{"schema_version":"onlylabs.public_signal.v1","title":"Google (DeepMind / Gemini) Writing: FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","description":"Google (DeepMind / Gemini) writing signal with public source context, captured evidence pages, related signals, and data-business radar classification.","url":"https://onlylabs.fyi/signals/9f252939-c040-4979-bcd9-913d3f6b9c53","json_url":"https://onlylabs.fyi/signals/9f252939-c040-4979-bcd9-913d3f6b9c53/signal.json","generated_at":"2026-06-07T21:15:17.59149+00:00","org":{"slug":"google-deepmind","name":"Google (DeepMind / Gemini)","category":"frontier-lab","category_label":"Frontier lab","dossier_url":"https://onlylabs.fyi/labs/google-deepmind","dossier_json_url":"https://onlylabs.fyi/labs/google-deepmind/dossier.json"},"related_urls":{"signal":"https://onlylabs.fyi/signals/9f252939-c040-4979-bcd9-913d3f6b9c53","signal_json":"https://onlylabs.fyi/signals/9f252939-c040-4979-bcd9-913d3f6b9c53/signal.json","source":"https://deepmind.google/blog/facts-benchmark-suite-systematically-evaluating-the-factuality-of-large-language-models/","lab_dossier":"https://onlylabs.fyi/labs/google-deepmind","lab_dossier_json":"https://onlylabs.fyi/labs/google-deepmind/dossier.json","analysis":"https://onlylabs.fyi/analysis/google-deepmind","analysis_json":"https://onlylabs.fyi/analysis/google-deepmind/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/google-deepmind/evidence.json","category":"https://onlylabs.fyi/frontier","category_json":"https://onlylabs.fyi/frontier.json","category_feed":"https://onlylabs.fyi/frontier/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","topic":"https://onlylabs.fyi/topics/talking","topic_signals_json":"https://onlylabs.fyi/topics/talking/signals.json","topic_feed":"https://onlylabs.fyi/topics/talking/feed.xml","data_business":{"radar":"https://onlylabs.fyi/data-radar","radar_json":"https://onlylabs.fyi/data-radar.json","opportunities":"https://onlylabs.fyi/opportunities","opportunities_json":"https://onlylabs.fyi/opportunities.json","lanes":[{"key":"evals","label":"Evals and quality","url":"https://onlylabs.fyi/data-radar/evals","json_url":"https://onlylabs.fyi/data-radar/evals/signals.json"}]}},"answer_pack":{"answer":"Google (DeepMind / Gemini) published FACTS Benchmark Suite: Systematically evaluating the factuality of large language models. This talking signal gives public context for research themes, product direction, policy, or launch framing. High-signal details: Notable benchmark release from DeepMind, important for AI factuality. · FACTS Benchmark Suite: a new way to systematically evaluate LLMs factuality — Google DeepMind Skip to main content December 9, 2025 Responsibility & Safety FACTS.... onlylabs links this event to 1 captured evidence page and 6 related writing signals. It also maps to Evals and quality in the data-business radar.","signal_desk":"talking","source_context":{"source_url":"https://deepmind.google/blog/facts-benchmark-suite-systematically-evaluating-the-factuality-of-large-language-models/","source_host":"deepmind.google","occurred_at":"2025-12-09T11:29:03+00:00","first_seen_at":"2026-06-05T05:42:58.356131+00:00","date_source":"rss.item_date","context":null},"context_markers":[{"label":"Lab","value":"Google (DeepMind / Gemini)","source":"signal"},{"label":"Signal desk","value":"talking","source":"signal"},{"label":"Source host","value":"deepmind.google","source":"source"},{"label":"Notability","value":"Notable benchmark release from DeepMind, important for AI factuality.","source":"signal"},{"label":"Radar lane","value":"Evals and quality","source":"radar"},{"label":"Matched term","value":"eval","source":"radar"},{"label":"Matched term","value":"benchmark","source":"radar"},{"label":"Watch term","value":"Eval methodology","source":"evidence"},{"label":"Watch term","value":"Data pipeline","source":"evidence"},{"label":"Watch term","value":"Infrastructure","source":"evidence"},{"label":"Watch term","value":"Safety and alignment","source":"evidence"}],"evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["plain"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://deepmind.google/blog/facts-benchmark-suite-systematically-evaluating-the-factuality-of-large-language-models/"],"related_signals":6,"has_source_url":true,"latest_page_fetched_at":"2026-06-07T21:15:17.59149+00:00"},"data_business":{"matches":true,"lanes":[{"key":"evals","label":"Evals and quality","url":"https://onlylabs.fyi/data-radar/evals","json_url":"https://onlylabs.fyi/data-radar/evals/signals.json"}],"matched_terms":["eval","benchmark"],"score":15,"reason":"Google (DeepMind / Gemini) has a writing signal matching evals and quality."},"agent_handoff":{"signal_json":"https://onlylabs.fyi/signals/9f252939-c040-4979-bcd9-913d3f6b9c53/signal.json","dossier_json":"https://onlylabs.fyi/labs/google-deepmind/dossier.json","analysis_json":"https://onlylabs.fyi/analysis/google-deepmind/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/google-deepmind/evidence.json","topic_signals_json":"https://onlylabs.fyi/topics/talking/signals.json","topic_feed":"https://onlylabs.fyi/topics/talking/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","data_radar_json":"https://onlylabs.fyi/data-radar.json","opportunities_json":"https://onlylabs.fyi/opportunities.json"},"analysis_playbook":{"objective":"Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.","evidence_focus":["post title","source URL","captured page text","HN traction","linked model or paper references","publication date"],"extraction_questions":["Which themes are labs choosing to explain publicly?","Which posts are attracting outside discussion?","Which writing reframes a recent release, model, hiring wave, or policy stance?","Which posts mention data, evals, infrastructure, safety, or deployment workflows?"],"signal_questions":["What public theme, launch framing, or research direction does this writing signal expose?","Which themes are labs choosing to explain publicly?","Which posts are attracting outside discussion?","Which data-business lane explains this signal: Evals and quality?","Do the 6 related writing signals show a repeated pattern?"],"output_fields":["org","theme","public_framing","traction","data_business_lane","evidence_url"],"data_business_relevance":"Public writing supplies the narrative layer over raw signals and helps identify which frontier-lab priorities are becoming externally legible.","required_sources":[{"label":"signal_json","url":"https://onlylabs.fyi/signals/9f252939-c040-4979-bcd9-913d3f6b9c53/signal.json","required":true},{"label":"source","url":"https://deepmind.google/blog/facts-benchmark-suite-systematically-evaluating-the-factuality-of-large-language-models/","required":true},{"label":"dossier_json","url":"https://onlylabs.fyi/labs/google-deepmind/dossier.json","required":true},{"label":"analysis_evidence_json","url":"https://onlylabs.fyi/analysis/google-deepmind/evidence.json","required":true},{"label":"topic_signals_json","url":"https://onlylabs.fyi/topics/talking/signals.json","required":false},{"label":"data_radar_json","url":"https://onlylabs.fyi/data-radar.json","required":true}],"expected_output":["one-paragraph source-grounded interpretation","data-business implication","confidence and missing evidence","recommended next source to inspect"],"prompt_seed":"Using only the linked onlylabs JSON, captured source context, and cited evidence, analyze Google (DeepMind / Gemini)'s writing signal \"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models\" for frontier lab strategy and data-business implications."},"semantic_triples":[{"subject":"Google (DeepMind / Gemini)","predicate":"published","object":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","text":"Google (DeepMind / Gemini) published FACTS Benchmark Suite: Systematically evaluating the factuality of large language models."},{"subject":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","predicate":"is classified as","object":"writing signal","text":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models is classified as writing signal."},{"subject":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","predicate":"belongs to","object":"talking desk","text":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models belongs to talking desk."},{"subject":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","predicate":"has evidence coverage","object":"1 captured evidence page","text":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models has evidence coverage 1 captured evidence page."},{"subject":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","predicate":"matches data-business lanes","object":"Evals and quality","text":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models matches data-business lanes Evals and quality."},{"subject":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","predicate":"has captured page count","object":"1","text":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models has captured page count 1."},{"subject":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","predicate":"has readable page count","object":"1","text":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models has readable page count 1."},{"subject":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","predicate":"has related signal count","object":"6","text":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models has related signal count 6."},{"subject":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","predicate":"has analysis playbook objective","object":"Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.","text":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models has analysis playbook objective Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.."},{"subject":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","predicate":"has source host","object":"deepmind.google","text":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models has source host deepmind.google."},{"subject":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","predicate":"has lab","object":"Google (DeepMind / Gemini)","text":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models has lab Google (DeepMind / Gemini)."},{"subject":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","predicate":"has signal desk","object":"talking","text":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models has signal desk talking."},{"subject":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","predicate":"has source host","object":"deepmind.google","text":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models has source host deepmind.google."},{"subject":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","predicate":"has notability","object":"Notable benchmark release from DeepMind, important for AI factuality.","text":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models has notability Notable benchmark release from DeepMind, important for AI factuality.."},{"subject":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","predicate":"has radar lane","object":"Evals and quality","text":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models has radar lane Evals and quality."},{"subject":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","predicate":"has matched term","object":"eval","text":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models has matched term eval."},{"subject":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","predicate":"has matched term","object":"benchmark","text":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models has matched term benchmark."},{"subject":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","predicate":"has watch term","object":"Eval methodology","text":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models has watch term Eval methodology."}]},"intelligence":{"signal_desk":"talking","answer":"Google (DeepMind / Gemini) published FACTS Benchmark Suite: Systematically evaluating the factuality of large language models. This talking signal gives public context for research themes, product direction, policy, or launch framing. High-signal details: Notable benchmark release from DeepMind, important for AI factuality. · FACTS Benchmark Suite: a new way to systematically evaluate LLMs factuality — Google DeepMind Skip to main content December 9, 2025 Responsibility & Safety FACTS.... onlylabs links this event to 1 captured evidence page and 6 related writing signals. It also maps to Evals and quality in the data-business radar.","semantic_triples":[{"subject":"Google (DeepMind / Gemini)","predicate":"published","object":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","text":"Google (DeepMind / Gemini) published FACTS Benchmark Suite: Systematically evaluating the factuality of large language models."},{"subject":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","predicate":"is classified as","object":"writing signal","text":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models is classified as writing signal."},{"subject":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","predicate":"belongs to","object":"talking desk","text":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models belongs to talking desk."},{"subject":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","predicate":"has evidence coverage","object":"1 captured evidence page","text":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models has evidence coverage 1 captured evidence page."},{"subject":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","predicate":"matches data-business lanes","object":"Evals and quality","text":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models matches data-business lanes Evals and quality."}]},"signal":{"id":"9f252939-c040-4979-bcd9-913d3f6b9c53","url":"https://onlylabs.fyi/signals/9f252939-c040-4979-bcd9-913d3f6b9c53","json_url":"https://onlylabs.fyi/signals/9f252939-c040-4979-bcd9-913d3f6b9c53/signal.json","source_url":"https://deepmind.google/blog/facts-benchmark-suite-systematically-evaluating-the-factuality-of-large-language-models/","title":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","summary":"Google (DeepMind / Gemini) published a writing signal. onlylabs watches public writing for research themes, product direction, and model-launch context.","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"google-deepmind","name":"Google (DeepMind / Gemini)","category":"frontier-lab"},"occurred_at":"2025-12-09T11:29:03+00:00","first_seen_at":"2026-06-05T05:42:58.356131+00:00","date_source":"rss.item_date","evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["plain"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://deepmind.google/blog/facts-benchmark-suite-systematically-evaluating-the-factuality-of-large-language-models/"]},"facets":{},"traction":{"github_stars":null,"hn_points":null,"hn_comments":null,"hn_story_id":null,"hf_downloads":null,"hf_likes":null},"data_radar":{"lanes":[{"key":"evals","label":"Evals and quality","url":"https://onlylabs.fyi/data-radar/evals"}],"score":15,"matched_terms":["eval","benchmark"],"reason":"Google (DeepMind / Gemini) has a writing signal matching evals and quality."}},"primary_evidence_page":{"url":"https://deepmind.google/blog/facts-benchmark-suite-systematically-evaluating-the-factuality-of-large-language-models/","final_url":"https://deepmind.google/blog/facts-benchmark-suite-systematically-evaluating-the-factuality-of-large-language-models/","title":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","http_status":200,"content_type":"text/html","capture_method":"plain","fetched_at":"2026-06-07T21:15:17.59149+00:00","bytes":161674,"raw_path":null,"content_hash":"b030d24817810817a6bf8f3e6ba679522d31d6e209986a208d7d1cf6d5cfb222","excerpt_chars":1200,"truncated":true,"excerpt":"FACTS Benchmark Suite: a new way to systematically evaluate LLMs factuality — Google DeepMind Skip to main content December 9, 2025 Responsibility & Safety FACTS Benchmark Suite: Systematically evaluating the factuality of large language models The FACTS team Share Large language models (LLMs) are increasingly becoming a primary source for information delivery across diverse use cases, so it’s important that their responses are factually accurate. In order to continue improving their performance on this industry-wide challenge, we have to better understand the types of use cases where models struggle to provide an accurate response and better measure factuality performance in those areas. The FACTS Benchmark Suite Today, we’re teaming up with Kaggle to introduce the FACTS Benchmark Suite . It extends our previous work developing the FACTS Grounding Benchmark , with three additional factuality benchmarks, including: A Parametric Benchmark that measures the model’s ability to access its internal knowledge accurately in factoid question use-cases. A Search Benchmark that tests a model’s ability to use Search as a tool to retrieve information and synthesize it correctly. A Multimodal..."},"evidence_pages":[{"url":"https://deepmind.google/blog/facts-benchmark-suite-systematically-evaluating-the-factuality-of-large-language-models/","final_url":"https://deepmind.google/blog/facts-benchmark-suite-systematically-evaluating-the-factuality-of-large-language-models/","title":"FACTS Benchmark Suite: Systematically evaluating the factuality of large language models","http_status":200,"content_type":"text/html","capture_method":"plain","fetched_at":"2026-06-07T21:15:17.59149+00:00","bytes":161674,"raw_path":null,"content_hash":"b030d24817810817a6bf8f3e6ba679522d31d6e209986a208d7d1cf6d5cfb222","excerpt_chars":1200,"truncated":true,"excerpt":"FACTS Benchmark Suite: a new way to systematically evaluate LLMs factuality — Google DeepMind Skip to main content December 9, 2025 Responsibility & Safety FACTS Benchmark Suite: Systematically evaluating the factuality of large language models The FACTS team Share Large language models (LLMs) are increasingly becoming a primary source for information delivery across diverse use cases, so it’s important that their responses are factually accurate. In order to continue improving their performance on this industry-wide challenge, we have to better understand the types of use cases where models struggle to provide an accurate response and better measure factuality performance in those areas. The FACTS Benchmark Suite Today, we’re teaming up with Kaggle to introduce the FACTS Benchmark Suite . It extends our previous work developing the FACTS Grounding Benchmark , with three additional factuality benchmarks, including: A Parametric Benchmark that measures the model’s ability to access its internal knowledge accurately in factoid question use-cases. A Search Benchmark that tests a model’s ability to use Search as a tool to retrieve information and synthesize it correctly. A Multimodal..."}],"related_signals":[{"id":"8dc3cb48-ca03-4c5e-b776-2febb36923ff","url":"https://onlylabs.fyi/signals/8dc3cb48-ca03-4c5e-b776-2febb36923ff","source_url":"https://deepmind.google/blog/diffusiongemma-4x-faster-text-generation/","title":"DiffusionGemma: 4x faster text generation","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"google-deepmind","name":"Google (DeepMind / Gemini)","category":"frontier-lab"},"occurred_at":"2026-06-10T16:24:11+00:00","first_seen_at":"2026-06-11T07:01:08.279091+00:00","date_source":"rss.item_date"},{"id":"c3e1268c-6a62-4443-9861-8cc8b2fab2fa","url":"https://onlylabs.fyi/signals/c3e1268c-6a62-4443-9861-8cc8b2fab2fa","source_url":"https://deepmind.google/blog/fluid-natural-voice-translation-with-gemini-35-live-translate/","title":"Fluid, natural voice translation with Gemini 3.5 Live Translate","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"google-deepmind","name":"Google (DeepMind / Gemini)","category":"frontier-lab"},"occurred_at":"2026-06-09T15:16:25+00:00","first_seen_at":"2026-06-10T07:01:19.98686+00:00","date_source":"rss.item_date"},{"id":"91fc1de9-018b-4566-8d26-4287b86fc4d8","url":"https://onlylabs.fyi/signals/91fc1de9-018b-4566-8d26-4287b86fc4d8","source_url":"https://deepmind.google/blog/introducing-gemma-4-12b-a-unified-encoder-free-multimodal-model/","title":"Introducing Gemma 4 12B: a unified, encoder-free multimodal model","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"google-deepmind","name":"Google (DeepMind / Gemini)","category":"frontier-lab"},"occurred_at":"2026-06-09T14:10:19+00:00","first_seen_at":"2026-06-10T07:01:19.98686+00:00","date_source":"rss.item_date"},{"id":"8458c661-c5f3-4fef-afc1-abe720ecd4d5","url":"https://onlylabs.fyi/signals/8458c661-c5f3-4fef-afc1-abe720ecd4d5","source_url":"https://deepmind.google/blog/powering-the-future-of-robotics-in-europe/","title":"Powering the future of robotics in Europe","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"google-deepmind","name":"Google (DeepMind / Gemini)","category":"frontier-lab"},"occurred_at":"2026-06-09T14:02:33+00:00","first_seen_at":"2026-06-10T07:01:19.98686+00:00","date_source":"rss.item_date"},{"id":"136852e1-8df2-4a9d-9f2f-dd29e4038f1f","url":"https://onlylabs.fyi/signals/136852e1-8df2-4a9d-9f2f-dd29e4038f1f","source_url":"https://deepmind.google/blog/measuring-the-impact-of-learning-with-ai-in-sierra-leone-and-beyond/","title":"Measuring the impact of learning with AI in Sierra Leone and beyond","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"google-deepmind","name":"Google (DeepMind / Gemini)","category":"frontier-lab"},"occurred_at":"2026-06-08T13:04:59+00:00","first_seen_at":"2026-06-10T07:01:19.98686+00:00","date_source":"rss.item_date"},{"id":"5623a86e-2349-4e54-b92a-c76c14059b83","url":"https://onlylabs.fyi/signals/5623a86e-2349-4e54-b92a-c76c14059b83","source_url":"https://deepmind.google/blog/were-launching-the-google-deepmind-accelerator-program-in-asia-pacific-to-tackle-environmental-risks/","title":"We’re launching the Google DeepMind Accelerator program in Asia Pacific to tackle environmental risks","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"google-deepmind","name":"Google (DeepMind / Gemini)","category":"frontier-lab"},"occurred_at":"2026-05-21T19:46:42+00:00","first_seen_at":"2026-06-05T05:42:58.356131+00:00","date_source":"rss.item_date"}]}