{"schema_version":"onlylabs.public_signal.v1","title":"Anthropic Writing: Statistical Approach To Model Evals","description":"Anthropic writing signal with public source context, captured evidence pages, related signals, and data-business radar classification.","url":"https://onlylabs.fyi/signals/75a45571-7b23-414c-8c98-1536e1948484","json_url":"https://onlylabs.fyi/signals/75a45571-7b23-414c-8c98-1536e1948484/signal.json","generated_at":"2026-06-11T04:18:19.449627+00:00","org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab","category_label":"Frontier lab","dossier_url":"https://onlylabs.fyi/labs/anthropic","dossier_json_url":"https://onlylabs.fyi/labs/anthropic/dossier.json"},"related_urls":{"signal":"https://onlylabs.fyi/signals/75a45571-7b23-414c-8c98-1536e1948484","signal_json":"https://onlylabs.fyi/signals/75a45571-7b23-414c-8c98-1536e1948484/signal.json","source":"https://www.anthropic.com/research/statistical-approach-to-model-evals","lab_dossier":"https://onlylabs.fyi/labs/anthropic","lab_dossier_json":"https://onlylabs.fyi/labs/anthropic/dossier.json","analysis":"https://onlylabs.fyi/analysis/anthropic","analysis_json":"https://onlylabs.fyi/analysis/anthropic/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/anthropic/evidence.json","category":"https://onlylabs.fyi/frontier","category_json":"https://onlylabs.fyi/frontier.json","category_feed":"https://onlylabs.fyi/frontier/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","topic":"https://onlylabs.fyi/topics/talking","topic_signals_json":"https://onlylabs.fyi/topics/talking/signals.json","topic_feed":"https://onlylabs.fyi/topics/talking/feed.xml","data_business":{"radar":"https://onlylabs.fyi/data-radar","radar_json":"https://onlylabs.fyi/data-radar.json","opportunities":"https://onlylabs.fyi/opportunities","opportunities_json":"https://onlylabs.fyi/opportunities.json","lanes":[{"key":"evals","label":"Evals and quality","url":"https://onlylabs.fyi/data-radar/evals","json_url":"https://onlylabs.fyi/data-radar/evals/signals.json"}]}},"answer_pack":{"answer":"Anthropic published Statistical Approach To Model Evals. This talking signal gives public context for research themes, product direction, policy, or launch framing. High-signal details: Basic statistical advice welcomed, but community wants more advanced methods and consistent eval terminology. · A statistical approach to model evaluations \\ Anthropic Evaluations A statistical approach to model evaluations Nov 19, 2024 Read the paper Suppose an AI model.... onlylabs links this event to 1 captured evidence page and 6 related writing signals. It also maps to Evals and quality in the data-business radar.","signal_desk":"talking","source_context":{"source_url":"https://www.anthropic.com/research/statistical-approach-to-model-evals","source_host":"anthropic.com","occurred_at":"2024-11-19T20:49:39+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod","context":null},"context_markers":[{"label":"Lab","value":"Anthropic","source":"signal"},{"label":"Signal desk","value":"talking","source":"signal"},{"label":"Source host","value":"anthropic.com","source":"source"},{"label":"HN","value":"Basic statistical advice welcomed, but community wants more advanced methods and consistent eval terminology.","source":"source"},{"label":"Radar lane","value":"Evals and quality","source":"radar"},{"label":"Matched term","value":"eval","source":"radar"},{"label":"Matched term","value":"evals","source":"radar"},{"label":"Watch term","value":"Eval methodology","source":"evidence"},{"label":"Watch term","value":"Infrastructure","source":"evidence"}],"evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["plain"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://www.anthropic.com/research/statistical-approach-to-model-evals"],"related_signals":6,"has_source_url":true,"latest_page_fetched_at":"2026-06-11T04:18:19.449627+00:00"},"data_business":{"matches":true,"lanes":[{"key":"evals","label":"Evals and quality","url":"https://onlylabs.fyi/data-radar/evals","json_url":"https://onlylabs.fyi/data-radar/evals/signals.json"}],"matched_terms":["eval","evals"],"score":17,"reason":"Anthropic has a writing signal matching evals and quality."},"agent_handoff":{"signal_json":"https://onlylabs.fyi/signals/75a45571-7b23-414c-8c98-1536e1948484/signal.json","dossier_json":"https://onlylabs.fyi/labs/anthropic/dossier.json","analysis_json":"https://onlylabs.fyi/analysis/anthropic/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/anthropic/evidence.json","topic_signals_json":"https://onlylabs.fyi/topics/talking/signals.json","topic_feed":"https://onlylabs.fyi/topics/talking/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","data_radar_json":"https://onlylabs.fyi/data-radar.json","opportunities_json":"https://onlylabs.fyi/opportunities.json"},"analysis_playbook":{"objective":"Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.","evidence_focus":["post title","source URL","captured page text","HN traction","linked model or paper references","publication date"],"extraction_questions":["Which themes are labs choosing to explain publicly?","Which posts are attracting outside discussion?","Which writing reframes a recent release, model, hiring wave, or policy stance?","Which posts mention data, evals, infrastructure, safety, or deployment workflows?"],"signal_questions":["What public theme, launch framing, or research direction does this writing signal expose?","Which themes are labs choosing to explain publicly?","Which posts are attracting outside discussion?","Which data-business lane explains this signal: Evals and quality?","Do the 6 related writing signals show a repeated pattern?"],"output_fields":["org","theme","public_framing","traction","data_business_lane","evidence_url"],"data_business_relevance":"Public writing supplies the narrative layer over raw signals and helps identify which frontier-lab priorities are becoming externally legible.","required_sources":[{"label":"signal_json","url":"https://onlylabs.fyi/signals/75a45571-7b23-414c-8c98-1536e1948484/signal.json","required":true},{"label":"source","url":"https://www.anthropic.com/research/statistical-approach-to-model-evals","required":true},{"label":"dossier_json","url":"https://onlylabs.fyi/labs/anthropic/dossier.json","required":true},{"label":"analysis_evidence_json","url":"https://onlylabs.fyi/analysis/anthropic/evidence.json","required":true},{"label":"topic_signals_json","url":"https://onlylabs.fyi/topics/talking/signals.json","required":false},{"label":"data_radar_json","url":"https://onlylabs.fyi/data-radar.json","required":true}],"expected_output":["one-paragraph source-grounded interpretation","data-business implication","confidence and missing evidence","recommended next source to inspect"],"prompt_seed":"Using only the linked onlylabs JSON, captured source context, and cited evidence, analyze Anthropic's writing signal \"Statistical Approach To Model Evals\" for frontier lab strategy and data-business implications."},"semantic_triples":[{"subject":"Anthropic","predicate":"published","object":"Statistical Approach To Model Evals","text":"Anthropic published Statistical Approach To Model Evals."},{"subject":"Statistical Approach To Model Evals","predicate":"is classified as","object":"writing signal","text":"Statistical Approach To Model Evals is classified as writing signal."},{"subject":"Statistical Approach To Model Evals","predicate":"belongs to","object":"talking desk","text":"Statistical Approach To Model Evals belongs to talking desk."},{"subject":"Statistical Approach To Model Evals","predicate":"has evidence coverage","object":"1 captured evidence page","text":"Statistical Approach To Model Evals has evidence coverage 1 captured evidence page."},{"subject":"Statistical Approach To Model Evals","predicate":"matches data-business lanes","object":"Evals and quality","text":"Statistical Approach To Model Evals matches data-business lanes Evals and quality."},{"subject":"Statistical Approach To Model Evals","predicate":"has captured page count","object":"1","text":"Statistical Approach To Model Evals has captured page count 1."},{"subject":"Statistical Approach To Model Evals","predicate":"has readable page count","object":"1","text":"Statistical Approach To Model Evals has readable page count 1."},{"subject":"Statistical Approach To Model Evals","predicate":"has related signal count","object":"6","text":"Statistical Approach To Model Evals has related signal count 6."},{"subject":"Statistical Approach To Model Evals","predicate":"has analysis playbook objective","object":"Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.","text":"Statistical Approach To Model Evals has analysis playbook objective Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.."},{"subject":"Statistical Approach To Model Evals","predicate":"has source host","object":"anthropic.com","text":"Statistical Approach To Model Evals has source host anthropic.com."},{"subject":"Statistical Approach To Model Evals","predicate":"has lab","object":"Anthropic","text":"Statistical Approach To Model Evals has lab Anthropic."},{"subject":"Statistical Approach To Model Evals","predicate":"has signal desk","object":"talking","text":"Statistical Approach To Model Evals has signal desk talking."},{"subject":"Statistical Approach To Model Evals","predicate":"has source host","object":"anthropic.com","text":"Statistical Approach To Model Evals has source host anthropic.com."},{"subject":"Statistical Approach To Model Evals","predicate":"has hn","object":"Basic statistical advice welcomed, but community wants more advanced methods and consistent eval terminology.","text":"Statistical Approach To Model Evals has hn Basic statistical advice welcomed, but community wants more advanced methods and consistent eval terminology.."},{"subject":"Statistical Approach To Model Evals","predicate":"has radar lane","object":"Evals and quality","text":"Statistical Approach To Model Evals has radar lane Evals and quality."},{"subject":"Statistical Approach To Model Evals","predicate":"has matched term","object":"eval","text":"Statistical Approach To Model Evals has matched term eval."},{"subject":"Statistical Approach To Model Evals","predicate":"has matched term","object":"evals","text":"Statistical Approach To Model Evals has matched term evals."},{"subject":"Statistical Approach To Model Evals","predicate":"has watch term","object":"Eval methodology","text":"Statistical Approach To Model Evals has watch term Eval methodology."}]},"intelligence":{"signal_desk":"talking","answer":"Anthropic published Statistical Approach To Model Evals. This talking signal gives public context for research themes, product direction, policy, or launch framing. High-signal details: Basic statistical advice welcomed, but community wants more advanced methods and consistent eval terminology. · A statistical approach to model evaluations \\ Anthropic Evaluations A statistical approach to model evaluations Nov 19, 2024 Read the paper Suppose an AI model.... onlylabs links this event to 1 captured evidence page and 6 related writing signals. It also maps to Evals and quality in the data-business radar.","semantic_triples":[{"subject":"Anthropic","predicate":"published","object":"Statistical Approach To Model Evals","text":"Anthropic published Statistical Approach To Model Evals."},{"subject":"Statistical Approach To Model Evals","predicate":"is classified as","object":"writing signal","text":"Statistical Approach To Model Evals is classified as writing signal."},{"subject":"Statistical Approach To Model Evals","predicate":"belongs to","object":"talking desk","text":"Statistical Approach To Model Evals belongs to talking desk."},{"subject":"Statistical Approach To Model Evals","predicate":"has evidence coverage","object":"1 captured evidence page","text":"Statistical Approach To Model Evals has evidence coverage 1 captured evidence page."},{"subject":"Statistical Approach To Model Evals","predicate":"matches data-business lanes","object":"Evals and quality","text":"Statistical Approach To Model Evals matches data-business lanes Evals and quality."}]},"signal":{"id":"75a45571-7b23-414c-8c98-1536e1948484","url":"https://onlylabs.fyi/signals/75a45571-7b23-414c-8c98-1536e1948484","json_url":"https://onlylabs.fyi/signals/75a45571-7b23-414c-8c98-1536e1948484/signal.json","source_url":"https://www.anthropic.com/research/statistical-approach-to-model-evals","title":"Statistical Approach To Model Evals","summary":"Anthropic published a writing signal. onlylabs watches public writing for research themes, product direction, and model-launch context.","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2024-11-19T20:49:39+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod","evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["plain"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://www.anthropic.com/research/statistical-approach-to-model-evals"]},"facets":{},"traction":{"github_stars":null,"hn_points":66,"hn_comments":49,"hn_story_id":"42220573","hf_downloads":null,"hf_likes":null},"data_radar":{"lanes":[{"key":"evals","label":"Evals and quality","url":"https://onlylabs.fyi/data-radar/evals"}],"score":17,"matched_terms":["eval","evals"],"reason":"Anthropic has a writing signal matching evals and quality."}},"primary_evidence_page":{"url":"https://www.anthropic.com/research/statistical-approach-to-model-evals","final_url":"https://www.anthropic.com/research/statistical-approach-to-model-evals","title":"Statistical Approach To Model Evals","http_status":200,"content_type":"text/html; charset=utf-8","capture_method":"plain","fetched_at":"2026-06-11T04:18:19.449627+00:00","bytes":149125,"raw_path":"5e4f29b9b8d279315e72843bcda2e9118611e8bf5aa23bf1a46949c580ff65f6.html","content_hash":"13afc61253d5b26eb0c33647510d5d3d14fef1150cc0b32346fec78eb4431758","excerpt_chars":1200,"truncated":true,"excerpt":"A statistical approach to model evaluations \\ Anthropic Evaluations A statistical approach to model evaluations Nov 19, 2024 Read the paper Suppose an AI model outperforms another model on a benchmark of interest—testing its general knowledge, for example, or its ability to solve computer-coding questions. Is the difference in capabilities real, or could one model simply have gotten lucky in the choice of questions on the benchmark? With the amount of public interest in AI model evaluations—informally called “evals”—this question remains surprisingly understudied among the AI research community. This month, we published a new research paper that attempts to answer the question rigorously. Drawing on statistical theory and the experiment design literature, the paper makes a number of recommendations to the AI research community for reporting eval results in a scientifically informative way. In this post, we briefly go over the reporting recommendations, and the logic behind them. Recommendation #1: Use the Central Limit Theorem Evals often consist of hundreds or thousands of unrelated questions. MMLU , for instance, contains questions as diverse as: Who discovered the first virus?..."},"evidence_pages":[{"url":"https://www.anthropic.com/research/statistical-approach-to-model-evals","final_url":"https://www.anthropic.com/research/statistical-approach-to-model-evals","title":"Statistical Approach To Model Evals","http_status":200,"content_type":"text/html; charset=utf-8","capture_method":"plain","fetched_at":"2026-06-11T04:18:19.449627+00:00","bytes":149125,"raw_path":"5e4f29b9b8d279315e72843bcda2e9118611e8bf5aa23bf1a46949c580ff65f6.html","content_hash":"13afc61253d5b26eb0c33647510d5d3d14fef1150cc0b32346fec78eb4431758","excerpt_chars":1200,"truncated":true,"excerpt":"A statistical approach to model evaluations \\ Anthropic Evaluations A statistical approach to model evaluations Nov 19, 2024 Read the paper Suppose an AI model outperforms another model on a benchmark of interest—testing its general knowledge, for example, or its ability to solve computer-coding questions. Is the difference in capabilities real, or could one model simply have gotten lucky in the choice of questions on the benchmark? With the amount of public interest in AI model evaluations—informally called “evals”—this question remains surprisingly understudied among the AI research community. This month, we published a new research paper that attempts to answer the question rigorously. Drawing on statistical theory and the experiment design literature, the paper makes a number of recommendations to the AI research community for reporting eval results in a scientifically informative way. In this post, we briefly go over the reporting recommendations, and the logic behind them. Recommendation #1: Use the Central Limit Theorem Evals often consist of hundreds or thousands of unrelated questions. MMLU , for instance, contains questions as diverse as: Who discovered the first virus?..."}],"related_signals":[{"id":"6c78c028-3ab4-4b33-86f7-d86c8ba9e3ba","url":"https://onlylabs.fyi/signals/6c78c028-3ab4-4b33-86f7-d86c8ba9e3ba","source_url":"https://www.anthropic.com/research/agents-in-biology","title":"Agents In Biology","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-10T15:16:01+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"},{"id":"2648db51-9d6a-42a9-aece-a0ca5f9ce64f","url":"https://onlylabs.fyi/signals/2648db51-9d6a-42a9-aece-a0ca5f9ce64f","source_url":"https://www.anthropic.com/news/claude-fable-5-mythos-5","title":"Claude Fable 5 Mythos 5","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-09T20:27:50+00:00","first_seen_at":"2026-06-10T07:01:05.666054+00:00","date_source":"sitemap.lastmod"},{"id":"8475487f-45b4-4689-9bc5-8e4c6ca0457d","url":"https://onlylabs.fyi/signals/8475487f-45b4-4689-9bc5-8e4c6ca0457d","source_url":"https://www.anthropic.com/engineering/how-we-contain-claude","title":"How We Contain Claude","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-06T00:28:16+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"},{"id":"e4fbfcdd-15b4-41b9-b011-fd83e7068ae9","url":"https://onlylabs.fyi/signals/e4fbfcdd-15b4-41b9-b011-fd83e7068ae9","source_url":"https://www.anthropic.com/research/making-claude-a-chemist","title":"Making Claude A Chemist","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-05T20:13:40+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"},{"id":"cc62deba-9682-4751-aa6b-81c3bd7122a0","url":"https://onlylabs.fyi/signals/cc62deba-9682-4751-aa6b-81c3bd7122a0","source_url":"https://www.anthropic.com/research/measuring-agent-autonomy","title":"Measuring Agent Autonomy","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-05T15:49:18+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"},{"id":"93da14fd-7141-4e17-abd6-1c8d52435c70","url":"https://onlylabs.fyi/signals/93da14fd-7141-4e17-abd6-1c8d52435c70","source_url":"https://www.anthropic.com/research/values-wild","title":"Values Wild","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-05T15:38:54+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"}]}