{"schema_version":"onlylabs.public_signal.v1","title":"OpenAI Writing: MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","description":"OpenAI writing signal with public source context, captured evidence pages, related signals, and data-business radar classification.","url":"https://onlylabs.fyi/signals/df878412-cd52-4abe-bb51-5bf12e1d64e9","json_url":"https://onlylabs.fyi/signals/df878412-cd52-4abe-bb51-5bf12e1d64e9/signal.json","generated_at":"2026-06-08T15:45:56.006+00:00","org":{"slug":"openai","name":"OpenAI","category":"frontier-lab","category_label":"Frontier lab","dossier_url":"https://onlylabs.fyi/labs/openai","dossier_json_url":"https://onlylabs.fyi/labs/openai/dossier.json"},"related_urls":{"signal":"https://onlylabs.fyi/signals/df878412-cd52-4abe-bb51-5bf12e1d64e9","signal_json":"https://onlylabs.fyi/signals/df878412-cd52-4abe-bb51-5bf12e1d64e9/signal.json","source":"https://openai.com/index/mle-bench","lab_dossier":"https://onlylabs.fyi/labs/openai","lab_dossier_json":"https://onlylabs.fyi/labs/openai/dossier.json","analysis":"https://onlylabs.fyi/analysis/openai","analysis_json":"https://onlylabs.fyi/analysis/openai/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/openai/evidence.json","category":"https://onlylabs.fyi/frontier","category_json":"https://onlylabs.fyi/frontier.json","category_feed":"https://onlylabs.fyi/frontier/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","topic":"https://onlylabs.fyi/topics/talking","topic_signals_json":"https://onlylabs.fyi/topics/talking/signals.json","topic_feed":"https://onlylabs.fyi/topics/talking/feed.xml","data_business":{"radar":"https://onlylabs.fyi/data-radar","radar_json":"https://onlylabs.fyi/data-radar.json","opportunities":"https://onlylabs.fyi/opportunities","opportunities_json":"https://onlylabs.fyi/opportunities.json","lanes":[{"key":"evals","label":"Evals and quality","url":"https://onlylabs.fyi/data-radar/evals","json_url":"https://onlylabs.fyi/data-radar/evals/signals.json"}]}},"answer_pack":{"answer":"OpenAI published MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering. This talking signal gives public context for research themes, product direction, policy, or launch framing. High-signal details: New benchmark from major lab for ML agents. · MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering | OpenAI October 10, 2024 MLE-bench Evaluating Machine Learning Agents on Machine Learning.... onlylabs links this event to 1 captured evidence page and 6 related writing signals. It also maps to Evals and quality in the data-business radar.","signal_desk":"talking","source_context":{"source_url":"https://openai.com/index/mle-bench","source_host":"openai.com","occurred_at":"2024-10-10T10:00:00+00:00","first_seen_at":"2026-06-05T05:42:57.832854+00:00","date_source":"rss.item_date","context":null},"context_markers":[{"label":"Lab","value":"OpenAI","source":"signal"},{"label":"Signal desk","value":"talking","source":"signal"},{"label":"Source host","value":"openai.com","source":"source"},{"label":"Notability","value":"New benchmark from major lab for ML agents.","source":"signal"},{"label":"Radar lane","value":"Evals and quality","source":"radar"},{"label":"Matched term","value":"eval","source":"radar"},{"label":"Matched term","value":"benchmark","source":"radar"},{"label":"Watch term","value":"Eval methodology","source":"evidence"},{"label":"Watch term","value":"Data pipeline","source":"evidence"},{"label":"Watch term","value":"Infrastructure","source":"evidence"},{"label":"Watch term","value":"Safety and alignment","source":"evidence"},{"label":"Watch term","value":"Agents and tool use","source":"evidence"}],"evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["exa"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://openai.com/index/mle-bench"],"related_signals":6,"has_source_url":true,"latest_page_fetched_at":"2026-06-08T15:45:56.006+00:00"},"data_business":{"matches":true,"lanes":[{"key":"evals","label":"Evals and quality","url":"https://onlylabs.fyi/data-radar/evals","json_url":"https://onlylabs.fyi/data-radar/evals/signals.json"}],"matched_terms":["eval","benchmark"],"score":16,"reason":"OpenAI has a writing signal matching evals and quality."},"agent_handoff":{"signal_json":"https://onlylabs.fyi/signals/df878412-cd52-4abe-bb51-5bf12e1d64e9/signal.json","dossier_json":"https://onlylabs.fyi/labs/openai/dossier.json","analysis_json":"https://onlylabs.fyi/analysis/openai/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/openai/evidence.json","topic_signals_json":"https://onlylabs.fyi/topics/talking/signals.json","topic_feed":"https://onlylabs.fyi/topics/talking/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","data_radar_json":"https://onlylabs.fyi/data-radar.json","opportunities_json":"https://onlylabs.fyi/opportunities.json"},"analysis_playbook":{"objective":"Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.","evidence_focus":["post title","source URL","captured page text","HN traction","linked model or paper references","publication date"],"extraction_questions":["Which themes are labs choosing to explain publicly?","Which posts are attracting outside discussion?","Which writing reframes a recent release, model, hiring wave, or policy stance?","Which posts mention data, evals, infrastructure, safety, or deployment workflows?"],"signal_questions":["What public theme, launch framing, or research direction does this writing signal expose?","Which themes are labs choosing to explain publicly?","Which posts are attracting outside discussion?","Which data-business lane explains this signal: Evals and quality?","Do the 6 related writing signals show a repeated pattern?"],"output_fields":["org","theme","public_framing","traction","data_business_lane","evidence_url"],"data_business_relevance":"Public writing supplies the narrative layer over raw signals and helps identify which frontier-lab priorities are becoming externally legible.","required_sources":[{"label":"signal_json","url":"https://onlylabs.fyi/signals/df878412-cd52-4abe-bb51-5bf12e1d64e9/signal.json","required":true},{"label":"source","url":"https://openai.com/index/mle-bench","required":true},{"label":"dossier_json","url":"https://onlylabs.fyi/labs/openai/dossier.json","required":true},{"label":"analysis_evidence_json","url":"https://onlylabs.fyi/analysis/openai/evidence.json","required":true},{"label":"topic_signals_json","url":"https://onlylabs.fyi/topics/talking/signals.json","required":false},{"label":"data_radar_json","url":"https://onlylabs.fyi/data-radar.json","required":true}],"expected_output":["one-paragraph source-grounded interpretation","data-business implication","confidence and missing evidence","recommended next source to inspect"],"prompt_seed":"Using only the linked onlylabs JSON, captured source context, and cited evidence, analyze OpenAI's writing signal \"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering\" for frontier lab strategy and data-business implications."},"semantic_triples":[{"subject":"OpenAI","predicate":"published","object":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","text":"OpenAI published MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering."},{"subject":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","predicate":"is classified as","object":"writing signal","text":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering is classified as writing signal."},{"subject":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","predicate":"belongs to","object":"talking desk","text":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering belongs to talking desk."},{"subject":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","predicate":"has evidence coverage","object":"1 captured evidence page","text":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering has evidence coverage 1 captured evidence page."},{"subject":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","predicate":"matches data-business lanes","object":"Evals and quality","text":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering matches data-business lanes Evals and quality."},{"subject":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","predicate":"has captured page count","object":"1","text":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering has captured page count 1."},{"subject":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","predicate":"has readable page count","object":"1","text":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering has readable page count 1."},{"subject":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","predicate":"has related signal count","object":"6","text":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering has related signal count 6."},{"subject":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","predicate":"has analysis playbook objective","object":"Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.","text":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering has analysis playbook objective Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.."},{"subject":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","predicate":"has source host","object":"openai.com","text":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering has source host openai.com."},{"subject":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","predicate":"has lab","object":"OpenAI","text":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering has lab OpenAI."},{"subject":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","predicate":"has signal desk","object":"talking","text":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering has signal desk talking."},{"subject":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","predicate":"has source host","object":"openai.com","text":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering has source host openai.com."},{"subject":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","predicate":"has notability","object":"New benchmark from major lab for ML agents.","text":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering has notability New benchmark from major lab for ML agents.."},{"subject":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","predicate":"has radar lane","object":"Evals and quality","text":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering has radar lane Evals and quality."},{"subject":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","predicate":"has matched term","object":"eval","text":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering has matched term eval."},{"subject":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","predicate":"has matched term","object":"benchmark","text":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering has matched term benchmark."},{"subject":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","predicate":"has watch term","object":"Eval methodology","text":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering has watch term Eval methodology."}]},"intelligence":{"signal_desk":"talking","answer":"OpenAI published MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering. This talking signal gives public context for research themes, product direction, policy, or launch framing. High-signal details: New benchmark from major lab for ML agents. · MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering | OpenAI October 10, 2024 MLE-bench Evaluating Machine Learning Agents on Machine Learning.... onlylabs links this event to 1 captured evidence page and 6 related writing signals. It also maps to Evals and quality in the data-business radar.","semantic_triples":[{"subject":"OpenAI","predicate":"published","object":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","text":"OpenAI published MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering."},{"subject":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","predicate":"is classified as","object":"writing signal","text":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering is classified as writing signal."},{"subject":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","predicate":"belongs to","object":"talking desk","text":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering belongs to talking desk."},{"subject":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","predicate":"has evidence coverage","object":"1 captured evidence page","text":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering has evidence coverage 1 captured evidence page."},{"subject":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","predicate":"matches data-business lanes","object":"Evals and quality","text":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering matches data-business lanes Evals and quality."}]},"signal":{"id":"df878412-cd52-4abe-bb51-5bf12e1d64e9","url":"https://onlylabs.fyi/signals/df878412-cd52-4abe-bb51-5bf12e1d64e9","json_url":"https://onlylabs.fyi/signals/df878412-cd52-4abe-bb51-5bf12e1d64e9/signal.json","source_url":"https://openai.com/index/mle-bench","title":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","summary":"OpenAI published a writing signal. onlylabs watches public writing for research themes, product direction, and model-launch context.","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2024-10-10T10:00:00+00:00","first_seen_at":"2026-06-05T05:42:57.832854+00:00","date_source":"rss.item_date","evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["exa"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://openai.com/index/mle-bench"]},"facets":{},"traction":{"github_stars":null,"hn_points":3,"hn_comments":0,"hn_story_id":"41808821","hf_downloads":null,"hf_likes":null},"data_radar":{"lanes":[{"key":"evals","label":"Evals and quality","url":"https://onlylabs.fyi/data-radar/evals"}],"score":16,"matched_terms":["eval","benchmark"],"reason":"OpenAI has a writing signal matching evals and quality."}},"primary_evidence_page":{"url":"https://openai.com/index/mle-bench","final_url":"https://openai.com/index/mle-bench","title":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","http_status":200,"content_type":null,"capture_method":"exa","fetched_at":"2026-06-08T15:45:56.006+00:00","bytes":null,"raw_path":null,"content_hash":null,"excerpt_chars":1200,"truncated":true,"excerpt":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering | OpenAI October 10, 2024 MLE-bench Evaluating Machine Learning Agents on Machine Learning Engineering Share We introduce MLE-bench, a benchmark for measuring how well AI agents perform at machine learning engineering. To this end, we curate 75 ML engineering-related competitions from Kaggle, creating a diverse set of challenging tasks that test real-world ML engineering skills such as training models, preparing datasets, and running experiments. We establish human baselines for each competition using Kaggle's publicly available leaderboards. We use open-source agent scaffolds to evaluate several frontier language models on our benchmark, finding that the best-performing setup — OpenAI's o1‑preview with AIDE scaffolding — achieves at least the level of a Kaggle bronze medal in 16.9% of competitions. In addition to our main results, we investigate various forms of resource-scaling for AI agents and the impact of contamination from pre-training. We open-source our benchmark code⁠ to facilitate future research in understanding the ML engineering capabilities of AI agents. - o1 - Software & Engineering -..."},"evidence_pages":[{"url":"https://openai.com/index/mle-bench","final_url":"https://openai.com/index/mle-bench","title":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering","http_status":200,"content_type":null,"capture_method":"exa","fetched_at":"2026-06-08T15:45:56.006+00:00","bytes":null,"raw_path":null,"content_hash":null,"excerpt_chars":1200,"truncated":true,"excerpt":"MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering | OpenAI October 10, 2024 MLE-bench Evaluating Machine Learning Agents on Machine Learning Engineering Share We introduce MLE-bench, a benchmark for measuring how well AI agents perform at machine learning engineering. To this end, we curate 75 ML engineering-related competitions from Kaggle, creating a diverse set of challenging tasks that test real-world ML engineering skills such as training models, preparing datasets, and running experiments. We establish human baselines for each competition using Kaggle's publicly available leaderboards. We use open-source agent scaffolds to evaluate several frontier language models on our benchmark, finding that the best-performing setup — OpenAI's o1‑preview with AIDE scaffolding — achieves at least the level of a Kaggle bronze medal in 16.9% of competitions. In addition to our main results, we investigate various forms of resource-scaling for AI agents and the impact of contamination from pre-training. We open-source our benchmark code⁠ to facilitate future research in understanding the ML engineering capabilities of AI agents. - o1 - Software & Engineering -..."}],"related_signals":[{"id":"b3668d3b-26d2-40c0-9d4f-ed1a67927aa4","url":"https://onlylabs.fyi/signals/b3668d3b-26d2-40c0-9d4f-ed1a67927aa4","source_url":"https://openai.com/index/supporting-eu-trustworthy-ai-ecosystem","title":"Supporting Europe’s work in ensuring a trustworthy AI ecosystem ","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-11T00:00:00+00:00","first_seen_at":"2026-06-11T08:00:56.140796+00:00","date_source":"rss.item_date"},{"id":"2638c0a7-b372-409c-ac72-f6d81d6464dc","url":"https://onlylabs.fyi/signals/2638c0a7-b372-409c-ac72-f6d81d6464dc","source_url":"https://openai.com/index/using-codex-to-simulate-black-holes","title":"How an astrophysicist uses Codex to help simulate black holes","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-11T00:00:00+00:00","first_seen_at":"2026-06-11T07:01:16.936464+00:00","date_source":"rss.item_date"},{"id":"509ea784-51ec-4ede-855b-5a4d1b27d3be","url":"https://onlylabs.fyi/signals/509ea784-51ec-4ede-855b-5a4d1b27d3be","source_url":"https://openai.com/index/openai-on-oracle-cloud","title":"Access OpenAI models and Codex through your Oracle cloud commitment","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-10T20:00:00+00:00","first_seen_at":"2026-06-11T07:01:16.936464+00:00","date_source":"rss.item_date"},{"id":"4f051449-87f2-466e-941e-b5918381a8fe","url":"https://onlylabs.fyi/signals/4f051449-87f2-466e-941e-b5918381a8fe","source_url":"https://openai.com/index/prc-linked-influence-operations-ai-debates","title":"PRC-linked influence operations are targeting AI debates in the US","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-10T12:00:00+00:00","first_seen_at":"2026-06-11T07:01:16.936464+00:00","date_source":"rss.item_date"},{"id":"4507c0c1-cb74-4bb3-b62b-5f6c2d37e20d","url":"https://onlylabs.fyi/signals/4507c0c1-cb74-4bb3-b62b-5f6c2d37e20d","source_url":"https://openai.com/index/lseg","title":"From data to decisions: how LSEG is scaling trusted AI","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-10T00:00:00+00:00","first_seen_at":"2026-06-10T09:18:54.26094+00:00","date_source":"rss.item_date"},{"id":"fb16aa7a-c4ef-4859-b514-0839c2f1330d","url":"https://onlylabs.fyi/signals/fb16aa7a-c4ef-4859-b514-0839c2f1330d","source_url":"https://openai.com/index/nextdoor","title":"How engineers at Nextdoor use Codex to build without limits","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-09T12:00:00+00:00","first_seen_at":"2026-06-10T07:01:28.700378+00:00","date_source":"rss.item_date"}]}