{"schema_version":"onlylabs.public_signal.v1","title":"OpenAI Writing: Introducing SWE-bench Verified","description":"OpenAI writing signal with public source context, captured evidence pages, related signals, and data-business radar classification.","url":"https://onlylabs.fyi/signals/b456d14b-d076-485e-b9c9-e674f8912986","json_url":"https://onlylabs.fyi/signals/b456d14b-d076-485e-b9c9-e674f8912986/signal.json","generated_at":"2026-06-08T15:45:59.746+00:00","org":{"slug":"openai","name":"OpenAI","category":"frontier-lab","category_label":"Frontier lab","dossier_url":"https://onlylabs.fyi/labs/openai","dossier_json_url":"https://onlylabs.fyi/labs/openai/dossier.json"},"related_urls":{"signal":"https://onlylabs.fyi/signals/b456d14b-d076-485e-b9c9-e674f8912986","signal_json":"https://onlylabs.fyi/signals/b456d14b-d076-485e-b9c9-e674f8912986/signal.json","source":"https://openai.com/index/introducing-swe-bench-verified","lab_dossier":"https://onlylabs.fyi/labs/openai","lab_dossier_json":"https://onlylabs.fyi/labs/openai/dossier.json","analysis":"https://onlylabs.fyi/analysis/openai","analysis_json":"https://onlylabs.fyi/analysis/openai/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/openai/evidence.json","category":"https://onlylabs.fyi/frontier","category_json":"https://onlylabs.fyi/frontier.json","category_feed":"https://onlylabs.fyi/frontier/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","topic":"https://onlylabs.fyi/topics/talking","topic_signals_json":"https://onlylabs.fyi/topics/talking/signals.json","topic_feed":"https://onlylabs.fyi/topics/talking/feed.xml","data_business":{"radar":"https://onlylabs.fyi/data-radar","radar_json":"https://onlylabs.fyi/data-radar.json","opportunities":"https://onlylabs.fyi/opportunities","opportunities_json":"https://onlylabs.fyi/opportunities.json","lanes":[{"key":"evals","label":"Evals and quality","url":"https://onlylabs.fyi/data-radar/evals","json_url":"https://onlylabs.fyi/data-radar/evals/signals.json"}]}},"answer_pack":{"answer":"OpenAI published Introducing SWE-bench Verified. This talking signal gives public context for research themes, product direction, policy, or launch framing. High-signal details: New benchmark from major lab · Introducing SWE-bench Verified | OpenAI August 13, 2024 Introducing SWE-bench Verified We’re releasing a human-validated subset of SWE-bench that more reliably evaluates.... onlylabs links this event to 1 captured evidence page and 6 related writing signals. It also maps to Evals and quality in the data-business radar.","signal_desk":"talking","source_context":{"source_url":"https://openai.com/index/introducing-swe-bench-verified","source_host":"openai.com","occurred_at":"2024-08-13T10:00:00+00:00","first_seen_at":"2026-06-05T05:42:57.832854+00:00","date_source":"rss.item_date","context":null},"context_markers":[{"label":"Lab","value":"OpenAI","source":"signal"},{"label":"Signal desk","value":"talking","source":"signal"},{"label":"Source host","value":"openai.com","source":"source"},{"label":"Notability","value":"New benchmark from major lab","source":"signal"},{"label":"HN","value":"Skepticism about flawed benchmark examples and AI's misguided focus on real-world problems.","source":"source"},{"label":"Radar lane","value":"Evals and quality","source":"radar"},{"label":"Matched term","value":"eval","source":"radar"},{"label":"Watch term","value":"Eval methodology","source":"evidence"},{"label":"Watch term","value":"Data pipeline","source":"evidence"},{"label":"Watch term","value":"Infrastructure","source":"evidence"},{"label":"Watch term","value":"Safety and alignment","source":"evidence"},{"label":"Watch term","value":"Agents and tool use","source":"evidence"}],"evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["exa"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://openai.com/index/introducing-swe-bench-verified"],"related_signals":6,"has_source_url":true,"latest_page_fetched_at":"2026-06-08T15:45:59.746+00:00"},"data_business":{"matches":true,"lanes":[{"key":"evals","label":"Evals and quality","url":"https://onlylabs.fyi/data-radar/evals","json_url":"https://onlylabs.fyi/data-radar/evals/signals.json"}],"matched_terms":["eval"],"score":14,"reason":"OpenAI has a writing signal matching evals and quality."},"agent_handoff":{"signal_json":"https://onlylabs.fyi/signals/b456d14b-d076-485e-b9c9-e674f8912986/signal.json","dossier_json":"https://onlylabs.fyi/labs/openai/dossier.json","analysis_json":"https://onlylabs.fyi/analysis/openai/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/openai/evidence.json","topic_signals_json":"https://onlylabs.fyi/topics/talking/signals.json","topic_feed":"https://onlylabs.fyi/topics/talking/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","data_radar_json":"https://onlylabs.fyi/data-radar.json","opportunities_json":"https://onlylabs.fyi/opportunities.json"},"analysis_playbook":{"objective":"Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.","evidence_focus":["post title","source URL","captured page text","HN traction","linked model or paper references","publication date"],"extraction_questions":["Which themes are labs choosing to explain publicly?","Which posts are attracting outside discussion?","Which writing reframes a recent release, model, hiring wave, or policy stance?","Which posts mention data, evals, infrastructure, safety, or deployment workflows?"],"signal_questions":["What public theme, launch framing, or research direction does this writing signal expose?","Which themes are labs choosing to explain publicly?","Which posts are attracting outside discussion?","Which data-business lane explains this signal: Evals and quality?","Do the 6 related writing signals show a repeated pattern?"],"output_fields":["org","theme","public_framing","traction","data_business_lane","evidence_url"],"data_business_relevance":"Public writing supplies the narrative layer over raw signals and helps identify which frontier-lab priorities are becoming externally legible.","required_sources":[{"label":"signal_json","url":"https://onlylabs.fyi/signals/b456d14b-d076-485e-b9c9-e674f8912986/signal.json","required":true},{"label":"source","url":"https://openai.com/index/introducing-swe-bench-verified","required":true},{"label":"dossier_json","url":"https://onlylabs.fyi/labs/openai/dossier.json","required":true},{"label":"analysis_evidence_json","url":"https://onlylabs.fyi/analysis/openai/evidence.json","required":true},{"label":"topic_signals_json","url":"https://onlylabs.fyi/topics/talking/signals.json","required":false},{"label":"data_radar_json","url":"https://onlylabs.fyi/data-radar.json","required":true}],"expected_output":["one-paragraph source-grounded interpretation","data-business implication","confidence and missing evidence","recommended next source to inspect"],"prompt_seed":"Using only the linked onlylabs JSON, captured source context, and cited evidence, analyze OpenAI's writing signal \"Introducing SWE-bench Verified\" for frontier lab strategy and data-business implications."},"semantic_triples":[{"subject":"OpenAI","predicate":"published","object":"Introducing SWE-bench Verified","text":"OpenAI published Introducing SWE-bench Verified."},{"subject":"Introducing SWE-bench Verified","predicate":"is classified as","object":"writing signal","text":"Introducing SWE-bench Verified is classified as writing signal."},{"subject":"Introducing SWE-bench Verified","predicate":"belongs to","object":"talking desk","text":"Introducing SWE-bench Verified belongs to talking desk."},{"subject":"Introducing SWE-bench Verified","predicate":"has evidence coverage","object":"1 captured evidence page","text":"Introducing SWE-bench Verified has evidence coverage 1 captured evidence page."},{"subject":"Introducing SWE-bench Verified","predicate":"matches data-business lanes","object":"Evals and quality","text":"Introducing SWE-bench Verified matches data-business lanes Evals and quality."},{"subject":"Introducing SWE-bench Verified","predicate":"has captured page count","object":"1","text":"Introducing SWE-bench Verified has captured page count 1."},{"subject":"Introducing SWE-bench Verified","predicate":"has readable page count","object":"1","text":"Introducing SWE-bench Verified has readable page count 1."},{"subject":"Introducing SWE-bench Verified","predicate":"has related signal count","object":"6","text":"Introducing SWE-bench Verified has related signal count 6."},{"subject":"Introducing SWE-bench Verified","predicate":"has analysis playbook objective","object":"Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.","text":"Introducing SWE-bench Verified has analysis playbook objective Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.."},{"subject":"Introducing SWE-bench Verified","predicate":"has source host","object":"openai.com","text":"Introducing SWE-bench Verified has source host openai.com."},{"subject":"Introducing SWE-bench Verified","predicate":"has lab","object":"OpenAI","text":"Introducing SWE-bench Verified has lab OpenAI."},{"subject":"Introducing SWE-bench Verified","predicate":"has signal desk","object":"talking","text":"Introducing SWE-bench Verified has signal desk talking."},{"subject":"Introducing SWE-bench Verified","predicate":"has source host","object":"openai.com","text":"Introducing SWE-bench Verified has source host openai.com."},{"subject":"Introducing SWE-bench Verified","predicate":"has notability","object":"New benchmark from major lab","text":"Introducing SWE-bench Verified has notability New benchmark from major lab."},{"subject":"Introducing SWE-bench Verified","predicate":"has hn","object":"Skepticism about flawed benchmark examples and AI's misguided focus on real-world problems.","text":"Introducing SWE-bench Verified has hn Skepticism about flawed benchmark examples and AI's misguided focus on real-world problems.."},{"subject":"Introducing SWE-bench Verified","predicate":"has radar lane","object":"Evals and quality","text":"Introducing SWE-bench Verified has radar lane Evals and quality."},{"subject":"Introducing SWE-bench Verified","predicate":"has matched term","object":"eval","text":"Introducing SWE-bench Verified has matched term eval."},{"subject":"Introducing SWE-bench Verified","predicate":"has watch term","object":"Eval methodology","text":"Introducing SWE-bench Verified has watch term Eval methodology."}]},"intelligence":{"signal_desk":"talking","answer":"OpenAI published Introducing SWE-bench Verified. This talking signal gives public context for research themes, product direction, policy, or launch framing. High-signal details: New benchmark from major lab · Introducing SWE-bench Verified | OpenAI August 13, 2024 Introducing SWE-bench Verified We’re releasing a human-validated subset of SWE-bench that more reliably evaluates.... onlylabs links this event to 1 captured evidence page and 6 related writing signals. It also maps to Evals and quality in the data-business radar.","semantic_triples":[{"subject":"OpenAI","predicate":"published","object":"Introducing SWE-bench Verified","text":"OpenAI published Introducing SWE-bench Verified."},{"subject":"Introducing SWE-bench Verified","predicate":"is classified as","object":"writing signal","text":"Introducing SWE-bench Verified is classified as writing signal."},{"subject":"Introducing SWE-bench Verified","predicate":"belongs to","object":"talking desk","text":"Introducing SWE-bench Verified belongs to talking desk."},{"subject":"Introducing SWE-bench Verified","predicate":"has evidence coverage","object":"1 captured evidence page","text":"Introducing SWE-bench Verified has evidence coverage 1 captured evidence page."},{"subject":"Introducing SWE-bench Verified","predicate":"matches data-business lanes","object":"Evals and quality","text":"Introducing SWE-bench Verified matches data-business lanes Evals and quality."}]},"signal":{"id":"b456d14b-d076-485e-b9c9-e674f8912986","url":"https://onlylabs.fyi/signals/b456d14b-d076-485e-b9c9-e674f8912986","json_url":"https://onlylabs.fyi/signals/b456d14b-d076-485e-b9c9-e674f8912986/signal.json","source_url":"https://openai.com/index/introducing-swe-bench-verified","title":"Introducing SWE-bench Verified","summary":"OpenAI published a writing signal. onlylabs watches public writing for research themes, product direction, and model-launch context.","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2024-08-13T10:00:00+00:00","first_seen_at":"2026-06-05T05:42:57.832854+00:00","date_source":"rss.item_date","evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["exa"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://openai.com/index/introducing-swe-bench-verified"]},"facets":{},"traction":{"github_stars":null,"hn_points":46,"hn_comments":10,"hn_story_id":"41237204","hf_downloads":null,"hf_likes":null},"data_radar":{"lanes":[{"key":"evals","label":"Evals and quality","url":"https://onlylabs.fyi/data-radar/evals"}],"score":14,"matched_terms":["eval"],"reason":"OpenAI has a writing signal matching evals and quality."}},"primary_evidence_page":{"url":"https://openai.com/index/introducing-swe-bench-verified","final_url":"https://openai.com/index/introducing-swe-bench-verified","title":"Introducing SWE-bench Verified","http_status":200,"content_type":null,"capture_method":"exa","fetched_at":"2026-06-08T15:45:59.746+00:00","bytes":null,"raw_path":null,"content_hash":null,"excerpt_chars":1200,"truncated":true,"excerpt":"Introducing SWE-bench Verified | OpenAI August 13, 2024 Introducing SWE-bench Verified We’re releasing a human-validated subset of SWE-bench that more reliably evaluates AI models’ ability to solve real-world software issues. Loading… Share Updated February 24, 2025 As part of our Preparedness Framework⁠, OpenAI develops a range of metrics to track, evaluate, and forecast models’ abilities to act autonomously. The ability to autonomously complete software engineering tasks is a key component of our Medium risk level in the Model Autonomy risk category. Evaluating these capabilities is challenging due to the complexity of software engineering tasks, the difficulty of accurately assessing generated code, and the challenge of simulating real-world development scenarios. Therefore, our approach to Preparedness must also involve careful examination of evaluations themselves, to reduce the potential for underestimating or overestimating performance in important risk categories. One of the most popular evaluation suites for software engineering is SWE-bench⁠ 1—a benchmark for evaluating large language models’ (LLMs’) abilities to solve real-world software issues sourced from GitHub. The..."},"evidence_pages":[{"url":"https://openai.com/index/introducing-swe-bench-verified","final_url":"https://openai.com/index/introducing-swe-bench-verified","title":"Introducing SWE-bench Verified","http_status":200,"content_type":null,"capture_method":"exa","fetched_at":"2026-06-08T15:45:59.746+00:00","bytes":null,"raw_path":null,"content_hash":null,"excerpt_chars":1200,"truncated":true,"excerpt":"Introducing SWE-bench Verified | OpenAI August 13, 2024 Introducing SWE-bench Verified We’re releasing a human-validated subset of SWE-bench that more reliably evaluates AI models’ ability to solve real-world software issues. Loading… Share Updated February 24, 2025 As part of our Preparedness Framework⁠, OpenAI develops a range of metrics to track, evaluate, and forecast models’ abilities to act autonomously. The ability to autonomously complete software engineering tasks is a key component of our Medium risk level in the Model Autonomy risk category. Evaluating these capabilities is challenging due to the complexity of software engineering tasks, the difficulty of accurately assessing generated code, and the challenge of simulating real-world development scenarios. Therefore, our approach to Preparedness must also involve careful examination of evaluations themselves, to reduce the potential for underestimating or overestimating performance in important risk categories. One of the most popular evaluation suites for software engineering is SWE-bench⁠ 1—a benchmark for evaluating large language models’ (LLMs’) abilities to solve real-world software issues sourced from GitHub. The..."}],"related_signals":[{"id":"b3668d3b-26d2-40c0-9d4f-ed1a67927aa4","url":"https://onlylabs.fyi/signals/b3668d3b-26d2-40c0-9d4f-ed1a67927aa4","source_url":"https://openai.com/index/supporting-eu-trustworthy-ai-ecosystem","title":"Supporting Europe’s work in ensuring a trustworthy AI ecosystem ","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-11T00:00:00+00:00","first_seen_at":"2026-06-11T08:00:56.140796+00:00","date_source":"rss.item_date"},{"id":"2638c0a7-b372-409c-ac72-f6d81d6464dc","url":"https://onlylabs.fyi/signals/2638c0a7-b372-409c-ac72-f6d81d6464dc","source_url":"https://openai.com/index/using-codex-to-simulate-black-holes","title":"How an astrophysicist uses Codex to help simulate black holes","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-11T00:00:00+00:00","first_seen_at":"2026-06-11T07:01:16.936464+00:00","date_source":"rss.item_date"},{"id":"509ea784-51ec-4ede-855b-5a4d1b27d3be","url":"https://onlylabs.fyi/signals/509ea784-51ec-4ede-855b-5a4d1b27d3be","source_url":"https://openai.com/index/openai-on-oracle-cloud","title":"Access OpenAI models and Codex through your Oracle cloud commitment","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-10T20:00:00+00:00","first_seen_at":"2026-06-11T07:01:16.936464+00:00","date_source":"rss.item_date"},{"id":"4f051449-87f2-466e-941e-b5918381a8fe","url":"https://onlylabs.fyi/signals/4f051449-87f2-466e-941e-b5918381a8fe","source_url":"https://openai.com/index/prc-linked-influence-operations-ai-debates","title":"PRC-linked influence operations are targeting AI debates in the US","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-10T12:00:00+00:00","first_seen_at":"2026-06-11T07:01:16.936464+00:00","date_source":"rss.item_date"},{"id":"4507c0c1-cb74-4bb3-b62b-5f6c2d37e20d","url":"https://onlylabs.fyi/signals/4507c0c1-cb74-4bb3-b62b-5f6c2d37e20d","source_url":"https://openai.com/index/lseg","title":"From data to decisions: how LSEG is scaling trusted AI","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-10T00:00:00+00:00","first_seen_at":"2026-06-10T09:18:54.26094+00:00","date_source":"rss.item_date"},{"id":"fb16aa7a-c4ef-4859-b514-0839c2f1330d","url":"https://onlylabs.fyi/signals/fb16aa7a-c4ef-4859-b514-0839c2f1330d","source_url":"https://openai.com/index/nextdoor","title":"How engineers at Nextdoor use Codex to build without limits","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-09T12:00:00+00:00","first_seen_at":"2026-06-10T07:01:28.700378+00:00","date_source":"rss.item_date"}]}