{"schema_version":"onlylabs.public_signal.v1","title":"Amazon (Nova) Repo: amazon-science/when-thinking-fails-RLLM-if-evaluation","description":"Amazon (Nova) repo signal with public source context, captured evidence pages, related signals, and data-business radar classification.","url":"https://onlylabs.fyi/signals/f70ee13e-25e9-49c1-ae70-1f5737238475","json_url":"https://onlylabs.fyi/signals/f70ee13e-25e9-49c1-ae70-1f5737238475/signal.json","generated_at":"2026-06-11T03:58:42.727155+00:00","org":{"slug":"amazon","name":"Amazon (Nova)","category":"frontier-lab","category_label":"Frontier lab","dossier_url":"https://onlylabs.fyi/labs/amazon","dossier_json_url":"https://onlylabs.fyi/labs/amazon/dossier.json"},"related_urls":{"signal":"https://onlylabs.fyi/signals/f70ee13e-25e9-49c1-ae70-1f5737238475","signal_json":"https://onlylabs.fyi/signals/f70ee13e-25e9-49c1-ae70-1f5737238475/signal.json","source":"https://github.com/amazon-science/when-thinking-fails-RLLM-if-evaluation","lab_dossier":"https://onlylabs.fyi/labs/amazon","lab_dossier_json":"https://onlylabs.fyi/labs/amazon/dossier.json","analysis":"https://onlylabs.fyi/analysis/amazon","analysis_json":"https://onlylabs.fyi/analysis/amazon/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/amazon/evidence.json","category":"https://onlylabs.fyi/frontier","category_json":"https://onlylabs.fyi/frontier.json","category_feed":"https://onlylabs.fyi/frontier/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","topic":null,"topic_signals_json":null,"topic_feed":null,"data_business":{"radar":"https://onlylabs.fyi/data-radar","radar_json":"https://onlylabs.fyi/data-radar.json","opportunities":"https://onlylabs.fyi/opportunities","opportunities_json":"https://onlylabs.fyi/opportunities.json","lanes":[{"key":"evals","label":"Evals and quality","url":"https://onlylabs.fyi/data-radar/evals","json_url":"https://onlylabs.fyi/data-radar/evals/signals.json"}]}},"answer_pack":{"answer":"Amazon (Nova) published amazon-science/when-thinking-fails-RLLM-if-evaluation (Python). This repository signal exposes tooling, eval, infrastructure, or model-adjacent work before it may appear in a launch post. High-signal details: repo amazon-science/when-thinking-fails-RLLM-if-evaluation · language Python · Research repo from amazon-science, topic notable but no traction info. onlylabs links this event to 1 captured evidence page and 6 related repo signals. It also maps to Evals and quality in the data-business radar.","signal_desk":"repos","source_context":{"source_url":"https://github.com/amazon-science/when-thinking-fails-RLLM-if-evaluation","source_host":"github.com","occurred_at":"2025-10-20T22:41:15+00:00","first_seen_at":"2026-06-05T20:58:37.464059+00:00","date_source":"source","context":"Python"},"context_markers":[{"label":"Lab","value":"Amazon (Nova)","source":"signal"},{"label":"Signal desk","value":"repos","source":"signal"},{"label":"Source host","value":"github.com","source":"source"},{"label":"Repository","value":"amazon-science/when-thinking-fails-RLLM-if-evaluation","source":"source"},{"label":"Language","value":"Python","source":"source"},{"label":"Notability","value":"Research repo from amazon-science, topic notable but no traction info","source":"signal"},{"label":"Radar lane","value":"Evals and quality","source":"radar"},{"label":"Matched term","value":"eval","source":"radar"},{"label":"Matched term","value":"evaluation","source":"radar"},{"label":"Watch term","value":"Eval methodology","source":"evidence"},{"label":"Watch term","value":"Model card","source":"model"},{"label":"Watch term","value":"Data pipeline","source":"evidence"}],"evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["plain"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://github.com/amazon-science/when-thinking-fails-RLLM-if-evaluation"],"related_signals":6,"has_source_url":true,"latest_page_fetched_at":"2026-06-11T03:58:42.727155+00:00"},"data_business":{"matches":true,"lanes":[{"key":"evals","label":"Evals and quality","url":"https://onlylabs.fyi/data-radar/evals","json_url":"https://onlylabs.fyi/data-radar/evals/signals.json"}],"matched_terms":["eval","evaluation"],"score":16,"reason":"Amazon (Nova) has a repo signal matching evals and quality."},"agent_handoff":{"signal_json":"https://onlylabs.fyi/signals/f70ee13e-25e9-49c1-ae70-1f5737238475/signal.json","dossier_json":"https://onlylabs.fyi/labs/amazon/dossier.json","analysis_json":"https://onlylabs.fyi/analysis/amazon/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/amazon/evidence.json","topic_signals_json":null,"topic_feed":null,"category_signals_json":"https://onlylabs.fyi/signals.json","data_radar_json":"https://onlylabs.fyi/data-radar.json","opportunities_json":"https://onlylabs.fyi/opportunities.json"},"analysis_playbook":{"objective":"Turn new repository signals into early evidence of tooling, eval, infrastructure, model-adjacent, or product work before it appears in polished launch channels.","evidence_focus":["repo name","owner","description","language","stars","source URL","first seen time","data, eval, infra, safety, and product terms"],"extraction_questions":["What technical area does this repository expose?","Does the repo imply eval, data, infrastructure, agent, or deployment work?","Is the repo new evidence for a lab direction that is not yet in writing or releases?","Which related signals should an analyst inspect next?"],"signal_questions":["What does this new repository reveal before a formal announcement exists?","What technical area does this repository expose?","Does the repo imply eval, data, infrastructure, agent, or deployment work?","Which data-business lane explains this signal: Evals and quality?","Do the 6 related repo signals show a repeated pattern?"],"output_fields":["org","repo","technical_theme","data_business_lane","evidence_url"],"data_business_relevance":"New repositories can expose organization build priorities early, especially around internal tooling, eval infrastructure, data systems, deployment, and agent workflows.","required_sources":[{"label":"signal_json","url":"https://onlylabs.fyi/signals/f70ee13e-25e9-49c1-ae70-1f5737238475/signal.json","required":true},{"label":"source","url":"https://github.com/amazon-science/when-thinking-fails-RLLM-if-evaluation","required":true},{"label":"dossier_json","url":"https://onlylabs.fyi/labs/amazon/dossier.json","required":true},{"label":"analysis_evidence_json","url":"https://onlylabs.fyi/analysis/amazon/evidence.json","required":true},{"label":"topic_signals_json","url":null,"required":false},{"label":"data_radar_json","url":"https://onlylabs.fyi/data-radar.json","required":true}],"expected_output":["one-paragraph source-grounded interpretation","data-business implication","confidence and missing evidence","recommended next source to inspect"],"prompt_seed":"Using only the linked onlylabs JSON, captured source context, and cited evidence, analyze Amazon (Nova)'s repo signal \"amazon-science/when-thinking-fails-RLLM-if-evaluation\" for frontier lab strategy and data-business implications."},"semantic_triples":[{"subject":"Amazon (Nova)","predicate":"published repo","object":"amazon-science/when-thinking-fails-RLLM-if-evaluation","text":"Amazon (Nova) published repo amazon-science/when-thinking-fails-RLLM-if-evaluation."},{"subject":"amazon-science/when-thinking-fails-RLLM-if-evaluation","predicate":"is classified as","object":"repo signal","text":"amazon-science/when-thinking-fails-RLLM-if-evaluation is classified as repo signal."},{"subject":"amazon-science/when-thinking-fails-RLLM-if-evaluation","predicate":"belongs to","object":"repos desk","text":"amazon-science/when-thinking-fails-RLLM-if-evaluation belongs to repos desk."},{"subject":"amazon-science/when-thinking-fails-RLLM-if-evaluation","predicate":"has context","object":"Python","text":"amazon-science/when-thinking-fails-RLLM-if-evaluation has context Python."},{"subject":"amazon-science/when-thinking-fails-RLLM-if-evaluation","predicate":"has evidence coverage","object":"1 captured evidence page","text":"amazon-science/when-thinking-fails-RLLM-if-evaluation has evidence coverage 1 captured evidence page."},{"subject":"amazon-science/when-thinking-fails-RLLM-if-evaluation","predicate":"matches data-business lanes","object":"Evals and quality","text":"amazon-science/when-thinking-fails-RLLM-if-evaluation matches data-business lanes Evals and quality."},{"subject":"amazon-science/when-thinking-fails-RLLM-if-evaluation","predicate":"has captured page count","object":"1","text":"amazon-science/when-thinking-fails-RLLM-if-evaluation has captured page count 1."},{"subject":"amazon-science/when-thinking-fails-RLLM-if-evaluation","predicate":"has readable page count","object":"1","text":"amazon-science/when-thinking-fails-RLLM-if-evaluation has readable page count 1."},{"subject":"amazon-science/when-thinking-fails-RLLM-if-evaluation","predicate":"has related signal count","object":"6","text":"amazon-science/when-thinking-fails-RLLM-if-evaluation has related signal count 6."},{"subject":"amazon-science/when-thinking-fails-RLLM-if-evaluation","predicate":"has analysis playbook objective","object":"Turn new repository signals into early evidence of tooling, eval, infrastructure, model-adjacent, or product work before it appears in polished launch channels.","text":"amazon-science/when-thinking-fails-RLLM-if-evaluation has analysis playbook objective Turn new repository signals into early evidence of tooling, eval, infrastructure, model-adjacent, or product work before it appears in polished launch channels.."},{"subject":"amazon-science/when-thinking-fails-RLLM-if-evaluation","predicate":"has source host","object":"github.com","text":"amazon-science/when-thinking-fails-RLLM-if-evaluation has source host github.com."},{"subject":"amazon-science/when-thinking-fails-RLLM-if-evaluation","predicate":"has lab","object":"Amazon (Nova)","text":"amazon-science/when-thinking-fails-RLLM-if-evaluation has lab Amazon (Nova)."},{"subject":"amazon-science/when-thinking-fails-RLLM-if-evaluation","predicate":"has signal desk","object":"repos","text":"amazon-science/when-thinking-fails-RLLM-if-evaluation has signal desk repos."},{"subject":"amazon-science/when-thinking-fails-RLLM-if-evaluation","predicate":"has source host","object":"github.com","text":"amazon-science/when-thinking-fails-RLLM-if-evaluation has source host github.com."},{"subject":"amazon-science/when-thinking-fails-RLLM-if-evaluation","predicate":"has repository","object":"amazon-science/when-thinking-fails-RLLM-if-evaluation","text":"amazon-science/when-thinking-fails-RLLM-if-evaluation has repository amazon-science/when-thinking-fails-RLLM-if-evaluation."},{"subject":"amazon-science/when-thinking-fails-RLLM-if-evaluation","predicate":"has language","object":"Python","text":"amazon-science/when-thinking-fails-RLLM-if-evaluation has language Python."},{"subject":"amazon-science/when-thinking-fails-RLLM-if-evaluation","predicate":"has notability","object":"Research repo from amazon-science, topic notable but no traction info","text":"amazon-science/when-thinking-fails-RLLM-if-evaluation has notability Research repo from amazon-science, topic notable but no traction info."},{"subject":"amazon-science/when-thinking-fails-RLLM-if-evaluation","predicate":"has radar lane","object":"Evals and quality","text":"amazon-science/when-thinking-fails-RLLM-if-evaluation has radar lane Evals and quality."},{"subject":"amazon-science/when-thinking-fails-RLLM-if-evaluation","predicate":"has matched term","object":"eval","text":"amazon-science/when-thinking-fails-RLLM-if-evaluation has matched term eval."}]},"intelligence":{"signal_desk":"repos","answer":"Amazon (Nova) published amazon-science/when-thinking-fails-RLLM-if-evaluation (Python). This repository signal exposes tooling, eval, infrastructure, or model-adjacent work before it may appear in a launch post. High-signal details: repo amazon-science/when-thinking-fails-RLLM-if-evaluation · language Python · Research repo from amazon-science, topic notable but no traction info. onlylabs links this event to 1 captured evidence page and 6 related repo signals. It also maps to Evals and quality in the data-business radar.","semantic_triples":[{"subject":"Amazon (Nova)","predicate":"published repo","object":"amazon-science/when-thinking-fails-RLLM-if-evaluation","text":"Amazon (Nova) published repo amazon-science/when-thinking-fails-RLLM-if-evaluation."},{"subject":"amazon-science/when-thinking-fails-RLLM-if-evaluation","predicate":"is classified as","object":"repo signal","text":"amazon-science/when-thinking-fails-RLLM-if-evaluation is classified as repo signal."},{"subject":"amazon-science/when-thinking-fails-RLLM-if-evaluation","predicate":"belongs to","object":"repos desk","text":"amazon-science/when-thinking-fails-RLLM-if-evaluation belongs to repos desk."},{"subject":"amazon-science/when-thinking-fails-RLLM-if-evaluation","predicate":"has context","object":"Python","text":"amazon-science/when-thinking-fails-RLLM-if-evaluation has context Python."},{"subject":"amazon-science/when-thinking-fails-RLLM-if-evaluation","predicate":"has evidence coverage","object":"1 captured evidence page","text":"amazon-science/when-thinking-fails-RLLM-if-evaluation has evidence coverage 1 captured evidence page."},{"subject":"amazon-science/when-thinking-fails-RLLM-if-evaluation","predicate":"matches data-business lanes","object":"Evals and quality","text":"amazon-science/when-thinking-fails-RLLM-if-evaluation matches data-business lanes Evals and quality."}]},"signal":{"id":"f70ee13e-25e9-49c1-ae70-1f5737238475","url":"https://onlylabs.fyi/signals/f70ee13e-25e9-49c1-ae70-1f5737238475","json_url":"https://onlylabs.fyi/signals/f70ee13e-25e9-49c1-ae70-1f5737238475/signal.json","source_url":"https://github.com/amazon-science/when-thinking-fails-RLLM-if-evaluation","title":"amazon-science/when-thinking-fails-RLLM-if-evaluation","summary":"Amazon (Nova) published a new repository. onlylabs watches repos for tooling, eval, infra, and model-adjacent work.","context":"Python","kind":{"key":"repo_new","label":"Repo"},"org":{"slug":"amazon","name":"Amazon (Nova)","category":"frontier-lab"},"occurred_at":"2025-10-20T22:41:15+00:00","first_seen_at":"2026-06-05T20:58:37.464059+00:00","date_source":"source","evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["plain"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://github.com/amazon-science/when-thinking-fails-RLLM-if-evaluation"]},"facets":{"repo":"amazon-science/when-thinking-fails-RLLM-if-evaluation","language":"Python"},"traction":{"github_stars":0,"hn_points":null,"hn_comments":null,"hn_story_id":null,"hf_downloads":null,"hf_likes":null},"data_radar":{"lanes":[{"key":"evals","label":"Evals and quality","url":"https://onlylabs.fyi/data-radar/evals"}],"score":16,"matched_terms":["eval","evaluation"],"reason":"Amazon (Nova) has a repo signal matching evals and quality."}},"primary_evidence_page":{"url":"https://github.com/amazon-science/when-thinking-fails-RLLM-if-evaluation","final_url":"https://github.com/amazon-science/when-thinking-fails-RLLM-if-evaluation","title":"amazon-science/when-thinking-fails-RLLM-if-evaluation repository metadata","http_status":200,"content_type":"application/json","capture_method":"plain","fetched_at":"2026-06-11T03:58:42.727155+00:00","bytes":17653,"raw_path":"5f23d86f0f6064976d3e69b2cd46681b14617aaebf5e989aa237b95939386aa1.json","content_hash":"cb01e68c7cd99be2228b5154b1271aaa73d1537c3add8acfa1b62bb27bb1e62c","excerpt_chars":1200,"truncated":true,"excerpt":"amazon-science/when-thinking-fails-RLLM-if-evaluation Language: Python License: NOASSERTION Stars: 0 Forks: 0 Open issues: 0 Created: 2025-10-20T22:41:15Z Pushed: 2025-10-22T18:01:02Z Default branch: main Fork: no Archived: no README: When Thinking Fails: The Pitfalls of Reasoning for Instruction-Following in LLMs 🎉 **Our paper has been accepted to NeurIPS 2025 as a Spotlight paper (https://neurips.cc/virtual/2025/poster/115354)!** Quickstart 1. **Clone the repo** 2. **Install dependencies** ```bash pip install -r requirements.txt ``` 3. **Generate CoT and non-CoT outputs** - Use `reasoning_prompt_template()` in `templates.py` to prompt LLMs on IFEval and ComplexBench instructions. 4. **Clean and parse responses** ```python from utils import clean_think_responses Example usage: responses = [...] # raw model generations answers, thinking = clean_think_responses(responses) ``` --- Evaluate mitigation strategies We compare four mitigation methods proposed in the paper: 1. Few-shot in-context learning 2. Self-reflection 3. Self-selective reasoning 4. Classifier-selective reasoning Code and evaluation metrics reproduce results reported in the paper. --- Data - **IFEval**: original..."},"evidence_pages":[{"url":"https://github.com/amazon-science/when-thinking-fails-RLLM-if-evaluation","final_url":"https://github.com/amazon-science/when-thinking-fails-RLLM-if-evaluation","title":"amazon-science/when-thinking-fails-RLLM-if-evaluation repository metadata","http_status":200,"content_type":"application/json","capture_method":"plain","fetched_at":"2026-06-11T03:58:42.727155+00:00","bytes":17653,"raw_path":"5f23d86f0f6064976d3e69b2cd46681b14617aaebf5e989aa237b95939386aa1.json","content_hash":"cb01e68c7cd99be2228b5154b1271aaa73d1537c3add8acfa1b62bb27bb1e62c","excerpt_chars":1200,"truncated":true,"excerpt":"amazon-science/when-thinking-fails-RLLM-if-evaluation Language: Python License: NOASSERTION Stars: 0 Forks: 0 Open issues: 0 Created: 2025-10-20T22:41:15Z Pushed: 2025-10-22T18:01:02Z Default branch: main Fork: no Archived: no README: When Thinking Fails: The Pitfalls of Reasoning for Instruction-Following in LLMs 🎉 **Our paper has been accepted to NeurIPS 2025 as a Spotlight paper (https://neurips.cc/virtual/2025/poster/115354)!** Quickstart 1. **Clone the repo** 2. **Install dependencies** ```bash pip install -r requirements.txt ``` 3. **Generate CoT and non-CoT outputs** - Use `reasoning_prompt_template()` in `templates.py` to prompt LLMs on IFEval and ComplexBench instructions. 4. **Clean and parse responses** ```python from utils import clean_think_responses Example usage: responses = [...] # raw model generations answers, thinking = clean_think_responses(responses) ``` --- Evaluate mitigation strategies We compare four mitigation methods proposed in the paper: 1. Few-shot in-context learning 2. Self-reflection 3. Self-selective reasoning 4. Classifier-selective reasoning Code and evaluation metrics reproduce results reported in the paper. --- Data - **IFEval**: original..."}],"related_signals":[{"id":"087c32a2-6ad0-4981-9315-11fdd32a0153","url":"https://onlylabs.fyi/signals/087c32a2-6ad0-4981-9315-11fdd32a0153","source_url":"https://github.com/amazon-science/reskill","title":"amazon-science/reskill","context":"Python","kind":{"key":"repo_new","label":"Repo"},"org":{"slug":"amazon","name":"Amazon (Nova)","category":"frontier-lab"},"occurred_at":"2026-06-04T02:13:35+00:00","first_seen_at":"2026-06-05T20:58:37.464059+00:00","date_source":"source"},{"id":"e5701aed-6cd3-48dd-bfa6-ef839031e2e8","url":"https://onlylabs.fyi/signals/e5701aed-6cd3-48dd-bfa6-ef839031e2e8","source_url":"https://github.com/amazon-science/dualkv-flash-attn-for-rl","title":"amazon-science/dualkv-flash-attn-for-rl","context":"Python","kind":{"key":"repo_new","label":"Repo"},"org":{"slug":"amazon","name":"Amazon (Nova)","category":"frontier-lab"},"occurred_at":"2026-05-27T17:38:58+00:00","first_seen_at":"2026-06-05T20:58:37.464059+00:00","date_source":"source"},{"id":"8af28f0c-7331-4b08-b517-e18b3555e503","url":"https://onlylabs.fyi/signals/8af28f0c-7331-4b08-b517-e18b3555e503","source_url":"https://github.com/amazon-science/EvoMAS","title":"amazon-science/EvoMAS","context":"Python","kind":{"key":"repo_new","label":"Repo"},"org":{"slug":"amazon","name":"Amazon (Nova)","category":"frontier-lab"},"occurred_at":"2026-05-19T19:23:29+00:00","first_seen_at":"2026-06-05T20:58:37.464059+00:00","date_source":"source"},{"id":"e3ff8718-7daa-4ebd-a3e6-3d825c538b74","url":"https://onlylabs.fyi/signals/e3ff8718-7daa-4ebd-a3e6-3d825c538b74","source_url":"https://github.com/amazon-science/adaptive-layerwise-perturbation","title":"amazon-science/adaptive-layerwise-perturbation","context":"Python","kind":{"key":"repo_new","label":"Repo"},"org":{"slug":"amazon","name":"Amazon (Nova)","category":"frontier-lab"},"occurred_at":"2026-05-14T17:44:17+00:00","first_seen_at":"2026-06-05T20:58:37.464059+00:00","date_source":"source"},{"id":"9afcd328-0124-485c-8ace-9c3ad546e316","url":"https://onlylabs.fyi/signals/9afcd328-0124-485c-8ace-9c3ad546e316","source_url":"https://github.com/amazon-science/temporal-reasoning-dataset","title":"amazon-science/temporal-reasoning-dataset","context":"Python","kind":{"key":"repo_new","label":"Repo"},"org":{"slug":"amazon","name":"Amazon (Nova)","category":"frontier-lab"},"occurred_at":"2026-05-13T13:07:08+00:00","first_seen_at":"2026-06-05T20:58:37.464059+00:00","date_source":"source"},{"id":"e19ce80b-3d6a-4aaf-9b1a-82d1b19ab682","url":"https://onlylabs.fyi/signals/e19ce80b-3d6a-4aaf-9b1a-82d1b19ab682","source_url":"https://github.com/amazon-science/PROF-GRPO","title":"amazon-science/PROF-GRPO","context":"Python","kind":{"key":"repo_new","label":"Repo"},"org":{"slug":"amazon","name":"Amazon (Nova)","category":"frontier-lab"},"occurred_at":"2026-05-12T19:43:55+00:00","first_seen_at":"2026-06-05T20:58:37.464059+00:00","date_source":"source"}]}