{"schema_version":"onlylabs.public_signal.v1","title":"Anthropic Writing: Reward Tampering","description":"Anthropic writing signal with public source context, captured evidence pages, related signals, and data-business radar classification.","url":"https://onlylabs.fyi/signals/9d12f76a-aead-4236-ae8f-a40dfcece89f","json_url":"https://onlylabs.fyi/signals/9d12f76a-aead-4236-ae8f-a40dfcece89f/signal.json","generated_at":"2026-06-11T04:15:27.403304+00:00","org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab","category_label":"Frontier lab","dossier_url":"https://onlylabs.fyi/labs/anthropic","dossier_json_url":"https://onlylabs.fyi/labs/anthropic/dossier.json"},"related_urls":{"signal":"https://onlylabs.fyi/signals/9d12f76a-aead-4236-ae8f-a40dfcece89f","signal_json":"https://onlylabs.fyi/signals/9d12f76a-aead-4236-ae8f-a40dfcece89f/signal.json","source":"https://www.anthropic.com/research/reward-tampering","lab_dossier":"https://onlylabs.fyi/labs/anthropic","lab_dossier_json":"https://onlylabs.fyi/labs/anthropic/dossier.json","analysis":"https://onlylabs.fyi/analysis/anthropic","analysis_json":"https://onlylabs.fyi/analysis/anthropic/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/anthropic/evidence.json","category":"https://onlylabs.fyi/frontier","category_json":"https://onlylabs.fyi/frontier.json","category_feed":"https://onlylabs.fyi/frontier/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","topic":"https://onlylabs.fyi/topics/talking","topic_signals_json":"https://onlylabs.fyi/topics/talking/signals.json","topic_feed":"https://onlylabs.fyi/topics/talking/feed.xml","data_business":null},"answer_pack":{"answer":"Anthropic published Reward Tampering. This talking signal gives public context for research themes, product direction, policy, or launch framing. High-signal details: Sycophancy to subterfuge: Investigating reward tampering in language models \\ Anthropic Alignment Sycophancy to subterfuge: Investigating reward tampering in language.... onlylabs links this event to 1 captured evidence page and 6 related writing signals.","signal_desk":"talking","source_context":{"source_url":"https://www.anthropic.com/research/reward-tampering","source_host":"anthropic.com","occurred_at":"2024-06-17T00:00:00.000Z","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"page.visible_date","context":null},"context_markers":[{"label":"Lab","value":"Anthropic","source":"signal"},{"label":"Signal desk","value":"talking","source":"signal"},{"label":"Source host","value":"anthropic.com","source":"source"},{"label":"Watch term","value":"RL environments","source":"evidence"},{"label":"Watch term","value":"Infrastructure","source":"evidence"},{"label":"Watch term","value":"Safety and alignment","source":"evidence"}],"evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["plain"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://www.anthropic.com/research/reward-tampering"],"related_signals":6,"has_source_url":true,"latest_page_fetched_at":"2026-06-11T04:15:27.403304+00:00"},"data_business":{"matches":false,"lanes":[],"matched_terms":[],"score":null,"reason":null},"agent_handoff":{"signal_json":"https://onlylabs.fyi/signals/9d12f76a-aead-4236-ae8f-a40dfcece89f/signal.json","dossier_json":"https://onlylabs.fyi/labs/anthropic/dossier.json","analysis_json":"https://onlylabs.fyi/analysis/anthropic/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/anthropic/evidence.json","topic_signals_json":"https://onlylabs.fyi/topics/talking/signals.json","topic_feed":"https://onlylabs.fyi/topics/talking/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","data_radar_json":null,"opportunities_json":null},"analysis_playbook":{"objective":"Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.","evidence_focus":["post title","source URL","captured page text","HN traction","linked model or paper references","publication date"],"extraction_questions":["Which themes are labs choosing to explain publicly?","Which posts are attracting outside discussion?","Which writing reframes a recent release, model, hiring wave, or policy stance?","Which posts mention data, evals, infrastructure, safety, or deployment workflows?"],"signal_questions":["What public theme, launch framing, or research direction does this writing signal expose?","Which themes are labs choosing to explain publicly?","Which posts are attracting outside discussion?","Do the 6 related writing signals show a repeated pattern?"],"output_fields":["org","theme","public_framing","traction","data_business_lane","evidence_url"],"data_business_relevance":"Public writing supplies the narrative layer over raw signals and helps identify which frontier-lab priorities are becoming externally legible.","required_sources":[{"label":"signal_json","url":"https://onlylabs.fyi/signals/9d12f76a-aead-4236-ae8f-a40dfcece89f/signal.json","required":true},{"label":"source","url":"https://www.anthropic.com/research/reward-tampering","required":true},{"label":"dossier_json","url":"https://onlylabs.fyi/labs/anthropic/dossier.json","required":true},{"label":"analysis_evidence_json","url":"https://onlylabs.fyi/analysis/anthropic/evidence.json","required":true},{"label":"topic_signals_json","url":"https://onlylabs.fyi/topics/talking/signals.json","required":false},{"label":"data_radar_json","url":null,"required":false}],"expected_output":["one-paragraph source-grounded interpretation","category-specific implication","confidence and missing evidence","recommended next source to inspect"],"prompt_seed":"Using only the linked onlylabs JSON, captured source context, and cited evidence, analyze Anthropic's writing signal \"Reward Tampering\" for frontier lab strategy."},"semantic_triples":[{"subject":"Anthropic","predicate":"published","object":"Reward Tampering","text":"Anthropic published Reward Tampering."},{"subject":"Reward Tampering","predicate":"is classified as","object":"writing signal","text":"Reward Tampering is classified as writing signal."},{"subject":"Reward Tampering","predicate":"belongs to","object":"talking desk","text":"Reward Tampering belongs to talking desk."},{"subject":"Reward Tampering","predicate":"has evidence coverage","object":"1 captured evidence page","text":"Reward Tampering has evidence coverage 1 captured evidence page."},{"subject":"Reward Tampering","predicate":"has captured page count","object":"1","text":"Reward Tampering has captured page count 1."},{"subject":"Reward Tampering","predicate":"has readable page count","object":"1","text":"Reward Tampering has readable page count 1."},{"subject":"Reward Tampering","predicate":"has related signal count","object":"6","text":"Reward Tampering has related signal count 6."},{"subject":"Reward Tampering","predicate":"has analysis playbook objective","object":"Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.","text":"Reward Tampering has analysis playbook objective Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.."},{"subject":"Reward Tampering","predicate":"has source host","object":"anthropic.com","text":"Reward Tampering has source host anthropic.com."},{"subject":"Reward Tampering","predicate":"has lab","object":"Anthropic","text":"Reward Tampering has lab Anthropic."},{"subject":"Reward Tampering","predicate":"has signal desk","object":"talking","text":"Reward Tampering has signal desk talking."},{"subject":"Reward Tampering","predicate":"has source host","object":"anthropic.com","text":"Reward Tampering has source host anthropic.com."},{"subject":"Reward Tampering","predicate":"has watch term","object":"RL environments","text":"Reward Tampering has watch term RL environments."},{"subject":"Reward Tampering","predicate":"has watch term","object":"Infrastructure","text":"Reward Tampering has watch term Infrastructure."},{"subject":"Reward Tampering","predicate":"has watch term","object":"Safety and alignment","text":"Reward Tampering has watch term Safety and alignment."}]},"intelligence":{"signal_desk":"talking","answer":"Anthropic published Reward Tampering. This talking signal gives public context for research themes, product direction, policy, or launch framing. High-signal details: Sycophancy to subterfuge: Investigating reward tampering in language models \\ Anthropic Alignment Sycophancy to subterfuge: Investigating reward tampering in language.... onlylabs links this event to 1 captured evidence page and 6 related writing signals.","semantic_triples":[{"subject":"Anthropic","predicate":"published","object":"Reward Tampering","text":"Anthropic published Reward Tampering."},{"subject":"Reward Tampering","predicate":"is classified as","object":"writing signal","text":"Reward Tampering is classified as writing signal."},{"subject":"Reward Tampering","predicate":"belongs to","object":"talking desk","text":"Reward Tampering belongs to talking desk."},{"subject":"Reward Tampering","predicate":"has evidence coverage","object":"1 captured evidence page","text":"Reward Tampering has evidence coverage 1 captured evidence page."}]},"signal":{"id":"9d12f76a-aead-4236-ae8f-a40dfcece89f","url":"https://onlylabs.fyi/signals/9d12f76a-aead-4236-ae8f-a40dfcece89f","json_url":"https://onlylabs.fyi/signals/9d12f76a-aead-4236-ae8f-a40dfcece89f/signal.json","source_url":"https://www.anthropic.com/research/reward-tampering","title":"Reward Tampering","summary":"Anthropic published a writing signal. onlylabs watches public writing for research themes, product direction, and model-launch context.","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2024-06-17T00:00:00.000Z","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"page.visible_date","evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["plain"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://www.anthropic.com/research/reward-tampering"]},"facets":{},"traction":{"github_stars":null,"hn_points":null,"hn_comments":null,"hn_story_id":null,"hf_downloads":null,"hf_likes":null},"data_radar":null},"primary_evidence_page":{"url":"https://www.anthropic.com/research/reward-tampering","final_url":"https://www.anthropic.com/research/reward-tampering","title":"Reward Tampering","http_status":200,"content_type":"text/html; charset=utf-8","capture_method":"plain","fetched_at":"2026-06-11T04:15:27.403304+00:00","bytes":140660,"raw_path":"ac5f8a05b1ace50cea596ed2c54f46471f8ccf30089eafa208a13ce5aba45efd.html","content_hash":"46b8e31aeb54af51de9a266af29a37ed1c600b9a34b65a016ec95dfaf8889478","excerpt_chars":1200,"truncated":true,"excerpt":"Sycophancy to subterfuge: Investigating reward tampering in language models \\ Anthropic Alignment Sycophancy to subterfuge: Investigating reward tampering in language models Jun 17, 2024 Read the paper Perverse incentives are everywhere. Think of the concept of \"teaching to the test\", where teachers focus on the narrow goal of exam preparation and fail to give their students a broader education. Or think of scientists working in the \"publish or perish\" academic system, publishing large numbers of low-quality papers to advance their careers at the expense of what we actually want them to produce: rigorous research. Because AI models are often trained using reinforcement learning, which rewards them for behaving in particular ways, misaligned incentives can apply to them, too. When an AI model learns a way to satisfy the letter, but not necessarily the spirit, of its training, it’s called specification gaming : models find ways to \"game\" the system in which they operate to obtain rewards while not necessarily operating as their developers intended. As AI models become more capable, we want to ensure that specification gaming doesn’t lead them to behave in unintended and potentially..."},"evidence_pages":[{"url":"https://www.anthropic.com/research/reward-tampering","final_url":"https://www.anthropic.com/research/reward-tampering","title":"Reward Tampering","http_status":200,"content_type":"text/html; charset=utf-8","capture_method":"plain","fetched_at":"2026-06-11T04:15:27.403304+00:00","bytes":140660,"raw_path":"ac5f8a05b1ace50cea596ed2c54f46471f8ccf30089eafa208a13ce5aba45efd.html","content_hash":"46b8e31aeb54af51de9a266af29a37ed1c600b9a34b65a016ec95dfaf8889478","excerpt_chars":1200,"truncated":true,"excerpt":"Sycophancy to subterfuge: Investigating reward tampering in language models \\ Anthropic Alignment Sycophancy to subterfuge: Investigating reward tampering in language models Jun 17, 2024 Read the paper Perverse incentives are everywhere. Think of the concept of \"teaching to the test\", where teachers focus on the narrow goal of exam preparation and fail to give their students a broader education. Or think of scientists working in the \"publish or perish\" academic system, publishing large numbers of low-quality papers to advance their careers at the expense of what we actually want them to produce: rigorous research. Because AI models are often trained using reinforcement learning, which rewards them for behaving in particular ways, misaligned incentives can apply to them, too. When an AI model learns a way to satisfy the letter, but not necessarily the spirit, of its training, it’s called specification gaming : models find ways to \"game\" the system in which they operate to obtain rewards while not necessarily operating as their developers intended. As AI models become more capable, we want to ensure that specification gaming doesn’t lead them to behave in unintended and potentially..."}],"related_signals":[{"id":"6c78c028-3ab4-4b33-86f7-d86c8ba9e3ba","url":"https://onlylabs.fyi/signals/6c78c028-3ab4-4b33-86f7-d86c8ba9e3ba","source_url":"https://www.anthropic.com/research/agents-in-biology","title":"Agents In Biology","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-10T15:16:01+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"},{"id":"2648db51-9d6a-42a9-aece-a0ca5f9ce64f","url":"https://onlylabs.fyi/signals/2648db51-9d6a-42a9-aece-a0ca5f9ce64f","source_url":"https://www.anthropic.com/news/claude-fable-5-mythos-5","title":"Claude Fable 5 Mythos 5","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-09T20:27:50+00:00","first_seen_at":"2026-06-10T07:01:05.666054+00:00","date_source":"sitemap.lastmod"},{"id":"8475487f-45b4-4689-9bc5-8e4c6ca0457d","url":"https://onlylabs.fyi/signals/8475487f-45b4-4689-9bc5-8e4c6ca0457d","source_url":"https://www.anthropic.com/engineering/how-we-contain-claude","title":"How We Contain Claude","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-06T00:28:16+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"},{"id":"e4fbfcdd-15b4-41b9-b011-fd83e7068ae9","url":"https://onlylabs.fyi/signals/e4fbfcdd-15b4-41b9-b011-fd83e7068ae9","source_url":"https://www.anthropic.com/research/making-claude-a-chemist","title":"Making Claude A Chemist","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-05T20:13:40+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"},{"id":"cc62deba-9682-4751-aa6b-81c3bd7122a0","url":"https://onlylabs.fyi/signals/cc62deba-9682-4751-aa6b-81c3bd7122a0","source_url":"https://www.anthropic.com/research/measuring-agent-autonomy","title":"Measuring Agent Autonomy","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-05T15:49:18+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"},{"id":"93da14fd-7141-4e17-abd6-1c8d52435c70","url":"https://onlylabs.fyi/signals/93da14fd-7141-4e17-abd6-1c8d52435c70","source_url":"https://www.anthropic.com/research/values-wild","title":"Values Wild","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-05T15:38:54+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"}]}