{"schema_version":"onlylabs.public_signal.v1","title":"Anthropic Writing: The Capacity For Moral Self Correction In Large Language Models","description":"Anthropic writing signal with public source context, captured evidence pages, related signals, and data-business radar classification.","url":"https://onlylabs.fyi/signals/6f81eae5-3c0b-40e4-bd80-9a7cd1d9e925","json_url":"https://onlylabs.fyi/signals/6f81eae5-3c0b-40e4-bd80-9a7cd1d9e925/signal.json","generated_at":"2026-06-11T04:17:44.787241+00:00","org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab","category_label":"Frontier lab","dossier_url":"https://onlylabs.fyi/labs/anthropic","dossier_json_url":"https://onlylabs.fyi/labs/anthropic/dossier.json"},"related_urls":{"signal":"https://onlylabs.fyi/signals/6f81eae5-3c0b-40e4-bd80-9a7cd1d9e925","signal_json":"https://onlylabs.fyi/signals/6f81eae5-3c0b-40e4-bd80-9a7cd1d9e925/signal.json","source":"https://www.anthropic.com/research/the-capacity-for-moral-self-correction-in-large-language-models","lab_dossier":"https://onlylabs.fyi/labs/anthropic","lab_dossier_json":"https://onlylabs.fyi/labs/anthropic/dossier.json","analysis":"https://onlylabs.fyi/analysis/anthropic","analysis_json":"https://onlylabs.fyi/analysis/anthropic/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/anthropic/evidence.json","category":"https://onlylabs.fyi/frontier","category_json":"https://onlylabs.fyi/frontier.json","category_feed":"https://onlylabs.fyi/frontier/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","topic":"https://onlylabs.fyi/topics/talking","topic_signals_json":"https://onlylabs.fyi/topics/talking/signals.json","topic_feed":"https://onlylabs.fyi/topics/talking/feed.xml","data_business":null},"answer_pack":{"answer":"Anthropic published The Capacity For Moral Self Correction In Large Language Models. This talking signal gives public context for research themes, product direction, policy, or launch framing. High-signal details: The Capacity for Moral Self-Correction in Large Language Models \\ Anthropic Societal Impacts The Capacity for Moral Self-Correction in Large Language Models Feb 15, 2023.... onlylabs links this event to 1 captured evidence page and 6 related writing signals.","signal_desk":"talking","source_context":{"source_url":"https://www.anthropic.com/research/the-capacity-for-moral-self-correction-in-large-language-models","source_host":"anthropic.com","occurred_at":"2023-02-15T00:00:00.000Z","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"page.visible_date","context":null},"context_markers":[{"label":"Lab","value":"Anthropic","source":"signal"},{"label":"Signal desk","value":"talking","source":"signal"},{"label":"Source host","value":"anthropic.com","source":"source"},{"label":"Watch term","value":"RL environments","source":"evidence"},{"label":"Watch term","value":"Infrastructure","source":"evidence"},{"label":"Watch term","value":"Safety and alignment","source":"evidence"},{"label":"Watch term","value":"Agents and tool use","source":"evidence"}],"evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["plain"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://www.anthropic.com/research/the-capacity-for-moral-self-correction-in-large-language-models"],"related_signals":6,"has_source_url":true,"latest_page_fetched_at":"2026-06-11T04:17:44.787241+00:00"},"data_business":{"matches":false,"lanes":[],"matched_terms":[],"score":null,"reason":null},"agent_handoff":{"signal_json":"https://onlylabs.fyi/signals/6f81eae5-3c0b-40e4-bd80-9a7cd1d9e925/signal.json","dossier_json":"https://onlylabs.fyi/labs/anthropic/dossier.json","analysis_json":"https://onlylabs.fyi/analysis/anthropic/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/anthropic/evidence.json","topic_signals_json":"https://onlylabs.fyi/topics/talking/signals.json","topic_feed":"https://onlylabs.fyi/topics/talking/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","data_radar_json":null,"opportunities_json":null},"analysis_playbook":{"objective":"Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.","evidence_focus":["post title","source URL","captured page text","HN traction","linked model or paper references","publication date"],"extraction_questions":["Which themes are labs choosing to explain publicly?","Which posts are attracting outside discussion?","Which writing reframes a recent release, model, hiring wave, or policy stance?","Which posts mention data, evals, infrastructure, safety, or deployment workflows?"],"signal_questions":["What public theme, launch framing, or research direction does this writing signal expose?","Which themes are labs choosing to explain publicly?","Which posts are attracting outside discussion?","Do the 6 related writing signals show a repeated pattern?"],"output_fields":["org","theme","public_framing","traction","data_business_lane","evidence_url"],"data_business_relevance":"Public writing supplies the narrative layer over raw signals and helps identify which frontier-lab priorities are becoming externally legible.","required_sources":[{"label":"signal_json","url":"https://onlylabs.fyi/signals/6f81eae5-3c0b-40e4-bd80-9a7cd1d9e925/signal.json","required":true},{"label":"source","url":"https://www.anthropic.com/research/the-capacity-for-moral-self-correction-in-large-language-models","required":true},{"label":"dossier_json","url":"https://onlylabs.fyi/labs/anthropic/dossier.json","required":true},{"label":"analysis_evidence_json","url":"https://onlylabs.fyi/analysis/anthropic/evidence.json","required":true},{"label":"topic_signals_json","url":"https://onlylabs.fyi/topics/talking/signals.json","required":false},{"label":"data_radar_json","url":null,"required":false}],"expected_output":["one-paragraph source-grounded interpretation","category-specific implication","confidence and missing evidence","recommended next source to inspect"],"prompt_seed":"Using only the linked onlylabs JSON, captured source context, and cited evidence, analyze Anthropic's writing signal \"The Capacity For Moral Self Correction In Large Language Models\" for frontier lab strategy."},"semantic_triples":[{"subject":"Anthropic","predicate":"published","object":"The Capacity For Moral Self Correction In Large Language Models","text":"Anthropic published The Capacity For Moral Self Correction In Large Language Models."},{"subject":"The Capacity For Moral Self Correction In Large Language Models","predicate":"is classified as","object":"writing signal","text":"The Capacity For Moral Self Correction In Large Language Models is classified as writing signal."},{"subject":"The Capacity For Moral Self Correction In Large Language Models","predicate":"belongs to","object":"talking desk","text":"The Capacity For Moral Self Correction In Large Language Models belongs to talking desk."},{"subject":"The Capacity For Moral Self Correction In Large Language Models","predicate":"has evidence coverage","object":"1 captured evidence page","text":"The Capacity For Moral Self Correction In Large Language Models has evidence coverage 1 captured evidence page."},{"subject":"The Capacity For Moral Self Correction In Large Language Models","predicate":"has captured page count","object":"1","text":"The Capacity For Moral Self Correction In Large Language Models has captured page count 1."},{"subject":"The Capacity For Moral Self Correction In Large Language Models","predicate":"has readable page count","object":"1","text":"The Capacity For Moral Self Correction In Large Language Models has readable page count 1."},{"subject":"The Capacity For Moral Self Correction In Large Language Models","predicate":"has related signal count","object":"6","text":"The Capacity For Moral Self Correction In Large Language Models has related signal count 6."},{"subject":"The Capacity For Moral Self Correction In Large Language Models","predicate":"has analysis playbook objective","object":"Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.","text":"The Capacity For Moral Self Correction In Large Language Models has analysis playbook objective Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.."},{"subject":"The Capacity For Moral Self Correction In Large Language Models","predicate":"has source host","object":"anthropic.com","text":"The Capacity For Moral Self Correction In Large Language Models has source host anthropic.com."},{"subject":"The Capacity For Moral Self Correction In Large Language Models","predicate":"has lab","object":"Anthropic","text":"The Capacity For Moral Self Correction In Large Language Models has lab Anthropic."},{"subject":"The Capacity For Moral Self Correction In Large Language Models","predicate":"has signal desk","object":"talking","text":"The Capacity For Moral Self Correction In Large Language Models has signal desk talking."},{"subject":"The Capacity For Moral Self Correction In Large Language Models","predicate":"has source host","object":"anthropic.com","text":"The Capacity For Moral Self Correction In Large Language Models has source host anthropic.com."},{"subject":"The Capacity For Moral Self Correction In Large Language Models","predicate":"has watch term","object":"RL environments","text":"The Capacity For Moral Self Correction In Large Language Models has watch term RL environments."},{"subject":"The Capacity For Moral Self Correction In Large Language Models","predicate":"has watch term","object":"Infrastructure","text":"The Capacity For Moral Self Correction In Large Language Models has watch term Infrastructure."},{"subject":"The Capacity For Moral Self Correction In Large Language Models","predicate":"has watch term","object":"Safety and alignment","text":"The Capacity For Moral Self Correction In Large Language Models has watch term Safety and alignment."},{"subject":"The Capacity For Moral Self Correction In Large Language Models","predicate":"has watch term","object":"Agents and tool use","text":"The Capacity For Moral Self Correction In Large Language Models has watch term Agents and tool use."}]},"intelligence":{"signal_desk":"talking","answer":"Anthropic published The Capacity For Moral Self Correction In Large Language Models. This talking signal gives public context for research themes, product direction, policy, or launch framing. High-signal details: The Capacity for Moral Self-Correction in Large Language Models \\ Anthropic Societal Impacts The Capacity for Moral Self-Correction in Large Language Models Feb 15, 2023.... onlylabs links this event to 1 captured evidence page and 6 related writing signals.","semantic_triples":[{"subject":"Anthropic","predicate":"published","object":"The Capacity For Moral Self Correction In Large Language Models","text":"Anthropic published The Capacity For Moral Self Correction In Large Language Models."},{"subject":"The Capacity For Moral Self Correction In Large Language Models","predicate":"is classified as","object":"writing signal","text":"The Capacity For Moral Self Correction In Large Language Models is classified as writing signal."},{"subject":"The Capacity For Moral Self Correction In Large Language Models","predicate":"belongs to","object":"talking desk","text":"The Capacity For Moral Self Correction In Large Language Models belongs to talking desk."},{"subject":"The Capacity For Moral Self Correction In Large Language Models","predicate":"has evidence coverage","object":"1 captured evidence page","text":"The Capacity For Moral Self Correction In Large Language Models has evidence coverage 1 captured evidence page."}]},"signal":{"id":"6f81eae5-3c0b-40e4-bd80-9a7cd1d9e925","url":"https://onlylabs.fyi/signals/6f81eae5-3c0b-40e4-bd80-9a7cd1d9e925","json_url":"https://onlylabs.fyi/signals/6f81eae5-3c0b-40e4-bd80-9a7cd1d9e925/signal.json","source_url":"https://www.anthropic.com/research/the-capacity-for-moral-self-correction-in-large-language-models","title":"The Capacity For Moral Self Correction In Large Language Models","summary":"Anthropic published a writing signal. onlylabs watches public writing for research themes, product direction, and model-launch context.","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2023-02-15T00:00:00.000Z","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"page.visible_date","evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["plain"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://www.anthropic.com/research/the-capacity-for-moral-self-correction-in-large-language-models"]},"facets":{},"traction":{"github_stars":null,"hn_points":null,"hn_comments":null,"hn_story_id":null,"hf_downloads":null,"hf_likes":null},"data_radar":null},"primary_evidence_page":{"url":"https://www.anthropic.com/research/the-capacity-for-moral-self-correction-in-large-language-models","final_url":"https://www.anthropic.com/research/the-capacity-for-moral-self-correction-in-large-language-models","title":"The Capacity For Moral Self Correction In Large Language Models","http_status":200,"content_type":"text/html; charset=utf-8","capture_method":"plain","fetched_at":"2026-06-11T04:17:44.787241+00:00","bytes":107047,"raw_path":"4d5e87f558b04761562cbe292a66fcc41e339e20162ffbff6d1b5a00cb46670e.html","content_hash":"dbc1ec56f5b328fd49a35603568dcc5431d4d06408f0a48f2e51c151a97c6d7c","excerpt_chars":1200,"truncated":true,"excerpt":"The Capacity for Moral Self-Correction in Large Language Models \\ Anthropic Societal Impacts The Capacity for Moral Self-Correction in Large Language Models Feb 15, 2023 Read Paper Abstract We test the hypothesis that language models trained with reinforcement learning from human feedback (RLHF) have the capability to \"morally self-correct\" -- to avoid producing harmful outputs -- if instructed to do so. We find strong evidence in support of this hypothesis across three different experiments, each of which reveal different facets of moral self-correction. We find that the capability for moral self-correction emerges at 22B model parameters, and typically improves with increasing model size and RLHF training. We believe that at this level of scale, language models obtain two capabilities that they can use for moral self-correction: (1) they can follow instructions and (2) they can learn complex normative concepts of harm like stereotyping, bias, and discrimination. As such, they can follow instructions to avoid certain kinds of morally harmful outputs. We believe our results are cause for cautious optimism regarding the ability to train language models to abide by ethical..."},"evidence_pages":[{"url":"https://www.anthropic.com/research/the-capacity-for-moral-self-correction-in-large-language-models","final_url":"https://www.anthropic.com/research/the-capacity-for-moral-self-correction-in-large-language-models","title":"The Capacity For Moral Self Correction In Large Language Models","http_status":200,"content_type":"text/html; charset=utf-8","capture_method":"plain","fetched_at":"2026-06-11T04:17:44.787241+00:00","bytes":107047,"raw_path":"4d5e87f558b04761562cbe292a66fcc41e339e20162ffbff6d1b5a00cb46670e.html","content_hash":"dbc1ec56f5b328fd49a35603568dcc5431d4d06408f0a48f2e51c151a97c6d7c","excerpt_chars":1200,"truncated":true,"excerpt":"The Capacity for Moral Self-Correction in Large Language Models \\ Anthropic Societal Impacts The Capacity for Moral Self-Correction in Large Language Models Feb 15, 2023 Read Paper Abstract We test the hypothesis that language models trained with reinforcement learning from human feedback (RLHF) have the capability to \"morally self-correct\" -- to avoid producing harmful outputs -- if instructed to do so. We find strong evidence in support of this hypothesis across three different experiments, each of which reveal different facets of moral self-correction. We find that the capability for moral self-correction emerges at 22B model parameters, and typically improves with increasing model size and RLHF training. We believe that at this level of scale, language models obtain two capabilities that they can use for moral self-correction: (1) they can follow instructions and (2) they can learn complex normative concepts of harm like stereotyping, bias, and discrimination. As such, they can follow instructions to avoid certain kinds of morally harmful outputs. We believe our results are cause for cautious optimism regarding the ability to train language models to abide by ethical..."}],"related_signals":[{"id":"6c78c028-3ab4-4b33-86f7-d86c8ba9e3ba","url":"https://onlylabs.fyi/signals/6c78c028-3ab4-4b33-86f7-d86c8ba9e3ba","source_url":"https://www.anthropic.com/research/agents-in-biology","title":"Agents In Biology","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-10T15:16:01+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"},{"id":"2648db51-9d6a-42a9-aece-a0ca5f9ce64f","url":"https://onlylabs.fyi/signals/2648db51-9d6a-42a9-aece-a0ca5f9ce64f","source_url":"https://www.anthropic.com/news/claude-fable-5-mythos-5","title":"Claude Fable 5 Mythos 5","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-09T20:27:50+00:00","first_seen_at":"2026-06-10T07:01:05.666054+00:00","date_source":"sitemap.lastmod"},{"id":"8475487f-45b4-4689-9bc5-8e4c6ca0457d","url":"https://onlylabs.fyi/signals/8475487f-45b4-4689-9bc5-8e4c6ca0457d","source_url":"https://www.anthropic.com/engineering/how-we-contain-claude","title":"How We Contain Claude","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-06T00:28:16+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"},{"id":"e4fbfcdd-15b4-41b9-b011-fd83e7068ae9","url":"https://onlylabs.fyi/signals/e4fbfcdd-15b4-41b9-b011-fd83e7068ae9","source_url":"https://www.anthropic.com/research/making-claude-a-chemist","title":"Making Claude A Chemist","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-05T20:13:40+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"},{"id":"cc62deba-9682-4751-aa6b-81c3bd7122a0","url":"https://onlylabs.fyi/signals/cc62deba-9682-4751-aa6b-81c3bd7122a0","source_url":"https://www.anthropic.com/research/measuring-agent-autonomy","title":"Measuring Agent Autonomy","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-05T15:49:18+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"},{"id":"93da14fd-7141-4e17-abd6-1c8d52435c70","url":"https://onlylabs.fyi/signals/93da14fd-7141-4e17-abd6-1c8d52435c70","source_url":"https://www.anthropic.com/research/values-wild","title":"Values Wild","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-05T15:38:54+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"}]}