{"schema_version":"onlylabs.public_signal.v1","title":"Nous Research Fork: NousResearch/datatrove","description":"Nous Research fork signal with public source context, captured evidence pages, related signals, and category-scoped analysis context.","url":"https://onlylabs.fyi/signals/5cfa99f6-5373-4834-8bfc-9e935b86276f","json_url":"https://onlylabs.fyi/signals/5cfa99f6-5373-4834-8bfc-9e935b86276f/signal.json","generated_at":"2026-06-11T02:54:27.547533+00:00","org":{"slug":"nous","name":"Nous Research","category":"neolab","category_label":"Neolab","dossier_url":"https://onlylabs.fyi/labs/nous","dossier_json_url":"https://onlylabs.fyi/labs/nous/dossier.json"},"related_urls":{"signal":"https://onlylabs.fyi/signals/5cfa99f6-5373-4834-8bfc-9e935b86276f","signal_json":"https://onlylabs.fyi/signals/5cfa99f6-5373-4834-8bfc-9e935b86276f/signal.json","source":"https://github.com/NousResearch/datatrove","lab_dossier":"https://onlylabs.fyi/labs/nous","lab_dossier_json":"https://onlylabs.fyi/labs/nous/dossier.json","analysis":"https://onlylabs.fyi/analysis/nous","analysis_json":"https://onlylabs.fyi/analysis/nous/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/nous/evidence.json","category":"https://onlylabs.fyi/neolabs","category_json":"https://onlylabs.fyi/neolabs.json","category_feed":"https://onlylabs.fyi/neolabs/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json?category=neolab","topic":"https://onlylabs.fyi/topics/forks","topic_signals_json":"https://onlylabs.fyi/topics/forks/signals.json?category=neolab","topic_feed":"https://onlylabs.fyi/topics/forks/feed.xml?category=neolab","data_business":null},"answer_pack":{"answer":"Nous Research forked NousResearch/datatrove (forked from huggingface/datatrove). This fork signal points to upstream code the lab may be inspecting, patching, or building on. High-signal details: repo NousResearch/datatrove · parent huggingface/datatrove · Routine fork, low stars. onlylabs links this event to 1 captured evidence page and 6 related fork signals.","signal_desk":"forks","source_context":{"source_url":"https://github.com/NousResearch/datatrove","source_host":"github.com","occurred_at":"2025-01-13T20:59:53+00:00","first_seen_at":"2026-06-06T01:49:37.982614+00:00","date_source":"source","context":"forked from huggingface/datatrove"},"context_markers":[{"label":"Lab","value":"Nous Research","source":"signal"},{"label":"Signal desk","value":"forks","source":"signal"},{"label":"Source host","value":"github.com","source":"source"},{"label":"Repository","value":"NousResearch/datatrove","source":"source"},{"label":"Parent","value":"huggingface/datatrove","source":"source"},{"label":"Stars","value":"8","source":"traction"},{"label":"Notability","value":"Routine fork, low stars","source":"signal"},{"label":"Watch term","value":"Data pipeline","source":"evidence"},{"label":"Watch term","value":"Infrastructure","source":"evidence"}],"evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["plain"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://github.com/NousResearch/datatrove"],"related_signals":6,"has_source_url":true,"latest_page_fetched_at":"2026-06-11T02:54:27.547533+00:00"},"data_business":{"matches":false,"lanes":[],"matched_terms":[],"score":null,"reason":null},"agent_handoff":{"signal_json":"https://onlylabs.fyi/signals/5cfa99f6-5373-4834-8bfc-9e935b86276f/signal.json","dossier_json":"https://onlylabs.fyi/labs/nous/dossier.json","analysis_json":"https://onlylabs.fyi/analysis/nous/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/nous/evidence.json","topic_signals_json":"https://onlylabs.fyi/topics/forks/signals.json?category=neolab","topic_feed":"https://onlylabs.fyi/topics/forks/feed.xml?category=neolab","category_signals_json":"https://onlylabs.fyi/signals.json?category=neolab","data_radar_json":null,"opportunities_json":null},"analysis_playbook":{"objective":"Turn fork activity into a map of upstream tools, libraries, model utilities, agent frameworks, and research dependencies being inspected or adapted.","evidence_focus":["forked parent repo","repo owner","language","description","stars","org and recency"],"extraction_questions":["Which upstream projects are repeated across labs?","Which forks point to evals, agents, infrastructure, model tooling, or data pipelines?","Which low-profile forks could precede a release or hiring wave?","Which labs are converging on the same technical adjacency?"],"signal_questions":["What upstream dependency or technical adjacency does this fork reveal?","Which upstream projects are repeated across labs?","Which forks point to evals, agents, infrastructure, model tooling, or data pipelines?","Do the 6 related fork signals show a repeated pattern?"],"output_fields":["org","upstream_repo","language","technical_theme","evidence_url"],"data_business_relevance":"Data-business lane extraction is scoped to frontier labs; for this category, keep conclusions tied to category-specific strategy, source evidence, and follow-up questions.","required_sources":[{"label":"signal_json","url":"https://onlylabs.fyi/signals/5cfa99f6-5373-4834-8bfc-9e935b86276f/signal.json","required":true},{"label":"source","url":"https://github.com/NousResearch/datatrove","required":true},{"label":"dossier_json","url":"https://onlylabs.fyi/labs/nous/dossier.json","required":true},{"label":"analysis_evidence_json","url":"https://onlylabs.fyi/analysis/nous/evidence.json","required":true},{"label":"topic_signals_json","url":"https://onlylabs.fyi/topics/forks/signals.json?category=neolab","required":false},{"label":"data_radar_json","url":null,"required":false}],"expected_output":["one-paragraph source-grounded interpretation","category-specific implication","confidence and missing evidence","recommended next source to inspect"],"prompt_seed":"Using only the linked onlylabs JSON, captured source context, and cited evidence, analyze Nous Research's fork signal \"NousResearch/datatrove\" for neolab strategy."},"semantic_triples":[{"subject":"Nous Research","predicate":"forked","object":"NousResearch/datatrove","text":"Nous Research forked NousResearch/datatrove."},{"subject":"NousResearch/datatrove","predicate":"is classified as","object":"fork signal","text":"NousResearch/datatrove is classified as fork signal."},{"subject":"NousResearch/datatrove","predicate":"belongs to","object":"forks desk","text":"NousResearch/datatrove belongs to forks desk."},{"subject":"NousResearch/datatrove","predicate":"has context","object":"forked from huggingface/datatrove","text":"NousResearch/datatrove has context forked from huggingface/datatrove."},{"subject":"NousResearch/datatrove","predicate":"has evidence coverage","object":"1 captured evidence page","text":"NousResearch/datatrove has evidence coverage 1 captured evidence page."},{"subject":"NousResearch/datatrove","predicate":"has captured page count","object":"1","text":"NousResearch/datatrove has captured page count 1."},{"subject":"NousResearch/datatrove","predicate":"has readable page count","object":"1","text":"NousResearch/datatrove has readable page count 1."},{"subject":"NousResearch/datatrove","predicate":"has related signal count","object":"6","text":"NousResearch/datatrove has related signal count 6."},{"subject":"NousResearch/datatrove","predicate":"has analysis playbook objective","object":"Turn fork activity into a map of upstream tools, libraries, model utilities, agent frameworks, and research dependencies being inspected or adapted.","text":"NousResearch/datatrove has analysis playbook objective Turn fork activity into a map of upstream tools, libraries, model utilities, agent frameworks, and research dependencies being inspected or adapted.."},{"subject":"NousResearch/datatrove","predicate":"has source host","object":"github.com","text":"NousResearch/datatrove has source host github.com."},{"subject":"NousResearch/datatrove","predicate":"has lab","object":"Nous Research","text":"NousResearch/datatrove has lab Nous Research."},{"subject":"NousResearch/datatrove","predicate":"has signal desk","object":"forks","text":"NousResearch/datatrove has signal desk forks."},{"subject":"NousResearch/datatrove","predicate":"has source host","object":"github.com","text":"NousResearch/datatrove has source host github.com."},{"subject":"NousResearch/datatrove","predicate":"has repository","object":"NousResearch/datatrove","text":"NousResearch/datatrove has repository NousResearch/datatrove."},{"subject":"NousResearch/datatrove","predicate":"has parent","object":"huggingface/datatrove","text":"NousResearch/datatrove has parent huggingface/datatrove."},{"subject":"NousResearch/datatrove","predicate":"has stars","object":"8","text":"NousResearch/datatrove has stars 8."},{"subject":"NousResearch/datatrove","predicate":"has notability","object":"Routine fork, low stars","text":"NousResearch/datatrove has notability Routine fork, low stars."},{"subject":"NousResearch/datatrove","predicate":"has watch term","object":"Data pipeline","text":"NousResearch/datatrove has watch term Data pipeline."}]},"intelligence":{"signal_desk":"forks","answer":"Nous Research forked NousResearch/datatrove (forked from huggingface/datatrove). This fork signal points to upstream code the lab may be inspecting, patching, or building on. High-signal details: repo NousResearch/datatrove · parent huggingface/datatrove · Routine fork, low stars. onlylabs links this event to 1 captured evidence page and 6 related fork signals.","semantic_triples":[{"subject":"Nous Research","predicate":"forked","object":"NousResearch/datatrove","text":"Nous Research forked NousResearch/datatrove."},{"subject":"NousResearch/datatrove","predicate":"is classified as","object":"fork signal","text":"NousResearch/datatrove is classified as fork signal."},{"subject":"NousResearch/datatrove","predicate":"belongs to","object":"forks desk","text":"NousResearch/datatrove belongs to forks desk."},{"subject":"NousResearch/datatrove","predicate":"has context","object":"forked from huggingface/datatrove","text":"NousResearch/datatrove has context forked from huggingface/datatrove."},{"subject":"NousResearch/datatrove","predicate":"has evidence coverage","object":"1 captured evidence page","text":"NousResearch/datatrove has evidence coverage 1 captured evidence page."}]},"signal":{"id":"5cfa99f6-5373-4834-8bfc-9e935b86276f","url":"https://onlylabs.fyi/signals/5cfa99f6-5373-4834-8bfc-9e935b86276f","json_url":"https://onlylabs.fyi/signals/5cfa99f6-5373-4834-8bfc-9e935b86276f/signal.json","source_url":"https://github.com/NousResearch/datatrove","title":"NousResearch/datatrove","summary":"Nous Research forked upstream code. onlylabs watches forks for tooling, infrastructure, research dependencies, and product-adjacent work.","context":"forked from huggingface/datatrove","kind":{"key":"repo_forked","label":"Fork"},"org":{"slug":"nous","name":"Nous Research","category":"neolab"},"occurred_at":"2025-01-13T20:59:53+00:00","first_seen_at":"2026-06-06T01:49:37.982614+00:00","date_source":"source","evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["plain"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://github.com/NousResearch/datatrove"]},"facets":{"repo":"NousResearch/datatrove","parent":"huggingface/datatrove"},"traction":{"github_stars":8,"hn_points":null,"hn_comments":null,"hn_story_id":null,"hf_downloads":null,"hf_likes":null},"data_radar":null},"primary_evidence_page":{"url":"https://github.com/NousResearch/datatrove","final_url":"https://github.com/NousResearch/datatrove","title":"NousResearch/datatrove repository metadata","http_status":200,"content_type":"application/json","capture_method":"plain","fetched_at":"2026-06-11T02:54:27.547533+00:00","bytes":47268,"raw_path":"d4d9555acdf3e844ef06be6fe8fbfdb65052b90e5fd3a70801f3975093070727.json","content_hash":"64d98b5b6642539c66fd034ac52565b33eadf4a1cf98b3c28a1991a36bcedd05","excerpt_chars":1200,"truncated":true,"excerpt":"NousResearch/datatrove Description: Freeing data processing from scripting madness by providing a set of platform-agnostic customizable pipeline processing blocks. License: Apache-2.0 Stars: 8 Forks: 2 Open issues: 0 Created: 2025-01-13T20:59:53Z Pushed: 2025-01-13T21:07:42Z Default branch: main Fork: yes Parent repository: huggingface/datatrove Archived: no README: DataTrove DataTrove is a library to process, filter and deduplicate text data at a very large scale. It provides a set of prebuilt commonly used processing blocks with a framework to easily add custom functionality. DataTrove processing pipelines are platform-agnostic, running out of the box locally or on a slurm cluster. Its (relatively) low memory usage and multiple step design makes it ideal for large workloads, such as to process an LLM's training data. Local, remote and other file systems are supported through [fsspec](https://filesystem-spec.readthedocs.io/en/latest/). Table of contents <!-- toc --> - [Installation](#installation) - [Quickstart examples](#quickstart-examples) - [Terminology](#terminology) - [Pipeline](#pipeline) * [DataTrove Document](#datatrove-document) * [Types of pipeline..."},"evidence_pages":[{"url":"https://github.com/NousResearch/datatrove","final_url":"https://github.com/NousResearch/datatrove","title":"NousResearch/datatrove repository metadata","http_status":200,"content_type":"application/json","capture_method":"plain","fetched_at":"2026-06-11T02:54:27.547533+00:00","bytes":47268,"raw_path":"d4d9555acdf3e844ef06be6fe8fbfdb65052b90e5fd3a70801f3975093070727.json","content_hash":"64d98b5b6642539c66fd034ac52565b33eadf4a1cf98b3c28a1991a36bcedd05","excerpt_chars":1200,"truncated":true,"excerpt":"NousResearch/datatrove Description: Freeing data processing from scripting madness by providing a set of platform-agnostic customizable pipeline processing blocks. License: Apache-2.0 Stars: 8 Forks: 2 Open issues: 0 Created: 2025-01-13T20:59:53Z Pushed: 2025-01-13T21:07:42Z Default branch: main Fork: yes Parent repository: huggingface/datatrove Archived: no README: DataTrove DataTrove is a library to process, filter and deduplicate text data at a very large scale. It provides a set of prebuilt commonly used processing blocks with a framework to easily add custom functionality. DataTrove processing pipelines are platform-agnostic, running out of the box locally or on a slurm cluster. Its (relatively) low memory usage and multiple step design makes it ideal for large workloads, such as to process an LLM's training data. Local, remote and other file systems are supported through [fsspec](https://filesystem-spec.readthedocs.io/en/latest/). Table of contents <!-- toc --> - [Installation](#installation) - [Quickstart examples](#quickstart-examples) - [Terminology](#terminology) - [Pipeline](#pipeline) * [DataTrove Document](#datatrove-document) * [Types of pipeline..."}],"related_signals":[{"id":"007a4fd1-4440-4405-853a-0a96c587ccb2","url":"https://onlylabs.fyi/signals/007a4fd1-4440-4405-853a-0a96c587ccb2","source_url":"https://github.com/NousResearch/Automodel","title":"NousResearch/Automodel","context":"forked from NVIDIA-NeMo/Automodel","kind":{"key":"repo_forked","label":"Fork"},"org":{"slug":"nous","name":"Nous Research","category":"neolab"},"occurred_at":"2026-05-27T12:18:10+00:00","first_seen_at":"2026-06-06T01:49:37.982614+00:00","date_source":"source"},{"id":"cded9ac1-e648-4965-8bda-97827b4128d0","url":"https://onlylabs.fyi/signals/cded9ac1-e648-4965-8bda-97827b4128d0","source_url":"https://github.com/NousResearch/Megatron-LM","title":"NousResearch/Megatron-LM","context":"forked from NVIDIA/Megatron-LM","kind":{"key":"repo_forked","label":"Fork"},"org":{"slug":"nous","name":"Nous Research","category":"neolab"},"occurred_at":"2026-05-27T12:17:29+00:00","first_seen_at":"2026-06-06T01:49:37.982614+00:00","date_source":"source"},{"id":"a66f6267-7e1f-452a-8efd-640d342f9ce6","url":"https://onlylabs.fyi/signals/a66f6267-7e1f-452a-8efd-640d342f9ce6","source_url":"https://github.com/NousResearch/Megatron-Bridge","title":"NousResearch/Megatron-Bridge","context":"forked from NVIDIA-NeMo/Megatron-Bridge","kind":{"key":"repo_forked","label":"Fork"},"org":{"slug":"nous","name":"Nous Research","category":"neolab"},"occurred_at":"2026-05-27T12:17:08+00:00","first_seen_at":"2026-06-06T01:49:37.982614+00:00","date_source":"source"},{"id":"8f913bfb-f7f5-4ebc-9d01-416bfaecdb12","url":"https://onlylabs.fyi/signals/8f913bfb-f7f5-4ebc-9d01-416bfaecdb12","source_url":"https://github.com/NousResearch/agent-governance-toolkit","title":"NousResearch/agent-governance-toolkit","context":"forked from microsoft/agent-governance-toolkit","kind":{"key":"repo_forked","label":"Fork"},"org":{"slug":"nous","name":"Nous Research","category":"neolab"},"occurred_at":"2026-05-10T22:34:18+00:00","first_seen_at":"2026-06-06T01:49:37.982614+00:00","date_source":"source"},{"id":"b857381d-51f3-446f-9a61-234abaced3d1","url":"https://onlylabs.fyi/signals/b857381d-51f3-446f-9a61-234abaced3d1","source_url":"https://github.com/NousResearch/Nemotron","title":"NousResearch/Nemotron","context":"forked from NVIDIA-NeMo/Nemotron","kind":{"key":"repo_forked","label":"Fork"},"org":{"slug":"nous","name":"Nous Research","category":"neolab"},"occurred_at":"2026-04-27T14:08:47+00:00","first_seen_at":"2026-06-06T01:49:37.982614+00:00","date_source":"source"},{"id":"a0d4e67f-3f35-4ea4-ab24-efba1dfd20b7","url":"https://onlylabs.fyi/signals/a0d4e67f-3f35-4ea4-ab24-efba1dfd20b7","source_url":"https://github.com/NousResearch/NemoClaw","title":"NousResearch/NemoClaw","context":"forked from NVIDIA/NemoClaw","kind":{"key":"repo_forked","label":"Fork"},"org":{"slug":"nous","name":"Nous Research","category":"neolab"},"occurred_at":"2026-04-24T18:35:00+00:00","first_seen_at":"2026-06-06T01:49:37.982614+00:00","date_source":"source"}]}