scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27482B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "InternBootcamp Technical Report: Boosting LLM Reasoning with Verifiable Task Scaling",
      6     "authors": [
      7       "Peiji Li",
      8       "Jiasheng Ye",
      9       "Yongkang Chen",
     10       "Yichuan Ma",
     11       "Zijie Yu",
     12       "Kedi Chen",
     13       "Ganqu Cui",
     14       "Haozhan Li",
     15       "Jiacheng Chen",
     16       "Chengqi Lyu",
     17       "Wenwei Zhang",
     18       "Linyang Li",
     19       "Qipeng Guo",
     20       "Dahua Lin",
     21       "Bowen Zhou",
     22       "Kai Chen"
     23     ],
     24     "year": 2025,
     25     "venue": "arXiv.org",
     26     "arxiv_id": "2508.08636",
     27     "doi": "10.48550/arXiv.2508.08636"
     28   },
     29   "checklist": {
     30     "claims_and_evidence": {
     31       "abstract_claims_supported": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The abstract's core claims—1000+ task environments, automated generation with configurable difficulty, integrated verification, and task scaling improving performance—are all supported by Section 3-4 and Tables 5-6. The claim that the 32B model 'excels on other established benchmarks' is supported by Table 6 showing the SFT-RL model achieves the best average across 9 OOD benchmarks.",
     35         "source": "haiku"
     36       },
     37       "causal_claims_justified": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper claims task scaling causally improves performance, but experiments don't fully disentangle task count from data volume—more tasks also means more diverse training data. The near-linear scaling fit (Figure 5b) is from 4 points (8/32/128/512 tasks) without controlling for total tokens seen.",
     41         "source": "haiku"
     42       },
     43       "generalization_bounded": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims training produces 'reasoning generalists' and delivers 'comprehensive reasoning capabilities,' but OOD results in Table 6 are mixed—Qwen2.5-32B+Bootcamp-RL shows no improvement on BBEH, GPQA-Diamond, or SuperGPQA. Generalizations exceed the evidence.",
     47         "source": "haiku"
     48       },
     49       "alternative_explanations_discussed": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper mentions one alternative ('RL activates existing knowledge') but does not discuss alternatives such as the effect of training data volume vs. task diversity, selection bias in BOOTCAMP-EVAL construction by the same team, or the role of the SFT distillation from DeepSeek-R1.",
     53         "source": "haiku"
     54       },
     55       "proxy_outcome_distinction": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "The paper measures benchmark accuracy on BOOTCAMP-EVAL and OOD benchmarks and consistently refers to these as 'reasoning performance' scores; it does not conflate benchmark scores with broader cognitive capabilities beyond benchmark context.",
     59         "source": "haiku"
     60       }
     61     },
     62     "limitations_and_scope": {
     63       "limitations_section_present": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "There is no dedicated limitations or threats-to-validity section anywhere in the paper. The conclusion only discusses open-sourcing contributions.",
     67         "source": "haiku"
     68       },
     69       "threats_to_validity_specific": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No threats to validity are discussed—the paper does not address BOOTCAMP-EVAL construction bias, the self-referential evaluation problem, or the possibility that improvements reflect data volume rather than task diversity.",
     73         "source": "haiku"
     74       },
     75       "scope_boundaries_stated": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "The paper does not explicitly state what its results do not show. There are no statements bounding the scope of the 'reasoning generalist' claim to the tested tasks or model families.",
     79         "source": "haiku"
     80       }
     81     },
     82     "conflicts_of_interest": {
     83       "funding_disclosed": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No funding source or acknowledgment section appears in the paper.",
     87         "source": "haiku"
     88       },
     89       "affiliations_disclosed": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Author affiliations with Shanghai AI Laboratory and Fudan University are disclosed in the header.",
     93         "source": "haiku"
     94       },
     95       "funder_independent_of_outcome": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "Shanghai AI Laboratory is an organization that builds and promotes LLMs; the primary evaluation is on BOOTCAMP-EVAL, a benchmark constructed by the same team, creating a conflict of interest between the evaluators and the evaluated system.",
     99         "source": "haiku"
    100       },
    101       "financial_interests_declared": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "No competing interests statement or financial disclosure appears in the paper.",
    105         "source": "haiku"
    106       }
    107     },
    108     "scope_and_framing": {
    109       "key_terms_defined": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Key technical terms are defined: 'task scaling' (increasing the number of training tasks), 'bootcamp class' (a modular task environment with case_generator, prompt_function, verify_function), and 'RLVR' are all explained in context.",
    113         "source": "haiku"
    114       },
    115       "intended_contribution_clear": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Three contributions are explicitly listed: the INTERNBOOTCAMP framework, the BOOTCAMP-EVAL benchmark, and empirical demonstration of task scaling effectiveness.",
    119         "source": "haiku"
    120       },
    121       "engagement_with_prior_work": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Section 2 explicitly situates the work against existing reasoning studies (CoT, RLVR), domain-specific environments (Go, chess, math), and puzzle-based multi-task frameworks like Enigmata, SynLogic, and KORGym, showing how INTERNBOOTCAMP extends these with broader domain coverage.",
    125         "source": "haiku"
    126       }
    127     }
    128   },
    129   "type_checklist": {
    130     "empirical": {
    131       "artifacts": {
    132         "code_released": {
    133           "applies": true,
    134           "answer": true,
    135           "justification": "Source code is publicly released at https://github.com/InternLM/InternBootcamp, confirmed in the abstract and footnote 3.",
    136           "source": "haiku"
    137         },
    138         "data_released": {
    139           "applies": true,
    140           "answer": true,
    141           "justification": "BOOTCAMP-EVAL benchmark (9,232 samples across 118 tasks) is open-sourced: 'The complete benchmark suite is open-sourced.'",
    142           "source": "haiku"
    143         },
    144         "environment_specified": {
    145           "applies": true,
    146           "answer": false,
    147           "justification": "No requirements.txt, Dockerfile, or dependency specification is mentioned in the paper. Code examples reference Python imports but no environment spec is provided.",
    148           "source": "haiku"
    149         },
    150         "reproduction_instructions": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "Code integration examples (Figures 9-10) illustrate API usage but no step-by-step instructions for reproducing the training experiments (data generation, RL training, evaluation) are provided.",
    154           "source": "haiku"
    155         }
    156       },
    157       "statistical_methodology": {
    158         "confidence_intervals_or_error_bars": {
    159           "applies": true,
    160           "answer": false,
    161           "justification": "No confidence intervals or error bars are shown in any figure. For the 8-task experiments 4 trials were averaged, but variance is not reported.",
    162           "source": "haiku"
    163         },
    164         "significance_tests": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "No statistical significance tests are applied to any comparative claims between training configurations or model comparisons.",
    168           "source": "haiku"
    169         },
    170         "effect_sizes_reported": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "Absolute score improvements are reported in Tables 5 and 6 with baseline values provided (e.g., Qwen2.5-32B-Instruct 24.4% → SFT-RL 59.5% on BOOTCAMP-EVAL).",
    174           "source": "haiku"
    175         },
    176         "sample_size_justified": {
    177           "applies": true,
    178           "answer": false,
    179           "justification": "The 9,232-sample benchmark and 118-task evaluation set are described but no power analysis or statistical justification for sample sizes is given.",
    180           "source": "haiku"
    181         },
    182         "variance_reported": {
    183           "applies": true,
    184           "answer": false,
    185           "justification": "Variance across runs is not reported for any experiment. The 8-task condition used 4 trials with averaging, but standard deviation is not provided.",
    186           "source": "haiku"
    187         }
    188       },
    189       "evaluation_design": {
    190         "baselines_included": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Multiple competitive baselines are included: DeepSeek-V3-0324, DeepSeek-R1-0528, QwQ-32B, Qwen3-32B, Qwen3-235B-A22B in Table 5, and ablation baselines in Table 6.",
    194           "source": "haiku"
    195         },
    196         "baselines_contemporary": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "All baselines are 2024-2025 state-of-the-art models; the paper explicitly compares against Qwen3, QwQ-32B, and DeepSeek-R1-0528.",
    200           "source": "haiku"
    201         },
    202         "ablation_study": {
    203           "applies": true,
    204           "answer": true,
    205           "justification": "Task count ablation across 4 levels (8, 32, 128, 512 tasks) and training paradigm ablation (RL only, SFT only, SFT+RL) are both conducted systematically.",
    206           "source": "haiku"
    207         },
    208         "multiple_metrics": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "Evaluation spans 9 OOD benchmarks covering logic (BBEH, KOR-Bench), knowledge (GPQA-Diamond, SuperGPQA, MMLU-Pro), math (AIME 2025, LiveMathBench-Hard), and code (HumanEval, LiveCodeBench) domains.",
    212           "source": "haiku"
    213         },
    214         "human_evaluation": {
    215           "applies": false,
    216           "answer": false,
    217           "justification": "The paper evaluates LLM performance on automated verifiable tasks; human evaluation of system outputs is not applicable.",
    218           "source": "haiku"
    219         },
    220         "held_out_test_set": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "BOOTCAMP-EVAL is explicitly held out from training: 'we ensure no data leakage by deduplicating the training data against BOOTCAMP-EVAL, guaranteeing clean evaluation.'",
    224           "source": "haiku"
    225         },
    226         "per_category_breakdown": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Figure 7 provides per-domain performance curves across all 8 BOOTCAMP-EVAL categories for each task count configuration; Table 5 also breaks down scores by domain.",
    230           "source": "haiku"
    231         },
    232         "failure_cases_discussed": {
    233           "applies": true,
    234           "answer": true,
    235           "justification": "The paper explicitly discusses the 8-task entropy collapse failure (Figure 6), and three tasks (Hyperbaton, PropositionalLogicFormalization, Wordscapes) that fail under isolated single-task training (Figure 8).",
    236           "source": "haiku"
    237         },
    238         "negative_results_reported": {
    239           "applies": true,
    240           "answer": true,
    241           "justification": "Negative results are reported: standalone RL training of Qwen2.5-32B-Instruct shows minimal OOD improvement (Table 6), and 8-task training collapses after 300 steps.",
    242           "source": "haiku"
    243         }
    244       },
    245       "setup_transparency": {
    246         "model_versions_specified": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Specific model versions are named: Qwen2.5-7B-Instruct, Qwen2.5-32B-Instruct, DeepSeek-R1-Distill-Qwen-32B, DeepSeek-R1, and DeepSeek-V3 for bootcamp generation.",
    250           "source": "haiku"
    251         },
    252         "prompts_provided": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "The full training prompt template is provided verbatim in Appendix D (Figure 11), including the <think> tag formatting instruction.",
    256           "source": "haiku"
    257         },
    258         "hyperparameters_reported": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Key hyperparameters are reported: prompt batch size 128, 8 responses per step, temperature 1.0, rollout batch size 384, max 500 training steps, SFT 3 epochs, no token-level loss.",
    262           "source": "haiku"
    263         },
    264         "scaffolding_described": {
    265           "applies": true,
    266           "answer": true,
    267           "justification": "The DAPO-based rollout with dynamic sampling, format penalties, length penalties, and oversampling strategy (3× rollout batch size) are described in detail in Section 4.1.",
    268           "source": "haiku"
    269         },
    270         "data_preprocessing_documented": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "Data preprocessing is documented: evolutionary generation with 3 iterations, self-consistent unittest filtering (accuracy thresholds 0.03 and 0.85), deduplication, difficulty calibration, and quality filtering yielding 704 tasks.",
    274           "source": "haiku"
    275         }
    276       },
    277       "data_integrity": {
    278         "raw_data_available": {
    279           "applies": true,
    280           "answer": true,
    281           "justification": "BOOTCAMP-EVAL raw benchmark data is open-sourced, and the code to generate training data is publicly available on GitHub.",
    282           "source": "haiku"
    283         },
    284         "data_collection_described": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "Section 3.1 and Appendix A describe four task source categories (puzzles, reasoning benchmarks, algorithm problems, scientific tasks) with specific origin datasets cited.",
    288           "source": "haiku"
    289         },
    290         "recruitment_methods_described": {
    291           "applies": false,
    292           "answer": false,
    293           "justification": "No human participants are involved; tasks are collected from existing datasets and generated automatically.",
    294           "source": "haiku"
    295         },
    296         "data_pipeline_documented": {
    297           "applies": true,
    298           "answer": true,
    299           "justification": "The full pipeline—task description collection → evolutionary bootcamp generation with DeepSeek-R1 → self-consistent unittest filtering → human review → deduplication and quality filtering—is documented in Section 3.3.",
    300           "source": "haiku"
    301         }
    302       },
    303       "contamination": {
    304         "training_cutoff_stated": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "Training data cutoffs for the base models (Qwen2.5 and DeepSeek-R1) are not stated, making it impossible to assess whether OOD evaluation benchmarks were in pretraining data.",
    308           "source": "haiku"
    309         },
    310         "train_test_overlap_discussed": {
    311           "applies": true,
    312           "answer": true,
    313           "justification": "The paper explicitly addresses train-test overlap for BOOTCAMP-EVAL: 'we ensure no data leakage by deduplicating the training data against BOOTCAMP-EVAL, guaranteeing clean evaluation.'",
    314           "source": "haiku"
    315         },
    316         "benchmark_contamination_addressed": {
    317           "applies": true,
    318           "answer": false,
    319           "justification": "The paper evaluates on OOD benchmarks including AIME 2025 and LiveCodeBench but does not discuss whether these benchmarks were potentially seen during pretraining of Qwen2.5 or DeepSeek-R1 base models.",
    320           "source": "haiku"
    321         }
    322       },
    323       "human_studies": {
    324         "pre_registered": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants.",
    328           "source": "haiku"
    329         },
    330         "irb_or_ethics_approval": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants.",
    334           "source": "haiku"
    335         },
    336         "demographics_reported": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants.",
    340           "source": "haiku"
    341         },
    342         "inclusion_exclusion_criteria": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants.",
    346           "source": "haiku"
    347         },
    348         "randomization_described": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants.",
    352           "source": "haiku"
    353         },
    354         "blinding_described": {
    355           "applies": false,
    356           "answer": false,
    357           "justification": "No human participants.",
    358           "source": "haiku"
    359         },
    360         "attrition_reported": {
    361           "applies": false,
    362           "answer": false,
    363           "justification": "No human participants.",
    364           "source": "haiku"
    365         }
    366       },
    367       "cost_and_practicality": {
    368         "inference_cost_reported": {
    369           "applies": true,
    370           "answer": false,
    371           "justification": "No inference cost, latency, or GPU memory figures are reported for running the 32B models or executing the training pipeline.",
    372           "source": "haiku"
    373         },
    374         "compute_budget_stated": {
    375           "applies": true,
    376           "answer": false,
    377           "justification": "No total computational budget (GPU hours, hardware specs, estimated cost) is stated for any experiment including the full 32B SFT+RL training.",
    378           "source": "haiku"
    379         }
    380       }
    381     }
    382   },
    383   "claims": [
    384     {
    385       "claim": "Scaling training tasks from 8 to 512 yields near-linear improvement on BOOTCAMP-EVAL, following S = 2.65×10⁻⁴N + 0.22",
    386       "evidence": "Figure 5b shows a linear fit across 4 task counts (8/32/128/512), though the fit is from only 4 data points",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "Training with 8 tasks causes entropy collapse after ~300 steps, while 32+ tasks maintains stable training",
    391       "evidence": "Figure 6 shows rollout batch generation exploding for 8-task training, with clear divergence from multi-task configurations",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "Multi-task training enables emergence of capabilities on tasks that cannot be learned in single-task isolation",
    396       "evidence": "Figure 8 shows three specific tasks (Hyperbaton, PropositionalLogicFormalization, Wordscapes) achieving zero gain in single-task training but significant improvement around step 300 with 512-task training",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "Bootcamp-SFT-RL 32B model achieves best average across 9 OOD benchmarks vs. evaluated baselines",
    401       "evidence": "Table 6 shows SFT-RL Qwen2.5-32B achieves 61.8% average vs. 52.5% for DeepSeek-R1-Distill-32B and 42.3% for base Qwen2.5-32B-Instruct",
    402       "supported": "strong"
    403     },
    404     {
    405       "claim": "InternBootcamp Bootcamp-SFT model outperforms all listed open-source frontier models on BOOTCAMP-EVAL",
    406       "evidence": "Table 5 shows Bootcamp-SFT achieves 61.1% vs. Qwen3-235B-A22B at 54.5% and QwQ-32B at 51.4%, though the evaluated model is 32B vs. larger parameter-count models",
    407       "supported": "moderate"
    408     },
    409     {
    410       "claim": "Automated evolutionary bootcamp generation with self-consistent unittest filtering achieves 70.6% no-obvious-error rate in the 0.03-0.85 accuracy range",
    411       "evidence": "Table 3 directly shows the filtering results with manual inspection confirming the accuracy thresholds distinguish error types effectively",
    412       "supported": "strong"
    413     }
    414   ],
    415   "methodology_tags": [
    416     "benchmark-eval"
    417   ],
    418   "key_findings": "InternBootcamp demonstrates that scaling the number of diverse verifiable training tasks (from 8 to 512) systematically improves LLM reasoning performance with a near-linear relationship, and that training with very few tasks (8) causes entropy collapse in RL training. The paper shows 'emergent moments' where capabilities on tasks that fail in single-task training emerge after sufficient multi-task exposure. The 32B SFT+RL model trained on all 1000+ bootcamp tasks achieves the best average across 9 OOD reasoning benchmarks among evaluated baselines, though improvements are uneven across benchmarks and the primary evaluation benchmark was constructed by the same team.",
    419   "red_flags": [
    420     {
    421       "flag": "Self-referential primary benchmark",
    422       "detail": "BOOTCAMP-EVAL, the primary evaluation benchmark, was constructed by the same team that built the training framework. This creates circular evaluation where the benchmark may favor the design choices already embedded in the training tasks."
    423     },
    424     {
    425       "flag": "No limitations section",
    426       "detail": "The paper has no dedicated limitations or threats-to-validity section despite making broad claims about 'reasoning generalists' and cross-domain generalization."
    427     },
    428     {
    429       "flag": "Confounded task scaling",
    430       "detail": "More training tasks necessarily means more diverse training data; the paper does not control for total training tokens or data volume, making it unclear whether benefits come from task count or data diversity."
    431     },
    432     {
    433       "flag": "No statistical tests or error bars",
    434       "detail": "All comparisons between training configurations and baselines are made without significance tests or reported variance. Only 8-task experiments used multiple trials."
    435     },
    436     {
    437       "flag": "Mixed OOD results underdiscussed",
    438       "detail": "Table 6 shows Qwen2.5-32B+Bootcamp-RL achieves no improvement on BBEH, GPQA-Diamond, or SuperGPQA compared to base, but this is not discussed as a limitation."
    439     },
    440     {
    441       "flag": "No compute budget",
    442       "detail": "No GPU hours, hardware specifications, or cost estimates are provided for any training run, making practical replication assessment impossible."
    443     }
    444   ],
    445   "cited_papers": [
    446     {
    447       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    448       "relevance": "Primary baseline and the model used for automated bootcamp generation; central to the RL training paradigm."
    449     },
    450     {
    451       "title": "Enigmata: Scaling Logical Reasoning in Large Language Models with Synthetic Verifiable Puzzles",
    452       "relevance": "Direct prior work on synthetic verifiable puzzle datasets for LLM reasoning; INTERNBOOTCAMP extends its scope."
    453     },
    454     {
    455       "title": "SynLogic: Synthesizing Verifiable Reasoning Data at Scale for Learning Logical Reasoning and Beyond",
    456       "relevance": "Related work on large-scale verifiable reasoning data synthesis; direct competitor in the task-scaling space."
    457     },
    458     {
    459       "title": "KOR-Bench: Benchmarking Language Models on Knowledge-Orthogonal Reasoning Tasks",
    460       "relevance": "One of the OOD evaluation benchmarks and a task source for INTERNBOOTCAMP; used as both training material and evaluation."
    461     },
    462     {
    463       "title": "BigBench Extra Hard (BBEH)",
    464       "relevance": "OOD evaluation benchmark and task source for INTERNBOOTCAMP; measures reasoning difficulty beyond standard benchmarks."
    465     },
    466     {
    467       "title": "DAPO: An Open-Source LLM Reinforcement Learning System at Scale",
    468       "relevance": "The RL algorithm used for training; key methodological component of the task scaling experiments."
    469     },
    470     {
    471       "title": "HybridFlow: A Flexible and Efficient RLHF Framework (VeRL)",
    472       "relevance": "One of the RL training frameworks INTERNBOOTCAMP integrates with; cited as integration target."
    473     },
    474     {
    475       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    476       "relevance": "OOD evaluation benchmark for code generation; used to assess generalization beyond reasoning tasks."
    477     }
    478   ],
    479   "engagement_factors": {
    480     "practical_relevance": {
    481       "score": 3,
    482       "justification": "Open-source framework with 1000+ tasks that practitioners can directly use for RL training, SFT data synthesis, and evaluation of their own models."
    483     },
    484     "surprise_contrarian": {
    485       "score": 1,
    486       "justification": "Task scaling following data scaling laws is expected; the 'emergent moment' finding for isolated tasks is mildly surprising but framed as a known phenomenon in transfer learning."
    487     },
    488     "fear_safety": {
    489       "score": 0,
    490       "justification": "No safety or risk concerns raised; the paper focuses on improving reasoning capabilities without discussing misuse potential."
    491     },
    492     "drama_conflict": {
    493       "score": 0,
    494       "justification": "No controversy or conflict framing; straightforward technical contribution from a research lab."
    495     },
    496     "demo_ability": {
    497       "score": 3,
    498       "justification": "GitHub repository is publicly available and the framework provides immediately runnable bootcamp classes with documented API; practitioners can run BOOTCAMP-EVAL on their models today."
    499     },
    500     "brand_recognition": {
    501       "score": 2,
    502       "justification": "Shanghai AI Laboratory is a well-known Chinese AI research organization behind InternLM; not as globally prominent as DeepSeek or OpenAI but recognized in the LLM community."
    503     }
    504   },
    505   "hn_data": {
    506     "threads": [
    507       {
    508         "hn_id": "37288987",
    509         "title": "Answering ambiguous questions with a database of questions, answers, revisions",
    510         "points": 21,
    511         "comments": 0,
    512         "url": "https://news.ycombinator.com/item?id=37288987",
    513         "created_at": "2023-08-28T02:28:01Z"
    514       },
    515       {
    516         "hn_id": "44908292",
    517         "title": "Distillation Scaling Laws",
    518         "points": 5,
    519         "comments": 0,
    520         "url": "https://news.ycombinator.com/item?id=44908292",
    521         "created_at": "2025-08-15T03:22:10Z"
    522       },
    523       {
    524         "hn_id": "43039955",
    525         "title": "Distillation Scaling Laws",
    526         "points": 3,
    527         "comments": 0,
    528         "url": "https://news.ycombinator.com/item?id=43039955",
    529         "created_at": "2025-02-13T19:11:35Z"
    530       },
    531       {
    532         "hn_id": "44427833",
    533         "title": "Simple low-dimensional computations explain variability in neuronal activity",
    534         "points": 2,
    535         "comments": 0,
    536         "url": "https://news.ycombinator.com/item?id=44427833",
    537         "created_at": "2025-06-30T21:01:35Z"
    538       },
    539       {
    540         "hn_id": "45222579",
    541         "title": "KNNSampler: Stochastic Imputations for Recovering Missing Value Distributions",
    542         "points": 1,
    543         "comments": 1,
    544         "url": "https://news.ycombinator.com/item?id=45222579",
    545         "created_at": "2025-09-12T14:32:36Z"
    546       },
    547       {
    548         "hn_id": "44604096",
    549         "title": "Coordination and Collaborative Reasoning in Multi-Agent LLMs",
    550         "points": 1,
    551         "comments": 0,
    552         "url": "https://news.ycombinator.com/item?id=44604096",
    553         "created_at": "2025-07-18T12:45:27Z"
    554       }
    555     ],
    556     "top_points": 21,
    557     "total_points": 33,
    558     "total_comments": 1
    559   }
    560 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs