scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30698B)
      1 {
      2   "paper": {
      3     "title": "InternBootcamp Technical Report: Boosting LLM Reasoning with Verifiable Task Scaling",
      4     "authors": [
      5       "Peiji Li",
      6       "Jiasheng Ye",
      7       "Yongkang Chen",
      8       "Yichuan Ma",
      9       "Zijie Yu",
     10       "Kedi Chen",
     11       "Ganqu Cui",
     12       "Haozhan Li",
     13       "Jiacheng Chen",
     14       "Chengqi Lyu",
     15       "Wenwei Zhang",
     16       "Linyang Li",
     17       "Qipeng Guo",
     18       "Dahua Lin",
     19       "Bowen Zhou",
     20       "Kai Chen"
     21     ],
     22     "year": 2025,
     23     "venue": "arXiv preprint",
     24     "arxiv_id": "2508.08636",
     25     "doi": "10.48550/arXiv.2508.08636"
     26   },
     27   "scan_version": 2,
     28   "active_modules": ["experimental_rigor", "data_leakage"],
     29   "methodology_tags": ["benchmark-eval"],
     30   "key_findings": "InternBootcamp is a large-scale framework of 1000+ reasoning tasks enabling RLVR training and data synthesis. Task scaling experiments show that increasing training task count from 8 to 512 consistently improves both reasoning performance and training efficiency on a 7B model. A 32B model trained with SFT+RL on all 1000+ tasks achieves state-of-the-art on the authors' BOOTCAMP-EVAL benchmark (61.1%) and strong gains on OOD benchmarks (61.8% avg across 9 benchmarks, up from 42.3% baseline).",
     31   "checklist": {
     32     "artifacts": {
     33       "code_released": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "GitHub repository link provided: https://github.com/InternLM/InternBootcamp. Abstract states 'All data and code are publicly available.'"
     37       },
     38       "data_released": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Abstract states 'All data and code are publicly available.' BOOTCAMP-EVAL benchmark is described as 'open-sourced' (Section 3.4). Training data generation is supported by the released framework."
     42       },
     43       "environment_specified": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No environment specifications (requirements.txt, Dockerfile, conda environment) are described in the paper. Only framework interfaces and code examples are shown."
     47       },
     48       "reproduction_instructions": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No step-by-step reproduction instructions are included in the paper. While code examples for using the framework are shown (Figures 9-10, Appendix C-D), there are no specific instructions for reproducing the main experimental results."
     52       }
     53     },
     54     "statistical_methodology": {
     55       "confidence_intervals_or_error_bars": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "No confidence intervals or error bars are reported in any tables or figures. Tables 5 and 6 report only point estimates. The training curves in Figures 5-8 show no uncertainty bands."
     59       },
     60       "significance_tests": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No statistical significance tests are used. Claims like 'scaling up training tasks improves both final performance and learning efficiency' are based on comparing raw numbers without any significance testing."
     64       },
     65       "effect_sizes_reported": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Tables 5 and 6 provide baseline scores alongside trained model scores, allowing effect magnitude assessment (e.g., Qwen2.5-32B-Instruct from 24.4 to 61.1 overall on BOOTCAMP-EVAL). Figure 5b provides a linear fit quantifying the relationship: S = 2.65×10⁻⁴N + 0.22."
     69       },
     70       "sample_size_justified": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No justification is given for sample sizes. The BOOTCAMP-EVAL benchmark uses 100 samples per task (except Cryptography) without justification. The choice of 4 trials for the 8-task setting is described but not justified statistically."
     74       },
     75       "variance_reported": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "For the 8-task setting, 4 independent trials were averaged, but no standard deviation or variance is reported across these trials. The 32, 128, and 512-task experiments appear to be single runs with no variance information."
     79       }
     80     },
     81     "evaluation_design": {
     82       "baselines_included": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Multiple baselines included: DeepSeek-V3-0324, DeepSeek-R1-0528, QwQ-32B, Qwen3-32B, Qwen3-235B-A22B, plus ablated versions of their own models (Tables 5-6)."
     86       },
     87       "baselines_contemporary": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Baselines include very recent models: DeepSeek-R1-0528, Qwen3-32B, Qwen3-235B-A22B (all 2025 models). These represent the state of the art at submission time."
     91       },
     92       "ablation_study": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Task scaling experiments (8, 32, 128, 512 tasks) serve as an ablation of task count. Section 4.3 ablates training strategies: RL only, SFT only, SFT+RL. Single-task vs multi-task comparison in Section 4.2.2."
     96       },
     97       "multiple_metrics": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Models evaluated on 9+ benchmarks: BOOTCAMP-EVAL, BBEH, KOR-Bench, GPQA-Diamond, Super GPQA, MMLU-Pro, AIME 2025, LiveMathBench Hard, HumanEval, LiveCodeBench v6 (Tables 5-6)."
    101       },
    102       "human_evaluation": {
    103         "applies": true,
    104         "answer": false,
    105         "justification": "All evaluation is automated via rule-based verify_function or benchmark metrics. No human evaluation of model reasoning outputs is performed. Human review is used only for bootcamp quality validation during data preparation, not for evaluating the trained models."
    106       },
    107       "held_out_test_set": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 3.4 explicitly states: 'we ensure no data leakage by deduplicating the training data against BOOTCAMP-EVAL, guaranteeing clean evaluation and preventing train-test contamination.' OOD benchmarks are separate from training data."
    111       },
    112       "per_category_breakdown": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Table 5 shows per-category scores across 8 domains (Graphical Puzzles, Algorithm, Logical Reasoning, Cryptography, Mathematical Modeling, Character Puzzles, Natural Science, Language Analysis). Figure 7 shows per-domain training curves."
    116       },
    117       "failure_cases_discussed": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "No error analysis or failure cases of the final trained model are discussed. The 8-task entropy collapse and single-task training failures (Figure 8) describe baseline limitations that motivate the approach, not where the proposed approach breaks down."
    121       },
    122       "negative_results_reported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Several negative results reported: (1) 8-task training collapses after 300 steps due to entropy (Section 4.2.1, Figure 6), (2) three tasks fail to learn under single-task training (Figure 8), (3) RL-only on Qwen2.5-32B-Instruct shows minimal OOD gains (Table 6), (4) 33/228 auto-generated bootcamps are problematic (Table 2)."
    126       }
    127     },
    128     "claims_and_evidence": {
    129       "abstract_claims_supported": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Abstract claims of SOTA on BOOTCAMP-EVAL are supported by Table 5 (61.1% vs 54.5% for Qwen3-235B). Task scaling claim is supported by Figures 5-6. 'Excels on other established benchmarks' is supported by Table 6 (61.8% avg for SFT-RL model)."
    133       },
    134       "causal_claims_justified": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The main causal claim ('task scaling improves performance') is supported by controlled experiments: same base model, same training algorithm, same compute budget per step, varying only the number of tasks (8, 32, 128, 512). The SFT/RL ablation (Section 4.3) uses controlled single-variable manipulation."
    138       },
    139       "generalization_bounded": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Task scaling experiments use only Qwen2.5-7B-Instruct. Full training tested on only Qwen2.5-32B and DeepSeek-R1-Distilled-Qwen-32B. The title claims 'Boosting LLM Reasoning' generically, but results are limited to two model families and specific sizes. No discussion of whether findings generalize to other architectures or scales."
    143       },
    144       "alternative_explanations_discussed": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "No alternative explanations are discussed. The paper does not consider: (1) whether the improvement comes from task diversity vs quantity, (2) whether similar gains could come from more data on fewer tasks, (3) whether the BOOTCAMP-EVAL improvements reflect genuine generalization vs distribution matching with training tasks."
    148       },
    149       "proxy_outcome_distinction": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper measures accuracy on BOOTCAMP-EVAL and other benchmarks but frames results as 'reasoning capability' and 'reasoning generalization' without discussing whether benchmark accuracy is a valid proxy for general reasoning ability."
    153       }
    154     },
    155     "setup_transparency": {
    156       "model_versions_specified": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Specific model versions identified: Qwen2.5-7B-Instruct, Qwen2.5-32B-Instruct, DeepSeek-R1-Distilled-Qwen-32B for training. Baselines include dated versions: DeepSeek-V3-0324, DeepSeek-R1-0528. These are specific open-source model releases with unique identifiers."
    160       },
    161       "prompts_provided": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Training prompt template provided in Appendix D / Figure 11. Example task prompt shown in Figure 9 via prompt_func output. The framework's actual prompt generation code is released in the open-source repository."
    165       },
    166       "hyperparameters_reported": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 4.1 reports: prompt batch size 128, 8 responses per step, temperature 1, oversampling ratio 3x, max 500 steps (7B). For 32B: prompt batch size 256, SFT 3 epochs, RL max 300 steps. DAPO-like algorithm with dynamic sampling described."
    170       },
    171       "scaffolding_described": {
    172         "applies": false,
    173         "answer": false,
    174         "justification": "No agentic scaffolding is used in the evaluated models. Models are evaluated via direct text generation with a simple prompt template. The agent workflow for bootcamp synthesis (Section 3.3) is a data preparation tool, not model inference scaffolding."
    175       },
    176       "data_preprocessing_documented": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 3.3 documents the full data pipeline: task collection from multiple sources (Appendix A), evolutionary bootcamp generation (3 iterations), self-consistent unittest filtering (accuracy thresholds 0.03-0.85), deduplication, difficulty calibration, quality filtering. Counts provided: 228 → 704 retained tasks. SFT data: 55K samples + 11K math."
    180       }
    181     },
    182     "limitations_and_scope": {
    183       "limitations_section_present": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No dedicated limitations section exists. The paper has Introduction, Related Work, InternBootcamp (method), Experiments, and Conclusion sections. No discussion of limitations is present."
    187       },
    188       "threats_to_validity_specific": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No threats to validity are discussed. The paper does not address potential issues such as the limited model diversity in experiments, the circularity of evaluating on a self-designed benchmark, or the scalability of the approach beyond the tested configurations."
    192       },
    193       "scope_boundaries_stated": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "No explicit scope boundaries are stated. The paper does not specify what the results do NOT show, what model families or scales were not tested, or what types of reasoning are not covered by the framework."
    197       }
    198     },
    199     "data_integrity": {
    200       "raw_data_available": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The paper states 'All data and code are publicly available' with a GitHub link. The framework itself generates the raw data (task instances) and the BOOTCAMP-EVAL benchmark is open-sourced."
    204       },
    205       "data_collection_described": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Section 3.1 and Appendix A describe task sources in detail: puzzle websites (PuzzleTeam), reasoning benchmarks (ARC-AGI, KOR-Bench, BBEH), algorithm problems (CodeContest), and scientific tasks. The curation process for each source type is described."
    209       },
    210       "recruitment_methods_described": {
    211         "applies": false,
    212         "answer": false,
    213         "justification": "No human participants in this study. Data sources are public benchmarks, puzzle websites, and synthetically generated task instances."
    214       },
    215       "data_pipeline_documented": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "The pipeline is documented with counts at each stage: task description collection → LLM bootcamp generation (228 attempts) → evolutionary refinement (3 iterations, problematic rates: 33→19→14) → self-consistent unittest filtering → deduplication and quality filtering → 704 retained tasks. For SFT: 55K long-CoT + 11K math data."
    219       }
    220     },
    221     "conflicts_of_interest": {
    222       "funding_disclosed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No funding or acknowledgments section is present in the paper. Authors are from Shanghai AI Laboratory and Fudan University, but no funding sources are disclosed."
    226       },
    227       "affiliations_disclosed": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "Author affiliations are clearly stated: Shanghai AI Laboratory (primary) and Fudan University (secondary). Contact emails at pjlab.org.cn are provided."
    231       },
    232       "funder_independent_of_outcome": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No funding is disclosed. Shanghai AI Laboratory, the primary affiliation, has a vested interest in demonstrating the value of their open-source framework and training infrastructure. Independence cannot be assessed."
    236       },
    237       "financial_interests_declared": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No competing interests or financial interests statement is present in the paper."
    241       }
    242     },
    243     "contamination": {
    244       "training_cutoff_stated": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No training data cutoff dates are stated for the base models (Qwen2.5-7B-Instruct, Qwen2.5-32B-Instruct, DeepSeek-R1-Distilled-Qwen-32B). This makes it impossible to assess whether the base models were trained on data that includes the OOD benchmarks."
    248       },
    249       "train_test_overlap_discussed": {
    250         "applies": true,
    251         "answer": true,
    252         "justification": "Section 3.4 explicitly states: 'we ensure no data leakage by deduplicating the training data against BOOTCAMP-EVAL, guaranteeing clean evaluation and preventing train-test contamination.' Section 4.1 reaffirms: 'filtering all training instances to exclude any that share identities with BOOTCAMP-EVAL.'"
    253       },
    254       "benchmark_contamination_addressed": {
    255         "applies": true,
    256         "answer": false,
    257         "justification": "Contamination is addressed for BOOTCAMP-EVAL (via deduplication and fresh instance generation) but not for OOD benchmarks like MMLU-Pro, GPQA, or HumanEval, which have been publicly available for years and could be in the base models' training data."
    258       }
    259     },
    260     "human_studies": {
    261       "pre_registered": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study. All experiments involve model training and automated benchmark evaluation."
    265       },
    266       "irb_or_ethics_approval": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "demographics_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study."
    275       },
    276       "inclusion_exclusion_criteria": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this study."
    280       },
    281       "randomization_described": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this study."
    285       },
    286       "blinding_described": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants in this study."
    290       },
    291       "attrition_reported": {
    292         "applies": false,
    293         "answer": false,
    294         "justification": "No human participants in this study."
    295       }
    296     },
    297     "cost_and_practicality": {
    298       "inference_cost_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No inference cost, latency, or per-example cost is reported for any of the models or experiments."
    302       },
    303       "compute_budget_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No GPU hours, hardware specifications, or total compute budget is stated. Only training step counts (500 for 7B, 300 for 32B) and batch sizes are provided, without hardware context."
    307       }
    308     },
    309     "experimental_rigor": {
    310       "seed_sensitivity_reported": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "For the 8-task setting, 4 trials with different task subsets were run, but this tests task sampling sensitivity, not seed sensitivity. No results across multiple random seeds are reported for the 32, 128, or 512-task experiments."
    314       },
    315       "number_of_runs_stated": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "Only the 8-task experiment states '4 independent trials.' The main experiments (32, 128, 512 tasks) do not state how many runs produced the results, implying single runs."
    319       },
    320       "hyperparameter_search_budget": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "No hyperparameter search procedure or budget is reported. Key hyperparameters (batch size, temperature, step count) appear chosen without documented search."
    324       },
    325       "best_config_selection_justified": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The choice of training configuration (DAPO algorithm, batch sizes, step counts) is not justified. No comparison with alternative RL algorithms or training schedules is provided."
    329       },
    330       "multiple_comparison_correction": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "Multiple comparisons are made across models and benchmarks (Tables 5-6) without any correction for multiple testing."
    334       },
    335       "self_comparison_bias_addressed": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "The authors evaluate their own trained models against baselines on their own benchmark (BOOTCAMP-EVAL) without acknowledging potential self-evaluation bias. No independent evaluation is performed."
    339       },
    340       "compute_budget_vs_performance": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "Performance is not reported as a function of compute budget. While all task scaling experiments use the same number of training steps, compute differences from rollout overhead (Figure 6 shows the 8-task setting requires many more generated batches) are not quantified or discussed."
    344       },
    345       "benchmark_construct_validity": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "BOOTCAMP-EVAL is claimed to assess 'cross-domain reasoning capabilities' but no formal discussion of construct validity is provided. No analysis of whether rule-based puzzle solving correlates with genuine reasoning ability or whether the 118 tasks are representative of general reasoning."
    349       },
    350       "scaffold_confound_addressed": {
    351         "applies": false,
    352         "answer": false,
    353         "justification": "No scaffolding is used in model evaluation. All models are evaluated via direct text generation with a standard prompt template."
    354       }
    355     },
    356     "data_leakage": {
    357       "temporal_leakage_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "While BOOTCAMP-EVAL generates fresh instances (inherently avoiding temporal leakage), this is not framed as addressing temporal leakage. For OOD benchmarks (MMLU-Pro, GPQA, HumanEval), temporal leakage is not discussed."
    361       },
    362       "feature_leakage_addressed": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No discussion of whether the evaluation setup provides information that would not be available in real usage, or whether BOOTCAMP-EVAL tasks structurally leak solution information through their prompt format."
    366       },
    367       "non_independence_addressed": {
    368         "applies": true,
    369         "answer": false,
    370         "justification": "Training and BOOTCAMP-EVAL data are generated from the same bootcamp classes. While instance-level deduplication is performed, the structural similarity between training and test data (same task types, same generation templates) is not discussed as a potential source of non-independence."
    371       },
    372       "leakage_detection_method": {
    373         "applies": true,
    374         "answer": true,
    375         "justification": "Section 3.4 describes identity-based deduplication: 'filtering all training instances to exclude any that share identities with BOOTCAMP-EVAL.' This is a concrete prevention method applied to their primary benchmark."
    376       }
    377     }
    378   },
    379   "claims": [
    380     {
    381       "claim": "Scaling the number of training tasks from 8 to 512 significantly enhances LLM reasoning performance on BOOTCAMP-EVAL",
    382       "evidence": "Figure 5a shows scores increasing from ~0.22 (8 tasks) to ~0.35 (512 tasks) at 500 training steps. Figure 5b shows a near-linear relationship between task count and performance. Experiments use Qwen2.5-7B-Instruct with same training setup.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Task scaling improves training efficiency by preventing entropy collapse",
    387       "evidence": "Figure 6 shows the 8-task configuration requires exponentially more rollout batches per step, indicating degenerate response patterns. The 32+ task configurations maintain stable batch generation. 8-task training was terminated at step 300 due to collapse.",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "Multitask training enables 'emergent' learning on tasks unsolvable in isolation (Emergent Moment)",
    392       "evidence": "Figure 8 shows 3 tasks (Hyperbaton, PropositionalLogicFormalization, Wordscapes) where single-task training yields flat learning curves but 512-task mixed training shows sudden improvement after ~300 steps.",
    393       "supported": "weak"
    394     },
    395     {
    396       "claim": "Their 32B SFT model achieves state-of-the-art on BOOTCAMP-EVAL",
    397       "evidence": "Table 5 shows Qwen2.5-32B + Bootcamp-SFT achieves 61.1% overall, exceeding Qwen3-235B-A22B at 54.5% and DeepSeek-R1-0528 at 51.0%.",
    398       "supported": "strong"
    399     },
    400     {
    401       "claim": "Sequential SFT followed by RL achieves maximum performance gains across both in-domain and OOD benchmarks",
    402       "evidence": "Tables 5-6 show SFT-RL achieves 59.5% on BOOTCAMP-EVAL and 61.8% avg on OOD benchmarks, outperforming RL-only (46.9/43.0%) and SFT-only (61.1/53.2%) variants.",
    403       "supported": "strong"
    404     },
    405     {
    406       "claim": "A linear relationship exists between task count and reasoning performance",
    407       "evidence": "Figure 5b fits S = 2.65×10⁻⁴N + 0.22 to data points at 8, 32, 128, and 512 tasks.",
    408       "supported": "weak"
    409     },
    410     {
    411       "claim": "The automated agent workflow effectively scales bootcamp generation from 100 to 1000+ tasks",
    412       "evidence": "Section 3.3 describes the evolutionary generation and self-consistent unittest filtering. Tables 1-3 show iterative improvement: problematic bootcamps decrease from 33/228 to 14/228 over 3 iterations. 704 tasks retained after filtering.",
    413       "supported": "moderate"
    414     }
    415   ],
    416   "red_flags": [
    417     {
    418       "flag": "Self-designed benchmark dominance",
    419       "detail": "The primary evaluation benchmark (BOOTCAMP-EVAL) is designed by the same team that built the training framework. The training and evaluation data come from the same bootcamp classes, creating potential structural advantage. SOTA claims are primarily on this self-designed benchmark."
    420     },
    421     {
    422       "flag": "Cherry-picked 'Emergent Moment' examples",
    423       "detail": "The 'Emergent Moment' claim (Section 4.2.2) is based on 3 hand-selected tasks out of 1000+. No systematic analysis of how many tasks show this pattern vs. how many don't. The loaded term 'emergent' is used without engagement with the broader debate (cf. Schaeffer et al.)."
    424     },
    425     {
    426       "flag": "No error bars or variance on main results",
    427       "detail": "The task scaling experiments (32, 128, 512 tasks) appear to be single runs. Without variance estimates, it's impossible to determine whether observed differences are reliable or within noise margins."
    428     },
    429     {
    430       "flag": "Linear scaling law from 4 data points",
    431       "detail": "Figure 5b claims a linear relationship between task count and performance based on only 4 data points (8, 32, 128, 512 tasks). This is insufficient to establish a scaling law, and the linear fit could be an artifact of sparse sampling."
    432     },
    433     {
    434       "flag": "No limitations section",
    435       "detail": "The paper has no limitations or threats-to-validity section. Key unaddressed issues include: limited model diversity (2 families tested), structural similarity between training and evaluation data, potential overfitting to BOOTCAMP-EVAL's task format, and lack of compute budget reporting."
    436     },
    437     {
    438       "flag": "Incomplete OOD contamination analysis",
    439       "detail": "While contamination is addressed for BOOTCAMP-EVAL via deduplication, no contamination analysis is provided for OOD benchmarks (MMLU-Pro, GPQA, HumanEval) that have been publicly available for years and could be in the base models' training data."
    440     }
    441   ],
    442   "cited_papers": [
    443     {
    444       "title": "Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning",
    445       "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"],
    446       "year": 2025,
    447       "arxiv_id": "2501.12948",
    448       "relevance": "Core RL method for reasoning enhancement in LLMs, used as both the bootcamp generation model and a training paradigm reference."
    449     },
    450     {
    451       "title": "DAPO: An open-source llm reinforcement learning system at scale",
    452       "authors": ["Qiying Yu", "Zheng Zhang", "Ruofei Zhu"],
    453       "year": 2025,
    454       "arxiv_id": "2503.14476",
    455       "relevance": "The RL algorithm used for training in the paper's experiments; key infrastructure for RLVR training."
    456     },
    457     {
    458       "title": "Enigmata: Scaling logical reasoning in large language models with synthetic verifiable puzzles",
    459       "authors": ["Jiangjie Chen", "Qianyu He", "Siyu Yuan"],
    460       "year": 2025,
    461       "relevance": "Prior work on synthesizing verifiable reasoning puzzles for LLM training, directly comparable approach with narrower scope."
    462     },
    463     {
    464       "title": "Synlogic: Synthesizing verifiable reasoning data at scale for learning logical reasoning and beyond",
    465       "authors": ["Junteng Liu", "Yuanxiang Fan"],
    466       "year": 2025,
    467       "arxiv_id": "2505.19641",
    468       "relevance": "Concurrent work on synthesizing verifiable reasoning data for LLM training with multi-task data generation."
    469     },
    470     {
    471       "title": "Livecodebench: Holistic and contamination free evaluation of large language models for code",
    472       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    473       "year": 2024,
    474       "arxiv_id": "2403.07974",
    475       "relevance": "Contamination-free code benchmark used as an OOD evaluation target."
    476     },
    477     {
    478       "title": "GPQA: A graduate-level google-proof q&a benchmark",
    479       "authors": ["David Rein", "Betty Li Hou"],
    480       "year": 2024,
    481       "relevance": "Graduate-level knowledge reasoning benchmark used for OOD evaluation."
    482     },
    483     {
    484       "title": "Big-bench extra hard",
    485       "authors": ["Mehran Kazemi", "Bahare Fatemi"],
    486       "year": 2025,
    487       "arxiv_id": "2502.19187",
    488       "relevance": "Challenging reasoning benchmark used for OOD evaluation and as a source of reasoning tasks."
    489     },
    490     {
    491       "title": "Kor-bench: Benchmarking language models on knowledge-orthogonal reasoning tasks",
    492       "authors": ["Kaijing Ma", "Xinrun Du"],
    493       "year": 2024,
    494       "arxiv_id": "2410.06526",
    495       "relevance": "Knowledge-orthogonal reasoning benchmark used for OOD evaluation, directly relevant to assessing reasoning vs knowledge."
    496     },
    497     {
    498       "title": "Competition-level code generation with alphacode",
    499       "authors": ["Yujia Li", "David Choi"],
    500       "year": 2022,
    501       "doi": "10.1126/science.abq1158",
    502       "relevance": "Competitive programming dataset (CodeContest) used as a task source for algorithm problems in InternBootcamp."
    503     },
    504     {
    505       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    506       "authors": ["Jason Wei", "Xuezhi Wang"],
    507       "year": 2022,
    508       "relevance": "Foundational work on CoT reasoning that underlies the training approach used in this paper."
    509     },
    510     {
    511       "title": "Hybridflow: A flexible and efficient rlhf framework",
    512       "authors": ["Guangming Sheng", "Chi Zhang"],
    513       "year": 2024,
    514       "arxiv_id": "2409.19256",
    515       "relevance": "RL training framework (VeRL) that InternBootcamp integrates with for RLVR training."
    516     },
    517     {
    518       "title": "Korgym: A dynamic game platform for llm reasoning evaluation",
    519       "authors": ["Jiajun Shi", "Jian Yang"],
    520       "year": 2025,
    521       "relevance": "Related platform for LLM reasoning evaluation through interactive game environments."
    522     }
    523   ]
    524 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs