scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (19762B)
      1 {
      2   "paper": {
      3     "title": "Competitive Programming with Large Reasoning Models",
      4     "authors": ["OpenAI"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2502.06807"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No source code or repository URL is provided in the paper. The paper includes sample IOI solutions in Appendix C but no reproducible codebase."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The CodeForces problems and IOI problems are publicly available, but the paper does not release its specific test set, evaluation scripts, or any curated dataset."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specifications, dependency lists, or hardware details for reproduction are provided."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided. The models (o1, o1-ioi, o3) are proprietary and not publicly available."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results are reported as point estimates (ratings, percentiles, scores). No confidence intervals or error bars are provided for any result."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims o3 outperforms o1-ioi and other models but provides no statistical significance tests. Comparisons are based solely on comparing point estimates."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports improvements with baseline context, e.g., 'rating increase from 2214 (98th percentile) to 2724 (99.8th percentile)' and specific point differences (e.g., 213 vs 362.14 vs 395.64 on IOI)."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The test set uses 12 Division 1 contests but no justification is given for why this number is sufficient. One contest was excluded for convenience."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "SWE-bench results are 'averaged over 3 trials' but no standard deviation or variance is reported. CodeForces and IOI results appear to be single-run evaluations with pass@10 but no spread measures."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares gpt-4o, o1-preview, o1, o1-ioi, and o3 across benchmarks, providing a progression of baselines."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "All baselines are OpenAI's own models. No comparison with external systems like DeepSeek-R1, Kimi k1.5, or AlphaCode2 on the same benchmarks despite mentioning them in the introduction."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper effectively ablates the test-time strategy by comparing o1-ioi with random submissions vs. hand-crafted strategy vs. relaxed limits (Figure 4), and compares o3's simpler strategy against o1-ioi's complex one."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple metrics are used: CodeForces rating/percentile, IOI score, pass@1, pass@10, average score on HackerRank Astra, and SWE-bench verified percentage."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "All evaluation is through automated competitive programming judges and test suites. Human evaluation is not relevant for this type of benchmark evaluation."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section B.1 states Division 1 contests from late 2023 and 2024 were used, 'all of which occurred after the o3 training set data cut-off.' A separate Division 2 validation set was used during training."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 1 in Appendix B provides detailed per-problem breakdowns across all 12 contests with pass@1, pass@10, and ranking information."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "The paper notes that 'the very best humans still solve more problems than AI' but does not analyze specific failure cases or why certain problems remain unsolved."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "Every experiment shows improvement over the previous model. No failed approaches, configurations that didn't work, or negative results are reported."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims about o1-ioi placing 49th percentile at IOI, achieving gold under relaxed constraints, and o3 achieving gold without hand-crafted strategies are all supported by the results in Sections 3.4 and 4.2."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper makes causal claims that 'reinforcement learning... significantly boosts performance' and 'scaling general-purpose reinforcement learning... offers a robust path.' However, multiple variables change between models (architecture, data, RL compute, training data) without controlled isolation."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title and conclusion claim results about 'reasoning domains' broadly, but evaluation is limited to competitive programming (CodeForces, IOI) and two software engineering benchmarks. The conclusion states results 'will unlock many new use cases for AI in science, coding, math, and many other fields.'"
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No alternative explanations are discussed. The improvements could be due to larger training data, more compute, architecture changes, or other factors besides RL, but these are not addressed."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Models are referred to as 'o1', 'o1-ioi', 'o3', 'gpt-4o', and 'o1-preview' without specific version identifiers, snapshot dates, or API versions. o3 is described as 'early checkpoints' without further specification."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "Section 4.2 mentions prompts include 'the problem description, constraints, and any available sample test cases' but the actual prompt text or template is not provided."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No hyperparameters are reported — no temperature, top-p, max tokens, or sampling parameters. The number of samples (10K for o1-ioi, 1K for o3) is stated but inference parameters are not."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The o1-ioi test-time strategy is described in detail in Section 3.2: subtask decomposition, 10K sampling, clustering via model-generated test inputs, reranking with learned scoring function, and round-robin submission. o3's simpler approach is also described."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section B.1 describes the test set construction: Division 1 contests from late 2023 and 2024, post training cutoff, one contest excluded for interactive problem, validation set of Division 2 problems used during training."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "There is no limitations or threats-to-validity section in the paper."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No threats to validity are discussed anywhere in the paper."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No explicit scope boundaries are stated. The paper does not discuss what the results do NOT show or what settings are excluded from the claims."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw data (model outputs, individual trial results, generated solutions beyond Appendix C samples) is made available for independent verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section B.1 describes the test set selection: Division 1 contests from late 2023 and 2024, post-cutoff, with contamination checks via embedding search."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. The data sources are standard competitive programming benchmarks."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section B describes the evaluation pipeline: problem selection, grading methodology (full test suite, matching official CodeForces grading), rating calculation methodology, and thinking time adjustments."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding source is disclosed. The paper is authored by 'OpenAI' but no specific funding or grant information is provided."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The paper is clearly authored by OpenAI, evaluating OpenAI's own models. The affiliation is stated in the author line."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "OpenAI has a direct financial interest in demonstrating the capabilities of o1 and o3. The funder is not independent of the outcome."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Section B.1 states all test contests 'occurred after the o3 training set data cut-off.' Section 4.2 states 'IOI 2024 occurred after the training cut-off for this model.'"
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Section B.1: 'we used embedding search to confirm that the test problems have not been seen by the model during training.' Section 4.2: 'we additionally confirmed with search that the IOI test problems are not contaminated with the training set.'"
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "Contamination is addressed via temporal separation (post-cutoff contests) and embedding-based search verification. Section 2.1 also mentions 'a contamination check as a sanity measure.'"
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in the study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in the study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in the study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in the study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in the study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference cost, API cost, or latency is reported despite sampling 10,000 solutions per problem for o1-ioi and 1,024 for o3."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No computational budget is stated for training or inference. The paper mentions 'significantly greater compute resources' for o3 without quantifying."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "o3 achieves a CodeForces rating of 2724 (99.8th percentile), placing it among the top 200 active competitors worldwide.",
    286       "evidence": "Figure 5 and Section 4.1 report this rating. Figure 10 shows o3 vs top active competitors. Appendix B provides per-problem breakdown.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "o3 achieves a gold medal on IOI 2024 (395.64 points) under the 50-submission limit without hand-crafted test-time strategies.",
    291       "evidence": "Figure 7 and Section 4.2 report this result. Gold threshold was approximately 360 points.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Scaling general-purpose reinforcement learning, rather than domain-specific techniques, offers a robust path toward state-of-the-art AI in reasoning domains.",
    296       "evidence": "Comparison of o1-ioi (domain-specific) vs o3 (general-purpose) across CodeForces and IOI benchmarks in Sections 3-4.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "o3 achieves 71.7% on SWE-bench Verified, a 22.8% improvement over o1.",
    301       "evidence": "Figure 9 shows these results, referenced from o1 system card and early o3 checkpoint results.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "o1-ioi's hand-crafted test-time selection strategy contributed nearly 60 additional points over random submission at IOI 2024.",
    306       "evidence": "Figure 4 shows 213 points with hand-crafted strategy vs 156 points with random 50 submissions.",
    307       "supported": "strong"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "OpenAI's o3 reasoning model achieves a CodeForces rating of 2724 (99.8th percentile) and an IOI 2024 gold medal score of 395.64 under standard competition constraints, surpassing the domain-specific o1-ioi system without hand-crafted test-time strategies. The paper demonstrates a progression from gpt-4o (11th percentile on CodeForces) through o1-preview, o1, and o1-ioi to o3, attributing gains to reinforcement learning at scale. o3 also achieves 71.7% on SWE-bench Verified, suggesting reasoning improvements transfer beyond competitive programming.",
    312   "red_flags": [
    313     {
    314       "flag": "Company evaluating own product",
    315       "detail": "OpenAI is evaluating its own proprietary models (o1, o1-ioi, o3) with no independent verification. All baselines are also OpenAI models — no comparison with competitors like DeepSeek-R1 or AlphaCode2 on the same benchmarks."
    316     },
    317     {
    318       "flag": "No limitations section",
    319       "detail": "The paper contains no limitations section, no threats to validity, and no discussion of what the results do not show. This is unusual for a paper making broad claims about reasoning capabilities."
    320     },
    321     {
    322       "flag": "No statistical rigor",
    323       "detail": "All results are point estimates with no confidence intervals, error bars, significance tests, or variance reporting. SWE-bench is averaged over 3 trials with no spread reported."
    324     },
    325     {
    326       "flag": "Unreproducible",
    327       "detail": "The models are proprietary and not publicly available. No code, prompts, hyperparameters, or environment details are provided. Results cannot be independently verified or reproduced."
    328     },
    329     {
    330       "flag": "Overclaimed generalization",
    331       "detail": "The conclusion claims results 'will unlock many new use cases for AI in science, coding, math, and many other fields' based on competitive programming and two software engineering benchmarks."
    332     },
    333     {
    334       "flag": "Confounded causal claims",
    335       "detail": "The paper attributes improvements to RL scaling, but multiple variables change between models (architecture, training data volume, compute budget, data quality) without controlled isolation."
    336     }
    337   ],
    338   "cited_papers": [
    339     {
    340       "title": "Competition-Level Code Generation with AlphaCode",
    341       "authors": ["Li et al."],
    342       "year": 2022,
    343       "relevance": "Foundational work on AI competitive programming using large-scale sampling and test-time heuristics."
    344     },
    345     {
    346       "title": "Evaluating Large Language Models Trained on Code",
    347       "authors": ["Chen et al."],
    348       "year": 2021,
    349       "relevance": "Introduced Codex and the HumanEval benchmark for code generation evaluation."
    350     },
    351     {
    352       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    353       "authors": ["DeepSeek-AI"],
    354       "year": 2025,
    355       "relevance": "Independent demonstration of RL-based chain-of-thought improving programming and math performance."
    356     },
    357     {
    358       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    359       "authors": ["Jimenez et al."],
    360       "year": 2024,
    361       "relevance": "Benchmark for evaluating AI on real-world software engineering tasks, used in this paper's evaluation."
    362     },
    363     {
    364       "title": "AlphaCode 2 Technical Report",
    365       "authors": ["Google DeepMind"],
    366       "year": 2023,
    367       "relevance": "Prior state-of-the-art in competitive programming AI, achieving 85th percentile on CodeForces."
    368     },
    369     {
    370       "title": "Learning to Reason with LLMs",
    371       "authors": ["OpenAI"],
    372       "year": 2024,
    373       "relevance": "Technical report on o1 reasoning model, foundational to this paper's systems."
    374     },
    375     {
    376       "title": "Kimi k1.5: Scaling Reinforcement Learning with LLMs",
    377       "authors": ["Kimi Team"],
    378       "year": 2025,
    379       "relevance": "Independent work on RL-based reasoning for mathematical and programming challenges."
    380     },
    381     {
    382       "title": "Program Synthesis with Large Language Models",
    383       "authors": ["Austin et al."],
    384       "year": 2021,
    385       "relevance": "Early work showing LLM code generation scaling with model size, foundational to the field."
    386     }
    387   ]
    388 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs