scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25251B)
      1 {
      2   "paper": {
      3     "title": "Co-Evolving LLM Coder and Unit Tester via Reinforcement Learning",
      4     "authors": [
      5       "Yinjie Wang",
      6       "Ling Yang",
      7       "Ye Tian",
      8       "Ke Shen",
      9       "Mengdi Wang"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv preprint",
     13     "arxiv_id": "2506.03136"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The paper provides a GitHub link (https://github.com/Gen-Verse/CURE) on page 1 and a HuggingFace model link (https://huggingface.co/Gen-Verse/ReasonFlux-Coder)."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The paper uses publicly available benchmarks: LiveBench, MBPP, LiveCodeBench, CodeContests, and CodeForces. The training data is drawn from CodeContests (a public dataset) with a described split. These are all publicly accessible."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper mentions using 8 A100 GPUs and vLLM for generation but does not provide a requirements.txt, Dockerfile, or detailed dependency/version listing to recreate the environment."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "While the algorithm is described in pseudocode (Algorithm 1) and hyperparameters are stated, there are no step-by-step reproduction instructions (e.g., a README with commands to run) in the paper itself."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "Table 3 in the appendix reports standard errors for accuracy improvements across 16 independent runs for unit test and code accuracy. For example, ReasonFlux-Coder-14B on LiveBench: UT 0.455 (0.008), Code 0.111 (0.0041)."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper makes comparative claims (e.g., 'outperforming similarly sized Qwen-Coder, DeepSeek-Coder, and Seed-Coder') but does not report any statistical significance tests (p-values, t-tests, etc.) to support these comparisons."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "The paper reports effect sizes with baseline context throughout. For example, '5.3% improvement in one-shot code generation accuracy' and '9.0% in BoN accuracy' relative to base models. Tables 1 and 3 provide both absolute numbers and improvements."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No justification is given for why 16 rollouts were used for code and unit tests, why the training set is 4.5k examples, or why 200 evaluation examples from CodeContests were used. No power analysis is provided."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Table 3 reports standard errors across 16 independent runs for UT and Code accuracy. The paper notes 'accuracies of unit test and code are evaluated over 16 independent runs.' BoN is reported on a single run, which is acknowledged."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Table 1 compares against multiple baselines: DeepSeek-Coder-V2-16B, Qwen2.5-14B-Coder-Instruct, Qwen2.5-14B-Instruct, Seed-Coder-8B-Instruct, Qwen2.5-7B-Coder-Instruct, Qwen2.5-7B-Instruct, and Qwen3-4B."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The baselines include recent models: Qwen2.5-Coder-Instruct (2024), DeepSeek-Coder-V2 (2024), Seed-Coder-8B (2025), and Qwen3-4B (2025). These are contemporary and competitive."
     75       },
     76       "ablation_study": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Section 4.2.6 and Figure 6(b) present ablation studies on optimization strategies (RL vs. SFT, code-only vs. co-evolving) and reward design choices (theoretically-derived reward vs. simpler pu-based reward). Results demonstrate CURE outperforms ablated variants."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Three main metrics are reported: unit test accuracy (UT), one-shot code generation accuracy (Code), and Best-of-N accuracy (BoN). These are reported across all five benchmarks."
     85       },
     86       "human_evaluation": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No human evaluation is included. All evaluation is automated through execution-based metrics (pass/fail on test cases). Given claims about code quality, human evaluation of generated code or unit tests could have strengthened the work."
     90       },
     91       "held_out_test_set": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "The training set is 4.5k examples from CodeContests. Evaluation is on separate benchmarks: LiveBench, MBPP test set, LiveCodeBench v2, a 200-example eval split from CodeContests, and 500 non-overlapping CodeForces examples. Section 4.1.1 describes these splits."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Table 1 provides per-benchmark breakdowns across all five benchmarks (LiveBench, MBPP, LiveCodeBench, CodeContests, CodeForces), with per-metric results (UT, Code, BoN) for each."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "The paper does not discuss failure cases or where the approach breaks down. There is no error analysis of when CURE fails, which types of problems it struggles with, or qualitative analysis of incorrect outputs."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The ablation study (Section 4.2.6) reports negative results: code-only optimization 'does not improve the model's ability to produce accurate unit tests,' and the simpler reward leads to 'poor control over key error probabilities' (p01=42.2%, p00=14.7%)."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The abstract claims of 5.3% code generation improvement and 9.0% BoN improvement are supported by Table 1. The claim of outperforming Qwen-Coder, DeepSeek-Coder, and Seed-Coder is supported by the same table. The 8.1% agentic coding improvement is supported by Figure 6(a). The 64.8% inference efficiency claim is supported by Table 4."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The paper makes causal claims via ablation studies (Section 4.2.6): removing co-evolution, switching RL to SFT, and simplifying the reward function each degrade performance. These controlled single-variable manipulations provide adequate support for the causal claims about which components matter."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The title is broad ('Co-Evolving LLM Coder and Unit Tester') but results are only on competitive programming benchmarks. The paper does not test on real-world software engineering tasks, non-Python languages, or other coding domains. No explicit bounding of generalization scope is provided."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper does not discuss alternative explanations for the observed improvements. For instance, could the improvements be due to the additional training signal from unit tests rather than co-evolution specifically? Could the gains come from data augmentation effects? No confound analysis is provided."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper uses 'Qwen2.5-7B-Instruct', 'Qwen2.5-14B-Instruct', 'Qwen3-4B' without snapshot dates or specific version identifiers. Similarly, 'GPT-4o-mini' and 'GPT-4.1-mini' are used without API version or snapshot dates."
    139       },
    140       "prompts_provided": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Appendix C.1 provides the full prompt templates for both code generation and unit test generation, including the exact text with the placeholder {{problem}} for the task description. The actual prompts sent to models are fully reconstructable."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Section 4.1.2 reports: temperature 1.0, top-p 1.0, learning rate 1e-6, KL coefficient beta=0.01, 350 training steps for 7B/14B, 50 steps for 4B, 16 rollouts each for code and unit tests. Temperature 0.8 for long-CoT. 8K token truncation in Section 3.4."
    149       },
    150       "scaffolding_described": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "The full co-evolution framework is described in Algorithm 1 (Section 3.3), including the reward design, iterative optimization procedure, and how code and unit test generation interact. The agentic coding pipelines (BoN, MPSC, AlphaCodium, S*) are described in Section 4.1.3 and Appendix C.3."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Appendix C.2 documents the preprocessing: conversion from functional to stdio format with detailed examples. Section 4.1.1 describes dataset splits: 4.5k training / 200 eval for CodeContests, 500 sampled from CodeForces, etc. Ground-truth code collection for some benchmarks is described (QwQ-32B BoN with max 3 samples)."
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "There is no dedicated limitations or threats-to-validity section. The Discussion section (Section 5) is primarily a summary of contributions and a brief mention of future work. No substantive limitations discussion exists."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "No specific threats to validity are discussed. There is no mention of limitations regarding competitive-programming-only evaluation, model size constraints, potential training instabilities, or other study-specific concerns."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "The paper does not explicitly state what the results do NOT show. There is no mention that results are limited to competitive programming, limited to specific model families, or limited to Python. The broad framing ('coding capabilities of LLMs') is not bounded."
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "Raw experimental data (individual model outputs, execution matrices, per-problem results) are not made available. Only aggregate metrics are reported in tables."
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section 4.1.1 describes the data collection: five benchmarks are named, CodeContests split into 4.5k training / 200 eval with difficulty level <=2, CodeForces 500 randomly sampled, LiveCodeBench v2 with 511 problems, MBPP standard test set."
    188       },
    189       "recruitment_methods_described": {
    190         "applies": false,
    191         "answer": false,
    192         "justification": "No human participants are involved. The study uses publicly available benchmark datasets."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The pipeline from data to results is documented: dataset selection → preprocessing (Appendix C.2) → rollout generation with vLLM → execution matrix construction → reward computation (Section 3.3) → RL optimization (Algorithm 1). Ground-truth code collection for evaluation is described in Appendix C.2."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "No funding source or acknowledgments section is present in the paper. Author affiliations include ByteDance Seed, but no funding disclosure is made."
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Author affiliations are listed on page 1: University of Chicago, Princeton University, Peking University, and ByteDance Seed. The ByteDance affiliation is clearly stated."
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "One author (Ke Shen) is affiliated with ByteDance Seed, a company that develops and deploys LLMs. ByteDance has a commercial interest in improved code generation capabilities. No discussion of potential conflict of interest is provided, and no funding disclosure is made to assess independence."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No competing interests statement is present in the paper. Given the ByteDance affiliation and the release of models under the Gen-Verse name, financial interests may exist but are not declared."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "The paper does not state the training data cutoff dates for Qwen2.5 or Qwen3 models used as base models. This is important because MBPP and other benchmarks may be in their training data."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No discussion of potential train/test overlap. MBPP (published 2021) and CodeContests (published 2022) could be in the training data of the base models. The paper does not address this risk."
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "The paper uses LiveBench and LiveCodeBench which are designed to be contamination-free, but also uses MBPP and CodeContests which predate the base models. No contamination analysis is provided for any benchmark. The use of LiveBench/LiveCodeBench partially mitigates this but the paper does not discuss it."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants are involved in this study."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants are involved in this study."
    249       },
    250       "demographics_reported": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants are involved in this study."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants are involved in this study."
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants are involved in this study."
    264       },
    265       "blinding_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants are involved in this study."
    269       },
    270       "attrition_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants are involved in this study."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": true,
    279         "answer": true,
    280         "justification": "Table 2 reports API costs per task (in units of 10^-3 USD) for GPT-series models, comparing one-shot, BoN-16, and CURE-enhanced costs across all five benchmarks. For example, 4o-mini-CURE BoN-16 costs 4.7 vs. 4o-mini BoN-16 at 10.8 on LiveBench."
    281       },
    282       "compute_budget_stated": {
    283         "applies": true,
    284         "answer": true,
    285         "justification": "Section 4.1.2 states training was done on 8 A100 GPUs. The number of training steps is specified (350 for 7B/14B, 50 for 4B). However, total GPU hours or wall-clock time are not explicitly stated, though the hardware and step counts provide a reasonable estimate."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "ReasonFlux-Coder 7B and 14B improve code generation accuracy by 5.3% and BoN accuracy by 9.0% over Qwen2.5-Instruct base models.",
    292       "evidence": "Table 1 shows averaged improvements across five benchmarks. Table 3 provides standard errors for these improvements.",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "ReasonFlux-Coder models outperform similarly sized Qwen-Coder, DeepSeek-Coder, and Seed-Coder.",
    297       "evidence": "Table 1 shows ReasonFlux-Coder-14B and 7B achieving higher BoN scores than respective baselines on most benchmarks. However, no significance tests are provided.",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "CURE achieves 8.1% improvement over the base model in agentic coding tasks.",
    302       "evidence": "Figure 6(a) shows improvements across BoN, MPSC, AlphaCodium, and S* methods on LiveBench, averaging 8.1%.",
    303       "supported": "moderate"
    304     },
    305     {
    306       "claim": "ReasonFlux-Coder-4B consistently outperforms Qwen3-4B while achieving 64.8% inference efficiency in unit test generation.",
    307       "evidence": "Table 1 shows higher accuracy for ReasonFlux-Coder-4B vs Qwen3-4B across benchmarks. Table 4 shows response length reduction to 64.8% on average.",
    308       "supported": "strong"
    309     },
    310     {
    311       "claim": "The trained unit test generator can serve as a reward model for RL, achieving competitive improvements compared to ground-truth labeled supervision.",
    312       "evidence": "Figure 7 shows that RL with ReasonFlux-Coder-4B generated tests as rewards achieves comparable performance to RL with labeled unit tests on LiveBench.",
    313       "supported": "moderate"
    314     },
    315     {
    316       "claim": "The theoretically derived reward (optimizing mu) better constrains error probabilities compared to simpler reward designs.",
    317       "evidence": "Section 4.2.6 reports the simple reward leads to p01=42.2%, p00=14.7% while the theoretically derived reward achieves 36.5% and 9.1% respectively.",
    318       "supported": "strong"
    319     }
    320   ],
    321   "methodology_tags": [
    322     "benchmark-eval"
    323   ],
    324   "key_findings": "The paper introduces CURE, a reinforcement learning framework that co-evolves a code generator and unit test generator without requiring ground-truth code solutions. The approach improves code generation accuracy by 5.3% and Best-of-N accuracy by 9.0% on five benchmarks when applied to Qwen2.5-Instruct models. A key practical finding is that the trained model can serve as a cost-effective unit tester for API-based models, improving GPT-4o-mini BoN performance by 5.5% while reducing API costs. The theoretically derived reward function based on 'reward precision' outperforms simpler alternatives in ablation studies.",
    325   "red_flags": [
    326     {
    327       "flag": "No limitations section",
    328       "detail": "The paper has no limitations, threats-to-validity, or scope-bounding discussion. All experiments are on competitive programming benchmarks, but the paper frames results broadly in terms of 'coding capabilities of LLMs' without acknowledging this scope restriction."
    329     },
    330     {
    331       "flag": "No significance tests for comparative claims",
    332       "detail": "The paper claims to 'outperform' multiple baseline models but provides no statistical significance tests. While standard errors are reported in the appendix, formal comparisons are absent."
    333     },
    334     {
    335       "flag": "Benchmark contamination risk unaddressed",
    336       "detail": "MBPP and CodeContests predate the Qwen2.5 base models and could be in their training data. The paper does not discuss contamination risk for any benchmark, though LiveBench and LiveCodeBench are designed to mitigate this."
    337     },
    338     {
    339       "flag": "ByteDance affiliation without conflict disclosure",
    340       "detail": "One author is from ByteDance Seed, a company with commercial interest in LLM coding capabilities. No conflicts of interest statement or funding disclosure is provided."
    341     },
    342     {
    343       "flag": "No failure case analysis",
    344       "detail": "The paper does not examine where CURE fails or what types of problems remain challenging. All reported results show improvements, with no qualitative error analysis."
    345     }
    346   ],
    347   "cited_papers": [
    348     {
    349       "title": "Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning",
    350       "authors": ["Daya Guo"],
    351       "year": 2025,
    352       "arxiv_id": "2501.12948",
    353       "relevance": "Major RL-for-reasoning approach that demonstrates reinforcement learning for improving LLM capabilities, directly relevant to the survey's scope on LLM optimization methods."
    354     },
    355     {
    356       "title": "Codet: Code generation with generated tests",
    357       "authors": ["Bei Chen", "Fengji Zhang"],
    358       "year": 2022,
    359       "arxiv_id": "2207.10397",
    360       "relevance": "Foundational work on using generated unit tests for code selection (BoN), a key baseline and methodological precursor for this paper."
    361     },
    362     {
    363       "title": "Dynamic scaling of unit tests for code reward modeling",
    364       "authors": ["Zeyao Ma"],
    365       "year": 2025,
    366       "arxiv_id": "2501.01054",
    367       "relevance": "Directly related work on using unit tests as rewards for scaling code generation at inference time."
    368     },
    369     {
    370       "title": "S*: Test time scaling for code generation",
    371       "authors": ["Dacheng Li"],
    372       "year": 2025,
    373       "arxiv_id": "2502.14382",
    374       "relevance": "Test-time scaling method for code generation using iterative debugging and unit tests, used as an agentic coding baseline in this paper."
    375     },
    376     {
    377       "title": "Code generation with alphacodium: From prompt engineering to flow engineering",
    378       "authors": ["Tal Ridnik"],
    379       "year": 2024,
    380       "arxiv_id": "2401.08500",
    381       "relevance": "Agentic coding approach using test-driven iterative refinement, used as a baseline in the paper's agentic coding experiments."
    382     },
    383     {
    384       "title": "Stepcoder: Improve code generation with reinforcement learning from compiler feedback",
    385       "authors": ["Shihan Dou"],
    386       "year": 2024,
    387       "arxiv_id": "2402.01391",
    388       "relevance": "RL method for code generation using compiler feedback as reward signal, directly relevant to survey scope on RL-based coding improvement."
    389     },
    390     {
    391       "title": "Livecodebench: Holistic and contamination free evaluation of large language models for code",
    392       "authors": ["Naman Jain"],
    393       "year": 2024,
    394       "arxiv_id": "2403.07974",
    395       "relevance": "Contamination-free code evaluation benchmark, relevant to survey's focus on methodological quality of code generation evaluations."
    396     },
    397     {
    398       "title": "Is your code generated by chatgpt really correct? rigorous evaluation of large language models for code generation",
    399       "authors": ["Jiawei Liu"],
    400       "year": 2023,
    401       "relevance": "Addresses rigor in evaluating LLM-generated code, directly relevant to methodological quality assessment in the survey."
    402     },
    403     {
    404       "title": "No more manual tests? evaluating and improving chatgpt for unit test generation",
    405       "authors": ["Zhiqiang Yuan"],
    406       "year": 2023,
    407       "arxiv_id": "2305.04207",
    408       "relevance": "Evaluation of LLMs for unit test generation with agentic refinement, relevant to the survey's scope on LLM-based testing."
    409     },
    410     {
    411       "title": "An empirical evaluation of using large language models for automated unit test generation",
    412       "authors": ["Max Schäfer", "Sarah Nadi"],
    413       "year": 2023,
    414       "relevance": "Empirical study of LLM-based unit test generation, relevant to the survey's assessment of code generation research quality."
    415     },
    416     {
    417       "title": "Learning to generate unit tests for automated debugging",
    418       "authors": ["Archiki Prasad"],
    419       "year": 2025,
    420       "arxiv_id": "2502.01619",
    421       "relevance": "UTGEN approach to training LLMs for unit test generation using ground-truth code, a direct methodological comparison point for CURE."
    422     },
    423     {
    424       "title": "Competition-level code generation with alphacode",
    425       "authors": ["Yujia Li"],
    426       "year": 2022,
    427       "doi": "10.1126/science.abq1158",
    428       "relevance": "Landmark work in competitive programming code generation that established the CodeContests benchmark used in this paper."
    429     }
    430   ]
    431 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs