scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24928B)
      1 {
      2   "paper": {
      3     "title": "CODERL+: Improving Code Generation via Reinforcement with Execution Semantics Alignment",
      4     "authors": [
      5       "Xue Jiang",
      6       "Yihong Dong",
      7       "Mengyang Liu",
      8       "Hongyi Deng",
      9       "Tian Wang",
     10       "Yongding Tao",
     11       "Rongyu Cao",
     12       "Binhua Li",
     13       "Zhi Jin",
     14       "Wenpin Jiao",
     15       "Fei Huang",
     16       "Yongbin Li",
     17       "Ge Li"
     18     ],
     19     "year": 2025,
     20     "venue": "arXiv preprint",
     21     "arxiv_id": "2510.18471"
     22   },
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper states 'Our source code will be released at https://github.com/jiangxxxue/CODERLPLUS' (footnote 1, page 1). This is a promise of future release, not an actual release. Per schema rules, a promise counts as NO."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The training data uses publicly available datasets: APPS, CodeContests, TACO, and Codeforces (Section 4.1). Evaluation uses public benchmarks: HumanEval, LeetCode, LiveCodeBench. All are publicly accessible standard benchmarks."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Section 4.1 mentions using VeRL framework and 8 NVIDIA A100 80G GPUs, but provides no requirements.txt, Dockerfile, conda environment, or library versions. This is not enough to recreate the environment."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No step-by-step reproduction instructions are provided. The paper describes experimental setup in Section 4.1 but does not include a README, reproduction scripts, or commands to replicate results."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "All results in Tables 1, 2, and 3 are reported as point estimates (e.g., '90.9', '63.3') with no confidence intervals, error bars, or uncertainty notation."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper claims CODERL+ 'outperforms' baselines (e.g., '4.6% average relative improvement in pass@1') based solely on comparing numbers without any statistical significance tests."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "The paper reports percentage improvements with baseline context, e.g., '4.6% average relative improvement in pass@1' (abstract), '11.2% over the GRPO baseline' (Section 4.2), and '+7.4% average on code generation' (Section 4.2). Absolute numbers are provided in Tables 1-3 allowing the reader to compute differences."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No justification is given for why these specific benchmarks and their sizes were chosen. No power analysis or discussion of whether benchmark sizes are adequate to detect the claimed improvements."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "All results appear to be from single runs. No standard deviations, variance across seeds, or multi-run results are reported. The paper uses greedy sampling (temperature 0.0) for evaluation, but does not report variance from training stochasticity."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Table 1 compares against the base model (Qwen2.5-Coder-7B-Instruct), GRPO, and seven baselines including OlympicCoder, OCR-Qwen-7B, Skywork-OR1, CodePRM, CODEI/O, CodeReasoner, and CodeBoost."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Baselines are contemporary: OlympicCoder, OCR-Qwen-7B, Skywork-OR1, CodePRM, CODEI/O, CodeReasoner, and CodeBoost are all from 2025, representing the current state of the art in RL-based and distillation-based code generation."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Section 4.3 presents ablation studies with three variants: CODERL+ (Random Rollout), CODERL+ (Off-policy Sem), and CODERL+ (IO), each isolating a different design choice. Results are shown in Figure 3."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The paper evaluates on pass@1 across three code generation benchmarks (HumanEval, LeetCode, LiveCodeBench), plus accuracy on LiveCodeBench-Reason and LiveCodeBench-Test. The probe analysis uses MSE. Multiple tasks serve as distinct metrics."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No human evaluation is included. All evaluation is automated: pass@1 via test case execution and accuracy on reasoning benchmarks. For code generation papers claiming quality improvements, human inspection of generated code quality would add value."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The evaluation benchmarks (HumanEval, LeetCode, LiveCodeBench) are separate from the training data (APPS, CodeContests, TACO, Codeforces). Section 4.1 explicitly describes this separation."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Results are broken down per benchmark in Tables 1-3. Table 2 provides per-model-family breakdown. Table 3 provides per-RL-algorithm breakdown. The ablation study provides per-task breakdown."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Figure 1 shows concrete failure cases (base model and RLVR failing on loop programs). The case study in Appendix A (Figure 6) discusses failure modes of the base model and GRPO compared to CODERL+."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The paper reports that GRPO sometimes shows performance degradation (e.g., on LLaMA-3.1-8B in Section 4.2). The ablation study shows degraded performance for each ablated variant. CodeReasoner outperforms CODERL+ on Test Output Generation."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The abstract claims '4.6% average relative improvement in pass@1' — verified in Table 1 (average 63.7 vs base 60.9 for GRPO). Claims of '15.5%' and '4.4%' improvements on reasoning and test output generation are supported by Table 1 results. Claims of generalization across LLMs and RL algorithms are supported by Tables 2 and 3."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The paper makes causal claims (e.g., 'execution semantics alignment improves code generation'). The ablation study (Section 4.3) provides controlled single-variable manipulation: each ablation removes one design choice and shows the effect. This is adequate causal evidence within the RL training paradigm."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The title claims 'Improving Code Generation' broadly, but results are only on Python-based benchmarks (HumanEval, LeetCode, LiveCodeBench are all Python). The approach is tested on models up to 8B parameters only. The Limitations section (Appendix D) acknowledges the model size limitation but does not bound the language generalization claim."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper does not substantively discuss alternative explanations for the improvements. For example, it does not consider whether the improvement comes from additional training signal volume (more data per batch) rather than execution semantics per se. The ablation study partially addresses this but does not control for total training signal quantity."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Section 4.1 specifies exact model names: 'Qwen2.5-Coder-7B-Instruct', 'LLaMA-3.1-8B-Instruct', and 'Qwen2.5-Coder-1.5B'. These are specific enough as they are open-weight models with fixed weights, not API models with changing behavior."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Appendix C provides the full execution trace inference prompt with placeholders AND a description of what fills them (function name, variable names, code, input). The code generation prompt is standard (problem descriptions from benchmarks). The evaluation prompt is fully reproduced."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section 4.1 reports: batch size 128, mini-batch size 64, learning rate 1e-06, max 1000 training steps, 8 rollout samples, max response length 8192 tokens, mixing ratio 0.4. Evaluation uses greedy sampling (temperature 0.0)."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "The paper does not use agentic scaffolding. CODERL+ is a training method (RL pipeline), not an agentic system with tools, retry logic, or feedback loops at inference time."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 4.1 describes the training data: 'prime code data sourced from APPS, CodeContests, TACO, and Codeforces, comprising 27K coding problems along with their corresponding test cases.' The pipeline for constructing execution semantics alignment data is described in Section 3.2."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Appendix D is titled 'Limitations' and provides substantive discussion of three specific limitations."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "The Limitations section (Appendix D) discusses three specific threats: (1) computational constraints limiting evaluation to 8B models, (2) no hyperparameter tuning due to cost, (3) additional computational overhead of the approach. These are specific to this study."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Appendix D explicitly states 'computational constraints limited our evaluation to models up to 8B parameters, which may affect the generalizability of our conclusions to larger-scale LLMs.' This is a specific scope boundary."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "No raw experimental data (model outputs, per-example results, training logs) are made available. Only aggregated results in tables are reported."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section 4.1 describes data sources (APPS, CodeContests, TACO, Codeforces), the total size (27K problems), and evaluation benchmarks. Section 3.2 describes how execution semantics alignment data is dynamically constructed from failed rollouts."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. Data sources are standard public benchmarks."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Section 3.2 documents the full training pipeline: code generation rollouts, test case execution for reward, failed program repurposing for alignment training, and construction of alignment prompts. The evaluation pipeline is described in Section 4.1."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding disclosure, acknowledgments section, or grant information is found in the paper. The footnote mentions 'Work done during Xue Jiang and Yihong Dong's internship at Tongyi Lab' but no explicit funding statement."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are clearly stated: Peking University and Tongyi Lab (Alibaba Group). The paper header lists institutional affiliations for all authors."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "Several authors are affiliated with Tongyi Lab (Alibaba Group), and the work was done during internships there. Alibaba has commercial interest in code generation capabilities. No explicit statement about funder independence. The funder (Alibaba via Tongyi Lab) has a stake in demonstrating code generation improvements."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests statement, patent disclosures, or financial interest declarations are present in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "The paper uses Qwen2.5-Coder-7B-Instruct and LLaMA-3.1-8B-Instruct but does not state the training data cutoff dates for these base models. This is important because benchmarks like HumanEval (published 2021) could be in the training data."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No analysis of whether HumanEval, LeetCode, or LiveCodeBench examples appeared in the base models' pre-training data. The paper does not discuss train/test overlap for any benchmark."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "HumanEval was published in 2021 and is widely known to be contaminated in models trained after 2021. The paper uses Qwen2.5-Coder models which were certainly trained after HumanEval's publication, yet does not discuss contamination risk. LiveCodeBench is designed to be contamination-free (referenced as Jain et al., 2024), but the paper does not explicitly leverage or discuss this property."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No inference cost or latency is reported. The paper does not mention API costs, tokens consumed, or wall-clock time for inference/evaluation."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Section 4.1 states 'All experiments are conducted on a cluster of 8 NVIDIA A100 80G GPUs.' Appendix D mentions 'each training run requires roughly three days.' This provides hardware and approximate training time."
    294       }
    295     }
    296   },
    297   "claims": [
    298     {
    299       "claim": "CODERL+ achieves a 4.6% average relative improvement in pass@1 over post-training baselines on code generation benchmarks.",
    300       "evidence": "Table 1 shows CODERL+ achieves 63.7 average pass@1 across HumanEval (90.9), LeetCode (63.3), and LiveCodeBench (36.9), compared to GRPO baseline at 60.9 average. The relative improvement over GRPO is (63.7-60.9)/60.9 = 4.6%.",
    301       "supported": "strong"
    302     },
    303     {
    304       "claim": "CODERL+ generalizes to code reasoning with 15.5% higher accuracy and test output generation with 4.4% higher accuracy compared to baselines.",
    305       "evidence": "Table 1 shows CODERL+ at 85.0 on LiveCodeBench-Reason vs. CodeReasoner at 78.5 (best code reasoning baseline). Percentage given is relative to unspecified baseline. For test output generation, CODERL+ achieves 53.2 vs. GRPO at 48.4.",
    306       "supported": "moderate"
    307     },
    308     {
    309       "claim": "CODERL+ consistently improves across different model families (LLaMA, Qwen) and sizes (1.5B, 7B, 8B).",
    310       "evidence": "Table 2 shows CODERL+ outperforming GRPO on all three models across all benchmarks. On LLaMA-3.1-8B-Instruct, the average code generation improvement is 11.2% over GRPO.",
    311       "supported": "strong"
    312     },
    313     {
    314       "claim": "CODERL+ seamlessly integrates with various RL algorithms (GRPO, PPO, REINFORCE++) and consistently enhances them.",
    315       "evidence": "Table 3 shows CODERL+ improving all three RL algorithms across all benchmarks. The most substantial improvement is PPO (+7.4% average on code generation).",
    316       "supported": "strong"
    317     },
    318     {
    319       "claim": "CODERL+ strengthens the alignment between code's textual representations and its underlying execution semantics.",
    320       "evidence": "Figure 5 shows probing analysis where CODERL+-trained model achieves lower MSE across all model layers compared to base model and GRPO, indicating better encoding of execution semantics in internal representations. Figure 1b shows 81.8% relative improvement on execution trace inference.",
    321       "supported": "moderate"
    322     },
    323     {
    324       "claim": "Using failed rollout programs for execution semantics alignment is more effective than random rollouts or off-policy data.",
    325       "evidence": "Figure 3 ablation study shows CODERL+ outperforms CODERL+ (Random Rollout), CODERL+ (Off-policy Sem), and CODERL+ (IO) across all three task types.",
    326       "supported": "strong"
    327     }
    328   ],
    329   "methodology_tags": [
    330     "benchmark-eval"
    331   ],
    332   "key_findings": "CODERL+ integrates execution semantics alignment into reinforcement learning training for code generation, using failed program rollouts to teach models about variable-level execution trajectories. The approach achieves 4.6% average relative improvement in pass@1 on code generation benchmarks over GRPO and generalizes to code reasoning tasks (85.0 vs 78.5 accuracy on LiveCodeBench-Reason). Results are consistent across three model families, three model sizes, and three RL algorithms. Probing analysis provides evidence that the training strengthens internal representations' alignment with execution semantics.",
    333   "red_flags": [
    334     {
    335       "flag": "No variance or multiple-run results",
    336       "detail": "All results appear to be single-run numbers with no standard deviations, confidence intervals, or significance tests. RL training is known to be high-variance, and differences of 1-3 percentage points could be within noise without multiple seeds."
    337     },
    338     {
    339       "flag": "HumanEval contamination risk unaddressed",
    340       "detail": "HumanEval (published 2021) is likely in the pre-training data of Qwen2.5-Coder and LLaMA-3.1 models. The paper does not acknowledge or discuss this contamination risk. Results on HumanEval should be interpreted cautiously."
    341     },
    342     {
    343       "flag": "Company affiliation evaluating code capabilities",
    344       "detail": "Multiple authors are from Tongyi Lab (Alibaba Group), which has commercial interest in code generation. The work was done during internships at Alibaba. No competing interests statement is provided."
    345     },
    346     {
    347       "flag": "Generalization claims beyond tested setting",
    348       "detail": "The paper's title and abstract frame this as improving 'Code Generation' generally, but all benchmarks are Python-only. No evaluation on other programming languages is included."
    349     },
    350     {
    351       "flag": "Code not actually released",
    352       "detail": "The paper promises 'Our source code will be released' but the code is not yet available. Without the code, independent verification is not possible."
    353     }
    354   ],
    355   "cited_papers": [
    356     {
    357       "title": "DeepSeek-R1 incentivizes reasoning in LLMs through reinforcement learning",
    358       "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"],
    359       "year": 2025,
    360       "relevance": "Foundational work on using RL (GRPO) with chain-of-thought reasoning for LLM capabilities, which CODERL+ builds upon."
    361     },
    362     {
    363       "title": "CodeRL: Mastering code generation through pretrained models and deep reinforcement learning",
    364       "authors": ["Hung Le", "Yue Wang", "Akhilesh Deepak Gotmare", "Silvio Savarese", "Steven Chu Hong Hoi"],
    365       "year": 2022,
    366       "relevance": "Pioneering work on actor-critic RL for code generation using unit test feedback."
    367     },
    368     {
    369       "title": "CodeIO: Condensing reasoning patterns via code input-output prediction",
    370       "authors": ["Junlong Li", "Daya Guo", "Dejian Yang", "Runxin Xu"],
    371       "year": 2025,
    372       "relevance": "Key baseline on learning code execution through distillation from teacher models."
    373     },
    374     {
    375       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    376       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    377       "year": 2024,
    378       "arxiv_id": "2403.07974",
    379       "relevance": "Contamination-free code benchmark design, directly relevant to evaluation methodology in code generation research."
    380     },
    381     {
    382       "title": "Evaluating large language models trained on code",
    383       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    384       "year": 2021,
    385       "arxiv_id": "2107.03374",
    386       "relevance": "Introduced HumanEval benchmark, foundational for code generation evaluation."
    387     },
    388     {
    389       "title": "CodePRM: Execution feedback-enhanced process reward model for code generation",
    390       "authors": ["Qingyao Li", "Xinyi Dai", "Xiangyang Li"],
    391       "year": 2025,
    392       "relevance": "Process reward models for code generation RL, addressing sparse reward problems."
    393     },
    394     {
    395       "title": "StepCoder: Improving code generation with reinforcement learning from compiler feedback",
    396       "authors": ["Shihan Dou", "Yan Liu", "Haoxiang Jia"],
    397       "year": 2024,
    398       "relevance": "Curriculum learning with RL for code generation, decomposing complex tasks into subtasks."
    399     },
    400     {
    401       "title": "CodeReasoner: Enhancing the code reasoning ability with reinforcement learning",
    402       "authors": ["Lingxiao Tang", "He Ye", "Zhongxin Liu"],
    403       "year": 2025,
    404       "relevance": "RL-based code reasoning improvement, key baseline combining instruction fine-tuning and GRPO."
    405     },
    406     {
    407       "title": "Generalization or memorization: Data contamination and trustworthy evaluation for large language models",
    408       "authors": ["Yihong Dong", "Xue Jiang", "Huanyu Liu"],
    409       "year": 2024,
    410       "relevance": "Addresses data contamination in LLM evaluation, directly relevant to benchmark reliability."
    411     },
    412     {
    413       "title": "The false promise of imitating proprietary LLMs",
    414       "authors": ["Arnav Gudibande", "Eric Wallace", "Charlie Snell"],
    415       "year": 2023,
    416       "arxiv_id": "2305.15717",
    417       "relevance": "Critiques SFT/distillation approaches as imitating surface patterns rather than learning reasoning, motivating RL-based approaches."
    418     },
    419     {
    420       "title": "Qwen2.5-Coder technical report",
    421       "authors": ["Binyuan Hui", "Jian Yang", "Zeyu Cui"],
    422       "year": 2024,
    423       "arxiv_id": "2409.12186",
    424       "relevance": "Technical report for the primary base model used in CODERL+ experiments."
    425     }
    426   ]
    427 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs