scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23996B)
      1 {
      2   "paper": {
      3     "title": "RLTF: Reinforcement Learning from Unit Test Feedback",
      4     "authors": ["Jiate Liu", "Yiqin Zhu", "Kaiwen Xiao", "Qiang Fu", "Xiao Han", "Wei Yang", "Deheng Ye"],
      5     "year": 2023,
      6     "venue": "Transactions on Machine Learning Research",
      7     "arxiv_id": "2307.04349",
      8     "doi": "10.48550/arXiv.2307.04349"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "RLTF proposes an online RL framework with multi-granularity unit test feedback (coarse, fine-grained, and adaptive) for improving code LLMs on program synthesis. On APPS and MBPP benchmarks using CodeT5 770M, RLTF achieves state-of-the-art results among CodeT5-based methods. Ablation studies show each feedback type contributes incrementally, with fine-grained feedback providing the largest boost. The approach generalizes across base models (CodeT5 770M and CodeGen 2.7B).",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The abstract states 'Our code is available at: https://github.com/Zyq-scut/RLTF' providing a GitHub repository URL."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses publicly available benchmarks APPS (Hendrycks et al., 2021) and MBPP (Austin et al., 2021), which are standard public datasets."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper mentions '8 NVIDIA V100 GPUs, each with 32GB of memory' but does not provide a requirements.txt, Dockerfile, or detailed dependency/library version listing."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided in the paper. Key details like batch size and learning rate are mentioned but no structured reproduction guide."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results in Tables 3-9 are point estimates with no confidence intervals or error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims RLTF outperforms CodeRL and PPOCoder but provides no statistical significance tests. Comparisons are based solely on raw numbers."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Percentage improvements are reported with baseline context, e.g., pass@1 improves from 1.30% (SL only) to 1.45% (full RLTF) in Table 5, and from 1.30 to 1.45 vs CodeRL's 1.30 in Table 3, providing enough context to assess magnitude."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for the number of test problems or generated samples. APPS and MBPP sizes are inherited from prior work without discussion of statistical adequacy."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No standard deviation, variance, or spread measures are reported across runs. It is unclear whether results represent single runs or averages."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Table 3 compares against Codex, AlphaCode, GPT variants, CodeRL, and PPOCoder. Table 5 includes SL-only baseline and incremental ablations."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "CodeRL (2022) and PPOCoder (2023) are the most recent RL-based code generation methods at time of submission, representing state of the art."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Extensive ablations in Tables 4-8: impact of framework (online vs offline), feedback combinations, Rfine values, temperature, and different base models."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Results reported across pass@1, pass@5, pass@10, pass@100, and pass@1000 metrics."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is not relevant here; code generation correctness is objectively measured via unit test pass/fail."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "APPS has explicit train/test splits (Section 4.1). MBPP evaluation is zero-shot on a separate test set of 500 instances."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 3 breaks down APPS results by difficulty level: Introductory, Interview, and Competition. Figure 2 shows per-error-type analysis."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Figure 2 provides qualitative analysis of error types before and after RLTF. Section 4.5 discusses that RLTF is more effective at runtime/compiler errors than semantic errors."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper notes that timeout errors slightly increase after applying RLTF (Section 4.5), and pass@1000 with Critic Sampling is slightly lower than CodeRL (Table 3, 20.32 vs 20.98)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims 'state-of-the-art performance on APPS and MBPP benchmarks,' which is supported by Tables 3 and 9 showing improvements over CodeRL and PPOCoder."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims like 'RLTF improves performance' are supported by controlled ablation studies (Tables 4-5) that isolate individual components via single-variable manipulation."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper tests only on Python benchmarks (APPS and MBPP) with two base models, but Section 5 acknowledges the limitation that 'manual categorization of sub-error types makes it challenging to transfer RLTF to other programming languages.' However, the title and abstract make broad claims about 'program synthesis' without bounding to Python."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No discussion of alternative explanations for the improvements. For example, whether the online framework's improvement comes from simply training longer or seeing more data rather than the feedback mechanism specifically."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures pass@k on unit tests and frames results in terms of pass@k. No proxy gap — the claims match the granularity of the measurements."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Specific model versions with sizes are stated: 'CodeT5 770M' and 'CodeGen 2.7B'. These are specific open-source model checkpoints with known architectures."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "This is a fine-tuning/RL training paper, not a prompting paper. The models are trained end-to-end on input-output pairs, not prompted."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4.1 reports: batch size 32, learning rate 2e-6, nucleus sampling top-p 0.95, temperature 0.6 (APPS) and 1.2 (MBPP), online buffer length 6400, buffer update every 50 steps."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. RLTF is an RL training framework, not an agentic system."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 4.1 states: 'We adhere to the same preprocessing steps as those in (Hendrycks et al., 2021)' and documents the subprocess modification for segfault handling. MBPP prompt format is explicitly described."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No dedicated limitations section. Some limitations are briefly mentioned in Section 5 (Conclusions and Future Work) but as future directions rather than as a structured limitations discussion."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No specific threats to validity discussed. The conclusion mentions transferability to other languages as a limitation, but does not discuss threats like evaluation methodology, baseline fairness, or reproducibility concerns."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 5 explicitly states: 'The manual categorization of sub-error types we employed makes it challenging to transfer RLTF to other programming languages, which should be considered as another limitation.' Also acknowledges benchmark test diversity limitations."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental data (generated programs, individual test outcomes, training logs) is provided for independent verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 4.1 describes both benchmarks in detail: APPS (10,000 problems, train/test splits, difficulty levels, unit test counts) and MBPP (974 instances, splits described)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data sources are standard public benchmarks (APPS and MBPP)."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The online buffer workflow is documented in Figure 1b and Section 3.2, including buffer size (6400), update frequency (every 50 steps), and the queue mechanism for data flow."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding or acknowledgments section. All authors are from Tencent but no funding disclosure is provided."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All seven authors are clearly listed as affiliated with Tencent, with their Tencent email addresses prominently displayed."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "All authors are Tencent employees. Tencent has commercial interests in code generation tools, so the funder/employer is not independent of the outcome."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is provided. All authors work at Tencent, which has potential commercial interests in code generation technology."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No discussion of training data cutoff for the base CodeT5 or CodeGen models. The models are fine-tuned on APPS training data, but pre-training data temporal scope is not addressed."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether APPS or MBPP test problems could have appeared in the CodeT5 or CodeGen pre-training data."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "APPS (2021) and MBPP (2021) were published before both CodeT5 and CodeGen training. No contamination analysis is performed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference cost or latency reported for generating solutions."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Section 4.1 states: '8 NVIDIA V100 GPUs... training process took approximately 24 hours. Concurrently, three additional machines with similar 8-card V100 GPU configurations were used to generate the latest samples.'"
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No mention of multiple random seeds. Results appear to be from single runs with no seed sensitivity analysis."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs is not stated. It is unclear whether results are from a single training run or averaged."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "While ablations over Rfine and temperature are shown, the total hyperparameter search budget (number of configurations tried) is not reported."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The paper presents results for Rfine=-0.3 and temperature=1.0 as the final configuration but does not explain whether selection was based on validation set or test set performance."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors compare RLTF against their own reproduction of CodeRL and scaled PPOCoder results without acknowledging potential bias in re-implementation."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "RLTF uses 4 machines (32 V100 GPUs total) while the compute used by baselines CodeRL and PPOCoder is not discussed. The online framework requires substantially more compute for sample generation, but no matched-budget comparison is provided."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether APPS/MBPP adequately measure code generation capability or their construct validity limitations."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding involved. This is a training method comparison, not an agentic system evaluation."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether APPS/MBPP problems or their solutions appeared in CodeT5/CodeGen pre-training data, despite both benchmarks predating the models."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "MBPP includes unit test assert statements in the input sequence, which could leak answer information. The paper acknowledges this 'occasionally encourages models to overfit' but does not treat it as a leakage concern."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether APPS training and test problems share structural similarities or come from the same coding websites."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No concrete leakage detection or prevention method is applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "RLTF achieves state-of-the-art performance on the APPS benchmark among CodeT5-based methods",
    365       "evidence": "Table 3 shows RLTF achieves pass@1=1.45%, pass@5=3.78%, pass@1000=19.92% without Critic Sampling, outperforming CodeRL (1.30/3.32/17.78) and PPOCoder-scaled (1.32/3.37/17.84).",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "The online framework improves performance over offline training",
    370       "evidence": "Table 4 ablation shows online+RLTF (pass@1=1.45%) > offline+RLTF (1.34%) > online-only (1.37%) > offline-only (1.29%).",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Fine-grained feedback contributes the most significant performance boost among the feedback types",
    375       "evidence": "Table 5 shows adding fine-grained feedback improves pass@1 from 1.37% to 1.41% (largest single-feedback jump), and Section 4.3 states this explicitly.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "RLTF generalizes across base models (CodeT5 and CodeGen)",
    380       "evidence": "Table 8 shows improvements on both CodeT5 770M (1.30→1.45 pass@1) and CodeGen 2.7B (1.64→2.04 pass@1).",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "RLTF achieves state-of-the-art zero-shot performance on MBPP",
    385       "evidence": "Table 9 shows CodeT5+RLTF achieves pass@1=30.4%, pass@80=71.3% vs CodeRL (25.7/68.1) and PPOCoder (26.1/68.2).",
    386       "supported": "moderate"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "No variance or statistical tests",
    392       "detail": "All results are point estimates without error bars, standard deviations, or significance tests. Improvements are often small (e.g., 0.15% pass@1) and could be within noise."
    393     },
    394     {
    395       "flag": "Unfair baseline comparison methodology",
    396       "detail": "PPOCoder results are 'proportionally scaled' from CodeRL's open-source model rather than independently reproduced. The paper acknowledges discrepancies between reported and reproduced CodeRL results, raising concerns about evaluation consistency."
    397     },
    398     {
    399       "flag": "Substantially higher compute budget than baselines",
    400       "detail": "RLTF uses 4 machines with 32 V100 GPUs total (1 for training + 3 for sample generation) for 24 hours. The online framework inherently requires more compute than offline methods, but no matched-budget comparison is provided."
    401     },
    402     {
    403       "flag": "All authors from same company",
    404       "detail": "All seven authors are from Tencent with no conflict of interest declaration. Tencent has commercial interests in code generation technology."
    405     },
    406     {
    407       "flag": "No contamination analysis",
    408       "detail": "APPS and MBPP were published in 2021, before CodeT5 and CodeGen training data collection. No analysis of whether benchmark solutions leaked into pre-training data."
    409     }
    410   ],
    411   "cited_papers": [
    412     {
    413       "title": "CodeRL: Mastering Code Generation through Pretrained Models and Deep Reinforcement Learning",
    414       "authors": ["Hung Le", "Yue Wang", "Akhilesh Deepak Gotmare", "Silvio Savarese", "Steven Chu Hong Hoi"],
    415       "year": 2022,
    416       "relevance": "Primary baseline; introduces RL with unit test feedback and Critic Sampling for code generation."
    417     },
    418     {
    419       "title": "Execution-based Code Generation using Deep Reinforcement Learning",
    420       "authors": ["Parshin Shojaee", "Aneesh Jain", "Sindhu Tipirneni", "Chandan K Reddy"],
    421       "year": 2023,
    422       "arxiv_id": "2301.13816",
    423       "relevance": "Primary baseline; applies PPO to CodeRL framework for code generation improvement."
    424     },
    425     {
    426       "title": "Evaluating Large Language Models Trained on Code",
    427       "authors": ["Mark Chen"],
    428       "year": 2021,
    429       "arxiv_id": "2107.03374",
    430       "relevance": "Introduces Codex and HumanEval benchmark for evaluating LLM code generation."
    431     },
    432     {
    433       "title": "Measuring Coding Challenge Competence with APPS",
    434       "authors": ["Dan Hendrycks"],
    435       "year": 2021,
    436       "arxiv_id": "2105.09938",
    437       "relevance": "Introduces the APPS benchmark used as primary evaluation in this paper."
    438     },
    439     {
    440       "title": "Program Synthesis with Large Language Models",
    441       "authors": ["Jacob Austin"],
    442       "year": 2021,
    443       "arxiv_id": "2108.07732",
    444       "relevance": "Introduces the MBPP benchmark used as secondary evaluation in this paper."
    445     },
    446     {
    447       "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation",
    448       "authors": ["Yue Wang", "Weishi Wang", "Shafiq Joty", "Steven CH Hoi"],
    449       "year": 2021,
    450       "arxiv_id": "2109.00859",
    451       "relevance": "Base model used in RLTF experiments; encoder-decoder architecture for code tasks."
    452     },
    453     {
    454       "title": "CodeGen: An Open Large Language Model for Code with Multi-Turn Program Synthesis",
    455       "authors": ["Erik Nijkamp"],
    456       "year": 2022,
    457       "arxiv_id": "2203.13474",
    458       "relevance": "Second base model used in RLTF experiments to demonstrate generalization."
    459     },
    460     {
    461       "title": "Competition-Level Code Generation with AlphaCode",
    462       "authors": ["Yujia Li"],
    463       "year": 2022,
    464       "relevance": "Competitive code generation system used as baseline comparison in APPS evaluation."
    465     },
    466     {
    467       "title": "CodeBERT: A Pre-trained Model for Programming and Natural Languages",
    468       "authors": ["Zhangyin Feng"],
    469       "year": 2020,
    470       "arxiv_id": "2002.08155",
    471       "relevance": "Early pre-trained model for code; encoder-only architecture for code understanding."
    472     },
    473     {
    474       "title": "Self-critiquing Models for Assisting Human Evaluators",
    475       "authors": ["William Saunders"],
    476       "year": 2022,
    477       "arxiv_id": "2206.05802",
    478       "relevance": "Explores LLM self-critique capabilities relevant to AI-assisted code review."
    479     }
    480   ]
    481 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs