scan.json (23996B)
1 { 2 "paper": { 3 "title": "RLTF: Reinforcement Learning from Unit Test Feedback", 4 "authors": ["Jiate Liu", "Yiqin Zhu", "Kaiwen Xiao", "Qiang Fu", "Xiao Han", "Wei Yang", "Deheng Ye"], 5 "year": 2023, 6 "venue": "Transactions on Machine Learning Research", 7 "arxiv_id": "2307.04349", 8 "doi": "10.48550/arXiv.2307.04349" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "RLTF proposes an online RL framework with multi-granularity unit test feedback (coarse, fine-grained, and adaptive) for improving code LLMs on program synthesis. On APPS and MBPP benchmarks using CodeT5 770M, RLTF achieves state-of-the-art results among CodeT5-based methods. Ablation studies show each feedback type contributes incrementally, with fine-grained feedback providing the largest boost. The approach generalizes across base models (CodeT5 770M and CodeGen 2.7B).", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The abstract states 'Our code is available at: https://github.com/Zyq-scut/RLTF' providing a GitHub repository URL." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The paper uses publicly available benchmarks APPS (Hendrycks et al., 2021) and MBPP (Austin et al., 2021), which are standard public datasets." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper mentions '8 NVIDIA V100 GPUs, each with 32GB of memory' but does not provide a requirements.txt, Dockerfile, or detailed dependency/library version listing." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are provided in the paper. Key details like batch size and learning rate are mentioned but no structured reproduction guide." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "All results in Tables 3-9 are point estimates with no confidence intervals or error bars." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims RLTF outperforms CodeRL and PPOCoder but provides no statistical significance tests. Comparisons are based solely on raw numbers." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Percentage improvements are reported with baseline context, e.g., pass@1 improves from 1.30% (SL only) to 1.45% (full RLTF) in Table 5, and from 1.30 to 1.45 vs CodeRL's 1.30 in Table 3, providing enough context to assess magnitude." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification for the number of test problems or generated samples. APPS and MBPP sizes are inherited from prior work without discussion of statistical adequacy." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No standard deviation, variance, or spread measures are reported across runs. It is unclear whether results represent single runs or averages." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Table 3 compares against Codex, AlphaCode, GPT variants, CodeRL, and PPOCoder. Table 5 includes SL-only baseline and incremental ablations." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "CodeRL (2022) and PPOCoder (2023) are the most recent RL-based code generation methods at time of submission, representing state of the art." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Extensive ablations in Tables 4-8: impact of framework (online vs offline), feedback combinations, Rfine values, temperature, and different base models." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Results reported across pass@1, pass@5, pass@10, pass@100, and pass@1000 metrics." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "Human evaluation is not relevant here; code generation correctness is objectively measured via unit test pass/fail." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "APPS has explicit train/test splits (Section 4.1). MBPP evaluation is zero-shot on a separate test set of 500 instances." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Table 3 breaks down APPS results by difficulty level: Introductory, Interview, and Competition. Figure 2 shows per-error-type analysis." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Figure 2 provides qualitative analysis of error types before and after RLTF. Section 4.5 discusses that RLTF is more effective at runtime/compiler errors than semantic errors." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper notes that timeout errors slightly increase after applying RLTF (Section 4.5), and pass@1000 with Critic Sampling is slightly lower than CodeRL (Table 3, 20.32 vs 20.98)." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims 'state-of-the-art performance on APPS and MBPP benchmarks,' which is supported by Tables 3 and 9 showing improvements over CodeRL and PPOCoder." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "Causal claims like 'RLTF improves performance' are supported by controlled ablation studies (Tables 4-5) that isolate individual components via single-variable manipulation." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper tests only on Python benchmarks (APPS and MBPP) with two base models, but Section 5 acknowledges the limitation that 'manual categorization of sub-error types makes it challenging to transfer RLTF to other programming languages.' However, the title and abstract make broad claims about 'program synthesis' without bounding to Python." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "No discussion of alternative explanations for the improvements. For example, whether the online framework's improvement comes from simply training longer or seeing more data rather than the feedback mechanism specifically." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper measures pass@k on unit tests and frames results in terms of pass@k. No proxy gap — the claims match the granularity of the measurements." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Specific model versions with sizes are stated: 'CodeT5 770M' and 'CodeGen 2.7B'. These are specific open-source model checkpoints with known architectures." 143 }, 144 "prompts_provided": { 145 "applies": false, 146 "answer": false, 147 "justification": "This is a fine-tuning/RL training paper, not a prompting paper. The models are trained end-to-end on input-output pairs, not prompted." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 4.1 reports: batch size 32, learning rate 2e-6, nucleus sampling top-p 0.95, temperature 0.6 (APPS) and 1.2 (MBPP), online buffer length 6400, buffer update every 50 steps." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. RLTF is an RL training framework, not an agentic system." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 4.1 states: 'We adhere to the same preprocessing steps as those in (Hendrycks et al., 2021)' and documents the subprocess modification for segfault handling. MBPP prompt format is explicitly described." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "No dedicated limitations section. Some limitations are briefly mentioned in Section 5 (Conclusions and Future Work) but as future directions rather than as a structured limitations discussion." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No specific threats to validity discussed. The conclusion mentions transferability to other languages as a limitation, but does not discuss threats like evaluation methodology, baseline fairness, or reproducibility concerns." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 5 explicitly states: 'The manual categorization of sub-error types we employed makes it challenging to transfer RLTF to other programming languages, which should be considered as another limitation.' Also acknowledges benchmark test diversity limitations." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw experimental data (generated programs, individual test outcomes, training logs) is provided for independent verification." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 4.1 describes both benchmarks in detail: APPS (10,000 problems, train/test splits, difficulty levels, unit test counts) and MBPP (974 instances, splits described)." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data sources are standard public benchmarks (APPS and MBPP)." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The online buffer workflow is documented in Figure 1b and Section 3.2, including buffer size (6400), update frequency (every 50 steps), and the queue mechanism for data flow." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding or acknowledgments section. All authors are from Tencent but no funding disclosure is provided." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "All seven authors are clearly listed as affiliated with Tencent, with their Tencent email addresses prominently displayed." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "All authors are Tencent employees. Tencent has commercial interests in code generation tools, so the funder/employer is not independent of the outcome." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement is provided. All authors work at Tencent, which has potential commercial interests in code generation technology." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No discussion of training data cutoff for the base CodeT5 or CodeGen models. The models are fine-tuned on APPS training data, but pre-training data temporal scope is not addressed." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether APPS or MBPP test problems could have appeared in the CodeT5 or CodeGen pre-training data." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "APPS (2021) and MBPP (2021) were published before both CodeT5 and CodeGen training. No contamination analysis is performed." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No inference cost or latency reported for generating solutions." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": true, 289 "justification": "Section 4.1 states: '8 NVIDIA V100 GPUs... training process took approximately 24 hours. Concurrently, three additional machines with similar 8-card V100 GPU configurations were used to generate the latest samples.'" 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No mention of multiple random seeds. Results appear to be from single runs with no seed sensitivity analysis." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of experimental runs is not stated. It is unclear whether results are from a single training run or averaged." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "While ablations over Rfine and temperature are shown, the total hyperparameter search budget (number of configurations tried) is not reported." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "The paper presents results for Rfine=-0.3 and temperature=1.0 as the final configuration but does not explain whether selection was based on validation set or test set performance." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors compare RLTF against their own reproduction of CodeRL and scaled PPOCoder results without acknowledging potential bias in re-implementation." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "RLTF uses 4 machines (32 V100 GPUs total) while the compute used by baselines CodeRL and PPOCoder is not discussed. The online framework requires substantially more compute for sample generation, but no matched-budget comparison is provided." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "No discussion of whether APPS/MBPP adequately measure code generation capability or their construct validity limitations." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding involved. This is a training method comparison, not an agentic system evaluation." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of whether APPS/MBPP problems or their solutions appeared in CodeT5/CodeGen pre-training data, despite both benchmarks predating the models." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "MBPP includes unit test assert statements in the input sequence, which could leak answer information. The paper acknowledges this 'occasionally encourages models to overfit' but does not treat it as a leakage concern." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether APPS training and test problems share structural similarities or come from the same coding websites." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No concrete leakage detection or prevention method is applied." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "RLTF achieves state-of-the-art performance on the APPS benchmark among CodeT5-based methods", 365 "evidence": "Table 3 shows RLTF achieves pass@1=1.45%, pass@5=3.78%, pass@1000=19.92% without Critic Sampling, outperforming CodeRL (1.30/3.32/17.78) and PPOCoder-scaled (1.32/3.37/17.84).", 366 "supported": "moderate" 367 }, 368 { 369 "claim": "The online framework improves performance over offline training", 370 "evidence": "Table 4 ablation shows online+RLTF (pass@1=1.45%) > offline+RLTF (1.34%) > online-only (1.37%) > offline-only (1.29%).", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "Fine-grained feedback contributes the most significant performance boost among the feedback types", 375 "evidence": "Table 5 shows adding fine-grained feedback improves pass@1 from 1.37% to 1.41% (largest single-feedback jump), and Section 4.3 states this explicitly.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "RLTF generalizes across base models (CodeT5 and CodeGen)", 380 "evidence": "Table 8 shows improvements on both CodeT5 770M (1.30→1.45 pass@1) and CodeGen 2.7B (1.64→2.04 pass@1).", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "RLTF achieves state-of-the-art zero-shot performance on MBPP", 385 "evidence": "Table 9 shows CodeT5+RLTF achieves pass@1=30.4%, pass@80=71.3% vs CodeRL (25.7/68.1) and PPOCoder (26.1/68.2).", 386 "supported": "moderate" 387 } 388 ], 389 "red_flags": [ 390 { 391 "flag": "No variance or statistical tests", 392 "detail": "All results are point estimates without error bars, standard deviations, or significance tests. Improvements are often small (e.g., 0.15% pass@1) and could be within noise." 393 }, 394 { 395 "flag": "Unfair baseline comparison methodology", 396 "detail": "PPOCoder results are 'proportionally scaled' from CodeRL's open-source model rather than independently reproduced. The paper acknowledges discrepancies between reported and reproduced CodeRL results, raising concerns about evaluation consistency." 397 }, 398 { 399 "flag": "Substantially higher compute budget than baselines", 400 "detail": "RLTF uses 4 machines with 32 V100 GPUs total (1 for training + 3 for sample generation) for 24 hours. The online framework inherently requires more compute than offline methods, but no matched-budget comparison is provided." 401 }, 402 { 403 "flag": "All authors from same company", 404 "detail": "All seven authors are from Tencent with no conflict of interest declaration. Tencent has commercial interests in code generation technology." 405 }, 406 { 407 "flag": "No contamination analysis", 408 "detail": "APPS and MBPP were published in 2021, before CodeT5 and CodeGen training data collection. No analysis of whether benchmark solutions leaked into pre-training data." 409 } 410 ], 411 "cited_papers": [ 412 { 413 "title": "CodeRL: Mastering Code Generation through Pretrained Models and Deep Reinforcement Learning", 414 "authors": ["Hung Le", "Yue Wang", "Akhilesh Deepak Gotmare", "Silvio Savarese", "Steven Chu Hong Hoi"], 415 "year": 2022, 416 "relevance": "Primary baseline; introduces RL with unit test feedback and Critic Sampling for code generation." 417 }, 418 { 419 "title": "Execution-based Code Generation using Deep Reinforcement Learning", 420 "authors": ["Parshin Shojaee", "Aneesh Jain", "Sindhu Tipirneni", "Chandan K Reddy"], 421 "year": 2023, 422 "arxiv_id": "2301.13816", 423 "relevance": "Primary baseline; applies PPO to CodeRL framework for code generation improvement." 424 }, 425 { 426 "title": "Evaluating Large Language Models Trained on Code", 427 "authors": ["Mark Chen"], 428 "year": 2021, 429 "arxiv_id": "2107.03374", 430 "relevance": "Introduces Codex and HumanEval benchmark for evaluating LLM code generation." 431 }, 432 { 433 "title": "Measuring Coding Challenge Competence with APPS", 434 "authors": ["Dan Hendrycks"], 435 "year": 2021, 436 "arxiv_id": "2105.09938", 437 "relevance": "Introduces the APPS benchmark used as primary evaluation in this paper." 438 }, 439 { 440 "title": "Program Synthesis with Large Language Models", 441 "authors": ["Jacob Austin"], 442 "year": 2021, 443 "arxiv_id": "2108.07732", 444 "relevance": "Introduces the MBPP benchmark used as secondary evaluation in this paper." 445 }, 446 { 447 "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation", 448 "authors": ["Yue Wang", "Weishi Wang", "Shafiq Joty", "Steven CH Hoi"], 449 "year": 2021, 450 "arxiv_id": "2109.00859", 451 "relevance": "Base model used in RLTF experiments; encoder-decoder architecture for code tasks." 452 }, 453 { 454 "title": "CodeGen: An Open Large Language Model for Code with Multi-Turn Program Synthesis", 455 "authors": ["Erik Nijkamp"], 456 "year": 2022, 457 "arxiv_id": "2203.13474", 458 "relevance": "Second base model used in RLTF experiments to demonstrate generalization." 459 }, 460 { 461 "title": "Competition-Level Code Generation with AlphaCode", 462 "authors": ["Yujia Li"], 463 "year": 2022, 464 "relevance": "Competitive code generation system used as baseline comparison in APPS evaluation." 465 }, 466 { 467 "title": "CodeBERT: A Pre-trained Model for Programming and Natural Languages", 468 "authors": ["Zhangyin Feng"], 469 "year": 2020, 470 "arxiv_id": "2002.08155", 471 "relevance": "Early pre-trained model for code; encoder-only architecture for code understanding." 472 }, 473 { 474 "title": "Self-critiquing Models for Assisting Human Evaluators", 475 "authors": ["William Saunders"], 476 "year": 2022, 477 "arxiv_id": "2206.05802", 478 "relevance": "Explores LLM self-critique capabilities relevant to AI-assisted code review." 479 } 480 ] 481 }