scan.json (23300B)
1 { 2 "paper": { 3 "title": "Applying RLAIF for Code Generation with API-usage in Lightweight LLMs", 4 "authors": ["Sujan Dutta", "Sayantan Mahinder", "Raviteja Anantha", "Bortik Bandyopadhyay"], 5 "year": 2024, 6 "venue": "NLRSE", 7 "arxiv_id": "2406.20060", 8 "doi": "10.48550/arXiv.2406.20060" 9 }, 10 "checklist": { 11 "artifacts": { 12 "code_released": { 13 "applies": true, 14 "answer": false, 15 "justification": "No GitHub link, Zenodo archive, or any code repository URL is provided in the paper. The authors mention using the transformers and TRL libraries but do not release their own training code or pipeline." 16 }, 17 "data_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "The paper uses the publicly available Gorilla dataset published by Patil et al. (2023). The dataset is a standard public benchmark that they did not modify." 21 }, 22 "environment_specified": { 23 "applies": true, 24 "answer": false, 25 "justification": "The paper mentions using 'transformers' and 'TRL' Python libraries (Appendix B.2) and NVIDIA A100 40GB GPUs (Appendix B.3), but does not provide a requirements.txt, Dockerfile, or specific library versions." 26 }, 27 "reproduction_instructions": { 28 "applies": true, 29 "answer": false, 30 "justification": "No step-by-step reproduction instructions, README, or scripts for replicating experiments are provided. The methodology section describes the approach conceptually but lacks concrete reproduction details." 31 } 32 }, 33 "statistical_methodology": { 34 "confidence_intervals_or_error_bars": { 35 "applies": true, 36 "answer": false, 37 "justification": "Table 1 reports only point estimates (e.g., 27.9% executability rate) with no confidence intervals, error bars, or ± notation despite reporting results as the mean of three inference runs (Appendix B.2)." 38 }, 39 "significance_tests": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper claims MRL 'boosts performance' and 'outperforms' MGorilla based solely on comparing raw numbers (e.g., 27.9% vs 26.9%) without any statistical significance test." 43 }, 44 "effect_sizes_reported": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper reports absolute improvements with baseline context: 'CodeBLEU (1.6 points abs), AST (0.66% abs) and Executability Rate (4.5% abs)' (Section 4), and the abstract states '4.5% improvement in executability rate' and '1.0% higher code executability rate' over the 7B baseline." 48 }, 49 "sample_size_justified": { 50 "applies": true, 51 "answer": false, 52 "justification": "No justification is given for the dataset split (90/10 train/test from ~9k examples) or for using only three inference runs. No power analysis is discussed." 53 }, 54 "variance_reported": { 55 "applies": true, 56 "answer": false, 57 "justification": "Appendix B.2 states 'results are reported by taking the mean of three inference runs' but no standard deviation, variance, or spread measure is reported in Table 1 or elsewhere." 58 } 59 }, 60 "evaluation_design": { 61 "baselines_included": { 62 "applies": true, 63 "answer": true, 64 "justification": "The paper compares against two baselines: their own supervised fine-tuned model (MSFT, 780M) and the Gorilla fine-tuned LLaMA-7B (MGorilla) from Patil et al. (2023), shown in Table 1." 65 }, 66 "baselines_contemporary": { 67 "applies": true, 68 "answer": true, 69 "justification": "MGorilla (Patil et al., 2023) is a contemporary and directly relevant baseline for API-call code generation on the Gorilla dataset. The paper was submitted in 2024 and compares against a 2023 baseline." 70 }, 71 "ablation_study": { 72 "applies": true, 73 "answer": false, 74 "justification": "No ablation study is performed. The framework has multiple components (8 evaluation prompts, reward model, PPO training) but no experiments isolate the contribution of individual components (e.g., varying the number of evaluation questions, comparing different reward models)." 75 }, 76 "multiple_metrics": { 77 "applies": true, 78 "answer": true, 79 "justification": "The paper reports four metrics: Executability Rate, ROUGE (average of ROUGE-1/2/L/sum), CodeBLEU, and AST sub-tree matching (Table 1)." 80 }, 81 "human_evaluation": { 82 "applies": true, 83 "answer": false, 84 "justification": "No human evaluation of generated code quality is included. All evaluation is automated via metrics (ROUGE, CodeBLEU, AST, executability). Given that the paper claims quality improvements in code generation, human evaluation of output quality would be relevant." 85 }, 86 "held_out_test_set": { 87 "applies": true, 88 "answer": true, 89 "justification": "Appendix B.1 states 'We trained our model on 90% of the data and kept the rest for evaluation,' indicating a separate evaluation split." 90 }, 91 "per_category_breakdown": { 92 "applies": true, 93 "answer": false, 94 "justification": "The Gorilla dataset spans 37 different API domains, but the paper only reports aggregate metrics in Table 1. No per-domain or per-category breakdown is provided. The paper even acknowledges in the Limitations section that they 'have not analyzed the performance between more frequent APIs (head) and infrequent APIs (tail).'" 95 }, 96 "failure_cases_discussed": { 97 "applies": true, 98 "answer": false, 99 "justification": "Figure 2 shows one positive example where MRL fixes an undefined variable error, but no systematic failure analysis or examples of where MRL fails are presented." 100 }, 101 "negative_results_reported": { 102 "applies": true, 103 "answer": false, 104 "justification": "Every metric in Table 1 shows improvement for MRL over MSFT. No negative results, failed approaches, or configurations that did not work are reported." 105 } 106 }, 107 "claims_and_evidence": { 108 "abstract_claims_supported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The abstract claims a 4.5% improvement in executability rate (supported by Table 1: 27.9% vs 23.4%) and that the 780M model surpasses the 7B model by 1.0% (supported by Table 1: 27.9% vs 26.9%). These claims match the reported results." 112 }, 113 "causal_claims_justified": { 114 "applies": true, 115 "answer": false, 116 "justification": "The paper claims RLAIF 'boosts the performance' and 'significantly enhances' the baseline, implying RLAIF causes the improvement. However, there is no ablation or controlled experiment isolating the effect of RLAIF from other factors (e.g., additional training steps, the reward model architecture)." 117 }, 118 "generalization_bounded": { 119 "applies": true, 120 "answer": false, 121 "justification": "The title and abstract broadly claim improvement in 'code generation' and 'API-usage' but the evaluation is limited to only the HuggingFace portion of the Gorilla dataset (Python only, ML API calls only). The Limitations section acknowledges Python-only but the title and abstract do not bound the claims." 122 }, 123 "alternative_explanations_discussed": { 124 "applies": true, 125 "answer": false, 126 "justification": "The paper does not discuss alternative explanations for the performance gains. For example, the additional training steps in RL could contribute independently of the AI feedback quality, or the improvement could come from the reward model regularization rather than the feedback signal." 127 } 128 }, 129 "setup_transparency": { 130 "model_versions_specified": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper refers to 'GPT-3.5' without specifying a version (e.g., gpt-3.5-turbo-0613) and 'GPT-2-large' without a specific checkpoint identifier. No API version or snapshot date is provided." 134 }, 135 "prompts_provided": { 136 "applies": true, 137 "answer": true, 138 "justification": "Table 2 in Appendix A lists all 8 prompts used for AI feedback. Each prompt contains the template with [instruction] and [code] placeholders, and while the actual fill values are from the dataset (which is public), the prompt templates are the complete prompts used." 139 }, 140 "hyperparameters_reported": { 141 "applies": true, 142 "answer": true, 143 "justification": "Appendix B.2 reports learning rates for all three training stages: MSFT (5×10^-4), Mreward (5×10^-5), and PPO (6×10^-6). The paper also states that no hyperparameter search was performed." 144 }, 145 "scaffolding_described": { 146 "applies": false, 147 "answer": false, 148 "justification": "No agentic scaffolding is used. The approach is a standard training pipeline (SFT → reward model → PPO), not an agentic system." 149 }, 150 "data_preprocessing_documented": { 151 "applies": true, 152 "answer": false, 153 "justification": "The paper states it uses the HuggingFace part of the Gorilla dataset with a 90/10 split but does not describe any preprocessing steps, filtering, or how the split was created (random vs. stratified by domain)." 154 } 155 }, 156 "limitations_and_scope": { 157 "limitations_section_present": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section 6 is a dedicated 'Limitations' section discussing inherited biases from GPT-2-large, lack of programming language diversity (Python only), missing head/tail API analysis, and the offline learning limitation." 161 }, 162 "threats_to_validity_specific": { 163 "applies": true, 164 "answer": true, 165 "justification": "The Limitations section discusses specific threats: biases from GPT-2-large affecting generated code comments, Python-only evaluation limiting generalizability, lack of head vs. tail API analysis, and the offline learning problem given rapidly evolving APIs." 166 }, 167 "scope_boundaries_stated": { 168 "applies": true, 169 "answer": false, 170 "justification": "While the Limitations section mentions Python-only and missing head/tail analysis, it does not explicitly state what the results do NOT show. The paper does not bound claims to the specific tested setting (HuggingFace APIs only, single dataset, single base model)." 171 } 172 }, 173 "data_integrity": { 174 "raw_data_available": { 175 "applies": true, 176 "answer": false, 177 "justification": "The Gorilla dataset is publicly available, but the generated preference data used to train the reward model and the intermediate outputs (GPT-3.5 feedback scores, generated code samples) are not released." 178 }, 179 "data_collection_described": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section 2 describes the Gorilla dataset origin (from Patil et al. 2023), its composition (925+ unique APIs across 37 domains, 10 instructions per API), and Section 3 describes how preference data is generated via GPT-3.5 scoring." 183 }, 184 "recruitment_methods_described": { 185 "applies": false, 186 "answer": false, 187 "justification": "No human participants are involved. The data comes from a standard benchmark dataset (Gorilla)." 188 }, 189 "data_pipeline_documented": { 190 "applies": true, 191 "answer": true, 192 "justification": "The three-step pipeline is documented: (1) SFT on Gorilla data, (2) generate two outputs per instruction, score via GPT-3.5 with 8 binary questions, create preference pairs, train reward model, (3) PPO fine-tuning with reward model. The flow is shown in Figure 1." 193 } 194 }, 195 "conflicts_of_interest": { 196 "funding_disclosed": { 197 "applies": true, 198 "answer": false, 199 "justification": "No funding information or acknowledgments section listing grants or sponsors is provided. The paper notes the first author's work was done during an Apple internship, but no formal funding disclosure exists." 200 }, 201 "affiliations_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "Author affiliations are clearly listed: Rochester Institute of Technology (first author) and Apple (three co-authors). The footnote states 'Work done as a part of an internship at Apple.'" 205 }, 206 "funder_independent_of_outcome": { 207 "applies": true, 208 "answer": false, 209 "justification": "Three of four authors are Apple employees, and Apple has commercial interests in AI/ML model capabilities. The research was conducted during an Apple internship. The funder (Apple, implicitly) has a stake in demonstrating that lightweight models can perform well, which aligns with Apple's on-device AI strategy." 210 }, 211 "financial_interests_declared": { 212 "applies": true, 213 "answer": false, 214 "justification": "No competing interests statement, patent disclosures, or financial interest declarations are provided in the paper." 215 } 216 }, 217 "contamination": { 218 "training_cutoff_stated": { 219 "applies": true, 220 "answer": false, 221 "justification": "The paper uses GPT-2-large and GPT-3.5 without stating training data cutoff dates for either model. GPT-2 may have seen code from HuggingFace repositories, and GPT-3.5 may have seen the Gorilla dataset or related content." 222 }, 223 "train_test_overlap_discussed": { 224 "applies": true, 225 "answer": false, 226 "justification": "No discussion of whether GPT-2-large's pre-training data or GPT-3.5's training data overlaps with the Gorilla dataset or HuggingFace API documentation used to construct the benchmark." 227 }, 228 "benchmark_contamination_addressed": { 229 "applies": true, 230 "answer": false, 231 "justification": "The Gorilla dataset was published in 2023 (arXiv:2305.15334). GPT-2 was trained before this, but GPT-3.5 may have been trained on data overlapping with the API documentation used to construct Gorilla. This contamination risk is not discussed." 232 } 233 }, 234 "human_studies": { 235 "pre_registered": { 236 "applies": false, 237 "answer": false, 238 "justification": "No human participants are involved in this study." 239 }, 240 "irb_or_ethics_approval": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants are involved in this study." 244 }, 245 "demographics_reported": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants are involved in this study." 249 }, 250 "inclusion_exclusion_criteria": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants are involved in this study." 254 }, 255 "randomization_described": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants are involved in this study." 259 }, 260 "blinding_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants are involved in this study." 264 }, 265 "attrition_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants are involved in this study." 269 } 270 }, 271 "cost_and_practicality": { 272 "inference_cost_reported": { 273 "applies": true, 274 "answer": false, 275 "justification": "No inference cost, latency, or API cost for the GPT-3.5 feedback queries is reported. The paper does not quantify the cost of running the AI feedback pipeline per example." 276 }, 277 "compute_budget_stated": { 278 "applies": true, 279 "answer": true, 280 "justification": "Appendix B.3 states 'We used a cluster of NVIDIA A100 40GB GPUs for our experiments. We spent in total ~60 GPU hours for all of the experiments.'" 281 } 282 } 283 }, 284 "claims": [ 285 { 286 "claim": "RLAIF achieves a 4.5% absolute improvement in executability rate over the supervised fine-tuned baseline (MSFT).", 287 "evidence": "Table 1 shows MRL at 27.9% executability rate vs MSFT at 23.4% (Section 4).", 288 "supported": "moderate" 289 }, 290 { 291 "claim": "A 780M parameter model (MRL) trained with RLAIF surpasses a 7B parameter fine-tuned model (MGorilla) by 1.0% in code executability rate.", 292 "evidence": "Table 1 shows MRL at 27.9% vs MGorilla at 26.9% (Section 4). However, this 1.0% difference has no statistical test and is within likely noise range for three inference runs without reported variance.", 293 "supported": "weak" 294 }, 295 { 296 "claim": "MRL outperforms MGorilla across ROUGE, CodeBLEU, AST, and executability rate metrics.", 297 "evidence": "Table 1 shows MRL ahead on all four metrics: ROUGE 47.5 vs 41.2, CodeBLEU 42.2 vs 36.8, AST 73.62% vs 71.68%, Executability 27.9% vs 26.9%.", 298 "supported": "moderate" 299 }, 300 { 301 "claim": "RLAIF using binary questions to GPT-3.5 is an effective alternative to human feedback or code execution-based feedback for API-usage code generation.", 302 "evidence": "The framework is described in Section 3 and results in Table 1 show improvement, but no comparison with RLHF or execution-based feedback is provided. The claim rests on a single dataset comparison without ablations.", 303 "supported": "weak" 304 } 305 ], 306 "methodology_tags": ["benchmark-eval"], 307 "key_findings": "The paper proposes an RLAIF framework that uses GPT-3.5 to provide binary feedback on 8 quality dimensions of generated code, which is then used to train a reward model for PPO fine-tuning of GPT-2-large (780M parameters). On the HuggingFace subset of the Gorilla dataset, the RLAIF-trained model improves executability rate by 4.5% over the SFT baseline and marginally outperforms a fine-tuned LLaMA-7B model (27.9% vs 26.9% executability). The approach demonstrates that AI feedback can substitute for expensive code execution feedback in settings where running generated code is prohibitively costly.", 308 "red_flags": [ 309 { 310 "flag": "No statistical tests on small differences", 311 "detail": "The key claim that the 780M model surpasses the 7B model rests on a 1.0% executability rate difference (27.9% vs 26.9%) with no significance test and no reported variance across the three inference runs. This difference could easily be noise." 312 }, 313 { 314 "flag": "No ablation study", 315 "detail": "The framework has multiple components (8 evaluation prompts, reward model architecture, PPO training) but no ablation isolates which components contribute to the gains. It is unclear whether all 8 prompts are necessary or whether simpler reward signals would suffice." 316 }, 317 { 318 "flag": "Missing variance despite multiple runs", 319 "detail": "Results are reported as the mean of three inference runs (Appendix B.2) but no standard deviation or spread measure is provided, making it impossible to assess result stability." 320 }, 321 { 322 "flag": "Potential industry conflict of interest", 323 "detail": "Three of four authors are Apple employees. Apple has commercial interest in demonstrating that small (<1B) models can perform well (on-device deployment). No conflict of interest statement is provided." 324 }, 325 { 326 "flag": "Very low absolute executability rates", 327 "detail": "The best model achieves only 27.9% executability rate, meaning over 72% of generated code does not execute. The paper does not discuss whether these low absolute numbers undermine the practical value of the approach." 328 } 329 ], 330 "cited_papers": [ 331 { 332 "title": "Gorilla: Large Language Model Connected with Massive APIs", 333 "authors": ["Shishir G. Patil", "Tianjun Zhang", "Xin Wang", "Joseph E. Gonzalez"], 334 "year": 2023, 335 "arxiv_id": "2305.15334", 336 "relevance": "Introduced the Gorilla dataset and approach for teaching LLMs to generate API calls, which is the direct baseline and evaluation setting for this paper." 337 }, 338 { 339 "title": "RLAIF: Scaling Reinforcement Learning from Human Feedback with AI Feedback", 340 "authors": ["Harrison Lee", "Samrat Phatale", "Hassan Mansoor"], 341 "year": 2023, 342 "arxiv_id": "2309.00267", 343 "relevance": "Demonstrated that RLAIF can achieve human-level performance in summarization, foundational work for applying AI feedback to replace human feedback." 344 }, 345 { 346 "title": "Constitutional AI: Harmlessness from AI Feedback", 347 "authors": ["Yuntao Bai", "Saurav Kadavath", "Sandipan Kundu"], 348 "year": 2022, 349 "arxiv_id": "2212.08073", 350 "relevance": "Introduced the RLAIF concept combining AI and human preferences for alignment, foundational for AI feedback approaches." 351 }, 352 { 353 "title": "CodeRL: Mastering Code Generation through Pretrained Models and Deep Reinforcement Learning", 354 "authors": ["Hung Le", "Yue Wang", "Akhilesh Deepak Gotmare"], 355 "year": 2022, 356 "relevance": "Applied actor-critic RL for code generation using execution feedback, a key prior approach that this paper argues is inapplicable for API-heavy code." 357 }, 358 { 359 "title": "Training Language Models to Follow Instructions with Human Feedback", 360 "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"], 361 "year": 2022, 362 "relevance": "Foundational RLHF paper (InstructGPT) whose pipeline this work adapts by replacing human feedback with AI feedback." 363 }, 364 { 365 "title": "Toolformer: Language Models Can Teach Themselves to Use Tools", 366 "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessì"], 367 "year": 2024, 368 "relevance": "Teaches LLMs to use external tools, relevant to the broader context of LLM tool/API usage research." 369 }, 370 { 371 "title": "ToolLLM: Facilitating Large Language Models to Master 16000+ Real-World APIs", 372 "authors": ["Yujia Qin", "Shihao Liang", "Yining Ye"], 373 "year": 2023, 374 "arxiv_id": "2307.16789", 375 "relevance": "Builds on LLMs mastering real-world APIs at scale, directly related to code generation with API usage." 376 }, 377 { 378 "title": "Proximal Policy Optimization Algorithms", 379 "authors": ["John Schulman", "Filip Wolski", "Prafulla Dhariwal"], 380 "year": 2017, 381 "arxiv_id": "1707.06347", 382 "relevance": "PPO algorithm used as the RL training method in this paper's RLAIF framework." 383 }, 384 { 385 "title": "WizardMath: Empowering Mathematical Reasoning for Large Language Models via Reinforced Evol-Instruct", 386 "authors": ["Haipeng Luo", "Qingfeng Sun", "Can Xu"], 387 "year": 2023, 388 "arxiv_id": "2308.09583", 389 "relevance": "Applied AI feedback to enhance mathematical reasoning in LLMs, a parallel application of RLAIF in a different domain." 390 }, 391 { 392 "title": "Program Synthesis with Large Language Models", 393 "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"], 394 "year": 2021, 395 "arxiv_id": "2108.07732", 396 "relevance": "Introduced the MBPP benchmark for code generation, relevant to evaluation of LLM code generation capabilities." 397 } 398 ] 399 }