scan.json (30698B)
1 { 2 "paper": { 3 "title": "Exploring Parameter-Efficient Fine-Tuning Techniques for Code Generation with Large Language Models", 4 "authors": [ 5 "Martin Weyssow", 6 "Xin Zhou", 7 "Kisub Kim", 8 "David Lo", 9 "Houari Sahraoui" 10 ], 11 "year": 2024, 12 "venue": "ACM Transactions on Software Engineering and Methodology", 13 "arxiv_id": "2308.10462", 14 "doi": "10.1145/3714461" 15 }, 16 "scan_version": 2, 17 "active_modules": ["experimental_rigor", "data_leakage"], 18 "methodology_tags": ["benchmark-eval"], 19 "key_findings": "PEFT techniques, especially LoRA, consistently outperform both ICL and RAG for Python code generation across multiple LLM families (CodeLlama, CodeGen2) and three benchmark datasets (Conala, CodeAlpacaPy, APPS). LLMs fine-tuned with LoRA using only ~1% of parameters significantly surpass fully fine-tuned SLMs by 39.8-72.3% in EM@k. QLoRA with 4-bit quantization achieves a 2x reduction in GPU memory while maintaining or improving effectiveness, enabling fine-tuning of models up to 34B parameters on a single 24GB GPU.", 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": true, 25 "justification": "The paper states 'We make our code publicly available: https://github.com/martin-wey/peft-llm-code' in Section 4.6." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "All three datasets are public: Conala (curated version from Zhou et al. 2023), CodeAlpaca (Chaudhary 2023), and APPS (Hendrycks et al. 2021). The authors constructed CodeAlpacaPy by filtering Python samples from CodeAlpaca and release their code which includes this filtering." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "The paper mentions using HuggingFace and PEFT libraries and an NVIDIA RTX A5000 24GB GPU, but does not provide library versions, a requirements.txt, Dockerfile, or detailed environment specification sufficient to recreate the software environment." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "While the paper provides detailed methodology in Section 4.6 and releases code on GitHub, the paper itself does not include step-by-step reproduction instructions or commands to replicate results." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": false, 47 "justification": "All results in Tables 3, 4 and Figures 3-7 are reported as point estimates only. No confidence intervals, error bars, or ± notation appear anywhere in the paper." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "The paper uses language like 'significantly outperform' and 'consistently outperform' throughout but performs no statistical significance tests. All comparative claims are based solely on numerical differences between point estimates." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "The paper consistently reports relative improvements with baseline context, e.g., 'surpasses the best small model by 39.8%, 41.7%, and 47.1%' (Section 5.2), '23.1% improvement in EM@10 on Conala (36.28 for LoRA vs. 29.47 for ICL)' (Section 5.3), and 'QLoRA-4bit results in a notable 52% increase' (Section 5.4)." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "No justification is provided for why the dataset sizes (Conala: 543 test, CodeAlpacaPy: 628 test, APPS: 750 test) are sufficient for the claims being made. No power analysis is discussed." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "No variance, standard deviation, or spread measures are reported for any experiment. Results appear to be from single runs with no indication of variability across seeds or runs." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "Comprehensive baselines are included: zero-shot, ICL with varying numbers of examples, RAG with GTE-small retrieval (Section 4.3), full fine-tuning for SLMs, and multiple PEFT techniques compared against each other." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "Baselines include CodeLlama (2023), CodeGen2 (2023), and CodeT5+ (2023), which were recent models at the time of writing. RAG uses GTE-small, a competitive embedding model." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": true, 84 "justification": "The study systematically compares LoRA, IA3, Prompt tuning, Prefix tuning, QLoRA-8bit, and QLoRA-4bit across the same models and datasets, effectively serving as an ablation across techniques. Effect of quantization precision (8-bit vs 4-bit) and number of ICL examples are also systematically varied." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "Conala and CodeAlpacaPy use EM@1, EM@10, and CodeBLEU. APPS uses average test cases passed and Pass@k (k=1,2,5). Multiple metrics reported throughout." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": false, 94 "justification": "All evaluation is entirely automated (exact match, CodeBLEU, test case pass rates). No human evaluation of generated code quality is performed." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": true, 99 "justification": "All three datasets have separate train/validation/test splits. Section 4.6 states 'We selected the checkpoint with the lowest evaluation loss for inference,' confirming model selection on validation, reporting on test." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Results are broken down by model, by PEFT technique, by dataset, and for APPS by difficulty level (introductory, interview, competition) in Table 4. Table 3 provides per-model per-technique breakdowns." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": true, 109 "justification": "The paper discusses where approaches fail: Prefix tuning 'fails to effectively adapt the larger models' (Section 5.2, some configurations show 0.0 EM@10 in Table 3), RAG underperforms ICL on CodeAlpacaPy (Section 5.3), and improvements are 'less substantial for interview and competition-level code generation' (Section 5.4)." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Several negative results are reported: Prefix tuning fails for larger models (0.0 EM scores in Table 3), increasing ICL examples beyond a threshold degrades performance (Section 5.1), RAG yields lower EM@10 than ICL on CodeAlpacaPy (Section 5.3), and QLoRA-8bit sometimes underperforms LoRA (Table 4)." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "The abstract claims PEFT superiority over ICL and RAG, memory reduction with QLoRA, and broader applicability. All are supported by Tables 3-4 and Figures 1, 5-7. The abstract's claims about 'diverse set of LLMs' and 'three representative Python code generation datasets' accurately reflect the study scope." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": true, 126 "justification": "The paper's causal claims (e.g., 'LoRA significantly enhances performance', PEFT 'outperforms' alternatives) are based on controlled comparisons where only the tuning technique varies while model, data, and hardware are held constant. This controlled experimental design is adequate for the claims made." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The title says 'Code Generation' broadly while testing only Python. The conclusion claims PEFT 'opens opportunities for broader applications of PEFT in software engineering scenarios' without evidence for non-Python languages or non-generation tasks. The threats-to-validity section acknowledges the monolingual limitation but then speculates that 'PEFT is also applicable to other programming languages.'" 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": true, 136 "justification": "The Discussion section hypothesizes QLoRA-4bit's improvement 'stems from the regularization effect of reducing weight precision to 4 bits' (Section 6). The threats to validity section discusses hyperparameter sensitivity, model selection bias, and dataset representativeness as factors that could influence results." 137 }, 138 "proxy_outcome_distinction": { 139 "applies": true, 140 "answer": true, 141 "justification": "The paper consistently frames its measurements (EM@k, CodeBLEU, Pass@k) as what they are — match-based and execution-based code generation metrics — without inflating them to broader claims about developer productivity or code quality. No proxy gap exists between measurements and claims." 142 } 143 }, 144 "setup_transparency": { 145 "model_versions_specified": { 146 "applies": true, 147 "answer": true, 148 "justification": "Specific model names and sizes are given: CodeLlama-7B, CodeLlama-7B-Instruct, CodeLlama-7B-Python, CodeLlama-13B-Python, CodeLlama-34B-Python, CodeGen2-1B/3.7B/7B, CodeGen-350M-mono, CodeT5+-220M/770M. For open-source models with fixed checkpoints, these names uniquely identify the model weights." 149 }, 150 "prompts_provided": { 151 "applies": true, 152 "answer": true, 153 "justification": "Table 2 shows the full prompt template with actual examples: '### Instruction: [intent] ### Response:'. The paper also describes the ICL prompt construction where examples are concatenated with the same template format." 154 }, 155 "hyperparameters_reported": { 156 "applies": true, 157 "answer": true, 158 "justification": "Section 4.6 provides detailed hyperparameters: learning rates (5e-5 for full FT, 3e-4 for LoRA/IA3/QLoRA, 3e-3/3e-2 for Prompt/Prefix tuning), r=16, alpha=32, 20 virtual tokens, Adafactor optimizer, 16-bit float, 5 epochs, batch size 8, beam search with beam size 10, max token lengths (64/128/1024)." 159 }, 160 "scaffolding_described": { 161 "applies": false, 162 "answer": false, 163 "justification": "No agentic scaffolding is used. This is standard fine-tuning and inference with direct model prompting." 164 }, 165 "data_preprocessing_documented": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 4.2 describes CodeAlpacaPy construction (filtering Python samples from CodeAlpaca, removing syntactically invalid code), Conala curation (ensured no function overlap between splits, no same-post overlap), and APPS split (4500/500/750 with 250 test samples per difficulty). Train/val/test sizes are stated for all datasets." 169 } 170 }, 171 "limitations_and_scope": { 172 "limitations_section_present": { 173 "applies": true, 174 "answer": true, 175 "justification": "Section 7 'Threats to Validity' provides a dedicated, substantive discussion organized by external, internal, and construct validity." 176 }, 177 "threats_to_validity_specific": { 178 "applies": true, 179 "answer": true, 180 "justification": "Specific threats are discussed: model selection limited to open-source (Section 7, external), monolingual Python-only evaluation (external), hyperparameter values taken from prior work without exhaustive tuning (internal), CodeBLEU's reliance on dataflow graphs not always available for small examples (construct), lack of unit tests in Conala/CodeAlpacaPy (construct)." 181 }, 182 "scope_boundaries_stated": { 183 "applies": true, 184 "answer": true, 185 "justification": "The paper explicitly states: closed-source models excluded due to parameter inaccessibility, full fine-tuning infeasible for LLMs within 24GB GPU constraint, HumanEval/MBPP excluded for lacking training data, Python only (acknowledging monolingual limitation), and no larger models beyond QLoRA experiments." 186 } 187 }, 188 "data_integrity": { 189 "raw_data_available": { 190 "applies": true, 191 "answer": false, 192 "justification": "While the input datasets are public and code is released, raw experimental outputs (model predictions, per-example scores) are not made available for independent verification." 193 }, 194 "data_collection_described": { 195 "applies": true, 196 "answer": true, 197 "justification": "Section 4.2 describes how each dataset was collected: Conala crawled from StackOverflow with manual annotations, CodeAlpacaPy filtered from CodeAlpaca for Python with syntactic validation, APPS from coding competitions with difficulty categorization." 198 }, 199 "recruitment_methods_described": { 200 "applies": false, 201 "answer": false, 202 "justification": "No human participants. All data comes from standard public benchmarks (Conala from StackOverflow, CodeAlpaca from LLM-generated data, APPS from coding competitions)." 203 }, 204 "data_pipeline_documented": { 205 "applies": true, 206 "answer": true, 207 "justification": "The data pipeline is documented: CodeAlpacaPy was constructed by filtering Python samples from CodeAlpaca and removing syntactically invalid code. Dataset splits are specified with exact counts (e.g., Conala: 2,135/201/543). The Conala curation process ensuring no overlap between splits is described." 208 } 209 }, 210 "conflicts_of_interest": { 211 "funding_disclosed": { 212 "applies": true, 213 "answer": false, 214 "justification": "No funding information, acknowledgments section, or grant numbers appear in the paper." 215 }, 216 "affiliations_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "Author affiliations are clearly listed: DIRO, University of Montreal (Weyssow, Sahraoui) and Singapore Management University (Zhou, Kim, Lo). These are academic institutions with no apparent conflict with the evaluated open-source models." 220 }, 221 "funder_independent_of_outcome": { 222 "applies": true, 223 "answer": false, 224 "justification": "Without funding disclosure, independence of the funder cannot be assessed. The authors are at academic institutions and evaluate open-source models they did not develop, suggesting likely independence, but this is not explicitly stated." 225 }, 226 "financial_interests_declared": { 227 "applies": true, 228 "answer": false, 229 "justification": "No competing interests statement or financial interest declaration appears in the paper." 230 } 231 }, 232 "contamination": { 233 "training_cutoff_stated": { 234 "applies": true, 235 "answer": false, 236 "justification": "No training data cutoff dates are stated for any of the models used (CodeLlama, CodeGen2, CodeGen, CodeT5+). The paper mentions CodeLlama is based on Llama 2 and CodeGen2 was pre-trained on TheStack, but does not state when training data was collected." 237 }, 238 "train_test_overlap_discussed": { 239 "applies": true, 240 "answer": false, 241 "justification": "The paper discusses train/test split integrity for fine-tuning (Conala curated to avoid function overlap), but does not discuss whether pretrained models may have seen test examples during pretraining. Conala (StackOverflow) and APPS (competition problems) are publicly available and could appear in pretraining corpora." 242 }, 243 "benchmark_contamination_addressed": { 244 "applies": true, 245 "answer": false, 246 "justification": "All three benchmarks (Conala from 2018, CodeAlpaca from 2023, APPS from 2021) were available online before the models' training periods. The paper does not discuss whether models may have seen benchmark solutions during pretraining." 247 } 248 }, 249 "human_studies": { 250 "pre_registered": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants in this study. All experiments are automated benchmark evaluations of code generation models." 254 }, 255 "irb_or_ethics_approval": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants. The study evaluates models on existing public benchmarks." 259 }, 260 "demographics_reported": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants involved." 264 }, 265 "inclusion_exclusion_criteria": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants involved." 269 }, 270 "randomization_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants involved." 274 }, 275 "blinding_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants involved." 279 }, 280 "attrition_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants involved." 284 } 285 }, 286 "cost_and_practicality": { 287 "inference_cost_reported": { 288 "applies": true, 289 "answer": false, 290 "justification": "The paper focuses extensively on training memory consumption (Figures 1 and 5) but does not report inference cost, latency, or time per example for any configuration." 291 }, 292 "compute_budget_stated": { 293 "applies": true, 294 "answer": false, 295 "justification": "The paper states hardware used (single NVIDIA RTX A5000 24GB GPU) and peak memory consumption, but does not report total GPU hours, wall-clock training time, or total computational budget across all experiments." 296 } 297 }, 298 "experimental_rigor": { 299 "seed_sensitivity_reported": { 300 "applies": true, 301 "answer": false, 302 "justification": "No seed sensitivity analysis is reported. For ICL, the paper mentions 'selecting random few-shot examples using different seeds' (Section 8) but does not report results across seeds. Fine-tuning results appear to be single-seed." 303 }, 304 "number_of_runs_stated": { 305 "applies": true, 306 "answer": false, 307 "justification": "The number of experimental runs is never stated. Results appear to be from single runs with no indication of how many trials produced them." 308 }, 309 "hyperparameter_search_budget": { 310 "applies": true, 311 "answer": false, 312 "justification": "No hyperparameter search budget is reported. Section 7 (internal validity) states 'we used hyperparameters values which have been used in previous work' but does not report whether any search was performed or how many configurations were tried." 313 }, 314 "best_config_selection_justified": { 315 "applies": true, 316 "answer": true, 317 "justification": "Section 4.6 clearly states: 'We selected the checkpoint with the lowest evaluation loss for inference,' which is a principled selection on the validation set, not the test set." 318 }, 319 "multiple_comparison_correction": { 320 "applies": false, 321 "answer": false, 322 "justification": "No statistical significance tests are performed at all, so multiple comparison correction is not applicable." 323 }, 324 "self_comparison_bias_addressed": { 325 "applies": true, 326 "answer": false, 327 "justification": "The authors implement all techniques (LoRA, IA3, Prompt tuning, Prefix tuning, ICL, RAG) themselves using the PEFT library. They do not acknowledge potential bias in their own implementations of these methods or discuss whether their ICL/RAG implementations are competitive." 328 }, 329 "compute_budget_vs_performance": { 330 "applies": true, 331 "answer": true, 332 "justification": "Table 1 explicitly categorizes techniques by computation cost vs effectiveness. Figure 5 shows EM@10 alongside GPU peak memory for different quantization levels. Figure 1 compares memory consumption across fine-tuning approaches. The computation-effectiveness tradeoff is a central theme." 333 }, 334 "benchmark_construct_validity": { 335 "applies": true, 336 "answer": false, 337 "justification": "The paper does not discuss whether Conala, CodeAlpacaPy, or APPS actually measure meaningful code generation capability. EM@k requires exact string match to reference solutions, a very narrow proxy for code quality, but this limitation is not discussed. The construct validity threats in Section 7 address only metric choice, not benchmark validity." 338 }, 339 "scaffold_confound_addressed": { 340 "applies": false, 341 "answer": false, 342 "justification": "No scaffolding is involved. All models are evaluated with the same direct inference setup (prompt in, code out), so no scaffold confound exists." 343 } 344 }, 345 "data_leakage": { 346 "temporal_leakage_addressed": { 347 "applies": true, 348 "answer": false, 349 "justification": "Not discussed. Conala (2018) and APPS (2021) predate the models used (CodeLlama and CodeGen2, both 2023). Models could have seen benchmark solutions during pretraining, but temporal leakage is not mentioned." 350 }, 351 "feature_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "Not discussed. The Conala dataset includes variable hints in natural language intents (e.g., 'map two lists keys and values into a dictionary'), which provide ground-truth variable names, but this is not analyzed as potential feature leakage." 355 }, 356 "non_independence_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "The paper notes Conala's curated version ensures no function overlap between fine-tuning splits and no same-post overlap, but does not address whether pretrained models' training corpora overlap with benchmark data. StackOverflow data (Conala) is commonly included in code pretraining corpora." 360 }, 361 "leakage_detection_method": { 362 "applies": true, 363 "answer": false, 364 "justification": "No concrete leakage detection or prevention method is applied. No canary strings, membership inference tests, n-gram overlap analysis, or decontamination pipelines are used." 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "ICL drastically improves effectiveness of all models compared to zero-shot, with CodeLlama-7B achieving EM@10 of 29.83 (vs 7.73 zero-shot) on Conala.", 371 "evidence": "Figure 3 shows EM@10 scores across models with 0-16 ICL examples on Conala and CodeAlpacaPy. All models show large gains from zero-shot to ICL. Section 5.1 reports specific numbers.", 372 "supported": "strong" 373 }, 374 { 375 "claim": "LLMs fine-tuned with PEFT consistently and significantly outperform fully fine-tuned SLMs by 39.8-72.3% in EM@k under the same 24GB GPU constraint.", 376 "evidence": "Table 3 shows CodeLlama-7B-Python with LoRA (EM@10: 36.28 on Conala, 15.92 on CodeAlpacaPy) vs CodeGen-350M-mono full FT (EM@10: 18.42 on Conala, 5.73 on CodeAlpacaPy). Section 5.2 reports the percentage improvements.", 377 "supported": "strong" 378 }, 379 { 380 "claim": "LoRA is the most effective PEFT technique among LoRA, IA3, Prompt tuning, and Prefix tuning across the studied models.", 381 "evidence": "Table 3 shows LoRA achieving the highest or near-highest scores for most model-dataset combinations. Section 5.2 states 'LoRA emerges as the most effective PEFT technique among the studied ones.'", 382 "supported": "strong" 383 }, 384 { 385 "claim": "QLoRA-4bit achieves a 2x reduction in peak memory usage compared to LoRA while improving or preserving effectiveness.", 386 "evidence": "Figure 5 shows CodeLlama-7B-Python LoRA at ~19GB vs QLoRA-4bit at ~9GB, with QLoRA-4bit achieving comparable or better EM@10 scores. CodeLlama-34B-Python fine-tuned with QLoRA-4bit fits within 24GB.", 387 "supported": "moderate" 388 }, 389 { 390 "claim": "LoRA significantly outperforms both ICL and RAG across CodeLlama-7B variants on Conala and CodeAlpacaPy.", 391 "evidence": "Figure 6 shows all models achieving higher EM@10 with LoRA vs ICL. Figure 7 shows CodeLlama-7B LoRA (39.31 EM@10) vs RAG peak (35.17) vs ICL (29.83) on Conala. Section 5.3 provides detailed comparisons.", 392 "supported": "moderate" 393 }, 394 { 395 "claim": "QLoRA-4bit boosts CodeLlama-7B-Instruct's average passed test cases on APPS introductory problems by 52% over the base model.", 396 "evidence": "Table 4 shows average passed tests increasing from 13.66 (base) to 20.84 (QLoRA-4bit) for introductory level, and from 13.44 to 20.34 for interview level. Pass@5 increases from 8.80 to 12.40 at introductory level.", 397 "supported": "moderate" 398 } 399 ], 400 "red_flags": [ 401 { 402 "flag": "No variance or uncertainty quantification", 403 "detail": "All results across all experiments appear to be from single runs. No standard deviations, confidence intervals, or repeated-run statistics are reported despite the paper making strong comparative claims. Results could shift substantially with different random seeds." 404 }, 405 { 406 "flag": "Statistical significance claims without tests", 407 "detail": "The paper repeatedly uses language like 'significantly outperform' and 'LoRA significantly enhances' without performing any statistical significance tests. All comparative conclusions rest on point-estimate comparisons." 408 }, 409 { 410 "flag": "No contamination analysis for pretrained models", 411 "detail": "All benchmark datasets (Conala from StackOverflow 2018, APPS from competitions 2021) were publicly available before the models' training data collection. CodeLlama and CodeGen2 likely encountered StackOverflow content during pretraining, potentially inflating zero-shot and ICL baselines." 412 }, 413 { 414 "flag": "No total compute budget reported", 415 "detail": "Despite running experiments across 11 models, 6 PEFT techniques, and 3 datasets, the paper does not report total GPU hours, training time, or wall-clock time for any experiment. Peak memory is reported but total computational cost is unknown." 416 } 417 ], 418 "cited_papers": [ 419 { 420 "title": "Evaluating large language models trained on code", 421 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 422 "year": 2021, 423 "arxiv_id": "2107.03374", 424 "relevance": "Introduced Codex and HumanEval benchmark, foundational work on LLM code generation evaluation." 425 }, 426 { 427 "title": "CodeGen: An Open Large Language Model for Code with Multi-Turn Program Synthesis", 428 "authors": ["Erik Nijkamp", "Bo Pang", "Hiroaki Hayashi"], 429 "year": 2023, 430 "arxiv_id": "2203.13474", 431 "relevance": "Open-source code LLM family used as baseline; demonstrates scaling effects for code generation." 432 }, 433 { 434 "title": "Code llama: Open foundation models for code", 435 "authors": ["Baptiste Roziere", "Jonas Gehring", "Fabian Gloeckle"], 436 "year": 2023, 437 "arxiv_id": "2308.12950", 438 "relevance": "Primary LLM family evaluated; open foundation model for code demonstrating instruction tuning and specialization." 439 }, 440 { 441 "title": "LoRA: Low-rank adaptation of large language models", 442 "authors": ["Edward J Hu", "Yelong Shen", "Phillip Wallis"], 443 "year": 2021, 444 "arxiv_id": "2106.09685", 445 "relevance": "Introduced LoRA, the best-performing PEFT technique in this study; foundational for efficient fine-tuning of LLMs." 446 }, 447 { 448 "title": "QLoRA: Efficient finetuning of quantized LLMs", 449 "authors": ["Tim Dettmers", "Artidoro Pagnoni", "Ari Holtzman"], 450 "year": 2023, 451 "arxiv_id": "2305.14314", 452 "relevance": "Introduced QLoRA combining LoRA with quantization; enables fine-tuning larger models on consumer GPUs." 453 }, 454 { 455 "title": "CodeT5+: Open code large language models for code understanding and generation", 456 "authors": ["Yue Wang", "Hung Le", "Akhilesh Deepak Gotmare"], 457 "year": 2023, 458 "arxiv_id": "2305.07922", 459 "relevance": "Encoder-decoder code LLM used as SLM baseline; demonstrates pre-training objectives for code intelligence." 460 }, 461 { 462 "title": "DocPrompting: Generating code by retrieving the docs", 463 "authors": ["Shuyan Zhou", "Uri Alon", "Frank F Xu"], 464 "year": 2023, 465 "relevance": "RAG approach for code generation; provides curated Conala dataset version used in this study." 466 }, 467 { 468 "title": "Measuring Coding Challenge Competence With APPS", 469 "authors": ["Dan Hendrycks", "Steven Basart", "Saurav Kadavath"], 470 "year": 2021, 471 "relevance": "Introduced APPS benchmark with execution-based evaluation of code generation at varying difficulty levels." 472 }, 473 { 474 "title": "A Systematic Evaluation of Large Language Models of Code", 475 "authors": ["Frank F. Xu", "Uri Alon", "Graham Neubig"], 476 "year": 2022, 477 "doi": "10.1145/3520312.3534862", 478 "relevance": "Systematic evaluation of code LLMs providing baseline comparisons and evaluation methodology." 479 }, 480 { 481 "title": "Program synthesis with large language models", 482 "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"], 483 "year": 2021, 484 "arxiv_id": "2108.07732", 485 "relevance": "Early work on LLM program synthesis including MBPP benchmark; demonstrates zero-shot code generation capability." 486 }, 487 { 488 "title": "Few-shot parameter-efficient fine-tuning is better and cheaper than in-context learning", 489 "authors": ["Haokun Liu", "Derek Tam", "Mohammed Muqeeth"], 490 "year": 2022, 491 "relevance": "Introduced IA3 PEFT technique; directly relevant comparison showing PEFT advantages over ICL in NLP." 492 }, 493 { 494 "title": "Execution-Based Evaluation for Open-Domain Code Generation", 495 "authors": ["Zhiruo Wang", "Shuyan Zhou", "Daniel Fried"], 496 "year": 2022, 497 "arxiv_id": "2212.10481", 498 "relevance": "Execution-based evaluation methodology for code generation; provides context on evaluation challenges." 499 } 500 ] 501 }