scan-v5.json (26149B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Enhancing Code Generation for Low-Resource Languages: No Silver Bullet", 6 "authors": [ 7 "Alessandro Giagnorio", 8 "Alberto Martin-Lopez", 9 "Gabriele Bavota" 10 ], 11 "year": 2025, 12 "venue": "IEEE International Conference on Program Comprehension", 13 "arxiv_id": "2501.19085", 14 "doi": "10.1109/ICPC66645.2025.00058" 15 }, 16 "checklist": { 17 "claims_and_evidence": { 18 "abstract_claims_supported": { 19 "applies": true, 20 "answer": true, 21 "justification": "All abstract claims (fine-tuning best for small models, ICL scales with size, large models degrade with fine-tuning) are directly supported by Table III with statistical comparisons.", 22 "source": "haiku" 23 }, 24 "causal_claims_justified": { 25 "applies": true, 26 "answer": true, 27 "justification": "Claims like 'fine-tuning improves small models' and 'ICL boosts performance' are supported by controlled experiments comparing techniques against a baseline using the same benchmark, models, and languages; McNemar's tests with ORs quantify the differences.", 28 "source": "haiku" 29 }, 30 "generalization_bounded": { 31 "applies": true, 32 "answer": true, 33 "justification": "The paper explicitly limits findings to R, Racket, 6 models, and 4 sizes in the threats section, stating 'Our findings may not generalize to other settings'; the title 'No Silver Bullet' itself signals bounded scope.", 34 "source": "haiku" 35 }, 36 "alternative_explanations_discussed": { 37 "applies": true, 38 "answer": true, 39 "justification": "The authors discuss alternative reasons for the performance gap (language similarity, domain of use, programming paradigm) and speculate on why fine-tuning hurts large models (insufficient data to update weights) and why small models struggle with ICL (limited ability to interpret complex prompts).", 40 "source": "haiku" 41 }, 42 "proxy_outcome_distinction": { 43 "applies": true, 44 "answer": true, 45 "justification": "The paper uses pass@1 on unit tests as a direct measure of functional code correctness, which matches what is claimed; no conflation with broader productivity or quality proxies.", 46 "source": "haiku" 47 } 48 }, 49 "limitations_and_scope": { 50 "limitations_section_present": { 51 "applies": true, 52 "answer": true, 53 "justification": "Section V 'Threats to Validity' is dedicated to limitations, covering construct, internal, and external validity separately.", 54 "source": "haiku" 55 }, 56 "threats_to_validity_specific": { 57 "applies": true, 58 "answer": true, 59 "justification": "Specific threats include: no hyperparameter tuning (resource constraint), prompt sensitivity for ICL techniques, training limited to 3 epochs potentially capping fine-tuning gains, and restriction to 4 low-resource languages and 6 models.", 60 "source": "haiku" 61 }, 62 "scope_boundaries_stated": { 63 "applies": true, 64 "answer": true, 65 "justification": "The external validity section explicitly bounds results to 'four low-resource languages (Julia, Lua, R and Racket), one closed-source tool (GitHub Copilot), two open source models (DeepSeek Coder and Code Llama) and four model sizes.'", 66 "source": "haiku" 67 } 68 }, 69 "conflicts_of_interest": { 70 "funding_disclosed": { 71 "applies": true, 72 "answer": true, 73 "justification": "Acknowledgments disclose Swiss National Science Foundation funding for the PARSED project (SNF Project No. 219294) and CHOOSE sponsorship for conference travel.", 74 "source": "haiku" 75 }, 76 "affiliations_disclosed": { 77 "applies": true, 78 "answer": true, 79 "justification": "All three authors list affiliation with Software Institute – USI Università della Svizzera italiana, Switzerland; none are affiliated with the evaluated tools (DeepSeek, Meta, GitHub).", 80 "source": "haiku" 81 }, 82 "funder_independent_of_outcome": { 83 "applies": true, 84 "answer": true, 85 "justification": "Swiss National Science Foundation is a government research funder with no stake in the evaluated commercial or open-source tools.", 86 "source": "haiku" 87 }, 88 "financial_interests_declared": { 89 "applies": true, 90 "answer": false, 91 "justification": "No competing interests or financial interests statement is present in the paper; absence of declaration = NO.", 92 "source": "haiku" 93 } 94 }, 95 "scope_and_framing": { 96 "key_terms_defined": { 97 "applies": true, 98 "answer": true, 99 "justification": "Low-resource languages are explicitly defined as 'niche programming languages characterized by the scarcity of training data'; pass@k is defined with its computation procedure; in-context learning and fine-tuning are described.", 100 "source": "haiku" 101 }, 102 "intended_contribution_clear": { 103 "applies": true, 104 "answer": true, 105 "justification": "The paper clearly states it contributes a comparative empirical study of five techniques across six LLMs for code generation on low-resource languages, filling a gap in previous work that studied techniques in isolation.", 106 "source": "haiku" 107 }, 108 "engagement_with_prior_work": { 109 "applies": true, 110 "answer": true, 111 "justification": "Section II engages substantively with Cassano et al., Athiwaratkun et al., Van Dam et al., and Orlanski et al., explaining why their approaches are reused, extended, or not applicable, not merely listing them.", 112 "source": "haiku" 113 } 114 } 115 }, 116 "type_checklist": { 117 "empirical": { 118 "artifacts": { 119 "code_released": { 120 "applies": true, 121 "answer": true, 122 "justification": "A replication package is released at https://doi.org/10.5281/zenodo.13128630 (reference [27]); Zenodo is a persistent archival repository, not a promise of future release.", 123 "source": "haiku" 124 }, 125 "data_released": { 126 "applies": true, 127 "answer": true, 128 "justification": "The study uses MultiPL-E benchmark and MultiPL-T datasets, both publicly available from Cassano et al.; no proprietary datasets were created.", 129 "source": "haiku" 130 }, 131 "environment_specified": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper specifies GPU hardware (A30/A40/A100) and bfloat16 precision but does not provide Python version, PyTorch/CUDA/Transformers library versions, or a Dockerfile/requirements.txt.", 135 "source": "haiku" 136 }, 137 "reproduction_instructions": { 138 "applies": true, 139 "answer": false, 140 "justification": "The training procedure is described in narrative form (Section IV.B), but the paper does not provide step-by-step reproduction instructions; the replication package may contain more but its contents are not enumerated.", 141 "source": "haiku" 142 } 143 }, 144 "statistical_methodology": { 145 "confidence_intervals_or_error_bars": { 146 "applies": true, 147 "answer": false, 148 "justification": "Tables I and III report only mean pass@1 scores across 50 repetitions; no confidence intervals or standard deviations are reported around those averages.", 149 "source": "haiku" 150 }, 151 "significance_tests": { 152 "applies": true, 153 "answer": true, 154 "justification": "McNemar's test is used for all pairwise comparisons with Benjamini-Hochberg p-value correction for multiple comparisons; all comparisons are run with 157×50 = 7,850 observations.", 155 "source": "haiku" 156 }, 157 "effect_sizes_reported": { 158 "applies": true, 159 "answer": true, 160 "justification": "Odds Ratios (OR) are reported throughout, e.g., 'the odds of generating a correct program in Java are about 5 times higher than in Julia' (OR=5.93), with full OR tables (Tables II and III).", 161 "source": "haiku" 162 }, 163 "sample_size_justified": { 164 "applies": true, 165 "answer": true, 166 "justification": "The paper justifies n=50 repetitions by citing Cassano et al.'s finding that pass@1 'appears to stabilize at n=20', and uses k=1 with temperature 0.2 consistent with prior work.", 167 "source": "haiku" 168 }, 169 "variance_reported": { 170 "applies": true, 171 "answer": false, 172 "justification": "Only average pass@1 rates are reported in Tables I and III; variance or standard deviation across the 50 repetitions is not shown.", 173 "source": "haiku" 174 } 175 }, 176 "evaluation_design": { 177 "baselines_included": { 178 "applies": true, 179 "answer": true, 180 "justification": "Table III explicitly includes baseline rows for each model (model used out-of-the-box on the low-resource language) against which all five techniques are compared.", 181 "source": "haiku" 182 }, 183 "baselines_contemporary": { 184 "applies": true, 185 "answer": true, 186 "justification": "DeepSeek Coder (2024), Code Llama (2023), and GitHub Copilot are all state-of-the-art at the time of publication; no suspiciously old or weak baselines.", 187 "source": "haiku" 188 }, 189 "ablation_study": { 190 "applies": true, 191 "answer": true, 192 "justification": "Pre-training + fine-tuning vs. fine-tuning only serves as an ablation of the pre-training component; the paper explicitly compares these and finds pre-training adds no consistent benefit.", 193 "source": "haiku" 194 }, 195 "multiple_metrics": { 196 "applies": true, 197 "answer": false, 198 "justification": "Only pass@1 is used as the evaluation metric; no alternative metrics such as pass@k (k>1), CodeBLEU, or manual code quality rating are reported.", 199 "source": "haiku" 200 }, 201 "human_evaluation": { 202 "applies": false, 203 "answer": false, 204 "justification": "Evaluation is fully automated via unit tests (pass@1); human evaluation of generated code quality is not applicable to this benchmark-based study.", 205 "source": "haiku" 206 }, 207 "held_out_test_set": { 208 "applies": true, 209 "answer": true, 210 "justification": "Fine-tuning uses MultiPL-T datasets; evaluation is on MultiPL-E (HumanEval translations), a separate benchmark not used during training.", 211 "source": "haiku" 212 }, 213 "per_category_breakdown": { 214 "applies": true, 215 "answer": true, 216 "justification": "Results are broken down by model family, model size (1B/7B/13B/33B), technique (5 + baseline), and language (R vs. Racket) across Table III.", 217 "source": "haiku" 218 }, 219 "failure_cases_discussed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Figure 1 and Listing 3 show concrete failure cases with analysis: R generation fails on edge cases (null vs. empty list, vector vs. list return type), and failure reasons are categorized.", 223 "source": "haiku" 224 }, 225 "negative_results_reported": { 226 "applies": true, 227 "answer": true, 228 "justification": "DeepSeek Coder 33B performance degrades after fine-tuning (ORs of 1.64 for R and 1.42 for Racket against baseline); translation rules worsens baseline for 4/6 models in Racket.", 229 "source": "haiku" 230 } 231 }, 232 "setup_transparency": { 233 "model_versions_specified": { 234 "applies": true, 235 "answer": false, 236 "justification": "Models are identified by marketing names (DeepSeek Coder 1B/7B/33B instruct, Code Llama 7B/13B instruct, GitHub Copilot) without specific snapshot dates or checkpoint identifiers.", 237 "source": "haiku" 238 }, 239 "prompts_provided": { 240 "applies": true, 241 "answer": true, 242 "justification": "Listing 1 and Listing 2 provide actual prompt templates for translation examples and translation rules; Listing 3 shows a fully instantiated prompt with real content.", 243 "source": "haiku" 244 }, 245 "hyperparameters_reported": { 246 "applies": true, 247 "answer": true, 248 "justification": "Learning rates (2×10⁻⁵ for DeepSeek, 5×10⁻⁵ for Code Llama), optimizers (AdamW), schedulers, max sequence lengths (1024/2048/3072), temperature (0.2), batch size, precision (bfloat16), and epochs (3) are all reported.", 249 "source": "haiku" 250 }, 251 "scaffolding_described": { 252 "applies": false, 253 "answer": false, 254 "justification": "No agentic scaffolding is used; models are invoked directly via prompts without multi-step orchestration.", 255 "source": "haiku" 256 }, 257 "data_preprocessing_documented": { 258 "applies": true, 259 "answer": true, 260 "justification": "Section IV.A.5 details the process for building the code translation pre-training dataset: matching Python functions to translations, docstring alignment, exclusion of ambiguous matches; fine-tuning dataset construction (combining D, S, B) is also described.", 261 "source": "haiku" 262 } 263 }, 264 "data_integrity": { 265 "raw_data_available": { 266 "applies": true, 267 "answer": true, 268 "justification": "The Zenodo replication package (https://doi.org/10.5281/zenodo.13128630) is referenced and includes at minimum all statistical test results (162 tests); this is a persistent public archive.", 269 "source": "haiku" 270 }, 271 "data_collection_described": { 272 "applies": true, 273 "answer": true, 274 "justification": "The construction of fine-tuning and pre-training datasets from MultiPL-T is described in detail, including the matching procedure, exclusion criteria, and resulting dataset sizes (22,796 R pairs, 25,390 Racket pairs).", 275 "source": "haiku" 276 }, 277 "recruitment_methods_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants; study uses standard public benchmarks with no recruitment.", 281 "source": "haiku" 282 }, 283 "data_pipeline_documented": { 284 "applies": true, 285 "answer": true, 286 "justification": "The full pipeline from dataset construction (MultiPL-T → pre-training/fine-tuning sets) through training (Section IV.B) to evaluation (MultiPL-E, pass@1 with 50 reps) is described sequentially.", 287 "source": "haiku" 288 } 289 }, 290 "contamination": { 291 "training_cutoff_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "No training data cutoff is stated for any of the six models; for Code Llama the paper notes that 'the complete list of programming languages used for its training is not publicly available.'", 295 "source": "haiku" 296 }, 297 "train_test_overlap_discussed": { 298 "applies": true, 299 "answer": false, 300 "justification": "The paper does not discuss whether HumanEval problems (the basis for MultiPL-E) may have appeared in model training data, despite these benchmarks predating all tested models.", 301 "source": "haiku" 302 }, 303 "benchmark_contamination_addressed": { 304 "applies": true, 305 "answer": false, 306 "justification": "HumanEval was published in 2021 and all tested models were trained after this date; potential benchmark contamination is not addressed anywhere in the paper.", 307 "source": "haiku" 308 } 309 }, 310 "human_studies": { 311 "pre_registered": { 312 "applies": false, 313 "answer": false, 314 "justification": "No human participants in this study.", 315 "source": "haiku" 316 }, 317 "irb_or_ethics_approval": { 318 "applies": false, 319 "answer": false, 320 "justification": "No human participants in this study.", 321 "source": "haiku" 322 }, 323 "demographics_reported": { 324 "applies": false, 325 "answer": false, 326 "justification": "No human participants in this study.", 327 "source": "haiku" 328 }, 329 "inclusion_exclusion_criteria": { 330 "applies": false, 331 "answer": false, 332 "justification": "No human participants in this study.", 333 "source": "haiku" 334 }, 335 "randomization_described": { 336 "applies": false, 337 "answer": false, 338 "justification": "No human participants in this study.", 339 "source": "haiku" 340 }, 341 "blinding_described": { 342 "applies": false, 343 "answer": false, 344 "justification": "No human participants in this study.", 345 "source": "haiku" 346 }, 347 "attrition_reported": { 348 "applies": false, 349 "answer": false, 350 "justification": "No human participants in this study.", 351 "source": "haiku" 352 } 353 }, 354 "cost_and_practicality": { 355 "inference_cost_reported": { 356 "applies": true, 357 "answer": false, 358 "justification": "Inference cost is mentioned qualitatively ('extremely expensive' for 70B models) as a reason for exclusion, but no actual latency or cost figures are reported for the evaluated models.", 359 "source": "haiku" 360 }, 361 "compute_budget_stated": { 362 "applies": true, 363 "answer": false, 364 "justification": "The HPC cluster is described (NVIDIA A30/A40/A100 GPUs) and training cost for 33B is called 'extremely high', but no GPU-hours, wall-clock time, or dollar cost is reported.", 365 "source": "haiku" 366 } 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "There is a significant performance gap between high-resource (Python, Java) and low-resource (R, Racket) languages across all six tested LLMs.", 373 "evidence": "Table I shows pass@1 for R ranging 7.0–32.9% and Racket 7.0–33.1% vs. Java 30.6–58.1% and Python 33.7–74.9%; Table II reports statistically significant ORs for all high- vs. low-resource comparisons.", 374 "supported": "strong" 375 }, 376 { 377 "claim": "Julia and Lua behave more like high-resource languages than like R and Racket despite all four being classified as low-resource in prior work.", 378 "evidence": "Pass@1 for Julia/Lua (19.2–61.4%) is far closer to Java/Python than to R/Racket (7.0–33.1%); ORs for Java vs. Julia/Lua (0.72–19.34) are substantially smaller than for Java vs. R/Racket (4.05–251.59).", 379 "supported": "strong" 380 }, 381 { 382 "claim": "Fine-tuning is the best technique for the smallest model (1B), substantially outperforming in-context learning approaches.", 383 "evidence": "For DeepSeek Coder 1B, fine-tuning achieves 16.7%/18.1% and pre-training+FT 16.0%/18.4% on R/Racket vs. baseline 13.9%/7.0%; ORs vs. ICL techniques range 1.45–14.01.", 384 "supported": "strong" 385 }, 386 { 387 "claim": "Fine-tuning degrades performance of large models (33B) on low-resource languages, likely due to insufficient training data.", 388 "evidence": "DeepSeek Coder 33B drops from baseline 30.2%/32.5% to 25.3%/28.0% (FT) and 25.8%/26.8% (pre+FT) on R/Racket; ORs of 1.64/1.42 vs. baseline are statistically significant.", 389 "supported": "moderate" 390 }, 391 { 392 "claim": "In-context learning with translation examples is a safe bet that consistently improves performance across all model sizes (excluding 1B) and languages.", 393 "evidence": "Translation examples improves over baseline for all 5 non-1B model configurations in both R and Racket (Table III), with ORs 1.28–2.27; few-shot worsens Code Llama 13B on Racket.", 394 "supported": "moderate" 395 }, 396 { 397 "claim": "No single technique is universally best across all combinations of model size and language (no silver bullet).", 398 "evidence": "Table III shows different techniques winning for different model-size-language combinations: FT wins for 1B, mixed for 7B/13B, ICL wins for 33B; no technique achieves best performance in all 10 cells.", 399 "supported": "strong" 400 } 401 ], 402 "methodology_tags": [ 403 "benchmark-eval", 404 "observational" 405 ], 406 "key_findings": "No single technique dominates for improving LLM code generation on low-resource languages (R, Racket). Fine-tuning works best for the smallest models (~1B) but actually degrades performance for the largest (33B), likely because the scarce training data cannot effectively update their parameters. In-context learning with translation examples is the most reliable technique — it always improves over baseline for models ≥7B and is cheap to apply. Julia and Lua, though classified as low-resource in prior work, now perform close to high-resource languages with modern LLMs, suggesting the amount of GitHub repositories is a poor proxy for LLM performance on a language.", 407 "red_flags": [ 408 { 409 "flag": "No variance reported", 410 "detail": "Tables report only average pass@1 across 50 repetitions; no standard deviation or confidence intervals are shown, making it impossible to assess reliability of small differences (e.g., +0.9% for Code Llama 7B in Racket with translation examples)." 411 }, 412 { 413 "flag": "Benchmark contamination unaddressed", 414 "detail": "HumanEval (2021) and its MultiPL-E translations predate all tested models; potential contamination of test problems in training data is never discussed, which could inflate all reported pass@1 scores." 415 }, 416 { 417 "flag": "Model versions not pinned", 418 "detail": "No snapshot dates or checkpoint hashes are given for DeepSeek Coder, Code Llama, or Copilot; as instruct models receive periodic updates, exact reproduction requires guessing which version was used." 419 }, 420 { 421 "flag": "Single evaluation metric", 422 "detail": "Only pass@1 is reported; no pass@k (k>1) or other metrics are used, limiting understanding of model behavior on harder problems." 423 }, 424 { 425 "flag": "Copilot asymmetry", 426 "detail": "GitHub Copilot cannot be fine-tuned, so only 3 of 5 techniques are evaluated for it, making cross-technique comparisons structurally asymmetric for the most commercially important model." 427 } 428 ], 429 "cited_papers": [ 430 { 431 "title": "Knowledge transfer from high-resource to low-resource programming languages for code LLMs (MultiPL-T)", 432 "relevance": "Primary source of fine-tuning datasets (37,592 R and 40,489 Racket functions) and prior work on fine-tuning for low-resource languages" 433 }, 434 { 435 "title": "MultiPL-E: A scalable and polyglot approach to benchmarking neural code generation", 436 "relevance": "The benchmark used for all evaluations; translates HumanEval to 18 languages including R and Racket" 437 }, 438 { 439 "title": "Multi-lingual evaluation of code generation models", 440 "relevance": "Prior work on few-shot learning for out-of-domain languages; few-shot technique replicated in this study" 441 }, 442 { 443 "title": "DeepSeek-Coder: When the large language model meets programming — the rise of code intelligence", 444 "relevance": "Primary open-source model family evaluated (1B, 7B, 33B)" 445 }, 446 { 447 "title": "Code Llama: Open foundation models for code", 448 "relevance": "Second open-source model family evaluated (7B, 13B)" 449 }, 450 { 451 "title": "Measuring the impact of programming language distribution", 452 "relevance": "Prior work treating low-resource as a data distribution issue; their approach (balanced training across 14 languages) not applicable here due to no control over pretraining" 453 }, 454 { 455 "title": "On the transferability of pre-trained language models for low-resource programming languages", 456 "relevance": "Prior work on fine-tuning for similar low-resource languages; motivates the multilingual pretraining approach" 457 }, 458 { 459 "title": "A survey on LLM-based code generation for low-resource and domain-specific programming languages", 460 "relevance": "Recent survey highlighting scarcity of benchmarks for niche languages; contextualizes this study's contribution" 461 }, 462 { 463 "title": "Evaluating large language models trained on code (HumanEval)", 464 "relevance": "Original benchmark whose problems form the basis of MultiPL-E used in all evaluations" 465 } 466 ], 467 "engagement_factors": { 468 "practical_relevance": { 469 "score": 2, 470 "justification": "Gives actionable guidance to practitioners using R/Racket with LLMs: use ICL with translation examples for large models, fine-tune only for small ones." 471 }, 472 "surprise_contrarian": { 473 "score": 2, 474 "justification": "Challenges the assumption that fine-tuning always helps — large models actually get worse, and Julia/Lua turn out not to be meaningfully low-resource anymore." 475 }, 476 "fear_safety": { 477 "score": 0, 478 "justification": "No safety or risk concerns raised; purely a performance benchmarking study." 479 }, 480 "drama_conflict": { 481 "score": 1, 482 "justification": "Mild tension with prior work that reported uniformly positive fine-tuning results; the degradation finding at 33B is notable but not framed controversially." 483 }, 484 "demo_ability": { 485 "score": 2, 486 "justification": "All techniques (especially ICL with translation examples) can be tried immediately with any API-accessible LLM and the public MultiPL-E benchmark." 487 }, 488 "brand_recognition": { 489 "score": 1, 490 "justification": "USI is a respected Swiss research university but not a famous AI lab; evaluates GitHub Copilot which adds some brand recognition." 491 } 492 }, 493 "hn_data": { 494 "threads": [], 495 "top_points": 0, 496 "total_points": 0, 497 "total_comments": 0 498 } 499 }