scan.json (24306B)
1 { 2 "paper": { 3 "title": "Enhancing Code Generation for Low-Resource Languages: No Silver Bullet", 4 "authors": ["Alessandro Giagnorio", "Alberto Martin-Lopez", "Gabriele Bavota"], 5 "year": 2025, 6 "venue": "IEEE International Conference on Program Comprehension", 7 "arxiv_id": "2501.19085", 8 "doi": "10.1109/ICPC66645.2025.00058" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "Modern LLMs handle some 'low-resource' languages (Julia, Lua) nearly as well as high-resource ones, but R and Racket show major performance gaps (pass@1 of 7-33% vs 31-75% for Java/Python). Fine-tuning helps smaller models (1B) but hurts larger ones (33B), likely due to insufficient data for weight updates. In-context learning with translation examples is a 'safe bet' that consistently improves performance across all model sizes, though no single technique is universally best.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "A replication package is provided via Zenodo (ref [27], https://doi.org/10.5281/zenodo.13128630)." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The study uses publicly available MultiPL-E benchmark [19] and MultiPL-T datasets [2]. The replication package is released via Zenodo." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "Hardware is mentioned (NVIDIA A30/A40/A100 GPUs) but no requirements.txt, Dockerfile, or detailed software dependency listing is provided in the paper." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are provided in the paper. The replication package is referenced but no README or commands are described." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Only point estimates of pass@1 are reported in Tables I and III. No confidence intervals or error bars are shown." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": true, 46 "justification": "McNemar's test is used for pairwise comparisons of dichotomous results, with Benjamini-Hochberg correction for multiple comparisons (Section III-D)." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Odds Ratios are reported alongside significance tests (Table II and Section IV-D), providing effect size magnitude for all comparisons." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No power analysis or justification for the choice of n=50 repetitions beyond citing prior work that 'this rate appears to stabilize at n=20'." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "Only average pass@1 rates are reported. No standard deviations, interquartile ranges, or spread measures across the 50 repetitions are provided." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Each model's baseline (out-of-the-box) performance is compared against all techniques (Table III white rows)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": false, 73 "justification": "DeepSeek Coder and Code Llama were reasonable at submission time, but GitHub Copilot is a black box with unknown model version. More critically, newer models (e.g., GPT-4, Claude) are not included despite being available." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "The study systematically compares five techniques (three in-context learning variants, fine-tuning, pre-training+fine-tuning) across models and sizes, effectively serving as an ablation of technique components." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": false, 83 "justification": "Only pass@1 is used as the evaluation metric. No additional metrics (e.g., pass@10, CodeBLEU, syntactic correctness rate) are reported." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": true, 88 "justification": "The authors manually analyzed a sample of generated programs to understand failure reasons (Section III-E): 'We analyzed a sample of the generated programs to understand the reasons behind the performance gap.'" 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "MultiPL-E HumanEval is used as the test set, separate from the MultiPL-T fine-tuning data. The 157/161 programs used for evaluation are distinct from training data." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down per model, per model size, per language, and per technique in Tables I and III." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Specific failure examples are analyzed: R returning null instead of empty list, vector instead of list (Fig. 1), Julia push vs push! API errors (Section III-E)." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper reports that fine-tuning worsens performance for DeepSeek Coder 33B (ORs of 1.64 and 1.42 for degradation), translation rules sometimes hurts, and pre-training doesn't consistently help." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims about fine-tuning helping small models, in-context learning being a safe bet, and large models degrading with fine-tuning are all supported by Tables I and III." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper makes causal claims ('fine-tuning [...] helps in substantially boosting performance', 'possibly due to the fact that even a small dataset is sufficient') but the study design is observational across models — confounds between model architecture, training data, and size are not controlled." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "The title says 'No Silver Bullet' and the paper consistently qualifies findings by model size and language. The threats to validity explicitly acknowledges findings 'may not generalize to other settings' (Section V)." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "The paper discusses alternative explanations for performance gaps: language similarity to high-resource languages, programming paradigm differences, domain-specificity of R, repository size vs count (Section III-E)." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "Claims match measurement granularity: the paper measures pass@1 on HumanEval and reports it as pass@1 on HumanEval, without inflating to broader 'code quality' or 'developer productivity' claims." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "Model families and sizes are specified (DeepSeek Coder 1B/7B/33B, Code Llama 7B/13B) but no specific version snapshots or dates. Copilot version is completely unspecified." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Prompt templates are shown in Listings 1-3 with structural detail. Full prompts are stated to be in the replication package [27]. The few-shot examples and translation rules are described with enough detail." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Temperature (0.2), learning rates (2×10⁻⁵ for DeepSeek, 5×10⁻⁵ for Code Llama), optimizers (AdamW), schedulers, max sequence lengths (1024/2048/3072), mixed precision (bfloat16), and epochs (3) are all reported (Sections III-C, IV-B)." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. Models are prompted directly for code generation." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "The paper documents how fine-tuning datasets were constructed from Cassano et al.'s data, the matching process for pre-training pairs (name+docstring matching with quality filtering), and filtering from 161→157 common programs (Sections III-C, IV-A)." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section V 'Threats to Validity' provides a dedicated discussion of construct, internal, and external validity threats." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Specific threats discussed: pass@k metric bias and mitigation via n=50, no hyperparameter tuning performed, training limited to 3 epochs which may cap results, specific prompt choices may be suboptimal (Section V)." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "External validity explicitly states: 'We decided to focus our study on four low-resource languages...Our findings may not generalize to other settings' and notes the specific models and sizes tested (Section V)." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "Replication package released via Zenodo [27] (doi: 10.5281/zenodo.13128630) containing experimental results." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Data sources are well-described: MultiPL-E benchmark with 157 common programs, MultiPL-T datasets (37,592 R functions, 40,489 Racket functions), and the matching process for pre-training pairs (Sections III-C, IV-A)." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data source is a standard public benchmark (MultiPL-E/HumanEval)." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline from 164 HumanEval → 161 MultiPL-E → 157 common programs is documented with reasons for each filtering step. Fine-tuning dataset construction and pre-training pair matching are documented with counts (Section IV-A.5)." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "Section VII acknowledges 'the financial support of the Swiss National Science Foundation for the PARSED project (SNF Project No. 219294)' and CHOOSE sponsorship." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "All authors are affiliated with Software Institute, USI Università della Svizzera italiana, Switzerland. No conflict with evaluated tools." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": true, 218 "justification": "Swiss National Science Foundation is an independent government funding agency with no stake in whether fine-tuning or in-context learning performs better." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No training data cutoff dates are stated for any of the models used. DeepSeek Coder and Code Llama training data temporal boundaries are not discussed." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether HumanEval problems appeared in the training data of any evaluated model." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "HumanEval was published in 2021. All models were trained after 2021 and could have seen these problems. This contamination risk is not discussed." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No inference costs, API costs, or wall-clock times are reported despite running 50 repetitions × 157-161 problems × 6 models × multiple techniques." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "Hardware is mentioned (NVIDIA A30/A40/A100 GPUs) but no GPU hours, training time, or total compute budget is stated." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "Results are averaged over 50 repetitions (via temperature=0.2 sampling) but no seed sensitivity analysis or variance across seeds is reported." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "Explicitly stated: 'We compute pass@1 with n = 50 repetitions' (Section III-C)." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "The paper explicitly states 'we did not perform hyperparameter tuning' (Section V) and used default configurations. No search budget reported." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "For fine-tuned models, 'we evaluate each epoch on the MultiPL-E benchmark and only report the best model's results' — selection on the evaluation benchmark, which is documented (Section IV-C). All epoch results in replication package." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": true, 316 "justification": "Benjamini-Hochberg procedure is applied to adjust p-values for multiple comparisons (Sections III-D, IV-C)." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors devised the translation examples and translation rules prompts and compare them against existing techniques without acknowledging author-evaluation bias." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "Fine-tuning vs in-context learning have vastly different compute costs but performance is not compared at matched compute budgets." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "HumanEval is used without discussing whether it adequately represents real-world code generation tasks for low-resource languages. The paper notes R is used for data analysis but HumanEval tests general programming — this construct validity gap is not addressed." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved; models are directly prompted." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "HumanEval (2021) predates all models' training. No discussion of temporal leakage." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether evaluation setup leaks information (e.g., function signatures providing too much structure)." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether fine-tuning data (MultiPL-T) shares structural similarities with test data (MultiPL-E HumanEval), despite both being derived from similar Python function collections." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No concrete leakage detection or prevention methods are applied." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "Modern LLMs show a major performance gap between high-resource (Java, Python) and some low-resource languages (R, Racket), with pass@1 gaps of 24-39 percentage points.", 365 "evidence": "Table I shows pass@1 ranges of 7-33% for R/Racket vs 31-75% for Java/Python across all 6 models. All differences statistically significant with McNemar's test (Section III-E).", 366 "supported": "strong" 367 }, 368 { 369 "claim": "Julia and Lua are not as 'low-resource' as previously characterized — performance is close to high-resource languages.", 370 "evidence": "Table I shows Julia/Lua pass@1 of 19-61% vs Java/Python 31-75%. Average gap is 10.9%/20.8% for Julia and 3.6%/13.5% for Lua vs Java/Python (Section III-E).", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Fine-tuning is best for small models (1B) but degrades performance for large models (33B).", 375 "evidence": "Table III: DeepSeek 1B improves from 7.0% to 18.4% on Racket with fine-tuning. DeepSeek 33B degrades from 30.2% to 25.3% on R and 32.5% to 28.0% on Racket (ORs 1.64 and 1.42).", 376 "supported": "strong" 377 }, 378 { 379 "claim": "In-context learning with translation examples is a 'safe bet' that always improves performance for models ≥7B.", 380 "evidence": "Table III shows consistent improvement for all models ≥7B across both languages, with ORs between 1.28 and 2.27 (Section IV-D).", 381 "supported": "strong" 382 }, 383 { 384 "claim": "The pre-training task (code translation) does not provide relevant additional benefit over fine-tuning alone.", 385 "evidence": "Fine-tuning only works better than pre-training+fine-tuning for 3/5 models on R and 4/5 on Racket. Average pass@1 difference is 0.3% (Section IV-D).", 386 "supported": "moderate" 387 } 388 ], 389 "red_flags": [ 390 { 391 "flag": "No contamination analysis", 392 "detail": "HumanEval was published in 2021 and all models were trained after that. The benchmark solutions could be in training data, inflating absolute numbers. While this affects all models equally for relative comparisons, it undermines claims about absolute performance levels." 393 }, 394 { 395 "flag": "Single benchmark", 396 "detail": "All conclusions rest on HumanEval (157 problems). MBPP was available but excluded. Results may not generalize to other code generation tasks or benchmarks." 397 }, 398 { 399 "flag": "No variance reporting", 400 "detail": "Despite 50 repetitions, no standard deviations or confidence intervals are reported. Statistical tests are performed but the reader cannot assess result stability." 401 }, 402 { 403 "flag": "Best-epoch selection on test set", 404 "detail": "Fine-tuned models are evaluated each epoch and 'only report the best model's results' on the MultiPL-E benchmark — this is effectively selecting on the test set, which inflates fine-tuning results." 405 } 406 ], 407 "cited_papers": [ 408 { 409 "title": "Knowledge transfer from high-resource to low-resource programming languages for code LLMs", 410 "authors": ["F. Cassano", "J. Gouwar", "F. Lucchetti"], 411 "year": 2024, 412 "relevance": "Introduces MultiPL-T framework for fine-tuning LLMs on low-resource languages via automated code translation." 413 }, 414 { 415 "title": "Multi-lingual evaluation of code generation models", 416 "authors": ["B. Athiwaratkun", "S. K. Gouda"], 417 "year": 2023, 418 "relevance": "Studies few-shot learning for multi-lingual code generation including out-of-domain languages." 419 }, 420 { 421 "title": "DeepSeek-Coder: When the large language model meets programming", 422 "authors": ["D. Guo", "Q. Zhu"], 423 "year": 2024, 424 "arxiv_id": "2401.14196", 425 "relevance": "One of the main model families evaluated; state-of-the-art code generation LLM." 426 }, 427 { 428 "title": "Code Llama: Open foundation models for code", 429 "authors": ["B. Roziere", "J. Gehring"], 430 "year": 2023, 431 "arxiv_id": "2308.12950", 432 "relevance": "Second main model family evaluated; open-source code generation LLM." 433 }, 434 { 435 "title": "MultiPL-E: A scalable and polyglot approach to benchmarking neural code generation", 436 "authors": ["F. Cassano", "J. Gouwar"], 437 "year": 2023, 438 "relevance": "The benchmark used for evaluation — translates HumanEval/MBPP to 18 languages including low-resource ones." 439 }, 440 { 441 "title": "Evaluating large language models trained on code", 442 "authors": ["M. Chen", "J. Tworek"], 443 "year": 2021, 444 "arxiv_id": "2107.03374", 445 "relevance": "Introduces HumanEval benchmark and Codex, foundational to code generation evaluation." 446 }, 447 { 448 "title": "Measuring the impact of programming language distribution", 449 "authors": ["G. Orlanski", "K. Xiao"], 450 "year": 2023, 451 "relevance": "Shows balanced language distribution in training reduces performance disparities among languages." 452 }, 453 { 454 "title": "StarCoder: May the source be with you", 455 "authors": ["R. Li", "L. B. Allal"], 456 "year": 2023, 457 "relevance": "Major open-source code LLM used in the MultiPL-T pipeline for generating training data." 458 }, 459 { 460 "title": "A survey on LLM-based code generation for low-resource and domain-specific programming languages", 461 "authors": ["S. Joel", "J. J. Wu", "F. H. Fard"], 462 "year": 2024, 463 "arxiv_id": "2410.03981", 464 "relevance": "Directly relevant survey highlighting scarcity of benchmarks for niche languages and need for advanced techniques." 465 }, 466 { 467 "title": "On the transferability of pre-trained language models for low-resource programming languages", 468 "authors": ["F. Chen", "F. Fard", "D. Lo", "T. Bryksin"], 469 "year": 2022, 470 "relevance": "Studies fine-tuning on similar languages to boost low-resource performance, directly builds on this approach." 471 } 472 ] 473 }