scan.json (21349B)
1 { 2 "paper": { 3 "title": "A Survey on LLM-based Code Generation for Low-Resource and Domain-Specific Programming Languages", 4 "authors": ["Sathvik Joel", "Jie JW Wu", "Fatemeh Fard"], 5 "year": 2024, 6 "venue": "ACM Transactions on Software Engineering and Methodology (TOSEM)", 7 "arxiv_id": "2410.03981" 8 }, 9 "scan_version": 2, 10 "active_modules": ["survey_methodology"], 11 "methodology_tags": ["meta-analysis"], 12 "key_findings": "Systematic review of 111 papers on LLM-based code generation for low-resource and domain-specific programming languages. Found that fine-tuning and prompting are the most prevalent improvement methods, LLaMA family is the most popular base model for fine-tuning, and there is a significant lack of standardized benchmarks and evaluation metrics for most LRPLs and DSLs. The survey categorizes evaluation techniques into four types (automatic, user-centric, domain-specific, manual) and improvement methods into six main groups.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": true, 18 "justification": "The paper provides a GitHub repository to organize surveyed papers: https://github.com/jie-jw-wu/Survey-CodeLLM4LowResource-DSL, mentioned in the abstract and introduction." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The GitHub repository organizes the papers of this survey. The list of 111 filtered papers and their categorizations constitute the data of this SLR." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "No environment or dependency specifications are provided. This is a survey paper, but analysis scripts or tools used for screening could have been documented." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No step-by-step reproduction instructions are provided for replicating the systematic search and screening process beyond the methodology description." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": false, 39 "answer": false, 40 "justification": "This is a systematic literature review that does not run experiments or report quantitative results requiring confidence intervals." 41 }, 42 "significance_tests": { 43 "applies": false, 44 "answer": false, 45 "justification": "Survey paper with no experimental comparisons requiring significance tests." 46 }, 47 "effect_sizes_reported": { 48 "applies": false, 49 "answer": false, 50 "justification": "No experiments conducted; this is a literature review." 51 }, 52 "sample_size_justified": { 53 "applies": false, 54 "answer": false, 55 "justification": "No experimental sample sizes to justify; the paper pool of 111 is determined by the search methodology." 56 }, 57 "variance_reported": { 58 "applies": false, 59 "answer": false, 60 "justification": "No experimental runs to report variance over." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "Section 2.2 explicitly compares this survey against prior related surveys (Fan et al., Hou et al., Watson et al., Zan et al., etc.) and explains how this work differs from and extends them." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "The related surveys compared against are from 2023-2024, which are contemporary given the 2024 publication date." 73 }, 74 "ablation_study": { 75 "applies": false, 76 "answer": false, 77 "justification": "Survey paper with no system components to ablate." 78 }, 79 "multiple_metrics": { 80 "applies": false, 81 "answer": false, 82 "justification": "Survey paper with no experimental evaluation requiring multiple metrics." 83 }, 84 "human_evaluation": { 85 "applies": false, 86 "answer": false, 87 "justification": "Survey paper; human evaluation of system outputs is not applicable." 88 }, 89 "held_out_test_set": { 90 "applies": false, 91 "answer": false, 92 "justification": "No experimental evaluation requiring train/test splits." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "The survey provides extensive breakdowns: by language (LRPL vs DSL), by technique category (6 main groups in Table 6), by evaluation metric type (Tables 3-4), by venue (Figure 2), and by dataset type (Tables 11-14)." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "Section 7 discusses challenges and limitations extensively, including evaluation challenges, data scarcity issues, and where current approaches fail for LRPLs/DSLs." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "The survey reports negative findings such as LLMs struggling with COQ syntax, low performance on LRPLs compared to HRPLs (Figure 1), and cases requiring 117 rounds of iteration to produce correct code." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The abstract's claims about filtering 111 papers from 27,000, identifying four evaluation techniques, six improvement method groups, and lack of standard benchmarks are all supported by the detailed analysis in Sections 4-7." 115 }, 116 "causal_claims_justified": { 117 "applies": false, 118 "answer": false, 119 "justification": "The paper is a systematic literature review that reports findings from surveyed papers rather than making causal claims of its own." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": true, 124 "justification": "Section 8 (Threats to Validity) explicitly bounds generalization, noting database selection limitations, the exclusion of SQL, and the distinction between LRPLs and DSLs within the survey scope." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": false, 128 "answer": false, 129 "justification": "Pure survey/taxonomy paper presenting no empirical results of its own that would need alternative explanations." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": false, 133 "answer": false, 134 "justification": "Survey paper with no measurements of its own; no proxy-outcome gap to address." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": false, 140 "answer": false, 141 "justification": "Survey paper that does not use any LLM models in its own methodology." 142 }, 143 "prompts_provided": { 144 "applies": false, 145 "answer": false, 146 "justification": "No prompting used in the survey methodology." 147 }, 148 "hyperparameters_reported": { 149 "applies": false, 150 "answer": false, 151 "justification": "No experiments conducted requiring hyperparameter reporting." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "No agentic scaffolding used." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section 3 provides detailed documentation of the paper selection pipeline: Table 1 shows keyword groups, Table 2 shows filtering counts at each iteration (27,330 → 506 → 204 → 192 → 189 → 75 + 36 snowballing = 111), and Section 3.4-3.5 describe eligibility criteria and screening process." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 8 'Threats to Validity' provides a dedicated discussion of limitations." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 8 discusses specific threats: database selection limitations, exclusion of SQL, the difference between LRPLs and DSLs within the survey, and incomplete backward snowballing for highly-cited papers (>300 citations). Also notes the extended 2025 search may have missed some papers." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": true, 178 "justification": "The paper explicitly states scope boundaries: LLMs defined as ≥1B parameters (Section 3.3), time period 2020-2024 (Section 3.3), SQL explicitly excluded (Section 3.4), and exclusion criteria listed in Section 3.4." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": true, 185 "justification": "The GitHub repository (https://github.com/jie-jw-wu/Survey-CodeLLM4LowResource-DSL) organizes the surveyed papers, making the raw paper list verifiable." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section 3.3 describes the search strategy in detail: four databases (arXiv, IEEE Xplore, Web of Science, ACM DL), keyword groups (Table 1), search period (Jan 2020 - May 2024), and snowballing approach." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants; data source is published papers identified through systematic search." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "Table 2 documents the full pipeline from 27,330 initial papers through five iterations to the final 111, with counts at each stage separated by database. The screening process is described in Section 3.5 with four distinct iterations." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": true, 207 "justification": "Acknowledgments section states: 'This research is supported by a grant from the Natural Sciences and Engineering Research Council of Canada RGPIN-2019-05175.'" 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are clearly listed: IIT Madras (Joel) and University of British Columbia (Wu, Fard). No commercial product is being evaluated." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": true, 217 "justification": "NSERC is a Canadian government research council with no financial stake in the survey's findings about LLM code generation." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests statement is present in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": false, 228 "answer": false, 229 "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": false, 233 "answer": false, 234 "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": false, 238 "answer": false, 239 "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in this systematic literature review." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "Survey paper; no method with inference costs." 284 }, 285 "compute_budget_stated": { 286 "applies": false, 287 "answer": false, 288 "justification": "Survey paper; no computational experiments." 289 } 290 }, 291 "survey_methodology": { 292 "prisma_or_structured_protocol": { 293 "applies": true, 294 "answer": true, 295 "justification": "The survey follows a structured systematic review protocol adapted from Rodriguez et al. [148], with a four-iteration screening process, explicit keyword groups (Table 1), defined databases, eligibility criteria (Section 3.4), and counts at each filtering stage (Table 2). While not explicitly PRISMA, it follows an established SLR methodology." 296 }, 297 "quality_assessment_of_sources": { 298 "applies": true, 299 "answer": false, 300 "justification": "The survey does not assess the methodological quality of its 111 source papers. All papers that pass the inclusion criteria are treated equally regardless of their rigor. No quality scoring rubric or risk-of-bias assessment is applied." 301 }, 302 "publication_bias_discussed": { 303 "applies": true, 304 "answer": false, 305 "justification": "The survey does not discuss publication bias. There is no consideration of whether published papers skew toward positive results for LLM code generation, no funnel plots, and no acknowledgment that negative results may be underrepresented." 306 } 307 } 308 }, 309 "claims": [ 310 { 311 "claim": "111 relevant papers were filtered from over 27,000 published studies from 2020-2024 on LLM code generation for LRPLs and DSLs.", 312 "evidence": "Table 2 shows the filtering pipeline: 27,330 initial papers across 4 databases, reduced through 5 iterations to 75 unique papers, plus 36 from snowballing, totaling 111. An additional 5 papers from 2024-2025 were added in an expanded search.", 313 "supported": "strong" 314 }, 315 { 316 "claim": "Fine-tuning is the most prevalent technique for improving LLM performance on LRPLs/DSLs, used in 48 papers.", 317 "evidence": "Figure 3(a) shows the frequency distribution of techniques, with fine-tuning at 48, followed by prompting strategies (25) and pre-training (22).", 318 "supported": "strong" 319 }, 320 { 321 "claim": "The LLaMA family is the most popular base model for fine-tuning in LRPL/DSL code generation.", 322 "evidence": "Figure 3(b) shows LLaMA at 14 instances, followed by DeepSeek (10) and StarCoder (9).", 323 "supported": "strong" 324 }, 325 { 326 "claim": "Fine-tuned smaller models can match or outperform GPT-3.5/GPT-4 on LRPL/DSL tasks.", 327 "evidence": "Table 8 provides 12 examples where fine-tuned models outperform GPT-3.5 or GPT-4, including DeepSeek-FT solving 5/148 FIMO problems vs GPT-4's 0/148, and CodeV achieving 78.1% vs GPT-4's 60.0% on VerilogEval-Machine.", 328 "supported": "strong" 329 }, 330 { 331 "claim": "There is a lack of standardized benchmarks and evaluation metrics for most LRPLs and DSLs.", 332 "evidence": "Section 7.2 discusses the lack of language-specific benchmarks for LRPLs compared to HRPLs (which have APPS, CodeContests, SWE-bench, etc.), and notes that many DSL researchers must create custom evaluation datasets.", 333 "supported": "moderate" 334 }, 335 { 336 "claim": "60.4% of surveyed papers use proprietary models in their research.", 337 "evidence": "Figure 3(c) shows the distribution with 59.5% using proprietary models vs 40.5% using only open-source models.", 338 "supported": "strong" 339 } 340 ], 341 "red_flags": [ 342 { 343 "flag": "No quality assessment of source papers", 344 "detail": "The survey treats all 111 papers equally without assessing their methodological quality. This means findings from rigorous studies are given equal weight to findings from methodologically weak papers, potentially laundering low-quality results." 345 }, 346 { 347 "flag": "No publication bias discussion", 348 "detail": "The survey does not consider whether published papers on LLM code generation for LRPLs/DSLs skew toward positive results. Negative findings (LLMs failing on certain languages) may be underrepresented." 349 }, 350 { 351 "flag": "Primary screening by single author", 352 "detail": "The initial title screening of all 27,333 papers was done by the first author alone. While second and third authors reviewed categorizations and disagreements were discussed, the initial filtering represents a single-rater bottleneck." 353 }, 354 { 355 "flag": "Incomplete backward snowballing", 356 "detail": "Section 3.5 notes that backward snowballing was skipped for papers with over 300 citations, potentially missing relevant work citing foundational papers." 357 } 358 ], 359 "cited_papers": [ 360 { 361 "title": "MultiPL-E: A Scalable and Polyglot Approach to Benchmarking Neural Code Generation", 362 "authors": ["Federico Cassano", "John Gouwar", "Daniel Nguyen"], 363 "year": 2023, 364 "relevance": "Key multilingual code generation benchmark covering low-resource languages, widely used as evaluation standard." 365 }, 366 { 367 "title": "StarCoder: may the source be with you!", 368 "authors": ["Raymond Li"], 369 "year": 2023, 370 "relevance": "Major open-source code LLM trained on 86 programming languages, frequently used as base model for fine-tuning." 371 }, 372 { 373 "title": "Code Llama: Open Foundation Models for Code", 374 "authors": ["Baptiste Rozière"], 375 "year": 2024, 376 "arxiv_id": "2308.12950", 377 "relevance": "Most popular base model family for fine-tuning on LRPL/DSL code generation tasks." 378 }, 379 { 380 "title": "DeepSeek-Coder: When the Large Language Model Meets Programming", 381 "authors": ["Daya Guo"], 382 "year": 2024, 383 "relevance": "Second most popular base model for LRPL/DSL fine-tuning with strong coding benchmark performance." 384 }, 385 { 386 "title": "Evaluating Large Language Models Trained on Code", 387 "authors": ["Mark Chen"], 388 "year": 2021, 389 "arxiv_id": "2107.03374", 390 "relevance": "Introduces HumanEval benchmark and Codex, foundational for code generation evaluation." 391 }, 392 { 393 "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?", 394 "authors": ["Carlos E Jimenez", "John Yang"], 395 "year": 2024, 396 "relevance": "Major benchmark for real-world software engineering tasks, cited as example of sophisticated evaluation frameworks." 397 }, 398 { 399 "title": "Large Language Models for Software Engineering: Survey and Open Problems", 400 "authors": ["Angela Fan", "Beliz Gokkaya", "Mark Harman"], 401 "year": 2023, 402 "arxiv_id": "2310.03533", 403 "relevance": "Related survey on LLMs for SE that this paper positions against as not covering LRPLs/DSLs." 404 }, 405 { 406 "title": "A Survey on Large Language Models for Software Engineering", 407 "authors": ["Quanjun Zhang"], 408 "year": 2023, 409 "arxiv_id": "2312.15223", 410 "relevance": "Related survey on LLMs for SE; this paper fills the gap left by its lack of LRPL/DSL coverage." 411 }, 412 { 413 "title": "Knowledge transfer from high-resource to low-resource programming languages for code llms", 414 "authors": ["Federico Cassano"], 415 "year": 2024, 416 "relevance": "Demonstrates knowledge transfer techniques (MultiPL-T) to improve LLM performance on LRPLs." 417 }, 418 { 419 "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot", 420 "authors": ["Sida Peng", "Eirini Kalliamvakou"], 421 "year": 2023, 422 "arxiv_id": "2302.06590", 423 "relevance": "Key study on AI coding tool productivity impact, relevant to understanding the practical stakes of LRPL/DSL support." 424 }, 425 { 426 "title": "IRCoder: Intermediate Representations Make Language Models Robust Multilingual Code Generators", 427 "authors": ["Indraneil Paul", "Goran Glavaš"], 428 "year": 2024, 429 "relevance": "Novel cross-lingual transfer approach using LLVM IR for improving code generation across languages including LRPLs." 430 }, 431 { 432 "title": "VerilogEval: Evaluating large language models for Verilog code generation", 433 "authors": ["Mingjie Liu", "Nathaniel Pinckney"], 434 "year": 2023, 435 "relevance": "Standard benchmark for Verilog DSL code generation, widely adopted in the hardware design LLM community." 436 } 437 ] 438 }