scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25734B)
      1 {
      2   "paper": {
      3     "title": "A Survey on LLM-based Code Generation for Low-Resource and Domain-Specific Programming Languages",
      4     "authors": ["Sathvik Joel", "Jie JW Wu", "Fatemeh Fard"],
      5     "year": 2024,
      6     "venue": "ACM Transactions on Software Engineering and Methodology (TOSEM)",
      7     "arxiv_id": "2410.03981",
      8     "doi": "10.48550/arXiv.2410.03981"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["survey_methodology"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "The paper provides a GitHub repository for organizing surveyed papers: 'A GitHub repository was created to organize the papers of this survey at https://github.com/jie-jw-wu/Survey-CodeLLM4LowResource-DSL.' This is a curated list rather than analysis code, but it is a released artifact."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "While the GitHub repository organizes the paper list, the extracted survey data (categorizations, technique classifications, detailed extraction spreadsheets) is only available in the paper's tables. No downloadable structured dataset of the survey's extracted information is mentioned."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No environment or dependency specifications are provided. As a survey, there is no computational environment to specify, but any analysis scripts or data processing tools used are not described."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions are provided. While the search strategy and screening process are described, there are no instructions for replicating the full extraction and categorization process."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": false,
     38         "answer": false,
     39         "justification": "This is a systematic literature review that does not run experiments or report quantitative results requiring confidence intervals."
     40       },
     41       "significance_tests": {
     42         "applies": false,
     43         "answer": false,
     44         "justification": "Survey paper with no statistical comparisons of its own. All reported statistics are counts and proportions of categorized papers."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": false,
     48         "answer": false,
     49         "justification": "Survey paper that does not conduct experiments requiring effect size reporting."
     50       },
     51       "sample_size_justified": {
     52         "applies": false,
     53         "answer": false,
     54         "justification": "Survey paper — the paper corpus size is determined by the search strategy, not by statistical power requirements."
     55       },
     56       "variance_reported": {
     57         "applies": false,
     58         "answer": false,
     59         "justification": "Survey paper with no experimental runs to report variance across."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Section 2.2 'Related Surveys' compares against multiple prior surveys [71, 89, 179, 192, 198, 200, 203, 207] and explicitly discusses how their work differs from and extends these prior reviews."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The compared surveys are recent (2023-2024), including Hou et al. [89], Zhang et al. [200], and Zheng et al. [207], representing the current state of the survey landscape."
     72       },
     73       "ablation_study": {
     74         "applies": false,
     75         "answer": false,
     76         "justification": "Survey paper with no system components to ablate."
     77       },
     78       "multiple_metrics": {
     79         "applies": false,
     80         "answer": false,
     81         "justification": "Survey paper that does not evaluate a system with metrics."
     82       },
     83       "human_evaluation": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "Human evaluation is not applicable to a systematic literature review."
     87       },
     88       "held_out_test_set": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "Survey paper with no train/test split applicable."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "The survey provides extensive breakdowns: by language type (LRPL vs DSL, 51 vs 59 papers), by technique category (Figure 3a, Table 6 with 6 main categories), by venue (Figure 2 waffle chart), by year, and by model family (Figure 3b)."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 7 'Challenges and Opportunities' extensively discusses limitations and failure modes of current approaches, including evaluation challenges, data scarcity issues, and methodological limitations specific to LRPLs and DSLs."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The survey reports negative findings: LLMs perform poorly on LRPLs (Figure 1), lack of standardized benchmarks, DSL-specific challenges that remain unsolved, and that iterative feedback often requires extensive human guidance even with advanced models."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims are supported: 111 papers filtered from 27,000+ (Table 2), four evaluation techniques identified (Section 4), six method categories (Table 6), and dataset curation approaches classified (Section 6). All claims match the paper content."
    114       },
    115       "causal_claims_justified": {
    116         "applies": false,
    117         "answer": false,
    118         "justification": "The paper is a descriptive survey that categorizes and summarizes existing work. It does not make its own causal claims — when it discusses what 'improves' performance, it is reporting claims from the surveyed papers."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The survey explicitly bounds its scope: LLMs above 1 billion parameters (Section 3.7), code generation specifically (not code analysis), LRPLs and DSLs only, papers from 2020-2024, SQL explicitly excluded (Section 3.4). Section 8 discusses limitations of database selection."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": false,
    127         "answer": false,
    128         "justification": "This is a pure survey/taxonomy that presents no empirical results requiring alternative explanations."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": false,
    132         "answer": false,
    133         "justification": "Survey paper with no measurements of its own. It reviews other papers' measurements but makes no proxy-to-outcome claims itself."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": false,
    139         "answer": false,
    140         "justification": "Survey paper that does not use any LLM models in its own methodology."
    141       },
    142       "prompts_provided": {
    143         "applies": false,
    144         "answer": false,
    145         "justification": "Survey paper that does not use prompting."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": false,
    149         "answer": false,
    150         "justification": "Survey paper with no experiments requiring hyperparameters."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "Survey paper that does not use agentic scaffolding."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 3 documents the full screening pipeline: four-iteration filtering approach (title screening → abstract screening → preliminary content review → final full-text review), with explicit inclusion/exclusion criteria (Section 3.4), search keywords (Table 1), and paper counts at each stage (Table 2)."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 8 'Threats to Validity' provides a dedicated discussion of the study's limitations."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 8 discusses specific threats: database selection limitations, exclusion of SQL, inability to conduct backward snowballing for highly-cited papers (>300 citations), potential for missed recent papers in the expanded search, and the distinction between LRPL and DSL challenges that may affect interpretation."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The paper explicitly states what it does not cover: SQL is excluded (Section 3.4), models under 1B parameters are generally excluded (Section 3.7), code analysis and translation are out of scope, and the time boundary (2020-2024, expanded to July 2025) is clearly stated."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The GitHub repository organizes the paper list, but the raw extraction data (categorization decisions, detailed coding sheets, inter-rater disagreement records) is not available for independent verification."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 3.3 provides detailed search strategy: four databases (ArXiv, IEEE Xplore, Web of Science, ACM Digital Library), date range (Jan 2020 - May 2024), complete keyword groups (Table 1), and the Boolean logic connecting them."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "The paper describes how papers were 'recruited' for the review: systematic search across four databases, manual initial search on arXiv, forward and backward snowballing (Section 3.5), with explicit inclusion/exclusion criteria (Section 3.4)."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Table 2 documents the full pipeline with counts: 27,333 initial → 506 (Iter 1) → 204 (Iter 2) → 192 (Iter 3) → 189 (Iter 4) → 75 (combined/deduped) + 36 (snowballing) = 111 final papers. Each iteration's purpose is described in Section 3.5."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Acknowledgments section states: 'This research is supported by a grant from the Natural Sciences and Engineering Research Council of Canada RGPIN-2019-05175.'"
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are clearly listed: Indian Institute of Technology Madras and University of British Columbia. Neither institution has a product being evaluated in the survey."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "NSERC (Natural Sciences and Engineering Research Council of Canada) is a government funding agency with no financial interest in the survey's conclusions about LLM code generation for LRPLs/DSLs."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests statement or financial interests declaration is present in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "Survey paper that does not evaluate a pre-trained model on any benchmark."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "Survey paper that does not evaluate a pre-trained model on any benchmark."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "Survey paper that does not evaluate a pre-trained model on any benchmark."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this systematic literature review."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this systematic literature review."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this systematic literature review."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this systematic literature review."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this systematic literature review."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this systematic literature review."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this systematic literature review."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "Survey paper with no computational method of its own."
    283       },
    284       "compute_budget_stated": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "Survey paper with no computational experiments."
    288       }
    289     },
    290     "survey_methodology": {
    291       "prisma_or_structured_protocol": {
    292         "applies": true,
    293         "answer": true,
    294         "justification": "The survey follows a structured SLR protocol adapted from Rodriguez et al. [148], with a systematic search strategy using reproducible keyword queries (Table 1), explicit database selection, four-iteration screening process (Section 3.5), defined inclusion/exclusion criteria (Section 3.4), and forward/backward snowballing. It also uses the Goal Question Metric (GQM) approach [7] for defining research objectives."
    295       },
    296       "quality_assessment_of_sources": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The survey does not assess the methodological quality of its 111 included papers. It categorizes papers by topic, technique, and language but treats all papers equally regardless of their methodological rigor. No quality scoring rubric or risk-of-bias assessment is applied."
    300       },
    301       "publication_bias_discussed": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The survey does not discuss publication bias. There is no consideration of whether the surveyed literature skews toward positive results, no funnel plots, and no acknowledgment that papers reporting successful LLM applications may be overrepresented. Section 8 discusses database selection threats but not publication bias."
    305       }
    306     }
    307   },
    308   "claims": [
    309     {
    310       "claim": "LLMs show significantly lower performance on low-resource programming languages compared to high-resource languages on MultiPL-E benchmark.",
    311       "evidence": "Figure 1 shows a heatmap comparing model performance across HRPLs and LRPLs, demonstrating consistently higher scores for Python and Java compared to LRPLs like R, Julia, and D across 13 models.",
    312       "supported": "strong"
    313     },
    314     {
    315       "claim": "Fine-tuning is the most prevalent technique for enhancing LLM performance in LRPL/DSL code generation, used in 48 papers.",
    316       "evidence": "Figure 3(a) and Table 6 show fine-tuning in 48 papers, followed by prompting strategies (25) and pre-training (22).",
    317       "supported": "strong"
    318     },
    319     {
    320       "claim": "The LLaMA family is the most popular base model for fine-tuning in LRPL/DSL code generation.",
    321       "evidence": "Figure 3(b) shows LLaMA used in 14 instances, followed by DeepSeek (10) and StarCoder (9). The paper attributes this to earlier release date, open-source availability, range of model sizes, and LoRA compatibility.",
    322       "supported": "strong"
    323     },
    324     {
    325       "claim": "Smaller fine-tuned models can match or outperform GPT-3.5/GPT-4 on specialized LRPL/DSL tasks.",
    326       "evidence": "Table 8 shows 12 examples across Lean, Verilog, PowerShell, CQL, and other languages where fine-tuned models (e.g., DeepSeek-FT, CodeV, BetterV) match or surpass GPT-3.5/GPT-4.",
    327       "supported": "moderate"
    328     },
    329     {
    330       "claim": "There is a lack of standardized benchmark datasets and evaluation metrics for most LRPLs and DSLs.",
    331       "evidence": "Section 7 discusses this gap extensively. Many DSLs lack standardized benchmarks (Section 4.3), and researchers frequently create their own evaluation datasets. Only Verilog has standard benchmarks (VerilogEval, RTLLM).",
    332       "supported": "strong"
    333     },
    334     {
    335       "claim": "Research output in LRPL/DSL code generation has increased dramatically, from 1 paper in 2020 to 49 papers in just the first half of 2024.",
    336       "evidence": "Section 3.6 states: '1 and 4 papers in 2020 and 2021, respectively, followed by a notable increase to 13 publications in 2022, 44 papers in 2023, and reaching a zenith of 49 publications just in the first half of 2024.'",
    337       "supported": "strong"
    338     },
    339     {
    340       "claim": "Approximately 60% of surveyed papers use proprietary models in their research.",
    341       "evidence": "Figure 3(c) shows 59.5% use proprietary models versus 40.5% using only open-source models.",
    342       "supported": "strong"
    343     },
    344     {
    345       "claim": "Iterative feedback approaches significantly improve code quality and correctness for DSLs with available validators.",
    346       "evidence": "Section 5.2 and Table 10 summarize 11 iterative feedback approaches across Verilog, XDL, PDDL, PLC, and Verus, reporting improvements in compilation rates and functional correctness. E.g., Yao et al. improved automated proofs from 3/20 to 14/20 with iterative feedback.",
    347       "supported": "moderate"
    348     }
    349   ],
    350   "methodology_tags": ["meta-analysis"],
    351   "key_findings": "This systematic literature review of 111 papers (from 27,000+ screened) identifies fine-tuning (48 papers) and prompting (25 papers) as the dominant strategies for improving LLM code generation in low-resource and domain-specific programming languages, with LLaMA being the most popular base model. The survey reveals that while smaller fine-tuned models can match or exceed GPT-3.5/GPT-4 on specialized tasks, there remains a critical lack of standardized benchmarks and evaluation metrics for most LRPLs and DSLs. DSLs face additional challenges requiring domain-specific evaluation infrastructure, specialized validators, and expert judgment that general-purpose metrics cannot capture. Research output in this area has grown exponentially from 1 paper in 2020 to 49 in the first half of 2024.",
    352   "red_flags": [
    353     {
    354       "flag": "No quality assessment of source papers",
    355       "detail": "The survey treats all 111 papers equally regardless of their methodological quality. There is no quality scoring rubric, risk-of-bias assessment, or structured evaluation of included studies. This means findings from rigorous peer-reviewed work carry the same weight as weaker arXiv preprints."
    356     },
    357     {
    358       "flag": "High proportion of non-peer-reviewed sources",
    359       "detail": "46 out of 111 papers (41%) are arXiv preprints (Figure 2). While the paper justifies including arXiv due to the rapid pace of the field, this high proportion of unreviewed work could affect the reliability of the survey's conclusions. Six technical reports were explicitly included despite exclusion criteria."
    360     },
    361     {
    362       "flag": "No discussion of publication bias",
    363       "detail": "The survey does not consider whether published papers skew toward positive results about LLM capabilities. Papers showing LLMs failing at LRPL/DSL tasks may be less likely to be published, potentially biasing the landscape toward optimistic findings about improvement techniques."
    364     },
    365     {
    366       "flag": "Selective reporting of improvement claims",
    367       "detail": "Tables 7 and 8 showcase successful fine-tuning improvements and cases where fine-tuned models beat GPT-3.5/GPT-4, but the survey does not systematically report or quantify how often fine-tuning fails or provides marginal improvement. The presentation selectively highlights success stories."
    368     }
    369   ],
    370   "cited_papers": [
    371     {
    372       "title": "Evaluating Large Language Models Trained on Code",
    373       "authors": ["Mark Chen et al."],
    374       "year": 2021,
    375       "arxiv_id": "2107.03374",
    376       "relevance": "Introduces HumanEval benchmark and Codex, foundational for evaluating LLM code generation capabilities."
    377     },
    378     {
    379       "title": "MultiPL-E: A Scalable and Polyglot Approach to Benchmarking Neural Code Generation",
    380       "authors": ["Federico Cassano", "John Gouwar", "Daniel Nguyen"],
    381       "year": 2023,
    382       "relevance": "Key multilingual benchmark for evaluating LLM code generation across low-resource programming languages including Bash, Lua, R, Rust, and others."
    383     },
    384     {
    385       "title": "StarCoder: may the source be with you!",
    386       "authors": ["Raymond Li et al."],
    387       "year": 2023,
    388       "relevance": "Major open-source code LLM trained on 86 programming languages, widely used as baseline and fine-tuning base for LRPL/DSL code generation."
    389     },
    390     {
    391       "title": "Code Llama: Open Foundation Models for Code",
    392       "authors": ["Baptiste Rozière et al."],
    393       "year": 2024,
    394       "arxiv_id": "2308.12950",
    395       "relevance": "Most popular base model family for fine-tuning in LRPL/DSL code generation according to this survey."
    396     },
    397     {
    398       "title": "Knowledge transfer from high-resource to low-resource programming languages for code llms",
    399       "authors": ["Federico Cassano", "John Gouwar", "Francesca Lucchetti"],
    400       "year": 2024,
    401       "relevance": "Directly addresses the core topic of transferring LLM capabilities to low-resource languages using synthetic data generation."
    402     },
    403     {
    404       "title": "VerilogEval: Evaluating large language models for Verilog code generation",
    405       "authors": ["Mingjie Liu", "Nathaniel Pinckney", "Brucek Khailany", "Haoxing Ren"],
    406       "year": 2023,
    407       "relevance": "Standard benchmark for evaluating LLM code generation in Verilog, a key DSL in hardware design."
    408     },
    409     {
    410       "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?",
    411       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"],
    412       "year": 2024,
    413       "relevance": "Major benchmark for evaluating LLM software engineering capabilities on real-world code tasks."
    414     },
    415     {
    416       "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
    417       "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"],
    418       "year": 2023,
    419       "arxiv_id": "2302.06590",
    420       "relevance": "Key study on developer productivity with AI coding assistants, relevant to understanding the motivation for LRPL/DSL support."
    421     },
    422     {
    423       "title": "Grammar prompting for domain-specific language generation with large language models",
    424       "authors": ["Bailin Wang", "Zi Wang", "Xuezhi Wang"],
    425       "year": 2023,
    426       "relevance": "Novel prompting technique for DSL generation that augments examples with specialized grammars, effective across diverse DSLs."
    427     },
    428     {
    429       "title": "DocPrompting: Generating Code by Retrieving the Docs",
    430       "authors": ["Shuyan Zhou", "Uri Alon", "Frank F. Xu", "Graham Neubig"],
    431       "year": 2023,
    432       "relevance": "RAG approach for code generation using documentation retrieval, applied to Bash and other languages."
    433     },
    434     {
    435       "title": "Large Language Models for Software Engineering: A Systematic Literature Review",
    436       "authors": ["Xinyi Hou et al."],
    437       "year": 2024,
    438       "arxiv_id": "2308.10620",
    439       "relevance": "Major related survey on LLMs for software engineering that this paper positions against and extends."
    440     },
    441     {
    442       "title": "Magicoder: empowering code generation with OSS-INSTRUCT",
    443       "authors": ["Yuxiang Wei", "Zhe Wang", "Jiawei Liu"],
    444       "year": 2024,
    445       "relevance": "Demonstrates synthetic data generation approach for improving code generation including in low-resource languages."
    446     },
    447     {
    448       "title": "IRCoder: Intermediate Representations Make Language Models Robust Multilingual Code Generators",
    449       "authors": ["Indraneil Paul", "Goran Glavaš", "Iryna Gurevych"],
    450       "year": 2024,
    451       "relevance": "Uses LLVM IR as cross-lingual bridge to improve code generation for low-resource languages like D, Ruby, and Swift."
    452     },
    453     {
    454       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    455       "authors": ["Edward J. Hu et al."],
    456       "year": 2022,
    457       "relevance": "Parameter-efficient fine-tuning method widely used for adapting LLMs to LRPL/DSL code generation with limited compute."
    458     }
    459   ]
    460 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs