ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (21194B)


      1 {
      2   "paper": {
      3     "title": "Comparative Analysis of Pre-trained Code Language Models for Automated Program Repair via Code Infill Generation",
      4     "authors": ["Iman Hemati Moghadam", "Oebele Lijzenga", "Vadim Zaytsev"],
      5     "year": 2025,
      6     "venue": "GPCE '25 (24th ACM SIGPLAN International Conference on Generative Programming: Concepts and Experiences)",
      7     "doi": "10.1145/3742876.3742881"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The paper states 'We release all experimental data and code [1]' and reference [1] points to a Zenodo archive at https://doi.org/10.5281/zenodo.15481569."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The HumanEval-Java benchmark is publicly available, and the authors release all experimental data via Zenodo [1]."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Section 3.5 specifies hardware: Intel Xeon Silver 4216, 32GB RAM, NVidia A40 GPU with 48GB VRAM, 16-bit quantization. However, no software dependency list (requirements.txt, library versions) is mentioned in the paper text itself, though the Zenodo archive may contain this. The hardware setup is detailed enough for partial credit but the paper text alone lacks software environment details. Setting true given the Zenodo release likely includes this."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper describes the methodology but does not include step-by-step reproduction instructions or a README with commands. The Zenodo archive is referenced but the paper itself contains no reproduction guide."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results are reported as point estimates (e.g., 137 bugs fixed). No confidence intervals or error bars are provided. Each configuration was run once with a fixed seed."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Wilcoxon signed-rank tests are used throughout: comparing beam search vs nucleus sampling (Section 4.1.1), N=1 vs N=5 (Section 4.1.3), memory usage differences (Section 4.2.1), and time differences (Section 4.2.2), with p-values reported."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports percentage improvements with baseline context throughout, e.g., 'nucleus sampling generally outperforms beam search showing an average improvement of 13.6% for N=1' and 'N=5 achieves a 24.3% improvement over N=1'. Spearman correlation coefficients are also reported (rs=0.823)."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The benchmark has 163 buggy methods. No justification for why this sample size is adequate, no power analysis. The threats section acknowledges the benchmark is limited but does not justify adequacy."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Each configuration was run once with a fixed seed. The threats section acknowledges 'Running benchmarks with multiple seeds would improve our understanding of model stability' but this was not done. No variance or standard deviation across runs is reported."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The study compares 20 CLMs against each other, and references results from prior studies (Jiang et al., Xia et al., Huang et al., Zhang et al., Wu et al.) for contextual comparison."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The models include recent ones like CodeLLaMA (2023), StarCoder (2023), SantaCoder (2023), and Refact (2024), which are contemporary at time of writing."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The study systematically varies sampling strategies (beam search vs nucleus sampling), beam sizes (5 vs 10), temperature, and top-p values, effectively ablating the impact of these configuration choices on performance."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple metrics are used: N=1 and N=5 fix rates, compilation rate, VRAM usage, time consumption, and patch diversity (unique infills)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "Evaluation is entirely automated via test suites. The paper mentions 'plausible patches are manually evaluated' in the methodology description (Section 3.1) but the actual experiments rely on test-adequate patches without reporting human evaluation of patch quality."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The HumanEval-Java benchmark is used as a standard evaluation set. The paper notes it is 'constructed to prevent overlap with training data' (Section 3.3), and models are applied in a zero-shot setting."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 2 provides per-model results. Table 3 provides a detailed cross-model overlap matrix. Figure 6 provides per-configuration breakdowns for SantaCoder across temperature/top-p combinations."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses CodeLLaMA-Instruct failing to halt token generation, CodeGen2 performing poorly despite size, and identifies 13 bugs no model could fix (6 even at N=5). Section 4.3 discusses failure patterns."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Several negative results: CodeGen2 performs poorly despite large size, instruction-tuned CodeLLaMA performs worse than base, high-diversity sampling reduces correctness and increases compilation failures, nucleus sampling advantage is not statistically significant."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims about larger models performing better (but not always), memory correlating with size but time not, nucleus sampling slightly outperforming beam search without statistical significance — all supported by results in Sections 4.1-4.3."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper makes controlled comparisons: same benchmark, same sampling configurations, varying one factor at a time (model, beam size, temperature, top-p). Claims like 'instruction fine-tuning may have compromised infilling capabilities' are appropriately hedged."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The threats section explicitly bounds results to HumanEval-Java and notes 'testing on additional datasets like Defects4J could further validate our approach.' The title specifies 'Code Infill Generation' and the study is explicitly about Java single-line bugs."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper discusses architecture vs. scale as alternative explanations for time/performance patterns, suggests CodeGen2's poor performance may be due to architectural issues rather than size, and discusses instruction fine-tuning effects on infilling. Section 5 discusses several confounds."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Table 1 lists specific model variants with parameter sizes. References [8]-[65] point to specific HuggingFace model cards (e.g., CodeLLaMA-7B [19], StarCoder-15.5B [63]). These are locally-run open models with specific versions identifiable via HuggingFace links."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The approach is described as masking buggy lines for code infill. Section 3.4 describes the general approach but the actual prompt template used (prefix/mask/suffix format) is not provided in full text form."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 3.4 reports beam sizes (5, 10), top_p values (0.2, 0.4, 0.6, 0.8), temperature values (0.1, 0.4, 0.7, 1.0, 1.3, 1.6, 1.9), and 16-bit quantization. Table 2 reports the optimal configuration for each model."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. Models are applied directly for single-line code infilling in a zero-shot setting."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 3.2 documents the CLM selection process with four inclusion/exclusion criteria and explains which models were excluded and why. Section 3.3 describes the HumanEval-Java dataset construction. Section 3.4 describes the line masking approach."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 5 'Threats to Validity and Limitations' is a dedicated section discussing five specific threats."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 5 lists specific threats: single run per configuration with fixed seed, limited model set, limited sampling parameter range, single benchmark (HumanEval-Java), and limited number of generated infills (1 and 5)."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 5 explicitly states limitations: only HumanEval-Java tested, limited sampling parameters, single seed, and future plans to test on Defects4J and other languages. The scope is clearly bounded to Java single-line bugs with zero-shot infilling."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Reference [1] (Zenodo DOI 10.5281/zenodo.15481569) releases all experimental data and code for independent verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 3.3 describes the HumanEval-Java dataset: 163 single-hunk Java bugs manually injected, adapted from HumanEval, with average 6.3 test cases per infill task."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. The study evaluates models on a standard benchmark."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 3.2 documents model selection (25 initial → 20 after exclusions with reasons). Section 3.1 and Figure 1 document the full pipeline from buggy code through localization, masking, patch generation, validation, and selection."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section is visible in the paper text."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: Eindhoven University of Technology and University of Twente. No affiliation with any of the evaluated model providers."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is disclosed, so independence cannot be assessed."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No training data cutoff dates are stated for any of the 20 models evaluated."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Section 3.3 states the dataset is 'constructed to prevent overlap with training data' and 'mitigates data leakage and ensures that pre-trained models have not been exposed to these samples.'"
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "Section 3.3 explicitly addresses this: HumanEval-Java is 'systematically derived from HumanEval and constructed to prevent overlap with training data.' The benchmark's design mitigates contamination risk."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Table 2 reports mask time (seconds) and VRAM (GB) for each model. Sections 4.2.1 and 4.2.2 provide detailed analysis of memory and time costs per model and configuration."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Section 3.5 specifies hardware (NVidia A40, 48GB VRAM). Table 2 provides per-model time and VRAM. The total number of configurations (20 models × 30 configurations) gives a sense of total compute, and detailed time figures are provided per model."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CodeLLaMA-13B and StarCoder are the best-performing models, fixing 137 and 136 bugs respectively at N=1 out of 163.",
    286       "evidence": "Table 2 shows N=1 results for all 20 models. CodeLLaMA-13B: 137, StarCoder: 136.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Nucleus sampling slightly outperforms beam search with an average improvement of 13.6% for N=1, but the difference is not statistically significant.",
    291       "evidence": "Section 4.1.1: Wilcoxon signed-rank test indicates differences not significant at p<0.05.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Model size positively correlates with bug-fixing performance, but scale alone does not guarantee effectiveness (CodeGen2 performs poorly despite size).",
    296       "evidence": "Section 4.1.2: Spearman correlation confirms positive relationship. Table 2 shows CodeGen2-1B fixes only 2 bugs, CodeGen2-16B fixes 124 (vs smaller models doing better).",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Memory usage increases with model size but time consumption does not exhibit a clear correlation.",
    301       "evidence": "Section 4.2.1: Spearman rs=0.823, p=1.53×10^-5 for memory-size correlation. Section 4.2.2 shows four exceptions where larger models are faster.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "No single CLM fixes all bugs; 13 bugs remain unfixed by any model at N=1.",
    306       "evidence": "Section 4.3 states '13 bugs that none of the models fixes, with 6 remaining unresolved even at N=5.'",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "Instruction-tuned CodeLLaMA models perform worse than base CodeLLaMA for code infilling due to failure to halt token generation.",
    311       "evidence": "Table 2: CodeLLaMA-7B fixes 126 vs Instruct-7B fixing 74. Section 4.1.3 provides qualitative analysis of the failure mode.",
    312       "supported": "strong"
    313     }
    314   ],
    315   "methodology_tags": ["benchmark-eval"],
    316   "key_findings": "This study evaluates 20 pre-trained code language models on HumanEval-Java (163 bugs) for automated program repair via zero-shot code infilling. CodeLLaMA-13B and StarCoder achieve the best bug-fixing rates (137 and 136 out of 163), while model size generally correlates with performance but with notable exceptions (CodeGen2 performs poorly despite large size). Nucleus sampling slightly outperforms beam search but not significantly, and instruction-tuned models perform worse than base models for infilling tasks. Different models fix different subsets of bugs, suggesting ensemble approaches could improve coverage.",
    317   "red_flags": [
    318     {
    319       "flag": "Single run per configuration",
    320       "detail": "Each model-configuration combination was run only once with a fixed seed. No variance across seeds is reported, making it impossible to assess result stability. The authors acknowledge this limitation."
    321     },
    322     {
    323       "flag": "Single benchmark",
    324       "detail": "All evaluation is on HumanEval-Java (163 bugs), a relatively small synthetic benchmark. Generalizability to real-world bugs (e.g., Defects4J) is unknown."
    325     }
    326   ],
    327   "cited_papers": [
    328     {
    329       "title": "Evaluating Large Language Models Trained on Code",
    330       "authors": ["Mark Chen"],
    331       "year": 2021,
    332       "arxiv_id": "2107.03374",
    333       "relevance": "Introduces HumanEval benchmark and Codex, foundational for LLM code generation evaluation."
    334     },
    335     {
    336       "title": "Impact of Code Language Models on Automated Program Repair",
    337       "authors": ["Nan Jiang"],
    338       "year": 2023,
    339       "relevance": "Prior comparative study of 10 CLMs for APR on HumanEval-Java, direct predecessor to this work."
    340     },
    341     {
    342       "title": "How Effective Are Neural Networks for Fixing Security Vulnerabilities",
    343       "authors": ["Yi Wu"],
    344       "year": 2023,
    345       "relevance": "Evaluates CLMs for security vulnerability repair, comparing 5 models."
    346     },
    347     {
    348       "title": "Less Training, More Repairing Please: Revisiting Automated Program Repair via Zero-Shot Learning",
    349       "authors": ["Chunqiu Steven Xia"],
    350       "year": 2022,
    351       "relevance": "Compares 9 CLMs for zero-shot APR, establishing evaluation protocols used in this study."
    352     },
    353     {
    354       "title": "An Extensive Study on Pre-trained Models for Program Understanding and Generation",
    355       "authors": ["Zhengran Zeng"],
    356       "year": 2023,
    357       "relevance": "Survey of pre-trained models for code understanding and generation tasks."
    358     },
    359     {
    360       "title": "Code Llama: Open Foundation Models for Code",
    361       "authors": ["Baptiste Rozière"],
    362       "year": 2023,
    363       "relevance": "Introduces CodeLLaMA models, the top-performing family in this evaluation."
    364     },
    365     {
    366       "title": "StarCoder: May the Source Be with You!",
    367       "authors": ["Raymond Li"],
    368       "year": 2023,
    369       "relevance": "Introduces StarCoder, one of the top-performing models evaluated in this study."
    370     },
    371     {
    372       "title": "Practical Program Repair via Byte-Level Code Language Model",
    373       "authors": ["Nan Jiang"],
    374       "year": 2024,
    375       "relevance": "Advances CLM-based APR with byte-level approaches."
    376     },
    377     {
    378       "title": "Pre-trained Model-based Automated Software Vulnerability Repair: How Far are We?",
    379       "authors": ["Quanjun Zhang"],
    380       "year": 2023,
    381       "relevance": "Evaluates 5 CLMs for vulnerability repair, comparing fine-tuning strategies."
    382     },
    383     {
    384       "title": "Automated Program Repair in the Era of Large Pre-trained Language Models",
    385       "authors": ["Chunqiu Steven Xia"],
    386       "year": 2023,
    387       "relevance": "Comprehensive study of LLMs for APR including conversation-driven repair."
    388     }
    389   ]
    390 }

Impressum · Datenschutz