scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26122B)
      1 {
      2   "paper": {
      3     "title": "Assessing the Impact of Code Changes on the Fault Localizability of Large Language Models",
      4     "authors": [
      5       "Sabaat Haroon",
      6       "Ahmad Faraz Khan",
      7       "Ahmad Humayun",
      8       "Waris Gill",
      9       "Abdul Haddi Amjad",
     10       "Ali R. Butt",
     11       "Mohammad Taha Khan",
     12       "Muhammad Ali Gulzar"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv",
     16     "arxiv_id": "2504.04372",
     17     "doi": ""
     18   },
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper states 'The evaluation framework, datasets, and code generated and analyzed during the study are publicly available on Zenodo: https://zenodo.org/records/15550782' in the Data Availability section."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The Zenodo archive includes datasets used in the study, as stated in the Data Availability section."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper mentions 'NVIDIA L40S GPU' and '48 GB RAM, 48 cores' (Table IV) for open-source models and that proprietary models used their respective APIs, but no requirements.txt, Dockerfile, or detailed dependency/version listing is provided in the paper."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions are provided in the paper itself. The Zenodo link is given but the paper does not describe how to run the framework, replicate experiments, or which commands to execute."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Results are reported as point estimates (e.g., accuracy percentages in Tables II, Figures 6-11). No confidence intervals, error bars, or uncertainty measures are provided for any of the reported metrics."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper makes comparative claims (e.g., 'Claude family achieves the best overall performance', 'Misleading Comments result in lower average accuracy compared to Misleading Variable Names') but no statistical significance tests are used to support any of these comparisons."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "The paper reports effect sizes with baseline context throughout, e.g., 'SPMs cause an LLM to fail to localize the same fault it correctly localized earlier in 78% of cases', 'dead code alone reducing average accuracy to 20.38%', 'accuracy drops by 1.04% per mutation step in Java and 1.93% per step in Python'. These provide magnitude context."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No justification is given for why 10 LLMs were chosen, why 1,307 seed programs are sufficient, or any power analysis. The scale is large (750K tasks) but the choice of parameters is not justified."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No standard deviations, variance, or spread measures are reported across runs. It is unclear whether experiments were run multiple times. All results appear to be single-run numbers."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper establishes baselines by measuring fault localization accuracy before applying SPMs (Table II), then compares post-SPM accuracy against this baseline. It also compares against prior benchmarks in Table V (HumanEval+, DebugBench, LiveCodeBench, SOAPFL, FlexFL)."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The compared benchmarks include recent work: FlexFL (2025), SOAPFL (2025), LiveCodeBench (2025), and the LLMs evaluated are contemporary (Claude 4.5 Sonnet, Gemini 2.5 Flash, etc.)."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper systematically varies individual components: SPM types (dead code, misleading comments, misleading variable names, function reordering), SPM strengths (1-8), fault locations (quartiles), and SPM combinations (Mc, Mv, Md, Mc(Mv), Mc(Mv(Md))), effectively serving as an ablation study of which mutation factors affect performance."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "The paper uses only a single metric: fault localization accuracy (whether the LLM identifies the correct faulty line number). No additional metrics such as rank of the fault in the model's output, distance from the correct line, or partial credit measures are used."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No human evaluation of the LLMs' fault localization outputs is performed. All evaluation is automated (comparing predicted line number against injected fault line number). Given the paper's claims about LLM code reasoning quality, human inspection of reasoning traces would be relevant."
     94       },
     95       "held_out_test_set": {
     96         "applies": false,
     97         "answer": false,
     98         "justification": "This is not a machine learning training study. The evaluation framework dynamically generates tasks; there is no train/dev/test split to hold out."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Extensive per-category breakdowns are provided: by fault type (Table II), by language (Figure 7), by SPM type (Figure 6), by fault location quartile (Figure 9), by LLM category (Figure 10), by SPM strength (Figure 8), and by model family version (Figure 11)."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper discusses specific failure cases, e.g., the N-Queen motivating example (Section II/Figures 1-3), the autonomous car dead code example (Section V-B), and the Blender API make_tile example (Section V-B)."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The primary finding is itself a negative result: LLMs fail to localize faults in 78% of cases after SPMs. The paper also reports that newer model versions show only marginal 1-2% improvements (RQ5), which is an honest negative finding about model evolution."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims that 'SPMs cause an LLM to fail to localize the same fault it correctly localized earlier in 78% of cases' and '56% of correctly localized faults appear within the first 25% of program lines, compared to only 6% in the final 25%'. These are directly supported by the results in Sections V-A and V-C."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper makes causal claims (e.g., SPMs 'cause' LLMs to fail). The experimental design supports this: the same LLM is tested on the same program before and after SPMs, with the SPM being the only controlled variable. This is a valid single-variable manipulation design."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The title says 'Large Language Models' generically but the study tests only 10 specific LLMs on Python and Java single-file programs. The abstract and conclusion make broad claims about 'LLMs' code-reasoning' without bounding to the tested setting. The threats to validity section mentions language and model limitations but the framing throughout is general."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Section VI discusses alternative explanations: whether prompt engineering could improve results, whether single-file programs are too simple (and argues this is a conservative lower bound), and whether code representation changes (CFGs, CPGs) could address the limitations. The threats section also discusses alternative explanations for fault injection coverage."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Table III lists models as 'GPT-4o', 'Claude 3.7 Sonnet', 'Claude 4.5 Sonnet', 'Gemini 2.0-Flash', 'Gemini 1.5-Pro', 'Gemini 2.5-Flash', 'Qwen2.5-coder', 'Llama3.1', 'Phi4', 'Qwen-QWQ'. These are marketing names without API versions or snapshot dates (e.g., no 'gpt-4o-2024-05-13' or similar)."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The full prompt used for fault localization is provided in Section II: 'This code is designed to solve the N-Queen problem. Given an input N, it should return all valid arrangements of N queens on an N x N board such that no two queens attack each other. However, the code produces incorrect output. Can you identify the specific line of code responsible for the error? The program is attached below. <CODE>'. This is the actual uniform baseline prompt used across experiments."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No temperature, top-p, max tokens, or other sampling hyperparameters are reported for any of the 10 LLMs. These significantly affect LLM output and are not mentioned."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The LLMs are given a single prompt and return a single response. There is no multi-turn interaction, tool use, or feedback loop."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section IV-A describes data selection criteria (>50 LOC, specifications required, avoiding contaminated benchmarks, context window limits), with specific counts: 18,612 Python programs and 812 Java programs filtered to 637 Python and 670 Java. Section IV-B describes the fault injection and underspecification filtering pipeline."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section VII 'Threats to Validity' provides a dedicated discussion of limitations."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The threats section discusses specific issues: injected faults may not cover all execution paths, the selection of 10 specific LLMs may not generalize to all models, results limited to Python and Java, four fault types may not capture real bug diversity (e.g., multi-location bugs), and SPMs do not cover all semantic-preserving transformations (e.g., refactorings, API-equivalent rewrites)."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The threats section explicitly states what is not tested: other programming languages, multi-location bugs, more complex semantic-preserving transformations, and models beyond the 10 tested. Section VI also states they 'intentionally avoid extensive prompt engineering' and test single-file programs only."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The Zenodo archive (https://zenodo.org/records/15550782) is stated to contain the evaluation framework, datasets, and code, making raw data available for verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section IV-A describes data collection in detail: Python programs from iamtarun/python_code_instructions_18k_alpaca on HuggingFace, Java programs from CodeSearchNet. Selection criteria (LOC >= 50, spec availability, context limits) and filtering steps are documented."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants are involved. The data sources are standard public benchmarks."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The full pipeline is documented in Section IV and Figure 4: seed program procurement (18,612 + 812 programs) -> size/context filtering (637 + 670) -> fault injection (4 types x 4 locations) -> underspecification filtering -> SPM application (6 variants per program) -> 750,013 total tasks. Each transformation step is explained."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding sources or acknowledgments section is present in the paper. There is no mention of grants, sponsorships, or funding agencies."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: seven authors from Virginia Tech and one from Carnegie Mellon University. Neither institution is directly affiliated with any of the evaluated model providers."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of funding disclosure means this criterion is not satisfied."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The paper discusses contamination as a key motivation (Defects4J and BugsInPy are in training data) and designs the framework to avoid it, but does not state the training data cutoff dates for any of the 10 evaluated LLMs."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "The paper extensively discusses contamination: Section I identifies that 'datasets have become part of LLM training data, leading to biased results', and the entire framework design addresses this by dynamically injecting unseen faults. Section IV-A explicitly avoids Defects4J and BugsInPy for contamination reasons. However, the seed programs themselves (from HuggingFace) could be in training data, which is partially addressed by the mutation approach."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "Contamination is a central concern of the paper. The framework generates novel fault localization tasks by injecting unseen faults and SPMs into programs, specifically designed so that 'LLMs undergo continuous training and are eventually exposed to public datasets' (Section I). The dynamic generation approach is the paper's key contribution for addressing contamination."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants are involved in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants are involved in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "The paper reports total tokens analyzed (3.8 billion) and total prompts (1,163,686) but does not report API costs, cost per task, wall-clock time, or total expenditure for running 750K+ tasks across 10 LLMs."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Table IV mentions '48 GB RAM, 48 cores with NVIDIA L40S GPU' as hardware but does not state total GPU hours, total API spend, or total computation time for the experiments."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "SPMs cause LLMs to fail to localize the same fault they correctly localized earlier in 78% of cases.",
    296       "evidence": "Section V-A reports this aggregate figure across all 10 LLMs, fault types, and SPM types. Figure 7 shows the per-model breakdown.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Dead code injection reduces LLM fault localization accuracy to 20.38% on average.",
    301       "evidence": "Section V-B and Figure 6 show dead code has the most substantial impact among SPM types, with 20.38% average accuracy.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "56% of correctly localized faults appear within the first 25% of program lines, compared to only 6% in the final 25%.",
    306       "evidence": "Section V-C and Figure 9 present a heatmap showing fault localization accuracy by fault location quartile across all fault types.",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "Fault localization accuracy drops by 1.04% per mutation strength step in Java and 1.93% per step in Python.",
    311       "evidence": "Section V-B and Figure 8 present mutation strength analysis from strength 1 to 8, showing near-linear degradation.",
    312       "supported": "strong"
    313     },
    314     {
    315       "claim": "Newer Claude and Gemini variants show only very modest fault localizability gains (1-2%).",
    316       "evidence": "Section V-E and Figure 11 show Gemini 2.0 Flash to 2.5 Flash improves 1.8%, Claude 3.7 Sonnet to 4.5 Sonnet improves 1.0%.",
    317       "supported": "moderate"
    318     },
    319     {
    320       "claim": "Reasoning and multimodal models achieve the highest fault localization accuracy, while coding-specialized models yield the lowest performance.",
    321       "evidence": "Section V-D and Figure 10 categorize LLMs and show reasoning/multimodal models outperform generic and coder models.",
    322       "supported": "moderate"
    323     },
    324     {
    325       "claim": "Function reordering causes an 83% accuracy drop in Java programs.",
    326       "evidence": "Section V-B states 'We observe a significant accuracy drop of 83%' for function reordering mutations on Java.",
    327       "supported": "moderate"
    328     }
    329   ],
    330   "methodology_tags": [
    331     "benchmark-eval"
    332   ],
    333   "key_findings": "LLMs' fault localization reasoning is highly fragile to semantic-preserving code changes: across 750K tasks on 10 LLMs, applying SPMs causes failure in 78% of cases where the model initially succeeded. Dead code insertion and misleading comments are the most disruptive mutation types. LLMs exhibit strong positional bias, localizing faults in the first 25% of code 9x more often than in the final 25%. Newer model versions show only marginal (1-2%) improvements in fault localization robustness, suggesting model scaling alone does not address fundamental limitations in code reasoning.",
    334   "red_flags": [
    335     {
    336       "flag": "No statistical significance testing",
    337       "detail": "The paper makes numerous comparative claims (model A outperforms model B, mutation type X causes more degradation than Y) but provides no statistical tests. With 750K tasks, even trivially small differences would be statistically significant, making the absence notable."
    338     },
    339     {
    340       "flag": "No uncertainty quantification",
    341       "detail": "No confidence intervals, error bars, or variance measures are reported for any results. It is unclear whether experiments involve any randomness (e.g., API temperature settings) or were run multiple times."
    342     },
    343     {
    344       "flag": "Missing hyperparameter reporting",
    345       "detail": "Temperature, top-p, and other sampling parameters are not reported for any of the 10 LLMs. These settings can significantly affect output quality and reproducibility, especially for deterministic fault localization."
    346     },
    347     {
    348       "flag": "Single metric evaluation",
    349       "detail": "Only exact line-number match accuracy is used. This binary metric does not capture near-misses (identifying the right function but wrong line) or cases where the model identifies the fault correctly but reports a slightly different line number. This could inflate the apparent fragility."
    350     },
    351     {
    352       "flag": "Seed program contamination not fully addressed",
    353       "detail": "While the paper's key contribution is addressing contamination through dynamic fault injection, the seed programs themselves come from public HuggingFace datasets that are likely in LLM training data. LLMs may have memorized the correct versions, which could confound the analysis of semantic-preserving mutations."
    354     }
    355   ],
    356   "cited_papers": [
    357     {
    358       "title": "Evaluating large language models trained on code",
    359       "authors": ["M. Chen"],
    360       "year": 2021,
    361       "arxiv_id": "2107.03374",
    362       "relevance": "Foundational benchmark (HumanEval) for LLM code generation evaluation, relevant to survey scope on LLM programming capabilities."
    363     },
    364     {
    365       "title": "Can large language models reason about code?",
    366       "authors": ["T. Li", "Z. Wei", "M. Allamanis"],
    367       "year": 2024,
    368       "relevance": "Directly evaluates LLM code reasoning capabilities, central to survey scope on AI programming quality."
    369     },
    370     {
    371       "title": "Investigating data contamination in modern benchmarks for large language models",
    372       "authors": ["C. Deng", "Y. Zhao", "X. Tang", "M. Gerstein", "A. Cohan"],
    373       "year": 2024,
    374       "relevance": "Addresses benchmark contamination in LLM evaluation, a key methodological concern for the survey."
    375     },
    376     {
    377       "title": "Are large language models memorizing bug benchmarks?",
    378       "authors": ["D. Ramos", "C. Mamede", "K. Jain", "P. Canelas", "C. Gamboa", "C. Le Goues"],
    379       "year": 2025,
    380       "relevance": "Directly relevant to benchmark contamination in LLM code evaluation, a key methodological issue."
    381     },
    382     {
    383       "title": "Large language models for test-free fault localization",
    384       "authors": ["A. Z. H. Yang", "C. Le Goues", "R. Martins", "V. Hellendoorn"],
    385       "year": 2024,
    386       "relevance": "Evaluates LLM-based fault localization capabilities, directly relevant to AI programming quality assessment."
    387     },
    388     {
    389       "title": "SOAPFL: A standard operating procedure for LLM-based method-level fault localization",
    390       "authors": ["Y. Qin", "S. Wang", "Y. Lou", "J. Dong", "K. Wang", "X. Li", "X. Mao"],
    391       "year": 2025,
    392       "doi": "10.1109/TSE.2025",
    393       "relevance": "LLM-based fault localization methodology, relevant to survey scope on AI-assisted software engineering."
    394     },
    395     {
    396       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    397       "authors": ["J. Liu", "C. S. Xia", "Y. Wang", "L. Zhang"],
    398       "year": 2023,
    399       "relevance": "HumanEval+ rigorous evaluation of LLM code generation, relevant to methodology quality assessment."
    400     },
    401     {
    402       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    403       "authors": ["N. Jain", "K. Han", "A. Gu"],
    404       "year": 2025,
    405       "relevance": "Contamination-free LLM code benchmark, relevant to evaluation methodology in the survey."
    406     },
    407     {
    408       "title": "DebugBench: Evaluating debugging capability of large language models",
    409       "authors": ["R. Tian", "Y. Ye", "Y. Qin"],
    410       "year": 2024,
    411       "relevance": "Benchmark for LLM debugging capabilities, directly relevant to AI programming evaluation."
    412     },
    413     {
    414       "title": "FlexFL: Flexible and effective fault localization with open-source large language models",
    415       "authors": ["C. Xu", "Z. Liu", "X. Ren", "D. Lo"],
    416       "year": 2025,
    417       "relevance": "Open-source LLM fault localization, relevant to survey scope on AI-assisted debugging."
    418     },
    419     {
    420       "title": "Top score on the wrong exam: On benchmarking in machine learning for vulnerability detection",
    421       "authors": ["N. Risse", "J. Liu", "M. Böhme"],
    422       "year": 2024,
    423       "arxiv_id": "2408.12986",
    424       "relevance": "Critiques benchmarking methodology in ML for vulnerability detection, relevant to evaluation methodology quality."
    425     },
    426     {
    427       "title": "DIP: Dead code insertion based black-box attack for programming language model",
    428       "authors": ["C. Na", "Y. Choi", "J.-H. Lee"],
    429       "year": 2023,
    430       "relevance": "Dead code insertion attacks on code models, relevant to robustness evaluation of AI programming tools."
    431     }
    432   ]
    433 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs