scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24303B)
      1 {
      2   "paper": {
      3     "title": "Large Language Models Meet Automated Program Repair: Innovations, Challenges and Solutions",
      4     "authors": ["Yiting Tang"],
      5     "year": 2024,
      6     "venue": "Proceedings of the 2nd International Conference on Machine Learning and Automation",
      7     "doi": "10.54254/2755-2721/113/2024.18303"
      8   },
      9   "scan_version": 3,
     10   "active_modules": ["survey_methodology"],
     11   "methodology_tags": ["meta-analysis"],
     12   "key_findings": "This survey reviews 8 LLM-based automated program repair systems (FitRepair, TypeFix, InferFix, RepairAgent, FixAgent, ThinkRepair, SRepair, ChatRepair), comparing their model architectures, techniques, and benchmark performance. The paper argues that LLM-based APR surpasses traditional neural program repair in performance and generality, particularly in zero-shot scenarios. It identifies data leakage and high computational overhead as key open challenges and proposes solution directions including data filtering, memorization detection, candidate patch reduction, and self-correction techniques.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No code, analysis scripts, or repository links are provided. A survey can release its analysis code and collected data but this paper does not."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No dataset, spreadsheet of reviewed systems, or structured data from the review is released. The comparison data in Table 1 is the extent of structured information and is only in the paper body."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment or tooling specifications are provided. This is a narrative survey with no computational component described."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No instructions for reproducing the review are provided — no search queries, databases searched, or systematic process to follow."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": false,
     39         "answer": false,
     40         "justification": "Survey paper with no original experiments. The paper reports numbers from other papers' results (Table 1) but does not conduct its own statistical analyses."
     41       },
     42       "significance_tests": {
     43         "applies": false,
     44         "answer": false,
     45         "justification": "Survey paper with no original experiments or meta-analytic statistical aggregation."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": false,
     49         "answer": false,
     50         "justification": "Survey paper with no original experiments or statistical aggregation."
     51       },
     52       "sample_size_justified": {
     53         "applies": false,
     54         "answer": false,
     55         "justification": "Survey paper with no original experiments."
     56       },
     57       "variance_reported": {
     58         "applies": false,
     59         "answer": false,
     60         "justification": "Survey paper with no original experiments."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper claims to be 'the first to provide a comprehensive review' of LLM-based APR but does not compare its coverage or methodology against any prior surveys on APR or related topics."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No prior surveys are cited or compared against to establish how this review advances the state of knowledge."
     73       },
     74       "ablation_study": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "No system with components to ablate — this is a survey paper."
     78       },
     79       "multiple_metrics": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "No original experiments conducted."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No experiments conducted that would warrant human evaluation."
     88       },
     89       "held_out_test_set": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "No experiments conducted."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Table 1 provides a per-system breakdown across model name, architecture, technique, and performance on multiple benchmarks (Defects4J, HumanEval-Java, QuixBugs, BugsInPy, CodeFlaw)."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 5 discusses challenges including data leakage (Section 5.1) and high computational overhead (Section 5.3) as failure modes of LLM-based APR."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper reports negative aspects including data leakage concerns, high fine-tuning costs, lack of domain-specific knowledge, and insufficient adaptability in complex repair scenarios (Sections 5.1, 5.3, and the abstract)."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": false,
    114         "justification": "The abstract claims 'our paper is the first to provide a comprehensive review of the LLM-based APR domain' but the paper reviews only 8 systems with very brief descriptions (1-3 sentences each in Sections 3.1-3.8). The claim of comprehensiveness is not supported by the shallow coverage. The claim that 'zero-shot LLM-based APRs has surpassed that of NPR' is stated without systematic evidence — Table 1 has mostly empty cells making comparison impossible."
    115       },
    116       "causal_claims_justified": {
    117         "applies": false,
    118         "answer": false,
    119         "justification": "The paper is a narrative survey that summarizes existing work. It does not make original causal claims requiring experimental justification."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper makes broad claims like 'LLMs have demonstrated advantages over traditional NPR approaches in various repair scenarios' and 'LLM-based APR excels in repair performance and flexibility' but only reviews 8 systems tested on a narrow set of Java/Python benchmarks. No scope boundaries are stated for these generalizations."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper does not consider alternative explanations for why LLM-based APR outperforms NPR (e.g., differences in training data scale, evaluation methodology differences across studies, or benchmark-specific advantages)."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper equates benchmark fix counts (e.g., bugs fixed on Defects4J) with 'repair performance' and 'software reliability' without discussing the gap between fixing curated benchmark bugs and actual software repair quality in real-world settings."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "Survey paper that does not use any models directly. Model names are listed for reviewed systems but the survey itself has no experimental setup."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "Survey paper that does not use prompting."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "Survey paper with no experimental setup requiring hyperparameters."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "Survey paper that does not use agentic scaffolding."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "The paper does not document any paper selection pipeline — no search queries, databases searched, inclusion/exclusion criteria, or filtering steps. The 8 systems appear to have been selected ad hoc with no stated methodology."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no dedicated limitations section. The conclusion (Section 6) mentions future directions but does not discuss limitations of the review itself."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No threats to validity are discussed. The paper does not acknowledge potential issues with its own review methodology, scope, or selection bias."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what is excluded from the review. There are no statements about which types of APR systems, time periods, or venues were out of scope."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw data or supplementary materials are provided. The comparison data in Table 1 is the only structured data and appears to be manually compiled with no verification possible."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "The paper does not describe how the 8 APR systems were identified or selected for review. No search strategy, databases, or selection criteria are mentioned."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants in this survey paper."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "No data pipeline is documented. The paper provides no information about how papers were found, screened, or how performance numbers in Table 1 were extracted and verified."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding statement or acknowledgments section is present in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "The author's affiliation with Northwest Minzu University, Lanzhou, China is clearly stated below the author name."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding information is disclosed, so funder independence cannot be assessed."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial disclosure statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks. The paper does discuss contamination as a challenge for reviewed systems (Section 5.1) but does not itself perform benchmark evaluation."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this survey paper."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this survey paper."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this survey paper."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this survey paper."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this survey paper."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this survey paper."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this survey paper."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "Survey paper with no original method or experiments. The paper discusses costs of reviewed systems but has no cost of its own."
    284       },
    285       "compute_budget_stated": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "Survey paper with no computational experiments."
    289       }
    290     },
    291     "survey_methodology": {
    292       "prisma_or_structured_protocol": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No PRISMA flow diagram, no structured search strategy, no reproducible queries, and no review protocol. The 8 systems appear selected without any stated methodology."
    296       },
    297       "quality_assessment_of_sources": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The paper does not assess the methodological quality of the 8 reviewed systems. It reports their claimed performance numbers from Table 1 without evaluating whether those results are trustworthy, comparable, or methodologically sound."
    301       },
    302       "publication_bias_discussed": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No discussion of publication bias. The paper does not consider whether the reviewed systems represent a biased sample of positive results or whether negative results in APR are underrepresented."
    306       }
    307     }
    308   },
    309   "engagement_factors": {
    310     "practical_relevance": {
    311       "score": 1,
    312       "justification": "Provides a brief overview of 8 APR tools that practitioners might find useful as an entry point, but offers no actionable guidance or new tools."
    313     },
    314     "surprise_contrarian": {
    315       "score": 0,
    316       "justification": "Confirms the expected narrative that LLMs outperform traditional NPR methods, with no surprising findings."
    317     },
    318     "fear_safety": {
    319       "score": 0,
    320       "justification": "No safety or security concerns raised."
    321     },
    322     "drama_conflict": {
    323       "score": 0,
    324       "justification": "No controversy or conflict; straightforward positive framing of LLM-based APR progress."
    325     },
    326     "demo_ability": {
    327       "score": 0,
    328       "justification": "No code, demo, or tool released."
    329     },
    330     "brand_recognition": {
    331       "score": 0,
    332       "justification": "Unknown venue (ICMLA), single author from a regional university, no association with prominent labs."
    333     }
    334   },
    335   "claims": [
    336     {
    337       "claim": "LLM-based APR exhibits superior repair performance and enhanced generality compared to traditional NPR methods.",
    338       "evidence": "Stated in abstract and Section 1. Table 1 shows performance numbers for 8 systems on various benchmarks, but most cells are empty (dashes), making systematic comparison impossible.",
    339       "supported": "weak"
    340     },
    341     {
    342       "claim": "Zero-shot LLM-based APR has surpassed NPR performance.",
    343       "evidence": "Stated in abstract. Section 4.1 mentions AlphaRepair as a zero-shot approach but provides no direct comparative data between zero-shot LLM-APR and NPR on the same benchmarks.",
    344       "supported": "unsupported"
    345     },
    346     {
    347       "claim": "This is the first comprehensive review of LLM-based APR from perspectives of innovation, challenges, and solutions.",
    348       "evidence": "Stated in contributions (Section 1). No search of prior surveys is conducted to verify this novelty claim. The review covers only 8 systems with 1-3 sentence descriptions each.",
    349       "supported": "unsupported"
    350     },
    351     {
    352       "claim": "Data leakage and high computational overhead are key challenges for LLM-based APR.",
    353       "evidence": "Section 5 discusses these challenges with references to relevant work (Yang et al. 2024 on memorization, Zhong et al. 2022 on reducing candidate patches). The challenges are well-supported by the cited literature.",
    354       "supported": "moderate"
    355     },
    356     {
    357       "claim": "Reducing candidate patches to approximately 170 achieves 90% of optimal performance.",
    358       "evidence": "Section 5.4.1 cites StandUp4NPR (Zhong et al. 2022) for this claim. The number comes from the cited paper, not original analysis.",
    359       "supported": "moderate"
    360     }
    361   ],
    362   "red_flags": [
    363     {
    364       "flag": "No systematic review methodology",
    365       "detail": "The paper claims to be a 'comprehensive review' but has no PRISMA protocol, no search strategy, no inclusion/exclusion criteria, and no stated methodology for selecting the 8 reviewed systems. The selection appears ad hoc."
    366     },
    367     {
    368       "flag": "Extremely shallow coverage despite comprehensiveness claim",
    369       "detail": "Each of the 8 APR systems receives only 1-3 sentences of description (Sections 3.1-3.8). FitRepair, TypeFix, InferFix, RepairAgent, and FixAgent each get a single short paragraph with no technical depth. This does not constitute a comprehensive review."
    370     },
    371     {
    372       "flag": "Table 1 is mostly empty",
    373       "detail": "The main comparison table (Table 1) has dashes (-) in the majority of performance cells, making cross-system comparison impossible. For example, InferFix shows no results on any benchmark, and most systems are missing results on 4 of 5 benchmarks."
    374     },
    375     {
    376       "flag": "No quality assessment of reviewed papers",
    377       "detail": "The survey reports performance numbers from reviewed papers at face value without assessing their methodological quality, whether results are comparable across different evaluation setups, or whether claimed numbers are trustworthy. This launders the signal-to-noise ratio of the sources."
    378     },
    379     {
    380       "flag": "Duplicate and inconsistent references",
    381       "detail": "InferFix appears as references [8], [15], and [21] — the same paper cited three times with the same text. SRepair (Wu et al. 2024) appears as [19], [29], [32], and [35]. This suggests careless reference management."
    382     },
    383     {
    384       "flag": "Author name discrepancy",
    385       "detail": "The paper header lists 'Yiting Tang' but the email prefix shows 'tangyt' — minor, but the registry entry lists 'Yi Tang' which differs from the paper itself."
    386     },
    387     {
    388       "flag": "Overclaiming novelty",
    389       "detail": "The paper claims to be 'the first to provide a comprehensive review' of LLM-based APR, but does not cite or compare against existing surveys on APR (e.g., Zhong et al. 2022 'Neural Program Repair: Systems, Challenges, and Solutions' is reference [7] and covers overlapping ground)."
    390     }
    391   ],
    392   "cited_papers": [
    393     {
    394       "title": "Cure: Code-Aware Neural Machine Translation for Automated Program Repair",
    395       "authors": ["Nan Jiang", "Thibaud Lutellier", "Lin Tan"],
    396       "year": 2021,
    397       "doi": "10.1109/ICSE43902.2021.00107",
    398       "relevance": "Neural machine translation approach to APR, demonstrates code-aware techniques for automated bug fixing."
    399     },
    400     {
    401       "title": "Less Training, More Repairing Please: Revisiting Automated Program Repair via Zero-Shot Learning",
    402       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    403       "year": 2022,
    404       "relevance": "Introduces AlphaRepair, demonstrating that zero-shot LLM-based repair can surpass trained NPR systems, key milestone in APR evolution."
    405     },
    406     {
    407       "title": "Keep the Conversation Going: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT",
    408       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    409       "year": 2023,
    410       "arxiv_id": "2304.00385",
    411       "relevance": "ChatRepair — demonstrates conversational LLM-based APR with multi-round dialogue, reports cost-effectiveness metrics."
    412     },
    413     {
    414       "title": "Revisiting the Plastic Surgery Hypothesis via Large Language Models",
    415       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    416       "year": 2023,
    417       "arxiv_id": "2303.10494",
    418       "relevance": "FitRepair — uses model ensemble with retrieval-enhanced CodeT5 for program repair."
    419     },
    420     {
    421       "title": "InferFix: End-to-end program repair with LLMs over Retrieval-augmented prompts",
    422       "authors": ["Matthew Jin", "Syed Shahriar", "Michele Tufano", "Xin Shi", "Shuai Lu", "Neel Sundaresan", "Alexey Svyatkovskiy"],
    423       "year": 2023,
    424       "relevance": "Retrieval-augmented LLM approach to APR using Codex, demonstrates industry-scale program repair."
    425     },
    426     {
    427       "title": "RepairAgent: An Autonomous, LLM-Based Agent for Program Repair",
    428       "authors": ["Islem Bouzenia", "Premkumar Devanbu", "Michael Pradel"],
    429       "year": 2024,
    430       "arxiv_id": "2403.17134",
    431       "relevance": "Autonomous agent-based APR using GPT-3.5, represents the shift toward agentic program repair systems."
    432     },
    433     {
    434       "title": "ThinkRepair: Self-Directed Automated Program Repair",
    435       "authors": ["Xin Yin", "Chao Ni", "Shaohua Wang", "Zhenhao Li", "Limin Zeng", "Xiaohu Yang"],
    436       "year": 2024,
    437       "doi": "10.1145/3650212.3680359",
    438       "relevance": "Uses Chain-of-Thought reasoning for APR, demonstrating advanced prompting techniques for bug repair."
    439     },
    440     {
    441       "title": "A Unified Debugging Approach via LLM-Based Multi-Agent Synergy",
    442       "authors": ["Cheryl Lee", "Chunqiu Steven Xia", "Jen-tse Huang", "Zhouruixin Zhu", "Lingming Zhang", "Michael R. Lyu"],
    443       "year": 2024,
    444       "relevance": "FixAgent — multi-agent system for debugging combining multiple repair strategies with GPT-4."
    445     },
    446     {
    447       "title": "How Far Can We Go with Practical Function-Level Program Repair?",
    448       "authors": ["Mingyuan Wu", "Jiahong Xiang", "Xiaoyang Xu", "Fanchu Kong", "Haotian Zhang", "Yuqun Zhang"],
    449       "year": 2024,
    450       "arxiv_id": "2404.12833",
    451       "relevance": "SRepair — function-level APR using GPT-3.5 + MigiCoder-7B pipeline, addresses practical scalability of LLM-based repair."
    452     },
    453     {
    454       "title": "Domain Knowledge Matters: Improving Prompts with Fix Templates for Repairing Python Type Errors",
    455       "authors": ["Yun Peng", "Shuzheng Gao", "Cuiyun Gao", "Yintong Huo", "Michael R. Lyu"],
    456       "year": 2023,
    457       "relevance": "TypeFix — domain-specific APR for Python type errors using retrieval and fine-tuning on CodeT5."
    458     },
    459     {
    460       "title": "Unveiling Memorization in Code Models",
    461       "authors": ["Zhou Yang", "Zhipeng Zhao", "Chenyu Wang", "Jieke Shi", "Dongsun Kim", "Donggyun Han", "David Lo"],
    462       "year": 2024,
    463       "doi": "10.1145/3597503.3639074",
    464       "relevance": "Directly relevant to data contamination and leakage detection in code LLMs, proposes Type-1 clone detection and perplexity-based memorization detection."
    465     },
    466     {
    467       "title": "StandUp4NPR: Standardizing SetUp for Empirically Comparing Neural Program Repair Systems",
    468       "authors": ["Wenkang Zhong", "Hongliang Ge", "Hongfei Ai", "Chuanyi Li", "Kui Liu", "Jidong Ge", "Bin Luo"],
    469       "year": 2022,
    470       "doi": "10.1145/3551349.3556943",
    471       "relevance": "Standardizes NPR evaluation methodology, provides empirical findings on candidate patch reduction for cost optimization."
    472     }
    473   ]
    474 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs