scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25220B)
      1 {
      2   "paper": {
      3     "title": "LIDL: LLM Integration Defect Localization via Knowledge Graph-Enhanced Multi-Agent Analysis",
      4     "authors": ["Gou Tan", "Zilong He", "Min Li", "Pengfei Chen", "Jieke Shi", "Zhensu Sun", "Ting Zhang", "Danwen Chen", "Lwin Khin Shar", "Chuanfu Zhang", "David Lo"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2601.05539",
      8     "doi": "10.48550/arXiv.2601.05539"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "LIDL achieves 0.64 Top-3 accuracy and 0.48 MAP on 146 real-world LLM integration defects, outperforming the best baseline (AutoCodeRover) by 64.1%. It reduces cost by 92.5% compared to AutoCodeRover ($0.008 vs $0.106 per instance). Ablation shows annotation-based retrieval contributes most (-17.2% Top-3 when removed). The largest gains are in LLM System Management defects (+170.8% over best baseline).",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper states 'All benchmark data and our implementation are publicly available at: https://github.com/IntelligentDDS/LIDL' (Section I)."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The benchmark dataset of 146 defect instances is stated as publicly available at the same GitHub repository."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Section IV-A states 'Ubuntu 24.04, 64-core Intel Xeon Gold 6326 CPU, 128GB RAM, and 6 NVIDIA A40 48GB GPUs' and 'Python 3.10.16 with popular libraries.' Hardware is detailed but specific library versions are not enumerated."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are described in the paper. The GitHub link is provided but no README content or reproduction scripts are described."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results are reported as point estimates (e.g., Top-3: 0.64, MAP: 0.48). No confidence intervals, error bars, or uncertainty quantification is provided."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims LIDL 'significantly outperforms' baselines but no statistical significance tests (p-values, t-tests, etc.) are reported. Comparisons are based solely on point metric differences."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports percentage improvements with baseline context throughout, e.g., '64.1% improvement over AutoCodeRover (0.39)' and '120.7% over SWE-agent (0.29)' (Section IV-B)."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The dataset contains 146 instances after filtering from two sources, but no justification is given for why this sample size is sufficient for the claims made."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or spread across runs is reported. Results appear to be from single runs (temperature=0.0 is stated but no mention of multiple runs)."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Five baselines are compared: SWE-agent, Agentless, AutoCodeRover, SWE-agent*, and Agentless* (Section IV)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include recent state-of-the-art methods: SWE-agent (NeurIPS 2024), Agentless (2024), AutoCodeRover (ISSTA 2024), RepoGraph (ICLR 2025). All are contemporary."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Section IV-D (RQ3) presents a systematic ablation removing each of the four components individually (direct extraction, symptom inference, annotation retrieval, validator) with results in Table VI."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Four metrics are used: Top-1, Top-3, MAP, and MRR. Additionally, cost and token consumption are reported (Section IV-B, IV-C)."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation of LIDL's outputs is conducted. Evaluation is entirely automated against ground-truth patches. Human evaluation could assess whether the localization reports are useful to developers."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "Parameter settings were determined through a 'pilot study on 15 defects' (Section IV-A) but there is no explicit statement that these 15 were excluded from the final evaluation, or that a separate held-out test set was used."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Fig. 7 provides per-category breakdowns across all four defect categories (Prompt/Context, LLM Interface, Tool Integration, LLM System) for all methods."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "The case study in Section IV-E only shows a success case. No failure analysis is provided — the 36% of instances where LIDL fails to locate the defect in Top-3 are not analyzed."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports that adding RepoGraph to baselines actually hurts performance: 'SWE-agent* drops from 0.29 to 0.21 Top-3 (-27.6%)' (Section IV-B). This is a negative result."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims of 0.64 Top-3 accuracy, 0.48 MAP, 64.1% improvement, and 92.5% cost reduction are all supported by Tables V and the results in Section IV."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims ('each core component contributes meaningfully') are supported by the ablation study in Table VI, which systematically removes components. This is adequate for single-variable manipulation claims."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper title and abstract frame this as a general solution for 'LLM integration defects' but the evaluation is limited to Python applications from GitHub. The limitations section acknowledges Python-only support but the title/abstract do not bound this scope."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No alternative explanations for the observed improvements are discussed. For example, LIDL's advantage could partly stem from the specific composition of the benchmark rather than generalizable superiority."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures file-level localization accuracy (Top-k, MAP, MRR) and frames claims at that same granularity. It does not overreach to claim broader capabilities like 'defect repair' from localization metrics."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Specific model versions are listed: 'Llama3.3-70B-Instruct', 'Qwen2.5-72B-Instruct', 'DeepSeek-V3.2', 'Kimi-K2', 'GPT-5.1', 'Claude-Sonnet-4.5', and 'BGE-M3' (Section IV-A)."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "Prompts are described in natural language (e.g., 'The LLM is prompted to perform three reasoning steps') but the actual prompt text used is not provided in the paper or appendix."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section IV-A reports temperature=0.0, and all key parameters: ks=10, kh=1, ke=5, ki=kr=5, wc=0.7, wd=0.3. Pilot study ranges are also given."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The multi-agent architecture is described in detail in Section III with workflow diagrams (Fig. 3, Fig. 4), showing the three agents, their coordination, and the knowledge graph construction pipeline."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section IV-A describes the data cleaning process: removing instances with missing repos, incomplete information, uncertain categories. Two annotators independently labeled defects with Cohen's kappa=0.9351."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section V includes 'Threats to Validity' (V-A) and 'Limitations and Future Work' (V-B), providing substantive discussion."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Specific threats are discussed: dataset reduction may introduce selection bias (mitigated by two independent annotators), Python-only support, pattern library limited to popular frameworks, and model pricing variability."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section V-B explicitly states three limitations: cost varies by model, pattern library covers only popular frameworks, and LIDL currently supports Python only. Section V-A notes the dataset 'may not represent industrial codebases.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The benchmark data is stated as publicly available at the GitHub repository. The defects are sourced from public GitHub issues with direct links provided."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section IV-A describes two data sources: Hydrangea (888 original defects from 105 GitHub applications) and AgentIssue-Bench (50 original defects from 16 agent systems), with cleaning criteria."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data sources are existing public datasets (Hydrangea and AgentIssue-Bench), which are standard benchmarks."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline from 888+50 original defects to 146 final instances is described with explicit filtering criteria: missing repos, incomplete info, uncertain categories. Annotator agreement (Cohen's kappa 0.9351) is reported."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding or acknowledgments section is present in the paper. No grants or sponsors are mentioned."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: Sun Yat-sen University, Singapore Management University, and Monash University. No conflict with evaluated products."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding disclosure does not establish unfunded status."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The paper evaluates multiple LLMs on a benchmark but does not state training cutoff dates for any of the six models used. The benchmark defects are from public GitHub issues that could appear in training data."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether the GitHub issues in the benchmark appeared in the training data of the evaluated LLMs. The Hydrangea and AgentIssue-Bench datasets are public and could be in training sets."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "The benchmark is constructed from public GitHub issues. The LLMs may have seen these issues and their solutions during training, which could inflate performance. This is not discussed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Section IV-C reports average cost per instance for all methods across all models. LIDL costs $0.008 per instance on kimi-k2, with detailed token consumption (19.5k input, 0.3k output)."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Hardware is specified (6 NVIDIA A40 48GB GPUs, 64-core CPU, 128GB RAM). Per-instance costs and token budgets are reported for all methods in Table V."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Temperature is set to 0.0 for reproducibility but no seed sensitivity analysis is reported. Even at temperature 0, LLM outputs can vary across API calls."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The paper does not state how many runs produced the reported results. Single-run results are implied but not explicitly confirmed."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": true,
    306         "justification": "Section IV-A states parameters were determined through a 'pilot study on 15 defects' with specific ranges tested: ks∈{5,10,15}, k∈{1,2}, ke,ki,kr∈{3,5,8}. Performance stability across ranges is noted."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "The pilot study approach and the specific ranges tested are described. The paper notes 'Performance remained stable across these ranges,' justifying the selected configuration."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The paper compares LIDL against 5 baselines across 6 models and 4 metrics with no correction for multiple comparisons. No statistical tests are performed at all."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors implement LIDL and compare against their own implementations/configurations of baselines. No discussion of author-evaluation bias per Lucic et al. (2018)."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Section IV-C explicitly compares cost vs accuracy, showing LIDL achieves the best accuracy-cost trade-off: '60% more than Agentless ($0.008 vs. $0.005) but improves Top-3 accuracy by 68.4%.'"
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether the 146-defect benchmark adequately represents the space of LLM integration defects. The benchmark is constructed from two existing datasets without analysis of its construct validity."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "LIDL uses a fundamentally different architecture (knowledge graph + multi-agent) than baselines. Performance differences could be attributed to the scaffold/architecture rather than the specific techniques. This confound is not discussed."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "The benchmark defects are from public GitHub issues. No discussion of whether these issues and their solutions existed before the LLMs' training cutoff dates."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "The defect descriptions include error traces and file paths from real GitHub issues. No discussion of whether the evaluation setup provides information that would not be available in practice."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "Multiple defects may come from the same repositories (105 repos for 146 defects). No discussion of non-independence between instances from the same codebase."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention method is used. No analysis of whether models have seen the benchmark issues during training."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "LIDL achieves 0.64 Top-3 accuracy and 0.48 MAP, representing a 64.1% improvement over the best-performing baseline (AutoCodeRover).",
    365       "evidence": "Table V shows kimi-k2 results: LIDL Top-3=0.64, MAP=0.48 vs AutoCodeRover Top-3=0.39, MAP=0.28. Section IV-B.",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "LIDL reduces cost by 92.5% compared to AutoCodeRover while maintaining superior accuracy.",
    370       "evidence": "Table V: LIDL $0.008 vs AutoCodeRover $0.106 per instance on kimi-k2. Section IV-C.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "LIDL uniquely localizes 18 defects (12.3%) that all baselines miss.",
    375       "evidence": "Overlap analysis in Fig. 8 and Section IV-B shows 18 unique defects found only by LIDL.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Annotation-based retrieval is the most critical component, with removal causing a 17.2% drop in Top-3.",
    380       "evidence": "Ablation study in Table VI: LIDL w/o R shows Top-3=0.53 vs full LIDL Top-3=0.64. Section IV-D.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Adding RepoGraph to baselines shows no improvement and can hurt performance.",
    385       "evidence": "Table V and Section IV-B: SWE-agent* drops from 0.29 to 0.21 Top-3 (-27.6%), Agentless* drops from 0.38 to 0.36 (-5.3%).",
    386       "supported": "moderate"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "No statistical significance testing",
    392       "detail": "The paper repeatedly claims LIDL 'significantly outperforms' baselines but provides no statistical tests. All comparisons are based on point estimates from what appear to be single runs. With 146 instances, bootstrap or permutation tests would be appropriate."
    393     },
    394     {
    395       "flag": "No variance or multiple runs reported",
    396       "detail": "Even with temperature=0.0, LLM API calls can produce different outputs. No repeated runs are reported, making it impossible to assess result stability."
    397     },
    398     {
    399       "flag": "Benchmark contamination risk",
    400       "detail": "The benchmark consists of public GitHub issues with their solutions. Models like GPT-5.1 and Claude-Sonnet-4.5 likely trained on GitHub data including these issues. Performance differences between models could partly reflect training data overlap rather than method quality."
    401     },
    402     {
    403       "flag": "No failure case analysis",
    404       "detail": "LIDL fails to locate defects in Top-3 for 36% of instances. No analysis of these failures is provided — only a success case study is shown."
    405     },
    406     {
    407       "flag": "Pilot study not separated from evaluation",
    408       "detail": "Parameters were tuned on 15 defects from the same pool but there is no statement that these were excluded from the 146-instance evaluation."
    409     }
    410   ],
    411   "cited_papers": [
    412     {
    413       "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    414       "authors": ["John Yang", "Carlos E. Jimenez"],
    415       "year": 2024,
    416       "relevance": "Major baseline for repository-level defect localization using LLM agents."
    417     },
    418     {
    419       "title": "Agentless: Demystifying LLM-based Software Engineering Agents",
    420       "authors": ["Chunqiu Steven Xia", "Yinlin Deng"],
    421       "year": 2024,
    422       "arxiv_id": "2407.01489",
    423       "relevance": "Lightweight hierarchical approach to LLM-based defect localization, key baseline."
    424     },
    425     {
    426       "title": "AutoCodeRover: Autonomous Program Improvement",
    427       "authors": ["Yuntong Zhang", "Haifeng Ruan"],
    428       "year": 2024,
    429       "relevance": "Best-performing baseline using code search APIs for defect localization."
    430     },
    431     {
    432       "title": "RepoGraph: Enhancing AI Software Engineering with Repository-Level Code Graph",
    433       "authors": ["Songlin Ouyang", "Wentao Yu"],
    434       "year": 2025,
    435       "relevance": "Repository graph approach for augmenting LLM-based SE tools."
    436     },
    437     {
    438       "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    439       "authors": ["Xingyao Wang", "Boxuan Li"],
    440       "year": 2025,
    441       "relevance": "Open platform for LLM-based software development agents."
    442     },
    443     {
    444       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework",
    445       "authors": ["Qingyun Wu", "Gagan Bansal"],
    446       "year": 2023,
    447       "arxiv_id": "2308.08155",
    448       "relevance": "Multi-agent framework frequently used as subject system in LLM integration defect studies."
    449     },
    450     {
    451       "title": "Are LLMs Correctly Integrated into Software Systems?",
    452       "authors": ["Yongqiang Shao", "Yuntong Huang"],
    453       "year": 2025,
    454       "relevance": "Prior study identifying and categorizing LLM integration defects (Hydrangea dataset source)."
    455     },
    456     {
    457       "title": "Can Agents Fix Agent Issues?",
    458       "authors": ["Alexander W. Rahardja", "Jiaxin Liu"],
    459       "year": 2025,
    460       "arxiv_id": "2505.20749",
    461       "relevance": "AgentIssue-Bench dataset source for evaluating agent defect fixing capabilities."
    462     },
    463     {
    464       "title": "CodexGraph: Bridging Large Language Models and Code Repositories via Code Graph Databases",
    465       "authors": ["Xiangyan Liu", "Bo Lan"],
    466       "year": 2025,
    467       "relevance": "Repository-structured approach using code graph databases for LLM reasoning about code."
    468     },
    469     {
    470       "title": "Why Do Multi-Agent LLM Systems Fail?",
    471       "authors": ["Mehmet Cemri", "Michael Z. Pan"],
    472       "year": 2025,
    473       "arxiv_id": "2503.13657",
    474       "relevance": "Analysis of failure modes in multi-agent LLM systems, directly relevant to LLM integration defects."
    475     },
    476     {
    477       "title": "Defining and Detecting the Defects of the Large Language Model-based Autonomous Agents",
    478       "authors": ["Kaiwen Ning", "Jiacheng Chen"],
    479       "year": 2024,
    480       "arxiv_id": "2412.18371",
    481       "relevance": "Taxonomy of defects in LLM-based autonomous agents."
    482     }
    483   ]
    484 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs