scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (23960B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "LIDL: LLM Integration Defect Localization via Knowledge Graph-Enhanced Multi-Agent Analysis",
      6     "authors": [
      7       "Gou Tan",
      8       "Zilong He",
      9       "Min Li",
     10       "Pengfei Chen",
     11       "Jieke Shi",
     12       "Zhensu Sun",
     13       "Ting Zhang",
     14       "Danwen Chen",
     15       "Lwin Khin Shar",
     16       "Chuanfu Zhang",
     17       "David Lo"
     18     ],
     19     "year": 2026,
     20     "venue": "arXiv.org",
     21     "arxiv_id": "2601.05539",
     22     "doi": "10.48550/arXiv.2601.05539"
     23   },
     24   "checklist": {
     25     "claims_and_evidence": {
     26       "abstract_claims_supported": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "All numerical claims in the abstract (Top-3 0.64, MAP 0.48, 64.1% improvement, 92.5% cost reduction) are directly supported by Table V results with kimi-k2.",
     30         "source": "haiku"
     31       },
     32       "causal_claims_justified": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Causal claims about component contributions are supported by ablation studies in Table VI that remove each component and quantify the performance impact.",
     36         "source": "haiku"
     37       },
     38       "generalization_bounded": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Section V explicitly bounds results to Python LLM-integrated software from GitHub repositories and acknowledges this may not generalize to industrial codebases.",
     42         "source": "haiku"
     43       },
     44       "alternative_explanations_discussed": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper does not consider whether improvements stem from additional LLM calls, better prompting design, or domain-specific patterns—only the proposed architecture is discussed as cause of gains.",
     48         "source": "haiku"
     49       },
     50       "proxy_outcome_distinction": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Top-k file identification accuracy and MAP/MRR directly match the claimed task of defect file localization; no proxy outcome substitution occurs.",
     54         "source": "haiku"
     55       }
     56     },
     57     "limitations_and_scope": {
     58       "limitations_section_present": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Section V contains dedicated subsections for 'Threats to Validity' and 'Limitations and Future Work' with substantive content beyond boilerplate.",
     62         "source": "haiku"
     63       },
     64       "threats_to_validity_specific": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Specific threats named: Python-only support, popular-framework dependency limiting annotation coverage, GitHub-only focus, and dataset selection bias with mitigation via inter-annotator agreement (κ=0.9351).",
     68         "source": "haiku"
     69       },
     70       "scope_boundaries_stated": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper explicitly states Python-only scope, GitHub repositories, popular LLM frameworks only, and notes industrial codebase generalization is unvalidated.",
     74         "source": "haiku"
     75       }
     76     },
     77     "conflicts_of_interest": {
     78       "funding_disclosed": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "No funding acknowledgment section appears anywhere in the paper despite multi-institutional authorship.",
     82         "source": "haiku"
     83       },
     84       "affiliations_disclosed": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Author affiliations are explicitly listed: Sun Yat-sen University, Singapore Management University, and Monash University.",
     88         "source": "haiku"
     89       },
     90       "funder_independent_of_outcome": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "No funding is disclosed, making funder independence impossible to assess.",
     94         "source": "haiku"
     95       },
     96       "financial_interests_declared": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "No competing interests or financial disclosure statement appears anywhere in the paper.",
    100         "source": "haiku"
    101       }
    102     },
    103     "scope_and_framing": {
    104       "key_terms_defined": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section II.A defines 'LLM-integrated software' and Section II.B defines and categorizes 'LLM integration defects' into four types with representative examples.",
    108         "source": "haiku"
    109       },
    110       "intended_contribution_clear": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Three contributions are explicitly enumerated: the knowledge graph approach, the LIDL multi-agent implementation, and evaluation on 146 real-world defects.",
    114         "source": "haiku"
    115       },
    116       "engagement_with_prior_work": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Section II.C systematically analyzes why existing approaches (SWE-agent, Agentless, AutoCodeRover, RepoGraph) fail for LLM integration defects, explaining what LIDL specifically adds.",
    120         "source": "haiku"
    121       }
    122     }
    123   },
    124   "type_checklist": {
    125     "empirical": {
    126       "artifacts": {
    127         "code_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "The paper states 'All benchmark data and our implementation are publicly available at: https://github.com/IntelligentDDS/LIDL'.",
    131           "source": "haiku"
    132         },
    133         "data_released": {
    134           "applies": true,
    135           "answer": true,
    136           "justification": "Benchmark data is claimed publicly available at the GitHub URL; source datasets Hydrangea and AgentIssue-Bench are cited prior published work.",
    137           "source": "haiku"
    138         },
    139         "environment_specified": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Hardware is specified (Ubuntu 24.04, Intel Xeon Gold 6326, 128GB RAM, NVIDIA A40) and Python 3.10.16 mentioned, but no requirements.txt, Dockerfile, or dependency list is provided.",
    143           "source": "haiku"
    144         },
    145         "reproduction_instructions": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "No step-by-step reproduction instructions appear in the paper; only a GitHub URL is provided without describing repository contents or how to run experiments.",
    149           "source": "haiku"
    150         }
    151       },
    152       "statistical_methodology": {
    153         "confidence_intervals_or_error_bars": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No confidence intervals or error bars are reported for any results in Tables V or VI; all metrics are single-point estimates.",
    157           "source": "haiku"
    158         },
    159         "significance_tests": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "No statistical significance tests are applied to comparative results despite making quantitative comparative claims across six models and five baselines.",
    163           "source": "haiku"
    164         },
    165         "effect_sizes_reported": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "Percentage improvements over baselines are reported (64.1%, 120.7%, 92.5%, etc.) alongside absolute metric values, providing effect size context.",
    169           "source": "haiku"
    170         },
    171         "sample_size_justified": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "The 146-instance dataset size is not justified and no power analysis is provided to support sufficiency for the statistical comparisons made.",
    175           "source": "haiku"
    176         },
    177         "variance_reported": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "No variance or standard deviation across multiple runs is reported; temperature=0.0 reduces but does not eliminate LLM variance, and no run-to-run statistics appear.",
    181           "source": "haiku"
    182         }
    183       },
    184       "evaluation_design": {
    185         "baselines_included": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Five baselines included: SWE-agent, Agentless, AutoCodeRover, and RepoGraph-enhanced variants SWE-agent* and Agentless*.",
    189           "source": "haiku"
    190         },
    191         "baselines_contemporary": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "All baselines are from 2024-2025: SWE-agent (NeurIPS 2024), Agentless (2024), AutoCodeRover (ISSTA 2024), RepoGraph (ICLR 2025).",
    195           "source": "haiku"
    196         },
    197         "ablation_study": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Table VI presents ablation removing each of four components (direct extraction, symptom inference, annotation retrieval, validator) with quantified performance impact.",
    201           "source": "haiku"
    202         },
    203         "multiple_metrics": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "Six evaluation metrics used: Top-1, Top-3, MAP, MRR, $Cost, and #Tokens (input/output separately).",
    207           "source": "haiku"
    208         },
    209         "human_evaluation": {
    210           "applies": false,
    211           "answer": false,
    212           "justification": "Human evaluation is not applicable for automated defect file localization benchmarked against ground-truth file labels.",
    213           "source": "haiku"
    214         },
    215         "held_out_test_set": {
    216           "applies": true,
    217           "answer": false,
    218           "justification": "15 defects used for hyperparameter pilot tuning are part of the 146-instance total; it is not stated whether these are excluded from the final evaluation.",
    219           "source": "haiku"
    220         },
    221         "per_category_breakdown": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Fig. 7 provides per-category performance breakdown across all four defect categories for all six methods on Top-1, Top-3, MAP, and MRR.",
    225           "source": "haiku"
    226         },
    227         "failure_cases_discussed": {
    228           "applies": true,
    229           "answer": false,
    230           "justification": "The paper discusses what baselines fail on but does not analyze the 36% of cases where LIDL itself fails to find the correct file in Top-3.",
    231           "source": "haiku"
    232         },
    233         "negative_results_reported": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "The paper reports that RepoGraph-enhanced methods (SWE-agent*, Agentless*) show no improvement; SWE-agent* drops from 0.29 to 0.21 Top-3, a negative result.",
    237           "source": "haiku"
    238         }
    239       },
    240       "setup_transparency": {
    241         "model_versions_specified": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Specific model versions are named: Llama3.3-70B-Instruct, Qwen2.5-72B-Instruct, DeepSeek-V3.2, Kimi-K2, GPT-5.1, Claude-Sonnet-4.5, BGE-M3.",
    245           "source": "haiku"
    246         },
    247         "prompts_provided": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "The paper describes prompting strategies conceptually (e.g., 'the LLM is prompted to perform three reasoning steps') but no actual prompt text is provided.",
    251           "source": "haiku"
    252         },
    253         "hyperparameters_reported": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "Key parameters reported: temperature=0.0, ks=10, kh=1, ke=5, ki=kr=5, wc=0.7, wd=0.3, all determined through a described pilot study.",
    257           "source": "haiku"
    258         },
    259         "scaffolding_described": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section III describes all three agents in algorithmic detail: Code Knowledge Graph Constructor, Defect Analyzer (three sub-components), and Context-aware Validator.",
    263           "source": "haiku"
    264         },
    265         "data_preprocessing_documented": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "Data cleaning criteria explicitly documented: removal of missing repo versions, incomplete info, uncertain categories; two independent annotators with Cohen's kappa 0.9351.",
    269           "source": "haiku"
    270         }
    271       },
    272       "data_integrity": {
    273         "raw_data_available": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "The paper states all benchmark data is publicly available at the GitHub repository.",
    277           "source": "haiku"
    278         },
    279         "data_collection_described": {
    280           "applies": true,
    281           "answer": true,
    282           "justification": "Data sourced from Hydrangea (888 defects, 105 GitHub apps) and AgentIssue-Bench (50 defects, 16 agent systems), with filtering reducing to 146 instances.",
    283           "source": "haiku"
    284         },
    285         "recruitment_methods_described": {
    286           "applies": false,
    287           "answer": false,
    288           "justification": "No human participants; data collected from public GitHub repositories using prior published benchmarks.",
    289           "source": "haiku"
    290         },
    291         "data_pipeline_documented": {
    292           "applies": true,
    293           "answer": true,
    294           "justification": "Pipeline from source datasets through filtering (three removal criteria) and annotation (two independent annotators, kappa measurement, conflict resolution) is documented.",
    295           "source": "haiku"
    296         }
    297       },
    298       "contamination": {
    299         "training_cutoff_stated": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "No training data cutoffs are stated for any of the six LLMs evaluated, despite the benchmark using GitHub issues from 2023-2025 that could appear in training data.",
    303           "source": "haiku"
    304         },
    305         "train_test_overlap_discussed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "The paper does not discuss whether GitHub issues in the benchmark were seen during LLM training, a real concern given issues predate some model training cutoffs.",
    309           "source": "haiku"
    310         },
    311         "benchmark_contamination_addressed": {
    312           "applies": true,
    313           "answer": false,
    314           "justification": "The benchmark uses publicly accessible GitHub issues dating to 2023-2025; contamination risk from models trained on GitHub data is not addressed.",
    315           "source": "haiku"
    316         }
    317       },
    318       "human_studies": {
    319         "pre_registered": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in this study.",
    323           "source": "haiku"
    324         },
    325         "irb_or_ethics_approval": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in this study.",
    329           "source": "haiku"
    330         },
    331         "demographics_reported": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in this study.",
    335           "source": "haiku"
    336         },
    337         "inclusion_exclusion_criteria": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in this study.",
    341           "source": "haiku"
    342         },
    343         "randomization_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in this study.",
    347           "source": "haiku"
    348         },
    349         "blinding_described": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in this study.",
    353           "source": "haiku"
    354         },
    355         "attrition_reported": {
    356           "applies": false,
    357           "answer": false,
    358           "justification": "No human participants in this study.",
    359           "source": "haiku"
    360         }
    361       },
    362       "cost_and_practicality": {
    363         "inference_cost_reported": {
    364           "applies": true,
    365           "answer": true,
    366           "justification": "Average cost per instance is reported in Table V for all six LLMs across all methods (e.g., LIDL with kimi-k2: $0.008, with claude-sonnet-4.5: $0.086).",
    367           "source": "haiku"
    368         },
    369         "compute_budget_stated": {
    370           "applies": true,
    371           "answer": false,
    372           "justification": "Hardware is specified but total compute budget for the full experiment suite is not stated; only per-instance API costs are reported.",
    373           "source": "haiku"
    374         }
    375       }
    376     }
    377   },
    378   "claims": [
    379     {
    380       "claim": "LIDL achieves Top-3 accuracy of 0.64 and MAP of 0.48, outperforming the best baseline (AutoCodeRover) by 64.1%",
    381       "evidence": "Table V shows LIDL with kimi-k2 achieves Top-3=0.64, MAP=0.48 vs AutoCodeRover's Top-3=0.39, MAP=0.28",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "LIDL reduces per-instance cost by 92.5% compared to AutoCodeRover ($0.008 vs $0.106 with kimi-k2)",
    386       "evidence": "Table V directly reports per-instance costs for all methods across all models; the comparison holds across all LLMs tested",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Generic repository graph augmentation (RepoGraph) provides no benefit for LLM integration defect localization",
    391       "evidence": "SWE-agent* drops from Top-3 0.29 to 0.21 (-27.6%) and Agentless* drops from 0.38 to 0.36 (-5.3%) when RepoGraph is added",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "LIDL uniquely localizes 18 defects (12.3%) that all five baselines miss in Top-3",
    396       "evidence": "Fig. 8 overlap Venn diagram shows 18 cases exclusively found by LIDL; all baselines combined contribute 0 unique cases not found by LIDL",
    397       "supported": "strong"
    398     },
    399     {
    400       "claim": "Annotation-based retrieval is the most critical component, with its removal causing a 17.2% Top-3 performance drop",
    401       "evidence": "Table VI: LIDL w/o R achieves Top-3=0.53 vs full LIDL's 0.64 with kimi-k2",
    402       "supported": "strong"
    403     },
    404     {
    405       "claim": "Structured reasoning reduces dependence on model capability compared to lightweight methods",
    406       "evidence": "LIDL improves 36.2% from weakest to strongest model (0.47→0.64), while Agentless improves 58.3% (0.24→0.38); interpreted as structured stages compensating for weaker models",
    407       "supported": "moderate"
    408     }
    409   ],
    410   "methodology_tags": [
    411     "benchmark-eval",
    412     "case-study"
    413   ],
    414   "key_findings": "LIDL achieves 64.1% improvement in Top-3 defect localization accuracy over the best baseline by combining domain-specific knowledge graph annotations with multi-source evidence fusion and counterfactual reasoning. The framework demonstrates that generic repository graphs without LLM-specific semantic annotations provide no benefit and sometimes degrade performance. LIDL achieves accuracy gains while reducing cost by 92.5%, costing only $0.008 per localization task with kimi-k2. Ablation confirms annotation-based retrieval and counterfactual validation are the two most critical components, with generic code search contributing least.",
    415   "red_flags": [
    416     {
    417       "flag": "No statistical significance testing",
    418       "detail": "All comparative results in Tables V and VI are single-point estimates with no confidence intervals, error bars, or significance tests despite making quantitative comparative claims across six models and five baselines."
    419     },
    420     {
    421       "flag": "Pilot set potentially included in test set",
    422       "detail": "15 defects used for hyperparameter tuning are part of the 146-instance dataset; the paper does not state whether these are excluded from the final evaluation, creating potential overfitting to the chosen parameters."
    423     },
    424     {
    425       "flag": "No actual prompts provided",
    426       "detail": "Despite the framework being heavily prompt-dependent, no actual prompt text is shown—only conceptual descriptions of what the LLM is asked to do, making exact reproduction impossible."
    427     },
    428     {
    429       "flag": "Training data contamination unaddressed",
    430       "detail": "GitHub issues in the benchmark (2023-2025) may appear in training data of LLMs evaluated (GPT-5.1, Claude-Sonnet-4.5, Kimi-K2); no contamination analysis is performed despite this being a real risk."
    431     },
    432     {
    433       "flag": "No variance across runs",
    434       "detail": "With LLM-based components, temperature=0.0 reduces but doesn't eliminate variance; no multiple-run statistics are reported, and a single run cannot establish reliability."
    435     },
    436     {
    437       "flag": "LIDL failure cases unanalyzed",
    438       "detail": "The 36% of cases where LIDL fails to find the correct file in Top-3 are not analyzed; no error analysis of LIDL's own failure modes is provided."
    439     }
    440   ],
    441   "cited_papers": [
    442     {
    443       "title": "Are LLMs Correctly Integrated into Software Systems? (Hydrangea / Shao et al., ICSE 2025)",
    444       "relevance": "Primary data source providing 888 LLM integration defects from 105 GitHub repositories used in LIDL's benchmark"
    445     },
    446     {
    447       "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    448       "relevance": "Key baseline for repository-level defect localization using LLM agents; NeurIPS 2024"
    449     },
    450     {
    451       "title": "Agentless: Demystifying LLM-based Software Engineering Agents",
    452       "relevance": "Hierarchical defect localization baseline representing cost-efficient approach; lowest-cost comparison point"
    453     },
    454     {
    455       "title": "AutoCodeRover: Autonomous Program Improvement",
    456       "relevance": "Best-performing baseline with code search APIs; LIDL claims 64.1% improvement over it"
    457     },
    458     {
    459       "title": "RepoGraph: Enhancing AI Software Engineering with Repository-Level Code Graph",
    460       "relevance": "Graph-augmented baseline whose failure demonstrates limitations of generic repository graphs for LLM integration defects"
    461     },
    462     {
    463       "title": "Can Agents Fix Agent Issues? (AgentIssue-Bench)",
    464       "relevance": "Secondary data source providing 50 agent-based defect instances used in LIDL's benchmark"
    465     },
    466     {
    467       "title": "Defining and Detecting the Defects of the Large Language Model-based Autonomous Agents",
    468       "relevance": "Prior characterization of LLM agent defect taxonomy that LIDL builds upon"
    469     },
    470     {
    471       "title": "CodexGraph: Bridging Large Language Models and Code Repositories via Code Graph Databases",
    472       "relevance": "Related repository graph approach for LLM-based code reasoning; compared in related work"
    473     }
    474   ],
    475   "engagement_factors": {
    476     "practical_relevance": {
    477       "score": 3,
    478       "justification": "Directly addresses defect localization in LLM-integrated software with open-source code, immediately useful for teams building AI applications."
    479     },
    480     "surprise_contrarian": {
    481       "score": 1,
    482       "justification": "Confirms expected result that domain-specific knowledge graphs outperform generic ones; the negative result on RepoGraph is mildly surprising but not major."
    483     },
    484     "fear_safety": {
    485       "score": 1,
    486       "justification": "Addresses software reliability rather than AI safety risks; bug localization failures cause reliability issues but not headline safety threats."
    487     },
    488     "drama_conflict": {
    489       "score": 1,
    490       "justification": "Incremental SE research improvement; no controversial claims or conflict with established results."
    491     },
    492     "demo_ability": {
    493       "score": 2,
    494       "justification": "Code publicly available on GitHub and can be run against real LLM-integrated software repositories; requires LLM API access but is otherwise runnable."
    495     },
    496     "brand_recognition": {
    497       "score": 1,
    498       "justification": "Academic authors from Singapore Management University and Sun Yat-sen University; no famous lab affiliation or high-profile industry partnership."
    499     }
    500   },
    501   "hn_data": {
    502     "threads": [],
    503     "top_points": 0,
    504     "total_points": 0,
    505     "total_comments": 0
    506   }
    507 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs