scan-v5.json (23960B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "LIDL: LLM Integration Defect Localization via Knowledge Graph-Enhanced Multi-Agent Analysis", 6 "authors": [ 7 "Gou Tan", 8 "Zilong He", 9 "Min Li", 10 "Pengfei Chen", 11 "Jieke Shi", 12 "Zhensu Sun", 13 "Ting Zhang", 14 "Danwen Chen", 15 "Lwin Khin Shar", 16 "Chuanfu Zhang", 17 "David Lo" 18 ], 19 "year": 2026, 20 "venue": "arXiv.org", 21 "arxiv_id": "2601.05539", 22 "doi": "10.48550/arXiv.2601.05539" 23 }, 24 "checklist": { 25 "claims_and_evidence": { 26 "abstract_claims_supported": { 27 "applies": true, 28 "answer": true, 29 "justification": "All numerical claims in the abstract (Top-3 0.64, MAP 0.48, 64.1% improvement, 92.5% cost reduction) are directly supported by Table V results with kimi-k2.", 30 "source": "haiku" 31 }, 32 "causal_claims_justified": { 33 "applies": true, 34 "answer": true, 35 "justification": "Causal claims about component contributions are supported by ablation studies in Table VI that remove each component and quantify the performance impact.", 36 "source": "haiku" 37 }, 38 "generalization_bounded": { 39 "applies": true, 40 "answer": true, 41 "justification": "Section V explicitly bounds results to Python LLM-integrated software from GitHub repositories and acknowledges this may not generalize to industrial codebases.", 42 "source": "haiku" 43 }, 44 "alternative_explanations_discussed": { 45 "applies": true, 46 "answer": false, 47 "justification": "The paper does not consider whether improvements stem from additional LLM calls, better prompting design, or domain-specific patterns—only the proposed architecture is discussed as cause of gains.", 48 "source": "haiku" 49 }, 50 "proxy_outcome_distinction": { 51 "applies": true, 52 "answer": true, 53 "justification": "Top-k file identification accuracy and MAP/MRR directly match the claimed task of defect file localization; no proxy outcome substitution occurs.", 54 "source": "haiku" 55 } 56 }, 57 "limitations_and_scope": { 58 "limitations_section_present": { 59 "applies": true, 60 "answer": true, 61 "justification": "Section V contains dedicated subsections for 'Threats to Validity' and 'Limitations and Future Work' with substantive content beyond boilerplate.", 62 "source": "haiku" 63 }, 64 "threats_to_validity_specific": { 65 "applies": true, 66 "answer": true, 67 "justification": "Specific threats named: Python-only support, popular-framework dependency limiting annotation coverage, GitHub-only focus, and dataset selection bias with mitigation via inter-annotator agreement (κ=0.9351).", 68 "source": "haiku" 69 }, 70 "scope_boundaries_stated": { 71 "applies": true, 72 "answer": true, 73 "justification": "The paper explicitly states Python-only scope, GitHub repositories, popular LLM frameworks only, and notes industrial codebase generalization is unvalidated.", 74 "source": "haiku" 75 } 76 }, 77 "conflicts_of_interest": { 78 "funding_disclosed": { 79 "applies": true, 80 "answer": false, 81 "justification": "No funding acknowledgment section appears anywhere in the paper despite multi-institutional authorship.", 82 "source": "haiku" 83 }, 84 "affiliations_disclosed": { 85 "applies": true, 86 "answer": true, 87 "justification": "Author affiliations are explicitly listed: Sun Yat-sen University, Singapore Management University, and Monash University.", 88 "source": "haiku" 89 }, 90 "funder_independent_of_outcome": { 91 "applies": false, 92 "answer": false, 93 "justification": "No funding is disclosed, making funder independence impossible to assess.", 94 "source": "haiku" 95 }, 96 "financial_interests_declared": { 97 "applies": true, 98 "answer": false, 99 "justification": "No competing interests or financial disclosure statement appears anywhere in the paper.", 100 "source": "haiku" 101 } 102 }, 103 "scope_and_framing": { 104 "key_terms_defined": { 105 "applies": true, 106 "answer": true, 107 "justification": "Section II.A defines 'LLM-integrated software' and Section II.B defines and categorizes 'LLM integration defects' into four types with representative examples.", 108 "source": "haiku" 109 }, 110 "intended_contribution_clear": { 111 "applies": true, 112 "answer": true, 113 "justification": "Three contributions are explicitly enumerated: the knowledge graph approach, the LIDL multi-agent implementation, and evaluation on 146 real-world defects.", 114 "source": "haiku" 115 }, 116 "engagement_with_prior_work": { 117 "applies": true, 118 "answer": true, 119 "justification": "Section II.C systematically analyzes why existing approaches (SWE-agent, Agentless, AutoCodeRover, RepoGraph) fail for LLM integration defects, explaining what LIDL specifically adds.", 120 "source": "haiku" 121 } 122 } 123 }, 124 "type_checklist": { 125 "empirical": { 126 "artifacts": { 127 "code_released": { 128 "applies": true, 129 "answer": true, 130 "justification": "The paper states 'All benchmark data and our implementation are publicly available at: https://github.com/IntelligentDDS/LIDL'.", 131 "source": "haiku" 132 }, 133 "data_released": { 134 "applies": true, 135 "answer": true, 136 "justification": "Benchmark data is claimed publicly available at the GitHub URL; source datasets Hydrangea and AgentIssue-Bench are cited prior published work.", 137 "source": "haiku" 138 }, 139 "environment_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "Hardware is specified (Ubuntu 24.04, Intel Xeon Gold 6326, 128GB RAM, NVIDIA A40) and Python 3.10.16 mentioned, but no requirements.txt, Dockerfile, or dependency list is provided.", 143 "source": "haiku" 144 }, 145 "reproduction_instructions": { 146 "applies": true, 147 "answer": false, 148 "justification": "No step-by-step reproduction instructions appear in the paper; only a GitHub URL is provided without describing repository contents or how to run experiments.", 149 "source": "haiku" 150 } 151 }, 152 "statistical_methodology": { 153 "confidence_intervals_or_error_bars": { 154 "applies": true, 155 "answer": false, 156 "justification": "No confidence intervals or error bars are reported for any results in Tables V or VI; all metrics are single-point estimates.", 157 "source": "haiku" 158 }, 159 "significance_tests": { 160 "applies": true, 161 "answer": false, 162 "justification": "No statistical significance tests are applied to comparative results despite making quantitative comparative claims across six models and five baselines.", 163 "source": "haiku" 164 }, 165 "effect_sizes_reported": { 166 "applies": true, 167 "answer": true, 168 "justification": "Percentage improvements over baselines are reported (64.1%, 120.7%, 92.5%, etc.) alongside absolute metric values, providing effect size context.", 169 "source": "haiku" 170 }, 171 "sample_size_justified": { 172 "applies": true, 173 "answer": false, 174 "justification": "The 146-instance dataset size is not justified and no power analysis is provided to support sufficiency for the statistical comparisons made.", 175 "source": "haiku" 176 }, 177 "variance_reported": { 178 "applies": true, 179 "answer": false, 180 "justification": "No variance or standard deviation across multiple runs is reported; temperature=0.0 reduces but does not eliminate LLM variance, and no run-to-run statistics appear.", 181 "source": "haiku" 182 } 183 }, 184 "evaluation_design": { 185 "baselines_included": { 186 "applies": true, 187 "answer": true, 188 "justification": "Five baselines included: SWE-agent, Agentless, AutoCodeRover, and RepoGraph-enhanced variants SWE-agent* and Agentless*.", 189 "source": "haiku" 190 }, 191 "baselines_contemporary": { 192 "applies": true, 193 "answer": true, 194 "justification": "All baselines are from 2024-2025: SWE-agent (NeurIPS 2024), Agentless (2024), AutoCodeRover (ISSTA 2024), RepoGraph (ICLR 2025).", 195 "source": "haiku" 196 }, 197 "ablation_study": { 198 "applies": true, 199 "answer": true, 200 "justification": "Table VI presents ablation removing each of four components (direct extraction, symptom inference, annotation retrieval, validator) with quantified performance impact.", 201 "source": "haiku" 202 }, 203 "multiple_metrics": { 204 "applies": true, 205 "answer": true, 206 "justification": "Six evaluation metrics used: Top-1, Top-3, MAP, MRR, $Cost, and #Tokens (input/output separately).", 207 "source": "haiku" 208 }, 209 "human_evaluation": { 210 "applies": false, 211 "answer": false, 212 "justification": "Human evaluation is not applicable for automated defect file localization benchmarked against ground-truth file labels.", 213 "source": "haiku" 214 }, 215 "held_out_test_set": { 216 "applies": true, 217 "answer": false, 218 "justification": "15 defects used for hyperparameter pilot tuning are part of the 146-instance total; it is not stated whether these are excluded from the final evaluation.", 219 "source": "haiku" 220 }, 221 "per_category_breakdown": { 222 "applies": true, 223 "answer": true, 224 "justification": "Fig. 7 provides per-category performance breakdown across all four defect categories for all six methods on Top-1, Top-3, MAP, and MRR.", 225 "source": "haiku" 226 }, 227 "failure_cases_discussed": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper discusses what baselines fail on but does not analyze the 36% of cases where LIDL itself fails to find the correct file in Top-3.", 231 "source": "haiku" 232 }, 233 "negative_results_reported": { 234 "applies": true, 235 "answer": true, 236 "justification": "The paper reports that RepoGraph-enhanced methods (SWE-agent*, Agentless*) show no improvement; SWE-agent* drops from 0.29 to 0.21 Top-3, a negative result.", 237 "source": "haiku" 238 } 239 }, 240 "setup_transparency": { 241 "model_versions_specified": { 242 "applies": true, 243 "answer": true, 244 "justification": "Specific model versions are named: Llama3.3-70B-Instruct, Qwen2.5-72B-Instruct, DeepSeek-V3.2, Kimi-K2, GPT-5.1, Claude-Sonnet-4.5, BGE-M3.", 245 "source": "haiku" 246 }, 247 "prompts_provided": { 248 "applies": true, 249 "answer": false, 250 "justification": "The paper describes prompting strategies conceptually (e.g., 'the LLM is prompted to perform three reasoning steps') but no actual prompt text is provided.", 251 "source": "haiku" 252 }, 253 "hyperparameters_reported": { 254 "applies": true, 255 "answer": true, 256 "justification": "Key parameters reported: temperature=0.0, ks=10, kh=1, ke=5, ki=kr=5, wc=0.7, wd=0.3, all determined through a described pilot study.", 257 "source": "haiku" 258 }, 259 "scaffolding_described": { 260 "applies": true, 261 "answer": true, 262 "justification": "Section III describes all three agents in algorithmic detail: Code Knowledge Graph Constructor, Defect Analyzer (three sub-components), and Context-aware Validator.", 263 "source": "haiku" 264 }, 265 "data_preprocessing_documented": { 266 "applies": true, 267 "answer": true, 268 "justification": "Data cleaning criteria explicitly documented: removal of missing repo versions, incomplete info, uncertain categories; two independent annotators with Cohen's kappa 0.9351.", 269 "source": "haiku" 270 } 271 }, 272 "data_integrity": { 273 "raw_data_available": { 274 "applies": true, 275 "answer": true, 276 "justification": "The paper states all benchmark data is publicly available at the GitHub repository.", 277 "source": "haiku" 278 }, 279 "data_collection_described": { 280 "applies": true, 281 "answer": true, 282 "justification": "Data sourced from Hydrangea (888 defects, 105 GitHub apps) and AgentIssue-Bench (50 defects, 16 agent systems), with filtering reducing to 146 instances.", 283 "source": "haiku" 284 }, 285 "recruitment_methods_described": { 286 "applies": false, 287 "answer": false, 288 "justification": "No human participants; data collected from public GitHub repositories using prior published benchmarks.", 289 "source": "haiku" 290 }, 291 "data_pipeline_documented": { 292 "applies": true, 293 "answer": true, 294 "justification": "Pipeline from source datasets through filtering (three removal criteria) and annotation (two independent annotators, kappa measurement, conflict resolution) is documented.", 295 "source": "haiku" 296 } 297 }, 298 "contamination": { 299 "training_cutoff_stated": { 300 "applies": true, 301 "answer": false, 302 "justification": "No training data cutoffs are stated for any of the six LLMs evaluated, despite the benchmark using GitHub issues from 2023-2025 that could appear in training data.", 303 "source": "haiku" 304 }, 305 "train_test_overlap_discussed": { 306 "applies": true, 307 "answer": false, 308 "justification": "The paper does not discuss whether GitHub issues in the benchmark were seen during LLM training, a real concern given issues predate some model training cutoffs.", 309 "source": "haiku" 310 }, 311 "benchmark_contamination_addressed": { 312 "applies": true, 313 "answer": false, 314 "justification": "The benchmark uses publicly accessible GitHub issues dating to 2023-2025; contamination risk from models trained on GitHub data is not addressed.", 315 "source": "haiku" 316 } 317 }, 318 "human_studies": { 319 "pre_registered": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants in this study.", 323 "source": "haiku" 324 }, 325 "irb_or_ethics_approval": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants in this study.", 329 "source": "haiku" 330 }, 331 "demographics_reported": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants in this study.", 335 "source": "haiku" 336 }, 337 "inclusion_exclusion_criteria": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants in this study.", 341 "source": "haiku" 342 }, 343 "randomization_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants in this study.", 347 "source": "haiku" 348 }, 349 "blinding_described": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants in this study.", 353 "source": "haiku" 354 }, 355 "attrition_reported": { 356 "applies": false, 357 "answer": false, 358 "justification": "No human participants in this study.", 359 "source": "haiku" 360 } 361 }, 362 "cost_and_practicality": { 363 "inference_cost_reported": { 364 "applies": true, 365 "answer": true, 366 "justification": "Average cost per instance is reported in Table V for all six LLMs across all methods (e.g., LIDL with kimi-k2: $0.008, with claude-sonnet-4.5: $0.086).", 367 "source": "haiku" 368 }, 369 "compute_budget_stated": { 370 "applies": true, 371 "answer": false, 372 "justification": "Hardware is specified but total compute budget for the full experiment suite is not stated; only per-instance API costs are reported.", 373 "source": "haiku" 374 } 375 } 376 } 377 }, 378 "claims": [ 379 { 380 "claim": "LIDL achieves Top-3 accuracy of 0.64 and MAP of 0.48, outperforming the best baseline (AutoCodeRover) by 64.1%", 381 "evidence": "Table V shows LIDL with kimi-k2 achieves Top-3=0.64, MAP=0.48 vs AutoCodeRover's Top-3=0.39, MAP=0.28", 382 "supported": "strong" 383 }, 384 { 385 "claim": "LIDL reduces per-instance cost by 92.5% compared to AutoCodeRover ($0.008 vs $0.106 with kimi-k2)", 386 "evidence": "Table V directly reports per-instance costs for all methods across all models; the comparison holds across all LLMs tested", 387 "supported": "strong" 388 }, 389 { 390 "claim": "Generic repository graph augmentation (RepoGraph) provides no benefit for LLM integration defect localization", 391 "evidence": "SWE-agent* drops from Top-3 0.29 to 0.21 (-27.6%) and Agentless* drops from 0.38 to 0.36 (-5.3%) when RepoGraph is added", 392 "supported": "strong" 393 }, 394 { 395 "claim": "LIDL uniquely localizes 18 defects (12.3%) that all five baselines miss in Top-3", 396 "evidence": "Fig. 8 overlap Venn diagram shows 18 cases exclusively found by LIDL; all baselines combined contribute 0 unique cases not found by LIDL", 397 "supported": "strong" 398 }, 399 { 400 "claim": "Annotation-based retrieval is the most critical component, with its removal causing a 17.2% Top-3 performance drop", 401 "evidence": "Table VI: LIDL w/o R achieves Top-3=0.53 vs full LIDL's 0.64 with kimi-k2", 402 "supported": "strong" 403 }, 404 { 405 "claim": "Structured reasoning reduces dependence on model capability compared to lightweight methods", 406 "evidence": "LIDL improves 36.2% from weakest to strongest model (0.47→0.64), while Agentless improves 58.3% (0.24→0.38); interpreted as structured stages compensating for weaker models", 407 "supported": "moderate" 408 } 409 ], 410 "methodology_tags": [ 411 "benchmark-eval", 412 "case-study" 413 ], 414 "key_findings": "LIDL achieves 64.1% improvement in Top-3 defect localization accuracy over the best baseline by combining domain-specific knowledge graph annotations with multi-source evidence fusion and counterfactual reasoning. The framework demonstrates that generic repository graphs without LLM-specific semantic annotations provide no benefit and sometimes degrade performance. LIDL achieves accuracy gains while reducing cost by 92.5%, costing only $0.008 per localization task with kimi-k2. Ablation confirms annotation-based retrieval and counterfactual validation are the two most critical components, with generic code search contributing least.", 415 "red_flags": [ 416 { 417 "flag": "No statistical significance testing", 418 "detail": "All comparative results in Tables V and VI are single-point estimates with no confidence intervals, error bars, or significance tests despite making quantitative comparative claims across six models and five baselines." 419 }, 420 { 421 "flag": "Pilot set potentially included in test set", 422 "detail": "15 defects used for hyperparameter tuning are part of the 146-instance dataset; the paper does not state whether these are excluded from the final evaluation, creating potential overfitting to the chosen parameters." 423 }, 424 { 425 "flag": "No actual prompts provided", 426 "detail": "Despite the framework being heavily prompt-dependent, no actual prompt text is shown—only conceptual descriptions of what the LLM is asked to do, making exact reproduction impossible." 427 }, 428 { 429 "flag": "Training data contamination unaddressed", 430 "detail": "GitHub issues in the benchmark (2023-2025) may appear in training data of LLMs evaluated (GPT-5.1, Claude-Sonnet-4.5, Kimi-K2); no contamination analysis is performed despite this being a real risk." 431 }, 432 { 433 "flag": "No variance across runs", 434 "detail": "With LLM-based components, temperature=0.0 reduces but doesn't eliminate variance; no multiple-run statistics are reported, and a single run cannot establish reliability." 435 }, 436 { 437 "flag": "LIDL failure cases unanalyzed", 438 "detail": "The 36% of cases where LIDL fails to find the correct file in Top-3 are not analyzed; no error analysis of LIDL's own failure modes is provided." 439 } 440 ], 441 "cited_papers": [ 442 { 443 "title": "Are LLMs Correctly Integrated into Software Systems? (Hydrangea / Shao et al., ICSE 2025)", 444 "relevance": "Primary data source providing 888 LLM integration defects from 105 GitHub repositories used in LIDL's benchmark" 445 }, 446 { 447 "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering", 448 "relevance": "Key baseline for repository-level defect localization using LLM agents; NeurIPS 2024" 449 }, 450 { 451 "title": "Agentless: Demystifying LLM-based Software Engineering Agents", 452 "relevance": "Hierarchical defect localization baseline representing cost-efficient approach; lowest-cost comparison point" 453 }, 454 { 455 "title": "AutoCodeRover: Autonomous Program Improvement", 456 "relevance": "Best-performing baseline with code search APIs; LIDL claims 64.1% improvement over it" 457 }, 458 { 459 "title": "RepoGraph: Enhancing AI Software Engineering with Repository-Level Code Graph", 460 "relevance": "Graph-augmented baseline whose failure demonstrates limitations of generic repository graphs for LLM integration defects" 461 }, 462 { 463 "title": "Can Agents Fix Agent Issues? (AgentIssue-Bench)", 464 "relevance": "Secondary data source providing 50 agent-based defect instances used in LIDL's benchmark" 465 }, 466 { 467 "title": "Defining and Detecting the Defects of the Large Language Model-based Autonomous Agents", 468 "relevance": "Prior characterization of LLM agent defect taxonomy that LIDL builds upon" 469 }, 470 { 471 "title": "CodexGraph: Bridging Large Language Models and Code Repositories via Code Graph Databases", 472 "relevance": "Related repository graph approach for LLM-based code reasoning; compared in related work" 473 } 474 ], 475 "engagement_factors": { 476 "practical_relevance": { 477 "score": 3, 478 "justification": "Directly addresses defect localization in LLM-integrated software with open-source code, immediately useful for teams building AI applications." 479 }, 480 "surprise_contrarian": { 481 "score": 1, 482 "justification": "Confirms expected result that domain-specific knowledge graphs outperform generic ones; the negative result on RepoGraph is mildly surprising but not major." 483 }, 484 "fear_safety": { 485 "score": 1, 486 "justification": "Addresses software reliability rather than AI safety risks; bug localization failures cause reliability issues but not headline safety threats." 487 }, 488 "drama_conflict": { 489 "score": 1, 490 "justification": "Incremental SE research improvement; no controversial claims or conflict with established results." 491 }, 492 "demo_ability": { 493 "score": 2, 494 "justification": "Code publicly available on GitHub and can be run against real LLM-integrated software repositories; requires LLM API access but is otherwise runnable." 495 }, 496 "brand_recognition": { 497 "score": 1, 498 "justification": "Academic authors from Singapore Management University and Sun Yat-sen University; no famous lab affiliation or high-profile industry partnership." 499 } 500 }, 501 "hn_data": { 502 "threads": [], 503 "top_points": 0, 504 "total_points": 0, 505 "total_comments": 0 506 } 507 }