scan-v5.json (25572B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Large Language Models for Fault Localization: An Empirical Study", 6 "authors": [ 7 "YingJian Xiao", 8 "Rongqun Hu", 9 "Weiwei Gong", 10 "Hongwei Li", 11 "AnQuan Jie" 12 ], 13 "year": 2025, 14 "venue": "arXiv.org", 15 "arxiv_id": "2510.20521", 16 "doi": "10.48550/arXiv.2510.20521" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "All abstract claims are supported by experimental results in Tables 3–8: Gemini outperforms others, bug context helps, few-shot shows diminishing returns, CoT depends on model ability.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Causal claims like 'bug report context improves performance' are justified through controlled empirical comparisons (with/without context). Table 4 clearly shows the effect across all models.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": true, 35 "justification": "Section 5 (Threats to External Validity) explicitly bounds conclusions to Java, two specific datasets, four models, and statement-level localization. Claims appropriately scoped.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "Paper lacks discussion of *why* CoT hurts GPT-4.1 on HumanEval but helps on Defects4J, or what mechanisms underlie model differences. Findings are reported without mechanistic explanation.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": false, 47 "justification": "Paper measures statement-level fault localization but extrapolates implications for program repair effectiveness without discussing the gap between locating bugs and successfully fixing them.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": true, 55 "justification": "Section 5 'THREATS TO VALIDITY' comprehensively covers internal, construct, and external validity threats with three subsections.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": true, 61 "justification": "Threats are concrete: prompt phrasing effects, 'perfect match' strictness overestimating errors, potential data contamination with Gemini's Jan 2025 cutoff, and Java-only generalization limits.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": true, 67 "justification": "Explicit boundaries: Java only, statement-level localization, two datasets, 13 trials per condition, synthetic vs real-world comparison. Scope clearly delineated.", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": false, 75 "justification": "No funding or grant information provided anywhere in the paper.", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "Author affiliations with Nanchang Institute of Technology and Jiangxi Normal University are listed, though not affiliations with evaluated product companies.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": false, 86 "answer": false, 87 "justification": "No funding disclosed; criterion does not apply.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests or financial conflicts statement present.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": false, 101 "justification": "Terms like 'fault localization,' 'few-shot learning,' and 'chain-of-thought' are used without formal definitions; paper assumes domain expertise.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "Section 1 explicitly states three contributions: (1) empirical evaluation on two datasets, (2) exploration of prompting strategies, (3) time/cost analysis. Intent is unambiguous.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section 2 covers traditional methods, deep learning approaches, LLMs in fault localization, and prompt engineering. Engagement is broad though somewhat surface-level.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": false, 124 "justification": "No code, scripts, or prompts released. Paper describes experiments but provides no reproducible implementation or prompt templates.", 125 "source": "haiku" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": true, 130 "justification": "Experiments use two public benchmark datasets: HumanEval-Java and Defects4J v1.2.0, both available for download.", 131 "source": "haiku" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "No environment specifications provided: no Python version, library versions, dependency lists, API configuration details, or reproducibility instructions.", 137 "source": "haiku" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "Paper describes experimental design and results but does not provide step-by-step instructions to reproduce. Actual prompts are hidden behind 'standardized templates.'", 143 "source": "haiku" 144 } 145 }, 146 "statistical_methodology": { 147 "applies": true, 148 "answer": false, 149 "justification": "Tables 3–8 report point estimates only with no confidence intervals, error bars, or variance measures. Pass@k computed over 13 trials but variance not reported.", 150 "source": "haiku" 151 }, 152 "significance_tests": { 153 "applies": true, 154 "answer": false, 155 "justification": "No p-values or significance tests applied to performance differences in main results. Scott-Knott ESD test mentioned for time analysis but results not shown in paper.", 156 "source": "haiku" 157 }, 158 "effect_sizes_reported": { 159 "applies": true, 160 "answer": false, 161 "justification": "Effect sizes not formally reported. Percentage-point differences shown (e.g., 65.03% vs 46.63%) but not quantified as standardized effect sizes.", 162 "source": "haiku" 163 }, 164 "sample_size_justified": { 165 "applies": true, 166 "answer": false, 167 "justification": "Use of 13 trials per condition not justified. No power analysis or sample-size rationale provided.", 168 "source": "haiku" 169 }, 170 "variance_reported": { 171 "applies": true, 172 "answer": false, 173 "justification": "Main results (Tables 3–8) show single point estimates with no standard deviation, variance, or confidence intervals despite running 13 trials.", 174 "source": "haiku" 175 }, 176 "evaluation_design": { 177 "applies": true, 178 "answer": true, 179 "justification": "Study includes model comparisons, ablations across zero-shot/few-shot/CoT, five performance metrics, plus time and cost analysis across two datasets.", 180 "source": "haiku" 181 }, 182 "baselines_included": { 183 "applies": true, 184 "answer": true, 185 "justification": "Four different models compared as baselines; each prompting strategy (standard, few-shot variants, CoT) serves as ablation against the others.", 186 "source": "haiku" 187 }, 188 "baselines_contemporary": { 189 "applies": true, 190 "answer": true, 191 "justification": "All models released 2024–2025 (GPT-4.1 mini Apr 2025, Qwen Nov 2024, DeepSeek Dec 2024, Gemini Apr 2025). Baselines are current.", 192 "source": "haiku" 193 }, 194 "ablation_study": { 195 "applies": true, 196 "answer": true, 197 "justification": "Systematic ablations: zero-shot, one/two/three-shot, and CoT. Each strategy tested independently to isolate effect.", 198 "source": "haiku" 199 }, 200 "multiple_metrics": { 201 "applies": true, 202 "answer": true, 203 "justification": "Five performance metrics (Top@5, Top@10, Pass@1, Pass@5, Pass@10) plus time and cost dimensions provide multi-faceted evaluation.", 204 "source": "haiku" 205 }, 206 "human_evaluation": { 207 "applies": false, 208 "answer": false, 209 "justification": "Automated benchmark evaluation; human evaluation not applicable for this task. Fault localization correctness is objectively verifiable.", 210 "source": "haiku" 211 }, 212 "held_out_test_set": { 213 "applies": true, 214 "answer": true, 215 "justification": "Both HumanEval-Java and Defects4J are established benchmark datasets with fixed test sets by design.", 216 "source": "haiku" 217 }, 218 "per_category_breakdown": { 219 "applies": true, 220 "answer": false, 221 "justification": "Results grouped by dataset and model but not by bug complexity, type, or category (e.g., logic errors vs type errors). No granular failure analysis.", 222 "source": "haiku" 223 }, 224 "failure_cases_discussed": { 225 "applies": true, 226 "answer": false, 227 "justification": "Paper does not analyze specific failure modes, provide examples of mislocalizations, or discuss what types of bugs each model struggles with.", 228 "source": "haiku" 229 }, 230 "negative_results_reported": { 231 "applies": true, 232 "answer": true, 233 "justification": "Negative findings explicitly reported: CoT hurts GPT-4.1 on HumanEval (Table 7), three-shot sometimes underperforms two-shot (Tables 5–6), some models degrade on Defects4J.", 234 "source": "haiku" 235 }, 236 "setup_transparency": { 237 "applies": true, 238 "answer": false, 239 "justification": "Model versions and knowledge cutoff dates specified in Table 2, but actual prompts not provided. Temperature, top-p, max_tokens, and other inference hyperparameters not disclosed.", 240 "source": "haiku" 241 }, 242 "model_versions_specified": { 243 "applies": true, 244 "answer": true, 245 "justification": "Exact model versions listed: GPT-4.1 mini, Qwen2.5-coder-32b-instruct, Gemini-2.5-flash, DeepSeek-V3. Table 2 includes knowledge cutoff dates.", 246 "source": "haiku" 247 }, 248 "prompts_provided": { 249 "applies": true, 250 "answer": false, 251 "justification": "Paper mentions 'standardized templates' and 'prompt design' but never includes actual prompts used. Reproduction impossible without them.", 252 "source": "haiku" 253 }, 254 "hyperparameters_reported": { 255 "applies": true, 256 "answer": false, 257 "justification": "No sampling parameters (temperature, top-p, top-k), max_tokens limits, or API configuration details reported.", 258 "source": "haiku" 259 }, 260 "scaffolding_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "Direct API calls to LLMs; no agentic scaffolding (planning, memory, tool use) employed. Not applicable.", 264 "source": "haiku" 265 }, 266 "data_preprocessing_documented": { 267 "applies": true, 268 "answer": false, 269 "justification": "HumanEval-Java and Defects4J properties described, but specific preprocessing, filtering, or feature extraction steps not documented.", 270 "source": "haiku" 271 }, 272 "data_integrity": { 273 "applies": true, 274 "answer": true, 275 "justification": "Public benchmarks used; collection procedures for HumanEval-Java (synthetic bugs injected) and Defects4J (real projects) briefly described.", 276 "source": "haiku" 277 }, 278 "raw_data_available": { 279 "applies": true, 280 "answer": true, 281 "justification": "Both datasets publicly available. HumanEval-Java and Defects4J can be downloaded independently.", 282 "source": "haiku" 283 }, 284 "data_collection_described": { 285 "applies": true, 286 "answer": false, 287 "justification": "High-level description (164 bugs with JUnit tests, 395 bugs from 6 projects) but insufficient detail on selection criteria, curation process, or representativeness.", 288 "source": "haiku" 289 }, 290 "recruitment_methods_described": { 291 "applies": false, 292 "answer": false, 293 "justification": "No human participants; criterion does not apply.", 294 "source": "haiku" 295 }, 296 "data_pipeline_documented": { 297 "applies": true, 298 "answer": false, 299 "justification": "Pipeline from benchmark to experiment not fully documented: how bugs were selected, how test cases were validated, how output was parsed and matched.", 300 "source": "haiku" 301 }, 302 "contamination": { 303 "applies": true, 304 "answer": true, 305 "justification": "Training cutoff dates explicitly stated in Table 2. Train-test overlap discussed in Section 5 (Threats to Construct Validity); potential contamination acknowledged but not fully ruled out for Gemini (Jan 2025 cutoff).", 306 "source": "haiku" 307 }, 308 "training_cutoff_stated": { 309 "applies": true, 310 "answer": true, 311 "justification": "Table 2 lists knowledge cutoff for each model: GPT-4.1 mini (2024-06), Qwen (2023-10), Gemini-2.5-flash (2025-01), DeepSeek-V3 (2024-07).", 312 "source": "haiku" 313 }, 314 "train_test_overlap_discussed": { 315 "applies": true, 316 "answer": true, 317 "justification": "Construct validity section acknowledges 'data contamination cannot be completely ruled out' despite using newer datasets and considering cutoff dates.", 318 "source": "haiku" 319 }, 320 "benchmark_contamination_addressed": { 321 "applies": true, 322 "answer": true, 323 "justification": "Paper explicitly notes that HumanEval-Java was created to avoid contamination, and acknowledges potential risk for other models and datasets. Limitation acknowledged.", 324 "source": "haiku" 325 }, 326 "human_studies": { 327 "applies": false, 328 "answer": false, 329 "justification": "No human participants; all questions in this category do not apply.", 330 "source": "haiku" 331 }, 332 "cost_and_practicality": { 333 "applies": true, 334 "answer": true, 335 "justification": "Section 4.4.2 provides detailed API cost analysis with explicit dollar/yuan costs. Section 4.4.1 reports time overhead with means and max values.", 336 "source": "haiku" 337 }, 338 "inference_cost_reported": { 339 "applies": true, 340 "answer": true, 341 "justification": "Table 9–10 show per-call costs in USD and CNY for each model on each dataset. Costs range $0.024–$1.917 per call, thoroughly documented.", 342 "source": "haiku" 343 }, 344 "compute_budget_stated": { 345 "applies": true, 346 "answer": false, 347 "justification": "Individual call times and costs reported, but total computational budget (aggregate tokens, cumulative cost, total compute hours) not calculated or summed.", 348 "source": "haiku" 349 } 350 } 351 }, 352 "claims": [ 353 { 354 "claim": "Gemini-2.5-flash outperforms other models on fault localization across both HumanEval-Java and Defects4J", 355 "evidence": "Tables 3–4: Gemini achieves Top@5=65.03% on HumanEval-Java (vs GPT 50%, Qwen 46.6%, DeepSeek 52.1%) and Top@5=23.67% on Defects4J with bug context (vs GPT 15.15%, Qwen 13.75%, DeepSeek 11.56%)", 356 "supported": "strong" 357 }, 358 { 359 "claim": "Providing bug report context significantly improves fault localization performance for all models", 360 "evidence": "Table 4 shows dramatic gains on Defects4J: GPT-4.1 mini improves from 3.90% to 15.15% Top@5; Qwen from 4.90% to 13.75%; Gemini from 6.08% to 23.67%; DeepSeek from 2.10% to 11.56%", 361 "supported": "strong" 362 }, 363 { 364 "claim": "Few-shot learning improves performance but exhibits clear diminishing marginal returns beyond two examples", 365 "evidence": "Tables 5–6 show two-shot typically peaks, three-shot often regresses. E.g., Qwen on HumanEval: one-shot Pass@5=49.09%, two-shot=47.61%, three-shot=47.28%. Similar pattern across models.", 366 "supported": "moderate" 367 }, 368 { 369 "claim": "Chain-of-thought effectiveness depends on task complexity and model reasoning ability, not uniformly beneficial", 370 "evidence": "Tables 7–8: CoT hurts GPT-4.1 on HumanEval (Top@5 drops 50% → 34.36%), but helps DeepSeek on Defects4J (Top@5 rises 11.56% → 19.11%). Pattern is model and dataset dependent.", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "Model inference latency ranges from 1-2 seconds (GPT-4.1 mini) to 20-30+ seconds (Gemini-2.5-flash) depending on task and prompting strategy", 375 "evidence": "Section 4.4.1: GPT-4.1 mini averages 1-3s, Qwen 2-5s, DeepSeek 3-9s, Gemini 9-30+ seconds. CoT introduces largest latency overhead for Gemini.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "API call costs vary from $0.024 (GPT-4.1 mini) to $1.917 (GPT on Defects4J) per call, with open-source models generally cheaper in local currency", 380 "evidence": "Tables 9–10: GPT-4.1 mini $0.024–$1.917, Qwen ¥0.144–¥8.642, DeepSeek ¥0.153–¥10.541, Gemini $0.030–$1.344", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Bugs in Defects4J (real projects) are substantially harder to localize than HumanEval-Java (synthetic), with performance drops of 2-3× across all models", 385 "evidence": "Table 3 vs Table 4: Gemini drops from 65.03% to 23.67% Top@5; GPT from 50% to 15.15%; performance consistently 2–3× lower on real-world Defects4J", 386 "supported": "strong" 387 } 388 ], 389 "methodology_tags": [ 390 "benchmark-eval", 391 "observational", 392 "comparative" 393 ], 394 "key_findings": "The paper systematically evaluated four LLMs (open and closed-source) on Java fault localization across synthetic (HumanEval-Java, 164 bugs) and real-world (Defects4J, 395 bugs) datasets. Gemini-2.5-flash achieved strongest overall performance (65% Top@5 on synthetic, 24% on real), while bug report context provided consistent 3-4× improvements across all models. Few-shot learning showed modest gains peaking at two-shot examples with diminishing returns; chain-of-thought had inconsistent effects—hurting GPT-4.1 on synthetic tasks but aiding DeepSeek on real-world bugs. Trade-offs between accuracy (24%–65% depending on task), inference latency (1–30 seconds), and API cost ($0.024–$10.541 per call) suggest practical model selection requires balancing budget, speed, and precision.", 395 "red_flags": [ 396 { 397 "flag": "No confidence intervals or variance", 398 "detail": "Main results (Tables 3–8) report only point estimates from 13 trials. Impossible to assess statistical significance or confidence bounds on performance differences." 399 }, 400 { 401 "flag": "Prompts not released", 402 "detail": "Paper mentions 'standardized templates' but never provides actual prompt text. Reproducibility critically impaired without exact prompts used." 403 }, 404 { 405 "flag": "Data contamination risk underaddressed", 406 "detail": "Gemini-2.5-flash has Jan 2025 knowledge cutoff; paper written in 2025. Potential overlap with benchmarks not fully verified. Acknowledged in threats but not resolved." 407 }, 408 { 409 "flag": "Strict 'perfect match' evaluation may inflate error rates", 410 "detail": "Paper requires complete line-by-line match; partial credit not reported. Hides whether models are close but wrong, and lacks granularity." 411 }, 412 { 413 "flag": "Limited to Java only", 414 "detail": "All experiments on Java code. Generalization to C++, Python, JavaScript, etc., unverified." 415 }, 416 { 417 "flag": "CoT findings unexplained", 418 "detail": "CoT hurts GPT-4.1 on HumanEval but helps DeepSeek on Defects4J. Paper acknowledges model and task dependency but offers no mechanistic explanation." 419 }, 420 { 421 "flag": "No per-category failure analysis", 422 "detail": "No breakdown by bug type (logic, type error, boundary, etc.). Unclear which models/strategies fail on which bug categories." 423 }, 424 { 425 "flag": "Sample size not justified", 426 "detail": "13 trials chosen without power analysis or justification. Unclear if sufficient for stable estimates." 427 } 428 ], 429 "cited_papers": [ 430 { 431 "title": "Software testing with large language models: Survey, landscape, and vision", 432 "relevance": "Broad survey of LLM applications in software testing; contextualizes fault localization within larger testing automation landscape", 433 "authors": "Wang et al.", 434 "year": 2024 435 }, 436 { 437 "title": "A Survey of LLMs for Software Engineering", 438 "relevance": "Comprehensive review of LLM capabilities across software engineering tasks including program repair and code analysis", 439 "authors": "Chen et al.", 440 "year": 2023 441 }, 442 { 443 "title": "Code generation with LLMs: Evaluation, challenges and opportunities", 444 "relevance": "Evaluation methodology and metrics for LLM code tasks; informs design of benchmarks and evaluation protocols", 445 "authors": "Xu et al.", 446 "year": 2024 447 }, 448 { 449 "title": "Large language models in fault localization", 450 "relevance": "Directly related prior work evaluating ChatGPT on Defects4J; establishes baseline for LLM fault localization", 451 "authors": "Wu et al.", 452 "year": 2023 453 }, 454 { 455 "title": "LLMAO: LLMs for Test-Free Fault Localization", 456 "relevance": "Test-free fault localization approach using LLMs; alternative methodology to the bug-report-context baseline", 457 "authors": "Yang et al.", 458 "year": 2024 459 }, 460 { 461 "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 462 "relevance": "Foundational paper on CoT prompting; motivates RQ3 exploration of CoT effectiveness for code reasoning", 463 "authors": "Wei et al.", 464 "year": 2022 465 }, 466 { 467 "title": "Evaluating fault localization and program repair capabilities of existing closed-source general-purpose LLMs", 468 "relevance": "Comparative study of GPT-4 and Claude on fault localization and repair; overlapping research question and models", 469 "authors": "Jiang et al.", 470 "year": 2024 471 }, 472 { 473 "title": "GitBug-Java: A reproducible benchmark of recent Java bugs", 474 "relevance": "Recent Java bug dataset designed for reproducibility; related to Defects4J benchmark selection", 475 "authors": "Silva et al.", 476 "year": 2024 477 } 478 ], 479 "engagement_factors": { 480 "practical_relevance": { 481 "score": 2, 482 "justification": "Fault localization is a real industrial problem. Results are practical (time/cost analysis) but no novel methods proposed—purely observational comparison of existing APIs." 483 }, 484 "surprise_contrarian": { 485 "score": 1, 486 "justification": "Findings confirm conventional wisdom: Gemini is strong, context helps, few-shot shows diminishing returns, CoT is inconsistent. No surprising results that challenge prior understanding." 487 }, 488 "fear_safety": { 489 "score": 0, 490 "justification": "Paper is purely empirical evaluation on safe benchmarks. No AI safety, adversarial, or risk-related concerns raised or explored." 491 }, 492 "drama_conflict": { 493 "score": 0, 494 "justification": "Straightforward technical comparison. No controversy, debate, or conflicting findings presented." 495 }, 496 "demo_ability": { 497 "score": 1, 498 "justification": "Approaches are doable (call public APIs with different prompts) but no released code, prompts, or notebooks make demo/reproduction difficult." 499 }, 500 "brand_recognition": { 501 "score": 2, 502 "justification": "Tests well-known models from major companies (OpenAI GPT-4, Google Gemini, Alibaba Qwen, DeepSeek) but no novel models or lesser-known labs." 503 } 504 }, 505 "hn_data": { 506 "threads": [ 507 { 508 "hn_id": "38424009", 509 "title": "Does GPT-4 Pass the Turing Test?", 510 "points": 60, 511 "comments": 88, 512 "url": "https://news.ycombinator.com/item?id=38424009" 513 }, 514 { 515 "hn_id": "38093289", 516 "title": "Does GPT-4 Pass the Turing Test?", 517 "points": 5, 518 "comments": 1, 519 "url": "https://news.ycombinator.com/item?id=38093289" 520 }, 521 { 522 "hn_id": "10444607", 523 "title": "From F to DOT: Type Soundness Proofs with Definitional Interpreters", 524 "points": 2, 525 "comments": 2, 526 "url": "https://news.ycombinator.com/item?id=10444607" 527 } 528 ], 529 "top_points": 60, 530 "total_points": 67, 531 "total_comments": 91 532 } 533 }