scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30438B)
      1 {
      2   "paper": {
      3     "title": "Large Language Models for Fault Localization: An Empirical Study",
      4     "authors": [
      5       "Yingjian Xiao",
      6       "Weiwei Gong",
      7       "Rongqun Hu",
      8       "Hongwei Li",
      9       "Anquan Jie"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv",
     13     "arxiv_id": "2510.20521",
     14     "doi": "10.48550/arXiv.2510.20521"
     15   },
     16   "scan_version": 3,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "Gemini-2.5-flash consistently outperforms GPT-4.1 mini, Qwen2.5-coder-32b-instruct, and DeepSeek-V3 on statement-level fault localization across both HumanEval-Java and Defects4J datasets. Providing bug report context yields the largest performance boost (e.g., Gemini Top@5 jumps from 6.08% to 23.67% on Defects4J). Few-shot learning shows diminishing returns beyond 2 examples, and chain-of-thought reasoning is model-dependent—it degrades GPT-4.1 mini on HumanEval-Java but substantially improves DeepSeek-V3 on Defects4J.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No code repository, GitHub link, or archive is provided anywhere in the paper. The experimental scripts, prompt templates, and evaluation pipeline are not released."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The paper uses two publicly available datasets: HumanEval-Java (164 bugs, reference [28]) and Defects4J v1.2.0 (395 bugs). Both are standard public benchmarks."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No environment specifications, dependency lists, or hardware details are provided. The paper does not describe what system the experiments ran on or what software versions were used."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No reproduction instructions, README, or step-by-step guide is provided. A reader could not reproduce the experiments without substantial guesswork about prompts, parsing, and evaluation pipeline."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Main performance tables (Tables 3–8) report only point estimates for Top@k and Pass@k. No confidence intervals or error bars accompany the primary accuracy results. Box plots appear only in the time overhead analysis."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper uses Scott-Knott ESD tests for time overhead analysis (Section 4.4.1), but the core performance comparison claims in RQ1–RQ3 (e.g., 'Gemini-2.5-flash performs best') are made purely by comparing numbers in tables without any significance test."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Tables 3–8 report absolute percentages for all models and conditions, allowing readers to compute the magnitude of differences (e.g., Gemini Top@5=65.03% vs Qwen=46.63% on HumanEval-Java, and with/without bug report comparisons on Defects4J)."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No power analysis or justification for using 164 bugs (HumanEval-Java) or 395 bugs (Defects4J). No discussion of whether these sizes are sufficient for the statistical claims made."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper runs 13 rounds per experiment and computes Pass@k, but does not report standard deviation, IQR, or any spread measure for the main accuracy results. Box plots in the time analysis show distributions, but the core performance tables are point estimates only."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Zero-shot standard prompting serves as the baseline, with Few-shot and CoT strategies compared against it. Multiple models are compared against each other across all conditions."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All four models are recent (2024–2025 releases per Table 2): GPT-4.1 mini (Apr 2025), Qwen2.5-coder-32b-instruct (Nov 2024), Gemini-2.5-flash (Apr 2025), DeepSeek-V3 (Dec 2024)."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "RQ1 tests with/without bug report context, RQ2 systematically varies shot count (1/2/3), and RQ3 adds/removes CoT reasoning. These controlled variations show which prompting components contribute to performance."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Five metrics are used: Top@5, Top@10, Pass@1, Pass@5, Pass@10 (Section 3.4)."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No human evaluation of model outputs is performed. Evaluation is entirely automated via exact matching of predicted fault locations against ground truth."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Models are evaluated in zero-shot/few-shot mode without fine-tuning. The full datasets (HumanEval-Java, Defects4J) serve as test sets that were not used for model training or selection."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Results are broken down by model, dataset, prompting strategy (zero-shot/few-shot/CoT), and shot count (1/2/3). Defects4J results are further split by with/without bug report. Time analysis separates successful vs failed localizations."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper discusses where approaches fail: CoT degrades GPT-4.1 mini performance on HumanEval-Java, 3-shot causes regression for some models, and failed localizations take significantly longer (Section 4.4.1). Finding 1 of RQ3 explicitly discusses when CoT does not help."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Several negative results are reported: CoT hurts GPT-4.1 mini and DeepSeek-V3 on HumanEval-Java (Table 7), 3-shot degrades performance for multiple models, and increasing examples shows diminishing or negative returns (RQ2 findings)."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract claims that bug report context significantly improves performance (supported by Table 4), few-shot has diminishing returns (supported by Tables 5–6), and CoT depends on model reasoning ability (supported by Tables 7–8). All are backed by experimental data."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Causal claims like 'providing bug report context significantly improves performance' are supported by controlled single-variable manipulation (with vs without bug report, same models and dataset). Similarly, few-shot and CoT claims are based on adding/removing specific prompt components while holding other variables constant."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The title 'Large Language Models for Fault Localization' is broad, but the study tests only 4 models on 2 Java datasets. While the threats section notes Java-only limitation, the title and findings language ('大语言模型在缺陷定位任务中的性能特征') generalizes beyond the tested scope."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The threats section discusses methodological concerns (prompt bias, API variability, strict matching) but does not discuss alternative explanations for specific findings. For example, no discussion of why Gemini outperforms (training data? architecture? inference strategy?) or why CoT helps some models but not others beyond stating it 'depends on reasoning ability.'"
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper measures statement-level fault localization accuracy (Top@k, Pass@k) and claims to evaluate fault localization ability. The measurements directly correspond to the claims without proxy gaps."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "Models are named (GPT-4.1 mini, Qwen2.5-coder-32b-instruct, Gemini-2.5-flash, DeepSeek-V3) with release dates and parameter counts in Table 2, but no API version identifiers or snapshot dates are provided. The schema explicitly states marketing names like 'Gemini-2.5' without a snapshot date do not count."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "The paper describes prompting strategies (Standard, Few-shot, CoT) at a conceptual level but never provides the actual prompt text used in experiments. Readers cannot reconstruct the exact prompts sent to the models."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "The number of runs (n=13) and k values (1, 5, 10) are stated, but model inference hyperparameters (temperature, top-p, max tokens) are not reported anywhere in the paper."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used. The paper sends prompts directly to model APIs without any multi-step reasoning framework, tool use, or feedback loops."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "The paper does not describe how buggy code files were formatted into prompts, how bug reports were extracted and formatted for Defects4J, how model outputs were parsed, or how the exact-match evaluation was implemented at the code level."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 5 ('THREATS TO VALIDITY') provides substantive discussion of internal, construct, and external validity threats across three subsections."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "The threats section includes specific concerns: prompt wording bias, strict complete-match criterion potentially overestimating error rates, format-parsing false negatives (missing brackets/semicolons), potential data leakage despite newer datasets, and coverage of only Java and limited defect types."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "External validity section explicitly states results are limited to Java and two specific datasets, that models evolve quickly and may invalidate findings, and that tested defect types do not cover complex industrial defects like concurrency bugs or security vulnerabilities."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "No raw experimental outputs (model responses, parsed results, per-bug success/failure logs) are released. Only aggregated metrics appear in the tables."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Both datasets are well-characterized: HumanEval-Java (164 manually injected Java bugs with JUnit tests, from reference [28]) and Defects4J v1.2.0 (395 bugs from 6 open-source projects with bug reports and tests). Table 1 summarizes dataset properties."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants. The study uses standard public benchmarks (HumanEval-Java, Defects4J) as data sources."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "The pipeline from raw datasets to final metrics is not documented. Missing details include: how code was extracted and formatted into prompts, how model responses were parsed, how line-number matching was performed, and how Pass@k was computed from the 13 rounds."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding information, grants, or acknowledgments section appears in the paper."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are clearly listed: Nanchang Institute of Technology (Computer Information Engineering School) and Jiangxi Normal University (School of Artificial Intelligence). No authors are affiliated with the companies whose models are evaluated."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "Cannot determine funder independence because no funding is disclosed. The absence of a funding disclosure does not confirm independence."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests or financial disclosure statement is provided."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": true,
    236         "justification": "Table 2 lists knowledge cutoff dates for all four models: GPT-4.1 mini (2024-06), Qwen2.5-coder-32b-instruct (2023-10), Gemini-2.5-flash (2025-01), DeepSeek-V3 (2024-07)."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": true,
    241         "justification": "The paper notes HumanEval-Java was specifically created to avoid data leakage (Section 3.2) and acknowledges in Section 5 that 'data leakage cannot be completely ruled out' for existing benchmarks whose code may have appeared in training data."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": true,
    246         "justification": "The paper deliberately selects HumanEval-Java because it was designed to prevent contamination. For Defects4J (2018, predating all model cutoffs), the construct validity threats acknowledge that 'if test data or similar code fragments appeared in model training data, the model may exhibit overly strong generalization.'"
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study. It is a benchmark evaluation of LLMs on code datasets."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants. The study evaluates models on public code benchmarks."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": true,
    290         "justification": "Section 4.4.2 and Tables 9–10 report per-round API costs for each model on each dataset, including input/output token counts and pricing. Time overhead per inference is also reported in Section 4.4.1."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "Per-call costs are reported but the total computational budget (total API spend across all 13 rounds × all conditions × all models) is not stated. No hardware specifications are provided."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "The paper runs 13 rounds per experiment to compute Pass@k, but does not report result variation across runs or discuss seed sensitivity. No standard deviation or spread measure accompanies the main accuracy metrics."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": true,
    307         "justification": "Section 3.4 explicitly states n=13 total samples per experiment and k=1,5,10 for the Pass@k computation."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "No hyperparameter search is described. The paper does not discuss how prompt templates were designed or whether alternative templates were tried. Temperature and other inference parameters are not reported."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "The paper does not explain how the specific prompt templates, few-shot examples, or CoT instructions were selected. The threats section acknowledges that 'the choice of examples, subtle wording differences, and chain-of-thought guidance' may affect results, but no selection process is documented."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "The paper makes numerous pairwise comparisons across 4 models × multiple settings but applies no correction for multiple comparisons. Scott-Knott ESD is used only for time analysis grouping, not for the main accuracy claims."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": true,
    327         "justification": "The authors evaluate third-party models (GPT, Qwen, Gemini, DeepSeek) they did not create. No self-comparison bias exists since no novel system is proposed."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": true,
    332         "justification": "RQ4 explicitly analyzes time overhead and API cost alongside accuracy results. Tables 9–10 report per-model costs, and the time analysis (Figures 1–2) shows inference time distributions, enabling cost-performance comparisons."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "The paper does not discuss whether HumanEval-Java (synthetic bugs) or Defects4J (historical bugs from 6 projects) adequately measure real-world fault localization ability. The difference in results between the two benchmarks (65% vs 24% Top@5 for Gemini) suggests a validity gap, but this is not analyzed."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "No scaffolding is used. Models are prompted directly via APIs with no multi-step framework or tool integration."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": true,
    349         "justification": "The paper lists model training cutoffs (Table 2) and explicitly selects HumanEval-Java because it was created in September 2023 to avoid data leakage. The construct validity section acknowledges that Defects4J (2018) predates all model cutoffs, creating leakage risk."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of whether the prompt format (providing full code, error messages, or bug reports) leaks information beyond what would be available in realistic debugging scenarios."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No discussion of whether HumanEval-Java examples share structural patterns or whether Defects4J bugs from the same project are independent. Non-independence between same-project bugs could inflate reported metrics."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No concrete leakage detection method (canary strings, membership inference, n-gram overlap) is applied. The paper relies only on the temporal argument that HumanEval-Java was designed to avoid contamination."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "Gemini-2.5-flash outperforms all other models on both datasets, demonstrating strong adaptability and generalization for fault localization.",
    371       "evidence": "Tables 3–4: Gemini achieves Top@5=65.03% on HumanEval-Java and Top@5=23.67% (with bug report) on Defects4J, highest among all models in all metrics.",
    372       "supported": "moderate"
    373     },
    374     {
    375       "claim": "Bug report context significantly improves fault localization performance across all models.",
    376       "evidence": "Table 4: All models show large improvements with bug report context on Defects4J (e.g., Gemini Top@5 from 6.08% to 23.67%, GPT-4.1 mini from 3.90% to 15.15%).",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Few-shot learning shows diminishing returns, with 2-shot typically being optimal and 3-shot sometimes degrading performance.",
    381       "evidence": "Tables 5–6: GPT-4.1 mini peaks at 2-shot (56.44% Top@10) then drops at 3-shot (51.81%); Qwen degrades steadily from 1-shot onward. DeepSeek-V3 is an exception, peaking at 3-shot.",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "Chain-of-thought reasoning effectiveness is model-dependent: it helps some models on complex defects but degrades others.",
    386       "evidence": "Tables 7–8: CoT degrades GPT-4.1 mini on HumanEval-Java (Top@5 drops from 50% to 34.36%) but improves DeepSeek-V3 on Defects4J (Top@5 rises from 11.56% to 19.11%). Gemini is stable.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "Failed localization tasks take significantly longer, especially on the Defects4J dataset.",
    391       "evidence": "Section 4.4.1 with box plots: Failed tasks show higher median inference times across all models. DeepSeek-V3 averages 22.96s on failed Defects4J tasks vs 3–9s on successes.",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "Closed-source models generally have higher API costs than open-source models.",
    396       "evidence": "Tables 9–10 report per-round costs. On Defects4J, GPT-4.1 mini costs $1.917 and Gemini $1.344, while Qwen costs ¥8.642 (~$1.19) and DeepSeek ¥10.541 (~$1.45). The cost difference is modest and currency-dependent.",
    397       "supported": "weak"
    398     }
    399   ],
    400   "red_flags": [
    401     {
    402       "flag": "Prompts not provided",
    403       "detail": "The actual prompt text for Standard, Few-shot, and CoT strategies is never shown. Readers cannot verify what the models were actually asked, making reproduction impossible. Prompt design is acknowledged as a threat to validity."
    404     },
    405     {
    406       "flag": "No code or artifacts released",
    407       "detail": "No experimental code, evaluation scripts, or raw results are released. Combined with missing prompts, this makes the study completely non-reproducible."
    408     },
    409     {
    410       "flag": "No significance tests on main results",
    411       "detail": "The core performance comparisons (RQ1–RQ3) rely entirely on comparing numbers in tables. Claims like 'Gemini outperforms all models' have no statistical backing. Scott-Knott ESD is used only for time overhead analysis."
    412     },
    413     {
    414       "flag": "Missing inference hyperparameters",
    415       "detail": "Temperature, top-p, and max tokens are not reported for any model. These settings significantly affect LLM output and could explain performance differences."
    416     },
    417     {
    418       "flag": "Duplicate references",
    419       "detail": "References [6] and [18] are the same paper (Wu et al. 2023), and references [8]/[9] and [19]/[24] overlap. The paper itself notes '[6] 和18 相同', suggesting incomplete editing."
    420     },
    421     {
    422       "flag": "Few-shot example selection not described",
    423       "detail": "The paper states few-shot examples were 'carefully selected' but does not describe the selection criteria, source, or whether they come from the same dataset being tested. This could introduce data leakage or selection bias."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "Software testing with large language models: Survey, landscape, and vision",
    429       "authors": ["Junjie Wang", "Yuchao Huang", "Chunyang Chen"],
    430       "year": 2024,
    431       "relevance": "Comprehensive survey of LLMs for software testing tasks, directly relevant to understanding LLM capabilities in code-related tasks."
    432     },
    433     {
    434       "title": "Large language models in fault localisation",
    435       "authors": ["Yonghao Wu", "Zheng Li", "Jie M. Zhang"],
    436       "year": 2023,
    437       "arxiv_id": "2308.15276",
    438       "relevance": "Directly evaluates LLMs (ChatGPT) for fault localization on Defects4J, the most closely related prior work."
    439     },
    440     {
    441       "title": "Large language models for test-free fault localization",
    442       "authors": ["Aidan Z. H. Yang", "Claire Le Goues", "Ruben Martins"],
    443       "year": 2024,
    444       "relevance": "Proposes LLMAO framework applying LLMs to fault localization without test cases, demonstrating LLM potential in this domain."
    445     },
    446     {
    447       "title": "Evaluating fault localization and program repair capabilities of existing closed-source general-purpose LLMs",
    448       "authors": ["Shuai Jiang", "Jie Zhang", "Wei Chen"],
    449       "year": 2024,
    450       "relevance": "Systematic comparison of closed-source LLMs (GPT-4, Claude) for fault localization and repair, closely related evaluation methodology."
    451     },
    452     {
    453       "title": "A quantitative and qualitative evaluation of LLM-based explainable fault localization",
    454       "authors": ["Sungmin Kang", "Gabin An", "Shin Yoo"],
    455       "year": 2024,
    456       "relevance": "Evaluates LLM-based fault localization with chain-of-thought reasoning, directly relevant to this paper's RQ3."
    457     },
    458     {
    459       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    460       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    461       "year": 2022,
    462       "relevance": "Foundational work on chain-of-thought prompting, the core technique studied in this paper's RQ3."
    463     },
    464     {
    465       "title": "Language models are Few-shot learners",
    466       "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"],
    467       "year": 2020,
    468       "relevance": "Foundational work on few-shot learning with LLMs, the core technique studied in this paper's RQ2."
    469     },
    470     {
    471       "title": "Hybrid automated program repair by combining large language models and program analysis",
    472       "authors": ["Fengjie Li", "Jiajun Jiang", "Jia Sun"],
    473       "year": 2025,
    474       "relevance": "GiantRepair method combining LLMs with program analysis for automated program repair, demonstrating LLM-augmented debugging."
    475     },
    476     {
    477       "title": "Context-aware prompting for LLM-based program repair",
    478       "authors": ["Yuxiang Li", "Muchen Cai", "Jianan Chen"],
    479       "year": 2025,
    480       "relevance": "CodeCorrector uses test failure information to construct targeted prompts for LLM-based program repair, relevant to prompting strategy design."
    481     },
    482     {
    483       "title": "ThinkRepair: Self-directed automated program repair",
    484       "authors": ["Xin Yin", "Chao Ni", "Shaohua Wang"],
    485       "year": 2024,
    486       "relevance": "Two-phase automated repair using few-shot learning and iterative testing with LLMs, demonstrating reasoning-enhanced repair workflows."
    487     },
    488     {
    489       "title": "Impact of code language models on automated program repair",
    490       "authors": ["Nan Jiang", "Kevin Liu", "Thibaud Lutellier"],
    491       "year": 2023,
    492       "relevance": "Introduces HumanEval-Java dataset used in this study, designed specifically to avoid data leakage in LLM evaluation."
    493     },
    494     {
    495       "title": "Gitbug-java: A reproducible benchmark of recent java bugs",
    496       "authors": ["André Silva", "Nuno Saavedra", "Martin Monperrus"],
    497       "year": 2024,
    498       "relevance": "New Java fault localization benchmark with better reproducibility and temporal properties, relevant to evaluation methodology."
    499     }
    500   ],
    501   "engagement_factors": {
    502     "practical_relevance": {
    503       "score": 2,
    504       "justification": "Practitioners choosing LLMs for automated debugging could use the model comparison and prompting strategy findings to guide selection."
    505     },
    506     "surprise_contrarian": {
    507       "score": 1,
    508       "justification": "The finding that chain-of-thought reasoning can degrade performance is mildly surprising but not strongly contrarian."
    509     },
    510     "fear_safety": {
    511       "score": 0,
    512       "justification": "No safety, security, or AI risk implications in this study of debugging tool accuracy."
    513     },
    514     "drama_conflict": {
    515       "score": 0,
    516       "justification": "No controversy or provocative claims; straightforward empirical comparison."
    517     },
    518     "demo_ability": {
    519       "score": 0,
    520       "justification": "No code, tool, or demo is released. Results cannot be tried or replicated."
    521     },
    522     "brand_recognition": {
    523       "score": 1,
    524       "justification": "Evaluates well-known models (GPT, Gemini, DeepSeek) but authors are from lesser-known Chinese universities."
    525     }
    526   }
    527 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs