scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (35662B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Explainable Automated Debugging via Large Language Model-driven Scientific Debugging",
      6     "authors": [
      7       "Sungmin Kang",
      8       "Bei Chen",
      9       "Shin Yoo",
     10       "Jian-Guang Lou"
     11     ],
     12     "year": 2023,
     13     "venue": "Empirical Software Engineering",
     14     "arxiv_id": "2304.02195",
     15     "doi": "10.1007/s10664-024-10594-x"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Abstract claims are well-supported: 'performs competitively' (Tables 1-2), 'can indicate when confident' (Figure 3), 'accuracy improved for five out of six real-world bugs' (Figure 5), '70% answered they wanted explanations' and '55% satisfied' (Figure 6).",
     23         "source": "opus"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The main causal claim (explanations improve accuracy) is supported by a within-subjects randomized design where participants see 3 bugs with and 3 without explanations in random order. The ablation study uses controlled single-variable manipulation (debugger vs no debugger).",
     29         "source": "opus"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Claims are generally bounded to the tested settings. The paper specifies 'on three program repair benchmarks' and reports specific numbers like 'five out of six real-world bugs studied.' Limitations explicitly note 'our technique can only handle single-method bugs' (Section 6.2).",
     35         "source": "opus"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The threats to validity section (6.1) discusses potential confounds like incorrect implementations and biased responses, but does not substantively discuss alternative explanations for the results — e.g., whether the performance gain is from extra compute rather than the scientific debugging structure, or whether explanation effects are driven by anchoring bias.",
     41         "source": "opus"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper measures patch correctness review accuracy as a proxy for real-world APR utility, and explicitly discusses the distinction between student vs professional performance and between lab study and real deployment. The patch review task is directly motivated by actual industry practice at Meta and Bloomberg (Section 2.1).",
     47         "source": "opus"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 6 'Discussion' contains both 6.1 'Threats to Validity' and 6.2 'Limitations' with substantive discussion of multiple issues.",
     55         "source": "opus"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Specific threats are discussed: potential training data overlap mitigated by constructing ARHE (external validity), the 5x time cost (Section 6.2), single-method bug limitation, method-level FL assumption, and the risk of credibility lending to incorrect patches.",
     61         "source": "opus"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Explicit scope boundaries: 'our technique can only handle single-method bugs as of now' (Section 6.2), 'we evaluated in the setting where method-level FL was done' (Section 6.2), and the maximum iteration limit s=3 constraining debugging depth.",
     67         "source": "opus"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No formal funding or acknowledgments section is present. The footnote mentions 'This work was done as part of an internship at Microsoft Research Asia' but there is no explicit funding disclosure.",
     75         "source": "opus"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations are clearly listed: Sungmin Kang and Shin Yoo at KAIST, Bei Chen and Jian-Guang Lou at Microsoft Research Asia. The internship arrangement is also disclosed.",
     81         "source": "opus"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "Two of four authors are at Microsoft Research Asia, and the work was done during an MSRA internship. Microsoft is a major investor in OpenAI, whose ChatGPT/Codex models are the primary tools evaluated. Microsoft has a financial interest in demonstrating LLM utility for developer tools.",
     87         "source": "opus"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests or financial interests statement is present in the paper. Microsoft's investment in OpenAI and potential commercial interest in LLM-based developer tools are not disclosed.",
     93         "source": "opus"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Scientific Debugging is formally defined via Zeller's five-step process (Section 2.2), 'plausible patch' vs. 'correct patch' are explicitly distinguished, and LLM capabilities are contextualized.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Section 1 lists four explicit contributions: AutoSD technique, empirical APR evaluation, developer user study, and qualitative feedback guidelines.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 2.1 provides substantive engagement with prior APR and explainability work, directly motivating AutoSD by showing that existing techniques lack human-like reasoning traces and no prior APR tools provided explanations.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "Section 6.1 states 'we plan to make our implementation and repair results publicly available for scrutiny' — this is a promise of future release, not an actual release. No repository URL is provided.",
    124           "source": "opus"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "The ARHE dataset (200 mutated bugs from HumanEval) is newly constructed but not released — only promised for future availability. Defects4J and BugsInPy are public benchmarks, but the human study data and ARHE mutations are not available.",
    130           "source": "opus"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper mentions using jdb, pdb, ChatGPT, Codex, and CodeGen but does not specify library versions or environment setup.",
    136           "source": "opus"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No step-by-step reproduction instructions are provided. The technique pipeline is described at a conceptual level (Section 3) and the full prompt is in the appendix, but there are no runnable scripts or setup instructions.",
    142           "source": "opus"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "The template-based baseline reports mean ± std (85.77 ± 4.20 in Table 1), but the main LLM-based results (LLM-Base and AutoSD) are point estimates only with no confidence intervals or error bars.",
    150           "source": "opus"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": true,
    155           "justification": "For the human study time comparison, the paper states 'There is no case where the difference is statistically significant,' implying significance tests were run on the time data. However, no significance tests are reported for the benchmark APR performance comparisons.",
    156           "source": "opus"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Effect sizes are provided throughout: absolute numbers (187 vs 177 correct, Table 1), percentage point differences ('12.4%p more likely to be plausible,' Section 5.2), and per-bug accuracy breakdowns in the human study (Figure 5).",
    162           "source": "opus"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "The human study uses N=20 with no power analysis or justification for why 20 participants were chosen. The benchmark sizes (200 ARHE bugs, Defects4J) are standard but not justified either.",
    168           "source": "opus"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "Only the template-based baseline reports variance (100 reruns with std). The LLM-based methods (AutoSD and LLM-Base) report single-run results with 10 patches per bug but no variance across runs or seeds.",
    174           "source": "opus"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Multiple baselines are compared: LLM-Base (direct LLM fix), template-based APR baseline (reverse mutators), Recoder (DL-based APR), and finetuned InCoder from Jiang et al. (Tables 1-2).",
    182           "source": "opus"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Baselines include Recoder and InCoder results from Jiang et al. (2023) and Codex results from Xia et al. (2022), which are contemporary state-of-the-art APR techniques at the time of writing.",
    188           "source": "opus"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "RQ2 provides a debugger ablation study comparing AutoSD with real debugger execution vs. 'hallucinated' observations, showing that real execution improves plausible patch rate from 63% to 73% and reverses the <DONE> confidence signal.",
    194           "source": "opus"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Multiple metrics are reported: plausible patches, correct patches (Tables 1-2), developer accuracy, developer time, and subjective helpfulness ratings (Figure 5) in the human study.",
    200           "source": "opus"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": true,
    205           "justification": "A human study with 20 participants (including 6 professional developers) evaluates patch review accuracy and time with and without AutoSD explanations (Section 4.2.2, RQ4-RQ5).",
    206           "source": "opus"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "The ARHE dataset was specifically constructed to avoid data contamination by mutating HumanEval, which was designed to avoid overlap with training data. Defects4J is a standard held-out benchmark with separate test suites.",
    212           "source": "opus"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Results are broken down per benchmark (ARHE, D4J v1.2, v2.0 in Tables 1-2), per bug in the human study (Figure 5), by <DONE> vs no-<DONE> status (Figure 3), and by participant group (students vs professionals, Figure 6).",
    218           "source": "opus"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "RQ6 (Section 5.6) provides qualitative failure analysis including a disliked example (BIP002) and systematic analysis of 25 failure cases, identifying that 13/25 failures were due to breakpoints not being hit.",
    224           "source": "opus"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Multiple negative results: explanations decreased accuracy for ARHE105 and BIP003 (Section 5.4), professional developers were largely unsatisfied with AutoSD (Figure 6b), and AutoSD takes 5x longer than LLM-Base (Section 6.2).",
    230           "source": "opus"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": false,
    237           "justification": "Codex is specified as 'code-davinci-002' but the default model ChatGPT is described only as 'a sibling model to InstructGPT' without a specific version, snapshot date, or API version. CodeGen is identified only by size (6B) without a specific checkpoint.",
    238           "source": "opus"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "The full Scientific Debugging description prompt is provided in the appendix (Section 4 of the appendix), including detailed examples for hypotheses, predictions, experiments, observations, and conclusions. The fix generation prompt template is also given in Section 3.3.",
    244           "source": "opus"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": false,
    249           "justification": "The maximum iteration limit s=3 and 10 patches per bug are stated, but critical LLM API parameters (temperature, top-p, max tokens, frequency penalty) are not reported for any of the models used.",
    250           "source": "opus"
    251         },
    252         "scaffolding_described": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "The AutoSD pipeline is described in detail in Section 3: prompt construction (3.1), hypothesize-observe-conclude loop (3.2) with DSL commands (REPLACE, ADD, DEL, RUN), debugger interface (jdb/pdb), and fix suggestion (3.3). Figure 1 provides a complete pipeline diagram.",
    256           "source": "opus"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "ARHE construction is documented: 7 mutators applied to HumanEval solutions yielding 200 bugs (Appendix Table 1). Human study design is documented: 12 bugs sampled, divided into 2 groups of 6, randomly assigned to participants (Section 4.2.2).",
    262           "source": "opus"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "Raw data (generated patches, debugging traces, human study responses) are not available. The paper only promises future release: 'we plan to make our implementation and repair results publicly available for scrutiny' (Section 6.1).",
    270           "source": "opus"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "ARHE construction via 7 mutators on HumanEval is detailed (Section 4.2.1, Appendix Section 2). Human study procedure is described: recruitment channels, task structure, practice problem, 30-40 minute sessions, post-questionnaire, and 5-minute interviews (Section 4.2.2).",
    276           "source": "opus"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": true,
    280           "answer": true,
    281           "justification": "Section 4.2.2: 'we advertised the task to both undergraduate and graduate students with at least 1 year of Python experience, as well as professional developers at a company that specializes in software testing techniques.' Resulting sample: 8 undergrad, 6 graduate, 6 professional developers.",
    282           "source": "opus"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The ARHE construction pipeline is documented with mutator breakdown (Appendix Table 1, 200 bugs). The human study pipeline is documented: 12 bugs sampled → divided into 2 groups of 6 → randomly assigned participants → 6 problems per participant with 3 having explanations.",
    288           "source": "opus"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "No training data cutoff dates are stated for any of the models used (ChatGPT, Codex, CodeGen). This is relevant because the Defects4J benchmarks could be in the training data.",
    296           "source": "opus"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": true,
    301           "justification": "Section 6.1 (External Validity) explicitly discusses the concern: 'A particular concern when using large language models is that their training data may include segments of the evaluation data.' The ARHE dataset was specifically constructed to mitigate this using HumanEval, which was designed to avoid contamination.",
    302           "source": "opus"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": true,
    307           "justification": "The paper constructs ARHE from HumanEval specifically because 'HumanEval was explicitly made by Chen et al. to avoid data contamination when evaluating their LLM' (Section 4.2.1). However, contamination for Defects4J is only acknowledged as a threat, not actively tested.",
    308           "source": "opus"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": true,
    314           "answer": false,
    315           "justification": "No mention of pre-registration for the human study. No link to OSF, AsPredicted, or other pre-registration platform.",
    316           "source": "opus"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": true,
    320           "answer": true,
    321           "justification": "Section 4.2.2 states: 'Our human study received IRB review exemption (IRB-23-054).'",
    322           "source": "opus"
    323         },
    324         "demographics_reported": {
    325           "applies": true,
    326           "answer": true,
    327           "justification": "Section 4.2.2 reports: 'eight undergraduate and six graduate students, as well as six professional developers whose career span from 3 to 10 years.' Python experience requirement (≥1 year) is also stated.",
    328           "source": "opus"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": true,
    332           "answer": true,
    333           "justification": "Inclusion criteria stated: 'undergraduate and graduate students with at least 1 year of Python experience, as well as professional developers at a company that specializes in software testing techniques' (Section 4.2.2).",
    334           "source": "opus"
    335         },
    336         "randomization_described": {
    337           "applies": true,
    338           "answer": true,
    339           "justification": "Randomization is described: bugs divided into 2 groups, participants 'randomly assigned' to a group, explanations provided for 'a randomly selected three of the six cases,' and problems solved 'in a randomized order' (Section 4.2.2).",
    340           "source": "opus"
    341         },
    342         "blinding_described": {
    343           "applies": true,
    344           "answer": false,
    345           "justification": "No blinding is described. Participants could see whether an explanation was available or not (the explanation panel was visibly present or absent). No mention of blinding for evaluators assessing patch correctness.",
    346           "source": "opus"
    347         },
    348         "attrition_reported": {
    349           "applies": true,
    350           "answer": false,
    351           "justification": "No explicit attrition reporting. Twenty participants were recruited and results appear to include all 20, but the paper does not explicitly state that all participants completed the study or report dropout.",
    352           "source": "opus"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "Only relative time is mentioned: 'AutoSD could take about five times longer to generate a patch when compared to LLM-Base' (Section 6.2). No absolute API costs, token counts, or per-example costs are reported.",
    360           "source": "opus"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "No total computational budget is stated. The paper does not report total API spend, number of API calls, or total tokens consumed across the experiments.",
    366           "source": "opus"
    367         }
    368       },
    369       "experimental_rigor": {
    370         "seed_sensitivity_reported": {
    371           "applies": true,
    372           "answer": false,
    373           "justification": "The template-based baseline is run 100 times with std reported, but the LLM-based methods (AutoSD and LLM-Base) do not report results across multiple seeds or runs. LLM stochasticity is not addressed.",
    374           "source": "opus"
    375         },
    376         "number_of_runs_stated": {
    377           "applies": true,
    378           "answer": true,
    379           "justification": "Section 4.2.1 states: 'for each dataset we provide AutoSD with the buggy method and generate 10 patches.' The template-based baseline uses '100 reruns' (Table 1).",
    380           "source": "opus"
    381         },
    382         "hyperparameter_search_budget": {
    383           "applies": true,
    384           "answer": false,
    385           "justification": "No hyperparameter search budget is reported. The paper states ChatGPT was chosen because 'we empirically found the best performance' (Section 5.3) but does not describe the selection process or configurations tried.",
    386           "source": "opus"
    387         },
    388         "best_config_selection_justified": {
    389           "applies": true,
    390           "answer": false,
    391           "justification": "ChatGPT is selected as the default model because it showed 'best performance' (Section 5.3), but the selection criteria, validation set, and process for this determination are not described.",
    392           "source": "opus"
    393         },
    394         "multiple_comparison_correction": {
    395           "applies": true,
    396           "answer": false,
    397           "justification": "The human study compares accuracy across 12 bugs with and without explanations, and the paper reports significance tests for time differences, but no correction for multiple comparisons is mentioned.",
    398           "source": "opus"
    399         },
    400         "self_comparison_bias_addressed": {
    401           "applies": true,
    402           "answer": false,
    403           "justification": "The authors implement both AutoSD and LLM-Base themselves. They compare against external baselines (Recoder, InCoder from Jiang et al.) but do not acknowledge author-evaluation bias in their own implementation comparisons.",
    404           "source": "opus"
    405         },
    406         "compute_budget_vs_performance": {
    407           "applies": true,
    408           "answer": false,
    409           "justification": "AutoSD takes '5 times longer' than LLM-Base (Section 6.2) due to iterative LLM and debugger calls, but performance is not compared at matched compute budgets. The performance comparison ignores this significant compute difference.",
    410           "source": "opus"
    411         },
    412         "benchmark_construct_validity": {
    413           "applies": true,
    414           "answer": false,
    415           "justification": "The paper does not discuss whether plausible/correct patch counts on ARHE and Defects4J adequately measure the claimed 'competitive repair performance.' The ARHE dataset's construct validity (mutation-based bugs vs real-world bugs) is not analyzed.",
    416           "source": "opus"
    417         },
    418         "scaffold_confound_addressed": {
    419           "applies": true,
    420           "answer": true,
    421           "justification": "In RQ3 (Figure 4), the same scaffolding (AutoSD) is used across different LLMs (CodeGen, Codex, ChatGPT) and compared against the same LLM-Base scaffold, isolating the model variable from the scaffold variable.",
    422           "source": "opus"
    423         }
    424       },
    425       "data_leakage": {
    426         "temporal_leakage_addressed": {
    427           "applies": true,
    428           "answer": false,
    429           "justification": "While ARHE is constructed from HumanEval to mitigate contamination, temporal leakage is not explicitly discussed. Defects4J bugs (2014-era) predate all model training cutoffs but this temporal relationship is not analyzed.",
    430           "source": "opus"
    431         },
    432         "feature_leakage_addressed": {
    433           "applies": true,
    434           "answer": false,
    435           "justification": "No discussion of whether the evaluation setup leaks information. The buggy function, failing test, and error message are all provided to the model, but whether this mirrors realistic information availability is not discussed.",
    436           "source": "opus"
    437         },
    438         "non_independence_addressed": {
    439           "applies": true,
    440           "answer": false,
    441           "justification": "No discussion of whether benchmark bugs share structural similarities that could inflate results, or whether ARHE bugs drawn from the same HumanEval problems introduce non-independence.",
    442           "source": "opus"
    443         },
    444         "leakage_detection_method": {
    445           "applies": true,
    446           "answer": false,
    447           "justification": "No concrete leakage detection method is applied. ARHE construction mitigates contamination by design, but no canary strings, membership inference, or n-gram overlap tests are used to verify the absence of leakage.",
    448           "source": "opus"
    449         }
    450       }
    451     }
    452   },
    453   "claims": [
    454     {
    455       "claim": "AutoSD achieves competitive automated program repair performance compared to LLM-Base and prior techniques (Recoder, InCoder) on Defects4J v1.2/v2.0 and ARHE benchmarks.",
    456       "evidence": "Tables 1–2: AutoSD fixes 187/189 ARHE bugs (vs. 177/179 for LLM-Base) and 76/113 on Defects4J v1.2/v2.0 (vs. 87/110 for LLM-Base).",
    457       "supported": "strong"
    458     },
    459     {
    460       "claim": "The <DONE> token reliably signals higher precision: <DONE>-predicted solutions are 12.4pp more likely to be plausible and 89% of plausible+<DONE> patches are correct.",
    461       "evidence": "Figure 3 and Section 5.2 ablation comparing <DONE> vs. NoDone conditions across ARHE and Defects4J.",
    462       "supported": "strong"
    463     },
    464     {
    465       "claim": "Removing real code execution (having LLM hallucinate observations) degrades performance from 73% to 63% plausible and inverts the <DONE> signal from +12.4pp to -11pp.",
    466       "evidence": "Section 5.2 ablation on ARHE dataset with debugger removed.",
    467       "supported": "strong"
    468     },
    469     {
    470       "claim": "AutoSD explanations improve developer accuracy for patch correctness judgment in 5 of 6 real-world bugs without increasing review time.",
    471       "evidence": "Figure 5 per-bug accuracy and time comparisons across 20 participants; improvements concentrated in BugsInPy real-world bugs.",
    472       "supported": "moderate"
    473     },
    474     {
    475       "claim": "Professional developers are substantially less satisfied with AutoSD than student participants (1/6 vs. majority of 14).",
    476       "evidence": "Figure 6 post-questionnaire results broken down by group; interviews attribute this to lack of specification linkage and interface limitations.",
    477       "supported": "moderate"
    478     },
    479     {
    480       "claim": "70% of all participants want explanations when using APR tools, and 55% are satisfied with the Scientific Debugging formulation.",
    481       "evidence": "Figure 6 aggregated post-questionnaire responses (n=20).",
    482       "supported": "moderate"
    483     },
    484     {
    485       "claim": "AutoSD performance improves as the underlying LLM scales from CodeGen-6B to Codex to ChatGPT.",
    486       "evidence": "Figure 4 showing plausible patch count on ARHE across three models for both AutoSD and LLM-Base.",
    487       "supported": "moderate"
    488     }
    489   ],
    490   "methodology_tags": [
    491     "benchmark-eval",
    492     "case-study",
    493     "qualitative"
    494   ],
    495   "key_findings": "AutoSD uses LLMs to emulate Scientific Debugging—generating hypotheses, running real debugger experiments, and reaching conclusions—enabling competitive automated program repair while producing human-readable explanations. On three benchmarks, AutoSD matches or exceeds LLM-Base and outperforms prior DL-based techniques (Recoder, InCoder) despite having weaker fault localization assumptions. A human study with 20 participants shows explanations improve patch correctness judgment accuracy for 5 of 6 real-world bugs without increasing review time, though professional developers (5/6 unsatisfied) are significantly more critical than students, pointing to unmet needs for specification linkage and IDE integration.",
    496   "red_flags": [
    497     {
    498       "flag": "Small human study",
    499       "detail": "20 participants each evaluating 6 bugs (3 with/without explanation) is insufficient for subgroup analysis; statistical significance of accuracy gains is not demonstrated despite the claim of improvement."
    500     },
    501     {
    502       "flag": "Code not released",
    503       "detail": "The implementation is only 'planned' for release at time of publication; results cannot be independently reproduced."
    504     },
    505     {
    506       "flag": "ChatGPT version unspecified",
    507       "detail": "The primary model is described as 'a sibling model to InstructGPT' without a snapshot date or model ID, making replication impossible as ChatGPT has changed significantly over time."
    508     },
    509     {
    510       "flag": "No LLM sampling parameters",
    511       "detail": "Temperature, top-p, and other hyperparameters are not reported for any model, a significant reproducibility gap for stochastic LLM-based systems."
    512     },
    513     {
    514       "flag": "Single-run LLM results",
    515       "detail": "Tables 1–2 report single-run results for LLM-Base and AutoSD with no variance, despite LLM generation being stochastic."
    516     },
    517     {
    518       "flag": "Microsoft co-author conflict",
    519       "detail": "Two of four authors are at Microsoft Research Asia, the institutional home of the technique; no competing interests statement is provided."
    520     }
    521   ],
    522   "cited_papers": [
    523     {
    524       "title": "Impact of Code Language Models on Automated Program Repair",
    525       "relevance": "Primary empirical comparison baseline; provides Recoder and InCoder results on Defects4J used to benchmark AutoSD."
    526     },
    527     {
    528       "title": "Evaluating Large Language Models Trained on Code (Codex/HumanEval)",
    529       "relevance": "Source of HumanEval benchmark used to construct ARHE dataset; Codex is one of the LLMs evaluated in AutoSD."
    530     },
    531     {
    532       "title": "Trust Enhancement Issues in Program Repair",
    533       "relevance": "Motivates the need for explanations in APR; cited for developer expectation that explanations including root causes are the most wanted APR output."
    534     },
    535     {
    536       "title": "Practitioners' Expectations on Automated Fault Localization",
    537       "relevance": "Key motivation: 85%+ of developers want rationales for automated debugging results."
    538     },
    539     {
    540       "title": "Chain of Thought Prompting Elicits Reasoning in Large Language Models",
    541       "relevance": "Foundational technique motivating LLMs' ability to generate step-by-step reasoning traces."
    542     },
    543     {
    544       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    545       "relevance": "Related approach of interleaving LLM reasoning with tool use/environment interaction."
    546     },
    547     {
    548       "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs",
    549       "relevance": "Primary APR benchmark used for evaluation."
    550     },
    551     {
    552       "title": "BugsInPy: A Database of Existing Bugs in Python Programs",
    553       "relevance": "Python benchmark used for human study bugs."
    554     },
    555     {
    556       "title": "Towards Developer-Centered Automatic Program Repair: Findings from Bloomberg",
    557       "relevance": "Industrial adoption study motivating the need for developer-friendly APR explanations."
    558     }
    559   ],
    560   "engagement_factors": {
    561     "practical_relevance": {
    562       "score": 2,
    563       "justification": "AutoSD is a practical debugging technique that could be integrated into developer workflows, but requires LLM API access and debugger infrastructure to deploy."
    564     },
    565     "surprise_contrarian": {
    566       "score": 1,
    567       "justification": "The idea that LLMs can emulate scientific debugging is a novel combination but not contrarian — it confirms the expectation that LLMs can follow structured reasoning processes."
    568     },
    569     "fear_safety": {
    570       "score": 0,
    571       "justification": "No AI safety or security concerns raised; the paper is about developer tooling for debugging."
    572     },
    573     "drama_conflict": {
    574       "score": 0,
    575       "justification": "No controversy or conflict; the paper presents a new technique with balanced evaluation."
    576     },
    577     "demo_ability": {
    578       "score": 0,
    579       "justification": "Code is not released (only promised), so no one can try the tool immediately."
    580     },
    581     "brand_recognition": {
    582       "score": 2,
    583       "justification": "Microsoft Research Asia is a well-known lab, and the paper uses ChatGPT/OpenAI models which are widely recognized."
    584     }
    585   },
    586   "hn_data": {
    587     "threads": [
    588       {
    589         "hn_id": "43578430",
    590         "title": "DeepSeek: Inference-Time Scaling for Generalist Reward Modeling",
    591         "points": 163,
    592         "comments": 35,
    593         "url": "https://news.ycombinator.com/item?id=43578430"
    594       },
    595       {
    596         "hn_id": "22875937",
    597         "title": "Air-ViBeR: Exfiltrating Data from Air-Gapped Computers via Covert Vibrations",
    598         "points": 9,
    599         "comments": 0,
    600         "url": "https://news.ycombinator.com/item?id=22875937"
    601       },
    602       {
    603         "hn_id": "39941576",
    604         "title": "Jailbreaking Leading Safety-Aligned LLMs with Simple Adaptive Attacks",
    605         "points": 3,
    606         "comments": 1,
    607         "url": "https://news.ycombinator.com/item?id=39941576"
    608       },
    609       {
    610         "hn_id": "37040795",
    611         "title": "Retroformer: Retrospective Large Language Agents",
    612         "points": 1,
    613         "comments": 1,
    614         "url": "https://news.ycombinator.com/item?id=37040795"
    615       },
    616       {
    617         "hn_id": "38765461",
    618         "title": "SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis",
    619         "points": 1,
    620         "comments": 0,
    621         "url": "https://news.ycombinator.com/item?id=38765461"
    622       },
    623       {
    624         "hn_id": "26728012",
    625         "title": "Revisiting Rashomon: A Comment on “The Two Cultures”",
    626         "points": 1,
    627         "comments": 0,
    628         "url": "https://news.ycombinator.com/item?id=26728012"
    629       },
    630       {
    631         "hn_id": "22896956",
    632         "title": "Exfiltrating Data from Air-Gapped Computers via Covert Surface ViBrAtIoNs",
    633         "points": 1,
    634         "comments": 0,
    635         "url": "https://news.ycombinator.com/item?id=22896956"
    636       }
    637     ],
    638     "top_points": 163,
    639     "total_points": 179,
    640     "total_comments": 37
    641   }
    642 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs