scan.json (33234B)
1 { 2 "paper": { 3 "title": "Explainable Automated Debugging via Large Language Model-driven Scientific Debugging", 4 "authors": [ 5 "Sungmin Kang", 6 "Bei Chen", 7 "Shin Yoo", 8 "Jian-Guang Lou" 9 ], 10 "year": 2023, 11 "venue": "Empirical Software Engineering", 12 "arxiv_id": "2304.02195", 13 "doi": "10.1007/s10664-024-10594-x" 14 }, 15 "scan_version": 3, 16 "active_modules": ["experimental_rigor", "data_leakage"], 17 "methodology_tags": ["benchmark-eval", "qualitative"], 18 "key_findings": "AutoSD, which uses LLMs to emulate Scientific Debugging with iterative hypothesis-observe-conclude loops and real debugger interaction, achieves competitive automated program repair performance (187 correct on ARHE, 113 on Defects4J v2.0) while generating explainable reasoning traces. A human study with 20 participants showed explanations improved patch review accuracy on 5 of 6 real-world bugs without increasing review time. The <DONE> token reliably indicates higher confidence in generated patches, and debugger ablation reduced plausible patch rate from 73% to 63%. Professional developers were notably less satisfied with explanations than students, suggesting need for tighter integration with development workflows.", 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "Section 6.1 states 'we plan to make our implementation and repair results publicly available for scrutiny' — this is a promise of future release, not an actual release. No repository URL is provided." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": false, 29 "justification": "The ARHE dataset (200 mutated bugs from HumanEval) is newly constructed but not released — only promised for future availability. Defects4J and BugsInPy are public benchmarks, but the human study data and ARHE mutations are not available." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper mentions using jdb, pdb, ChatGPT, Codex, and CodeGen but does not specify library versions or environment setup." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "No step-by-step reproduction instructions are provided. The technique pipeline is described at a conceptual level (Section 3) and the full prompt is in the appendix, but there are no runnable scripts or setup instructions." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": false, 46 "justification": "The template-based baseline reports mean ± std (85.77 ± 4.20 in Table 1), but the main LLM-based results (LLM-Base and AutoSD) are point estimates only with no confidence intervals or error bars." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": true, 51 "justification": "For the human study time comparison, the paper states 'There is no case where the difference is statistically significant,' implying significance tests were run on the time data. However, no significance tests are reported for the benchmark APR performance comparisons." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Effect sizes are provided throughout: absolute numbers (187 vs 177 correct, Table 1), percentage point differences ('12.4%p more likely to be plausible,' Section 5.2), and per-bug accuracy breakdowns in the human study (Figure 5)." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "The human study uses N=20 with no power analysis or justification for why 20 participants were chosen. The benchmark sizes (200 ARHE bugs, Defects4J) are standard but not justified either." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": false, 66 "justification": "Only the template-based baseline reports variance (100 reruns with std). The LLM-based methods (AutoSD and LLM-Base) report single-run results with 10 patches per bug but no variance across runs or seeds." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "Multiple baselines are compared: LLM-Base (direct LLM fix), template-based APR baseline (reverse mutators), Recoder (DL-based APR), and finetuned InCoder from Jiang et al. (Tables 1-2)." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "Baselines include Recoder and InCoder results from Jiang et al. (2023) and Codex results from Xia et al. (2022), which are contemporary state-of-the-art APR techniques at the time of writing." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": true, 83 "justification": "RQ2 provides a debugger ablation study comparing AutoSD with real debugger execution vs. 'hallucinated' observations, showing that real execution improves plausible patch rate from 63% to 73% and reverses the <DONE> confidence signal." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "Multiple metrics are reported: plausible patches, correct patches (Tables 1-2), developer accuracy, developer time, and subjective helpfulness ratings (Figure 5) in the human study." 89 }, 90 "human_evaluation": { 91 "applies": true, 92 "answer": true, 93 "justification": "A human study with 20 participants (including 6 professional developers) evaluates patch review accuracy and time with and without AutoSD explanations (Section 4.2.2, RQ4-RQ5)." 94 }, 95 "held_out_test_set": { 96 "applies": true, 97 "answer": true, 98 "justification": "The ARHE dataset was specifically constructed to avoid data contamination by mutating HumanEval, which was designed to avoid overlap with training data. Defects4J is a standard held-out benchmark with separate test suites." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Results are broken down per benchmark (ARHE, D4J v1.2, v2.0 in Tables 1-2), per bug in the human study (Figure 5), by <DONE> vs no-<DONE> status (Figure 3), and by participant group (students vs professionals, Figure 6)." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "RQ6 (Section 5.6) provides qualitative failure analysis including a disliked example (BIP002) and systematic analysis of 25 failure cases, identifying that 13/25 failures were due to breakpoints not being hit." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "Multiple negative results: explanations decreased accuracy for ARHE105 and BIP003 (Section 5.4), professional developers were largely unsatisfied with AutoSD (Figure 6b), and AutoSD takes 5x longer than LLM-Base (Section 6.2)." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "Abstract claims are well-supported: 'performs competitively' (Tables 1-2), 'can indicate when confident' (Figure 3), 'accuracy improved for five out of six real-world bugs' (Figure 5), '70% answered they wanted explanations' and '55% satisfied' (Figure 6)." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": true, 125 "justification": "The main causal claim (explanations improve accuracy) is supported by a within-subjects randomized design where participants see 3 bugs with and 3 without explanations in random order. The ablation study uses controlled single-variable manipulation (debugger vs no debugger)." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": true, 130 "justification": "Claims are generally bounded to the tested settings. The paper specifies 'on three program repair benchmarks' and reports specific numbers like 'five out of six real-world bugs studied.' Limitations explicitly note 'our technique can only handle single-method bugs' (Section 6.2)." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": false, 135 "justification": "The threats to validity section (6.1) discusses potential confounds like incorrect implementations and biased responses, but does not substantively discuss alternative explanations for the results — e.g., whether the performance gain is from extra compute rather than the scientific debugging structure, or whether explanation effects are driven by anchoring bias." 136 }, 137 "proxy_outcome_distinction": { 138 "applies": true, 139 "answer": true, 140 "justification": "The paper measures patch correctness review accuracy as a proxy for real-world APR utility, and explicitly discusses the distinction between student vs professional performance and between lab study and real deployment. The patch review task is directly motivated by actual industry practice at Meta and Bloomberg (Section 2.1)." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": false, 147 "justification": "Codex is specified as 'code-davinci-002' but the default model ChatGPT is described only as 'a sibling model to InstructGPT' without a specific version, snapshot date, or API version. CodeGen is identified only by size (6B) without a specific checkpoint." 148 }, 149 "prompts_provided": { 150 "applies": true, 151 "answer": true, 152 "justification": "The full Scientific Debugging description prompt is provided in the appendix (Section 4 of the appendix), including detailed examples for hypotheses, predictions, experiments, observations, and conclusions. The fix generation prompt template is also given in Section 3.3." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": false, 157 "justification": "The maximum iteration limit s=3 and 10 patches per bug are stated, but critical LLM API parameters (temperature, top-p, max tokens, frequency penalty) are not reported for any of the models used." 158 }, 159 "scaffolding_described": { 160 "applies": true, 161 "answer": true, 162 "justification": "The AutoSD pipeline is described in detail in Section 3: prompt construction (3.1), hypothesize-observe-conclude loop (3.2) with DSL commands (REPLACE, ADD, DEL, RUN), debugger interface (jdb/pdb), and fix suggestion (3.3). Figure 1 provides a complete pipeline diagram." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": true, 167 "justification": "ARHE construction is documented: 7 mutators applied to HumanEval solutions yielding 200 bugs (Appendix Table 1). Human study design is documented: 12 bugs sampled, divided into 2 groups of 6, randomly assigned to participants (Section 4.2.2)." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 6 'Discussion' contains both 6.1 'Threats to Validity' and 6.2 'Limitations' with substantive discussion of multiple issues." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": true, 179 "justification": "Specific threats are discussed: potential training data overlap mitigated by constructing ARHE (external validity), the 5x time cost (Section 6.2), single-method bug limitation, method-level FL assumption, and the risk of credibility lending to incorrect patches." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": true, 184 "justification": "Explicit scope boundaries: 'our technique can only handle single-method bugs as of now' (Section 6.2), 'we evaluated in the setting where method-level FL was done' (Section 6.2), and the maximum iteration limit s=3 constraining debugging depth." 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": false, 191 "justification": "Raw data (generated patches, debugging traces, human study responses) are not available. The paper only promises future release: 'we plan to make our implementation and repair results publicly available for scrutiny' (Section 6.1)." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "ARHE construction via 7 mutators on HumanEval is detailed (Section 4.2.1, Appendix Section 2). Human study procedure is described: recruitment channels, task structure, practice problem, 30-40 minute sessions, post-questionnaire, and 5-minute interviews (Section 4.2.2)." 197 }, 198 "recruitment_methods_described": { 199 "applies": true, 200 "answer": true, 201 "justification": "Section 4.2.2: 'we advertised the task to both undergraduate and graduate students with at least 1 year of Python experience, as well as professional developers at a company that specializes in software testing techniques.' Resulting sample: 8 undergrad, 6 graduate, 6 professional developers." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": true, 206 "justification": "The ARHE construction pipeline is documented with mutator breakdown (Appendix Table 1, 200 bugs). The human study pipeline is documented: 12 bugs sampled → divided into 2 groups of 6 → randomly assigned participants → 6 problems per participant with 3 having explanations." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": false, 213 "justification": "No formal funding or acknowledgments section is present. The footnote mentions 'This work was done as part of an internship at Microsoft Research Asia' but there is no explicit funding disclosure." 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "Author affiliations are clearly listed: Sungmin Kang and Shin Yoo at KAIST, Bei Chen and Jian-Guang Lou at Microsoft Research Asia. The internship arrangement is also disclosed." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": false, 223 "justification": "Two of four authors are at Microsoft Research Asia, and the work was done during an MSRA internship. Microsoft is a major investor in OpenAI, whose ChatGPT/Codex models are the primary tools evaluated. Microsoft has a financial interest in demonstrating LLM utility for developer tools." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests or financial interests statement is present in the paper. Microsoft's investment in OpenAI and potential commercial interest in LLM-based developer tools are not disclosed." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": true, 234 "answer": false, 235 "justification": "No training data cutoff dates are stated for any of the models used (ChatGPT, Codex, CodeGen). This is relevant because the Defects4J benchmarks could be in the training data." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": true, 239 "answer": true, 240 "justification": "Section 6.1 (External Validity) explicitly discusses the concern: 'A particular concern when using large language models is that their training data may include segments of the evaluation data.' The ARHE dataset was specifically constructed to mitigate this using HumanEval, which was designed to avoid contamination." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": true, 244 "answer": true, 245 "justification": "The paper constructs ARHE from HumanEval specifically because 'HumanEval was explicitly made by Chen et al. to avoid data contamination when evaluating their LLM' (Section 4.2.1). However, contamination for Defects4J is only acknowledged as a threat, not actively tested." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": true, 251 "answer": false, 252 "justification": "No mention of pre-registration for the human study. No link to OSF, AsPredicted, or other pre-registration platform." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": true, 256 "answer": true, 257 "justification": "Section 4.2.2 states: 'Our human study received IRB review exemption (IRB-23-054).'" 258 }, 259 "demographics_reported": { 260 "applies": true, 261 "answer": true, 262 "justification": "Section 4.2.2 reports: 'eight undergraduate and six graduate students, as well as six professional developers whose career span from 3 to 10 years.' Python experience requirement (≥1 year) is also stated." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": true, 266 "answer": true, 267 "justification": "Inclusion criteria stated: 'undergraduate and graduate students with at least 1 year of Python experience, as well as professional developers at a company that specializes in software testing techniques' (Section 4.2.2)." 268 }, 269 "randomization_described": { 270 "applies": true, 271 "answer": true, 272 "justification": "Randomization is described: bugs divided into 2 groups, participants 'randomly assigned' to a group, explanations provided for 'a randomly selected three of the six cases,' and problems solved 'in a randomized order' (Section 4.2.2)." 273 }, 274 "blinding_described": { 275 "applies": true, 276 "answer": false, 277 "justification": "No blinding is described. Participants could see whether an explanation was available or not (the explanation panel was visibly present or absent). No mention of blinding for evaluators assessing patch correctness." 278 }, 279 "attrition_reported": { 280 "applies": true, 281 "answer": false, 282 "justification": "No explicit attrition reporting. Twenty participants were recruited and results appear to include all 20, but the paper does not explicitly state that all participants completed the study or report dropout." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": true, 288 "answer": false, 289 "justification": "Only relative time is mentioned: 'AutoSD could take about five times longer to generate a patch when compared to LLM-Base' (Section 6.2). No absolute API costs, token counts, or per-example costs are reported." 290 }, 291 "compute_budget_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "No total computational budget is stated. The paper does not report total API spend, number of API calls, or total tokens consumed across the experiments." 295 } 296 }, 297 "experimental_rigor": { 298 "seed_sensitivity_reported": { 299 "applies": true, 300 "answer": false, 301 "justification": "The template-based baseline is run 100 times with std reported, but the LLM-based methods (AutoSD and LLM-Base) do not report results across multiple seeds or runs. LLM stochasticity is not addressed." 302 }, 303 "number_of_runs_stated": { 304 "applies": true, 305 "answer": true, 306 "justification": "Section 4.2.1 states: 'for each dataset we provide AutoSD with the buggy method and generate 10 patches.' The template-based baseline uses '100 reruns' (Table 1)." 307 }, 308 "hyperparameter_search_budget": { 309 "applies": true, 310 "answer": false, 311 "justification": "No hyperparameter search budget is reported. The paper states ChatGPT was chosen because 'we empirically found the best performance' (Section 5.3) but does not describe the selection process or configurations tried." 312 }, 313 "best_config_selection_justified": { 314 "applies": true, 315 "answer": false, 316 "justification": "ChatGPT is selected as the default model because it showed 'best performance' (Section 5.3), but the selection criteria, validation set, and process for this determination are not described." 317 }, 318 "multiple_comparison_correction": { 319 "applies": true, 320 "answer": false, 321 "justification": "The human study compares accuracy across 12 bugs with and without explanations, and the paper reports significance tests for time differences, but no correction for multiple comparisons is mentioned." 322 }, 323 "self_comparison_bias_addressed": { 324 "applies": true, 325 "answer": false, 326 "justification": "The authors implement both AutoSD and LLM-Base themselves. They compare against external baselines (Recoder, InCoder from Jiang et al.) but do not acknowledge author-evaluation bias in their own implementation comparisons." 327 }, 328 "compute_budget_vs_performance": { 329 "applies": true, 330 "answer": false, 331 "justification": "AutoSD takes '5 times longer' than LLM-Base (Section 6.2) due to iterative LLM and debugger calls, but performance is not compared at matched compute budgets. The performance comparison ignores this significant compute difference." 332 }, 333 "benchmark_construct_validity": { 334 "applies": true, 335 "answer": false, 336 "justification": "The paper does not discuss whether plausible/correct patch counts on ARHE and Defects4J adequately measure the claimed 'competitive repair performance.' The ARHE dataset's construct validity (mutation-based bugs vs real-world bugs) is not analyzed." 337 }, 338 "scaffold_confound_addressed": { 339 "applies": true, 340 "answer": true, 341 "justification": "In RQ3 (Figure 4), the same scaffolding (AutoSD) is used across different LLMs (CodeGen, Codex, ChatGPT) and compared against the same LLM-Base scaffold, isolating the model variable from the scaffold variable." 342 } 343 }, 344 "data_leakage": { 345 "temporal_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "While ARHE is constructed from HumanEval to mitigate contamination, temporal leakage is not explicitly discussed. Defects4J bugs (2014-era) predate all model training cutoffs but this temporal relationship is not analyzed." 349 }, 350 "feature_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether the evaluation setup leaks information. The buggy function, failing test, and error message are all provided to the model, but whether this mirrors realistic information availability is not discussed." 354 }, 355 "non_independence_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "No discussion of whether benchmark bugs share structural similarities that could inflate results, or whether ARHE bugs drawn from the same HumanEval problems introduce non-independence." 359 }, 360 "leakage_detection_method": { 361 "applies": true, 362 "answer": false, 363 "justification": "No concrete leakage detection method is applied. ARHE construction mitigates contamination by design, but no canary strings, membership inference, or n-gram overlap tests are used to verify the absence of leakage." 364 } 365 } 366 }, 367 "claims": [ 368 { 369 "claim": "AutoSD achieves competitive program repair performance compared to non-explainable APR techniques.", 370 "evidence": "Table 1: 187 correct on ARHE vs 177 for LLM-Base. Table 2: 113 correct on D4J v2.0 vs 110 for LLM-Base, and outperforms Recoder (11) and InCoder (28). Also outperforms Codex with 200 candidates under comparable FL settings (Section 5.1).", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "The <DONE> token prediction reliably indicates higher confidence and better repair performance.", 375 "evidence": "Figure 3: Higher plausible and correct patch proportions when <DONE> is predicted on both ARHE and Defects4J. 89% of <DONE> plausible patches were correct vs 82% without <DONE>. The signal reverses when debugger is ablated (Section 5.2).", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Using real debugger execution rather than hallucinated observations improves reliability.", 380 "evidence": "Section 5.2: Individual runs were 73% plausible with real execution vs 63% with hallucinated observations. <DONE> token becomes a negative signal (11%p less likely plausible) under hallucination, vs a positive signal (12.4%p more likely) with real execution.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "AutoSD performance rapidly improves with stronger language models.", 385 "evidence": "Figure 4: Performance increases from near-zero (CodeGen-6B) to competitive levels (ChatGPT) as model capability improves. CodeGen-6B fails in zero-shot but fixes 44 bugs few-shot (Section 5.3).", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Explanations from AutoSD improve developer accuracy on real-world bug patch review.", 390 "evidence": "Figure 5: Accuracy improved in 5 of 6 BugsInPy (real-world) bugs when explanations were available. However, accuracy decreased for ARHE105 and BIP003. Time to review was roughly constant (Section 5.4).", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "70% of participants want explanations when using APR tools, and 55% find Scientific Debugging formulation satisfactory.", 395 "evidence": "Figure 6: Post-questionnaire results show 70% agreed explanations are important for APR. 55% were satisfied with the Scientific Debugging details. Notable split between students (more satisfied) and professional developers (more critical) in Section 5.5.", 396 "supported": "moderate" 397 } 398 ], 399 "red_flags": [ 400 { 401 "flag": "Conflict of interest not disclosed", 402 "detail": "Two of four authors are at Microsoft Research Asia, and the work was done during an MSRA internship. Microsoft is a major investor in OpenAI, whose ChatGPT/Codex models are the primary evaluated tools. This financial relationship is not disclosed or discussed as a potential conflict." 403 }, 404 { 405 "flag": "No variance for main LLM results", 406 "detail": "AutoSD and LLM-Base results on ARHE and Defects4J are presented as single-run point estimates (Tables 1-2). Given LLM output stochasticity, these numbers may vary significantly across runs, but no variance or confidence intervals are reported." 407 }, 408 { 409 "flag": "Small sample for subgroup analysis", 410 "detail": "The human study (N=20) is split into students (N=14) and professional developers (N=6) for subgroup analysis (Figure 6). N=6 is too small for reliable subgroup conclusions, yet the student-vs-professional comparison is a major discussion point in RQ5." 411 }, 412 { 413 "flag": "Unmatched compute comparison", 414 "detail": "AutoSD takes 5x longer than LLM-Base (Section 6.2) due to iterative LLM and debugger calls, but the performance comparison does not control for this compute difference. LLM-Base with 5x more samples could potentially outperform AutoSD." 415 }, 416 { 417 "flag": "ChatGPT version unspecified", 418 "detail": "The default model 'ChatGPT' is described only as 'a sibling model to InstructGPT' without a specific version or API snapshot date. ChatGPT evolved significantly over 2023, making results non-reproducible without version information." 419 } 420 ], 421 "cited_papers": [ 422 { 423 "title": "Impact of Code Language Models on Automated Program Repair", 424 "authors": ["Nan Jiang", "Kevin Liu", "Thibaud Lutellier", "Lin Tan"], 425 "year": 2023, 426 "arxiv_id": "2302.05020", 427 "relevance": "Large-scale empirical study evaluating multiple LLMs and learning-based APR techniques, providing the baselines (Recoder, InCoder) compared against in this paper." 428 }, 429 { 430 "title": "Evaluating large language models trained on code", 431 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 432 "year": 2021, 433 "arxiv_id": "2107.03374", 434 "relevance": "Introduces Codex and HumanEval benchmark; HumanEval is the basis for the ARHE dataset and Codex is one of the evaluated models." 435 }, 436 { 437 "title": "GPT-4 Technical Report", 438 "authors": ["OpenAI"], 439 "year": 2023, 440 "arxiv_id": "2303.08774", 441 "relevance": "Describes GPT-4 capabilities and evaluation methodology, referenced for data contamination mitigation context." 442 }, 443 { 444 "title": "React: Synergizing reasoning and acting in language models", 445 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"], 446 "year": 2022, 447 "arxiv_id": "2210.03629", 448 "relevance": "Demonstrates LLMs using external tools to improve performance via interleaved reasoning and acting, foundational to AutoSD's tool-use approach." 449 }, 450 { 451 "title": "Chain of Thought Prompting Elicits Reasoning in Large Language Models", 452 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"], 453 "year": 2022, 454 "relevance": "Introduces chain-of-thought prompting for LLM reasoning, a key technique underlying AutoSD's hypothesis generation approach." 455 }, 456 { 457 "title": "PAL: Program-aided Language Models", 458 "authors": ["Luyu Gao", "Aman Madaan", "Shuyan Zhou"], 459 "year": 2022, 460 "arxiv_id": "2211.10435", 461 "relevance": "Shows LLMs can use program execution as external tools to improve reasoning, directly relevant to AutoSD's debugger interaction." 462 }, 463 { 464 "title": "Practical Program Repair in the Era of Large Pre-trained Language Models", 465 "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"], 466 "year": 2022, 467 "arxiv_id": "2210.14179", 468 "relevance": "Evaluates LLM-based program repair including Codex on Defects4J, providing a reference point for AutoSD's performance comparison." 469 }, 470 { 471 "title": "Less Training, More Repairing Please: Revisiting Automated Program Repair via Zero-Shot Learning", 472 "authors": ["Chunqiu Steven Xia", "Lingming Zhang"], 473 "year": 2022, 474 "relevance": "Demonstrates zero-shot LLM program repair capabilities, directly relevant to AutoSD's zero-shot approach to automated debugging." 475 }, 476 { 477 "title": "Training language models to follow instructions with human feedback", 478 "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"], 479 "year": 2022, 480 "arxiv_id": "2203.02155", 481 "relevance": "Describes RLHF training methodology used for ChatGPT/InstructGPT, the primary model underlying AutoSD." 482 }, 483 { 484 "title": "Language models are few-shot learners", 485 "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"], 486 "year": 2020, 487 "relevance": "Foundational paper on LLM few-shot and zero-shot capabilities, supporting AutoSD's approach of zero-shot scientific debugging." 488 }, 489 { 490 "title": "Trust Enhancement Issues in Program Repair", 491 "authors": ["Yannic Noller", "Ridwan Shariffdeen", "Xiang Gao", "Abhik Roychoudhury"], 492 "year": 2022, 493 "doi": "10.1145/3510003.3510040", 494 "relevance": "Developer study on APR trust and expectations, finding that explanations including root cause are the most commonly desired output from APR tools." 495 }, 496 { 497 "title": "On The Introduction of Automatic Program Repair in Bloomberg", 498 "authors": ["Serkan Kirbas", "Etienne Windels", "Olayori McBello"], 499 "year": 2021, 500 "doi": "10.1109/MS.2021.3071086", 501 "relevance": "Industrial deployment of APR at Bloomberg, demonstrating that patches require human review and developers want rationales." 502 } 503 ], 504 "engagement_factors": { 505 "practical_relevance": { 506 "score": 2, 507 "justification": "AutoSD is a practical debugging technique that could be integrated into developer workflows, but requires LLM API access and debugger infrastructure to deploy." 508 }, 509 "surprise_contrarian": { 510 "score": 1, 511 "justification": "The idea that LLMs can emulate scientific debugging is a novel combination but not contrarian — it confirms the expectation that LLMs can follow structured reasoning processes." 512 }, 513 "fear_safety": { 514 "score": 0, 515 "justification": "No AI safety or security concerns raised; the paper is about developer tooling for debugging." 516 }, 517 "drama_conflict": { 518 "score": 0, 519 "justification": "No controversy or conflict; the paper presents a new technique with balanced evaluation." 520 }, 521 "demo_ability": { 522 "score": 0, 523 "justification": "Code is not released (only promised), so no one can try the tool immediately." 524 }, 525 "brand_recognition": { 526 "score": 2, 527 "justification": "Microsoft Research Asia is a well-known lab, and the paper uses ChatGPT/OpenAI models which are widely recognized." 528 } 529 } 530 }