scan.json (29459B)
1 { 2 "paper": { 3 "title": "Leveraging Mutation Analysis for LLM-based Repair of Quantum Programs", 4 "authors": [ 5 "Chihiro Yoshida", 6 "Yuta Ishimoto", 7 "Olivier Nourry", 8 "Masanari Kondo", 9 "Makoto Matsushita", 10 "Yasutaka Kamei", 11 "Yoshiki Higo" 12 ], 13 "year": 2026, 14 "venue": "arXiv.org", 15 "arxiv_id": "2601.12273", 16 "doi": "10.48550/arXiv.2601.12273" 17 }, 18 "scan_version": 3, 19 "active_modules": ["experimental_rigor", "data_leakage"], 20 "methodology_tags": ["benchmark-eval"], 21 "key_findings": "Incorporating mutation analysis results into LLM prompts alongside dynamic runtime information (S+D+M configuration) achieves 94.4% repair success rate on 18 buggy quantum programs from Bugs4Q, compared to 77.8% with static information alone. S+D+M also achieved 100% success on Wrong Output bugs and the best explanation quality scores in 6 of 9 evaluation items. However, mutation analysis alone (S+M) underperforms dynamic information alone (S+D), suggesting mutation analysis is most valuable as a complement to runtime outputs rather than a substitute.", 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "Section VIII states 'All data, benchmarks, scripts, and prompts used in this study are publicly available in our replication package [29]' with a Zenodo DOI (10.5281/zenodo.17626083)." 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": true, 32 "justification": "The study uses the public Bugs4Q benchmark (with GitHub replication package cited), and the authors' own replication package [29] is available via Zenodo." 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "No requirements.txt, Dockerfile, or environment specification is provided. The paper mentions GPT-5 via OpenAI API 'with default settings' and QMutPy but does not list library versions or dependency details sufficient to recreate the environment." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper does not include step-by-step reproduction instructions. A replication package is referenced but no README with commands or a 'Reproducing Results' section appears in the paper itself." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": true, 48 "answer": false, 49 "justification": "Only point estimates are reported (e.g., 94.4%, 88.9%). No confidence intervals or error bars accompany any results in Tables I or II." 50 }, 51 "significance_tests": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper claims S+D+M is the most effective configuration based solely on comparing raw percentages across 4 configurations. No statistical significance tests (e.g., McNemar's, Fisher's exact) are applied to any comparison." 55 }, 56 "effect_sizes_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "Table I provides absolute repair success rates for all configurations (S=77.8%, S+D=88.9%, S+M=83.3%, S+D+M=94.4%), allowing readers to compute effect sizes. Baseline context is given by comparing against the S-only configuration." 60 }, 61 "sample_size_justified": { 62 "applies": true, 63 "answer": false, 64 "justification": "The sample of 18 programs (from 42 in Bugs4Q) is explained by practical filtering criteria (bug reproduction, mutant generation), but no power analysis or statistical justification for whether 18 is sufficient to support the comparative claims is provided." 65 }, 66 "variance_reported": { 67 "applies": true, 68 "answer": false, 69 "justification": "Five generations were produced per configuration, but results are reported only as 'at least one of five successful.' No standard deviation, IQR, or per-run success counts are provided, making it impossible to assess result stability across runs." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": true, 76 "justification": "Four prompt configurations (S, S+D, S+M, S+D+M) serve as controlled comparisons. Prior work is also referenced: ChatGPT repaired only 17% of quantum bugs (Guo et al.), and HornBro is discussed as the state-of-the-art." 77 }, 78 "baselines_contemporary": { 79 "applies": true, 80 "answer": true, 81 "justification": "The baselines include Guo et al. (2024) using ChatGPT and Tan et al. (2025) proposing HornBro, both recent and representing the current state-of-the-art in quantum APR." 82 }, 83 "ablation_study": { 84 "applies": true, 85 "answer": true, 86 "justification": "The four prompt configurations (S, S+D, S+M, S+D+M) form a systematic ablation of three components: static information, dynamic information, and mutation analysis results. Each combination isolates the contribution of individual components." 87 }, 88 "multiple_metrics": { 89 "applies": true, 90 "answer": true, 91 "justification": "RQ1 evaluates repair success rate, and RQ2 evaluates explanation quality across 9 binary items (3 elements × 3 criteria: correctness, completeness, complexity for position, cause, and change)." 92 }, 93 "human_evaluation": { 94 "applies": true, 95 "answer": true, 96 "justification": "Two authors independently evaluated explanation quality using 9 binary judgments per explanation (Section III-D). Inter-rater agreement is reported (79.2% agreement, Cohen's κ=0.48), with disagreements resolved through discussion." 97 }, 98 "held_out_test_set": { 99 "applies": true, 100 "answer": false, 101 "justification": "All 18 programs are used for both development of the approach and evaluation. There is no explicit separation of a development set used for prompt design from an evaluation set." 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": true, 106 "justification": "Table I breaks down repair success rates by bug type: Throw Exception (TE, 8 programs) and Wrong Output (WO, 10 programs). Figure 2 shows per-configuration unique repairs." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": true, 111 "justification": "Section IV-A notes that 1 of 18 programs could not be repaired by any configuration, and Section V-A discusses the Venn diagram of successful repairs. The discussion of TE bugs showing identical results across configurations is also informative about failure patterns." 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": true, 116 "justification": "S+M achieved a lower success rate (83.3%) than S+D (88.9%), which the authors explicitly note: 'S+M achieved a lower total repair success rate than S+D.' The TE bug category showed no improvement from dynamic/mutation information." 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": true, 123 "justification": "The abstract claims 94.4% repair success rate (supported by Table I), improved explanation quality (supported by Table II showing S+D+M best on 6/9 items), and that mutation analysis provides valuable contextual information (supported by the S+D+M results)." 124 }, 125 "causal_claims_justified": { 126 "applies": true, 127 "answer": true, 128 "justification": "The four prompt configurations form a controlled factorial design where only one component varies between adjacent configurations (e.g., S vs S+D adds only dynamic info). This controlled single-variable manipulation supports causal claims about each component's contribution." 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": true, 133 "justification": "Section VI (External Validity) explicitly states limitations: 'conducted using quantum programs on the Qiskit simulator,' 'limited to a single benchmark (Bugs4Q) and a single quantum framework (Qiskit),' and acknowledges that 'a full validation using a wider range of experimental subjects... is needed.'" 134 }, 135 "alternative_explanations_discussed": { 136 "applies": true, 137 "answer": false, 138 "justification": "The paper does not consider alternative explanations for why S+D+M outperforms other configurations. For example, it could be that providing more context of any kind improves performance (prompt length confound), rather than mutation analysis specifically. The threats to validity discuss only generic threats (stochasticity, subjectivity) rather than alternative explanations for the observed effects." 139 }, 140 "proxy_outcome_distinction": { 141 "applies": true, 142 "answer": true, 143 "justification": "Section VI (Construct Validity) explicitly acknowledges: 'we defined a successful repair as one that passes all tests, but this does not guarantee that the repair fully matches the developer's intended modification.' This distinguishes the proxy (test-passing) from the actual outcome (correct repair)." 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": true, 149 "answer": false, 150 "justification": "The paper states 'GPT-5' with a reference to OpenAI's announcement [22], but no specific version, snapshot date, or API version is provided. Only 'GPT-5 via the OpenAI API with default settings' is stated." 151 }, 152 "prompts_provided": { 153 "applies": true, 154 "answer": true, 155 "justification": "Section III-B states 'All prompts used in our study are available in our replication package [29]' (Zenodo DOI). The paper also describes the prompt structure and information types in detail (Figure 1, Section III-B)." 156 }, 157 "hyperparameters_reported": { 158 "applies": true, 159 "answer": false, 160 "justification": "The paper states only 'GPT-5 via the OpenAI API with default settings' without specifying temperature, top-p, max tokens, or other sampling parameters. 'Default settings' is insufficient as defaults may change over time." 161 }, 162 "scaffolding_described": { 163 "applies": false, 164 "answer": false, 165 "justification": "No agentic scaffolding is used. The approach consists of single-turn prompts to GPT-5 without tool use, retry logic, or multi-step workflows." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": true, 170 "justification": "Section III-A documents the filtering pipeline from 42 Bugs4Q programs: 19 excluded (bugs not reproducible), 5 excluded (no mutants or inaccessible URLs), yielding 18 experimental targets. Static and dynamic information extraction methods are described in Section III-B." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": true, 177 "justification": "Section VI 'Threats to Validity' contains three subsections (Construct, Internal, External) with substantive discussion of specific limitations." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section VI discusses specific threats: test-passing ≠ intended repair (construct), stochastic GPT-5 outputs despite 5 generations (internal), limited domain knowledge of evaluators affecting explanation assessment (internal), single benchmark/framework/LLM (external), and simulator vs. real quantum hardware (external)." 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": true, 187 "justification": "Section VI (External Validity) explicitly states: 'limited to a single benchmark (Bugs4Q) and a single quantum framework (Qiskit), a full validation using a wider range of experimental subjects (i.e., different bug benchmarks, other frameworks, or LLMs other than GPT-5) is needed.'" 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": true, 194 "justification": "Section VIII states all data, benchmarks, scripts, and prompts are in the replication package [29] (Zenodo DOI: 10.5281/zenodo.17626083). Bugs4Q is also publicly available." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": true, 199 "justification": "Section III-A describes the data source (Bugs4Q benchmark), selection criteria, and filtering process. Section III-B describes how static information was manually collected by two authors with disagreement resolution, and how dynamic/mutation information was obtained." 200 }, 201 "recruitment_methods_described": { 202 "applies": false, 203 "answer": false, 204 "justification": "No human participants. Data comes from the Bugs4Q benchmark, a standard public quantum bug benchmark." 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": true, 209 "justification": "The pipeline is documented: 42 Bugs4Q programs → clone and reproduce bugs → exclude 19 non-reproducible → exclude 5 (no mutants or inaccessible URLs) → 18 programs → 4 prompt configurations × 5 generations = 360 repairs. Filtering criteria are stated at each stage." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": true, 216 "justification": "The Acknowledgment section lists five funding sources: JSPS Grant-in-Aid (JP25K03102, JP24H00692, JP23K24823), JST ASPIRE (JPMJAP2415), and Inamori Research Institute fellowship." 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "Author affiliations are clearly listed: University of Osaka and Kyushu University. The authors are not affiliated with OpenAI (whose GPT-5 is evaluated), so no product-related conflict exists." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": true, 226 "justification": "Funding comes from JSPS (Japanese government research grants), JST (government science agency), and Inamori Research Institute — none have a financial stake in GPT-5 performance or quantum APR outcomes." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": false, 231 "justification": "No competing interests or financial interests statement is present in the paper. Absence of disclosure is not absence of conflict." 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": true, 237 "answer": false, 238 "justification": "No mention of GPT-5's training data cutoff date. The paper evaluates GPT-5 on Bugs4Q programs without stating when GPT-5's training data ends." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": true, 242 "answer": false, 243 "justification": "No discussion of whether Bugs4Q programs or their fixes appeared in GPT-5's training data. The bugs originate from GitHub, Stack Overflow, and Stack Exchange — public sources highly likely included in GPT-5's training set." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": true, 247 "answer": false, 248 "justification": "Bugs4Q was published in 2023 (before GPT-5's likely training cutoff). The bug fixes from GitHub/StackOverflow are public and plausibly in GPT-5's training data. This contamination risk is not discussed." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in the study. The two authors who evaluated explanations are researchers performing analysis, not study participants." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants. The study evaluates automated program repair on a public benchmark." 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in the study." 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in the study." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in the study." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in the study." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants in the study." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": true, 291 "answer": false, 292 "justification": "No API costs, token counts, or latency figures are reported despite making 360 API calls to GPT-5 (18 programs × 4 configurations × 5 generations)." 293 }, 294 "compute_budget_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "No total compute budget, API spend, or hardware specifications are stated. The mutation analysis compute (QMutPy runs) and GPT-5 API usage are both unquantified." 298 } 299 }, 300 "experimental_rigor": { 301 "seed_sensitivity_reported": { 302 "applies": true, 303 "answer": false, 304 "justification": "Five outputs were generated per configuration, but only the aggregate 'at least one of five successful' metric is reported. No analysis of sensitivity across the 5 runs (e.g., how many of the 5 succeeded per program) is provided." 305 }, 306 "number_of_runs_stated": { 307 "applies": true, 308 "answer": true, 309 "justification": "Section III-C explicitly states: 'we generated outputs five times for each prompt configuration' yielding '18 programs × 4 prompt configurations × 5 generations = 360 generated repairs.'" 310 }, 311 "hyperparameter_search_budget": { 312 "applies": true, 313 "answer": false, 314 "justification": "The paper uses 'default settings' for GPT-5 without reporting what those defaults are or whether any hyperparameter exploration was conducted. No search budget is stated." 315 }, 316 "best_config_selection_justified": { 317 "applies": true, 318 "answer": true, 319 "justification": "All four prompt configurations are reported in Table I, not just the best-performing one. The reader can see the full comparison and make their own judgment." 320 }, 321 "multiple_comparison_correction": { 322 "applies": true, 323 "answer": false, 324 "justification": "The paper compares four prompt configurations across multiple metrics but applies no statistical tests at all, let alone corrections for multiple comparisons." 325 }, 326 "self_comparison_bias_addressed": { 327 "applies": true, 328 "answer": false, 329 "justification": "The authors designed the prompt configurations and evaluated them without acknowledging the bias of evaluating their own system design. No independent evaluation or discussion of author-evaluation bias is present." 330 }, 331 "compute_budget_vs_performance": { 332 "applies": true, 333 "answer": false, 334 "justification": "S+D+M requires running mutation analysis (QMutPy) in addition to the LLM call, adding compute cost. This cost difference between configurations is not discussed or quantified." 335 }, 336 "benchmark_construct_validity": { 337 "applies": true, 338 "answer": false, 339 "justification": "No discussion of whether Bugs4Q (18 selected bugs from a 42-bug dataset) adequately represents real-world quantum programming bugs or whether test-passing is a valid measure of repair quality beyond the construct validity note in Section VI." 340 }, 341 "scaffold_confound_addressed": { 342 "applies": false, 343 "answer": false, 344 "justification": "No scaffolding is used. The approach consists of direct single-turn prompts to GPT-5." 345 } 346 }, 347 "data_leakage": { 348 "temporal_leakage_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "Bugs4Q bugs were collected from GitHub, Stack Overflow, and Stack Exchange — public sources predating GPT-5's training. The temporal relationship between benchmark creation (2023) and model training is not discussed." 352 }, 353 "feature_leakage_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "Prompts include bug descriptions and expected behavior from source URLs. Whether GPT-5 has already seen these source URLs (and their solutions) during training is not discussed." 357 }, 358 "non_independence_addressed": { 359 "applies": true, 360 "answer": false, 361 "justification": "No discussion of whether the 18 selected programs share structural similarities, come from the same repositories, or have other dependencies that could inflate apparent success rates." 362 }, 363 "leakage_detection_method": { 364 "applies": true, 365 "answer": false, 366 "justification": "No contamination detection method is applied despite the high risk that Bugs4Q's public source material appears in GPT-5's training data." 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "Combining dynamic information with mutation analysis results (S+D+M) yields the highest repair success rate at 94.4%.", 373 "evidence": "Table I shows S+D+M achieves 94.4% total, compared to S (77.8%), S+D (88.9%), and S+M (83.3%). Section IV-A.", 374 "supported": "moderate" 375 }, 376 { 377 "claim": "S+D+M achieves 100% repair success rate for Wrong Output bugs.", 378 "evidence": "Table I WO column: S+D+M = 100%, S = 70%, S+D = 90%, S+M = 80%. Based on 10 WO programs.", 379 "supported": "moderate" 380 }, 381 { 382 "claim": "Mutation analysis results combined with dynamic information achieve the best explanation quality scores in 6 of 9 evaluation items.", 383 "evidence": "Table II shows S+D+M has bold (best) values in 6 of 9 cells. Section IV-B.", 384 "supported": "moderate" 385 }, 386 { 387 "claim": "Mutation analysis alone (S+M) is less effective than dynamic information alone (S+D) for repair.", 388 "evidence": "Table I: S+M = 83.3% vs S+D = 88.9%. The authors note 'S+M achieved a lower total repair success rate than S+D.' Section IV-A.", 389 "supported": "weak" 390 }, 391 { 392 "claim": "For Throw Exception bugs, dynamic information and mutation analysis results have no impact on repair.", 393 "evidence": "Table I TE column shows identical results (87.5%) across all configurations, with the same 7 of 8 programs repaired. Section IV-A.", 394 "supported": "strong" 395 }, 396 { 397 "claim": "S+D+M is the only configuration that repairs 17 of 18 programs, encompassing all programs repaired by any configuration.", 398 "evidence": "Figure 2 Venn diagram shows S+D+M uniquely repairs 1 program, while all 14 commonly-fixed programs are included. Section V-A.", 399 "supported": "strong" 400 } 401 ], 402 "red_flags": [ 403 { 404 "flag": "Tiny sample size", 405 "detail": "Only 18 buggy programs are used (from a 42-program benchmark with 24 excluded). With n=18, the difference between 77.8% (14/18) and 94.4% (17/18) is just 3 programs. No statistical tests are applied, making it impossible to determine if the observed differences are due to chance." 406 }, 407 { 408 "flag": "No uncertainty quantification", 409 "detail": "Five generations per configuration are produced but only 'at least one of five' success is reported. Per-run success counts, variance, and confidence intervals are absent. The stochastic nature of GPT-5 is acknowledged as a threat but not mitigated analytically." 410 }, 411 { 412 "flag": "Unaddressed contamination risk", 413 "detail": "Bugs4Q programs originate from GitHub, Stack Overflow, and Stack Exchange (published 2023), all highly likely to be in GPT-5's training data (released 2025). GPT-5 may have memorized the correct fixes, inflating all repair rates. The 94.4% rate may reflect memorization rather than the value of mutation analysis." 414 }, 415 { 416 "flag": "Moderate inter-rater agreement on explanations", 417 "detail": "Cohen's κ = 0.48 for explanation quality evaluation indicates only moderate agreement. The differences between configurations in Table II are small (1-3 programs), and with this level of evaluator disagreement, the claimed rankings may not be reliable." 418 }, 419 { 420 "flag": "Prompt length confound", 421 "detail": "S+D+M has the most context (static + dynamic + mutation), and S has the least. The improvement may be due to providing more contextual information of any kind rather than mutation analysis specifically. No control for prompt length or information quantity is applied." 422 } 423 ], 424 "cited_papers": [ 425 { 426 "title": "On repairing quantum programs using chatgpt", 427 "authors": ["X. Guo", "J. Zhao", "P. Zhao"], 428 "year": 2024, 429 "relevance": "Directly evaluates ChatGPT's capability for automated repair of quantum programs, finding only 17% success on quantum bugs." 430 }, 431 { 432 "title": "InferFix: End-to-end program repair with LLMs", 433 "authors": ["M. Jin", "S. Shahriar", "M. Tufano", "X. Shi", "S. Lu", "N. Sundaresan", "A. Svyatkovskiy"], 434 "year": 2023, 435 "relevance": "LLM-based program repair approach that enhances prompts with static analysis results, directly relevant to prompt engineering for APR." 436 }, 437 { 438 "title": "Automated program repair via conversation: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT", 439 "authors": ["C. S. Xia", "L. Zhang"], 440 "year": 2024, 441 "relevance": "Demonstrates conversational LLM-based program repair with cost analysis, a key prior work in LLM-based APR." 442 }, 443 { 444 "title": "RepairAgent: An autonomous, LLM-based agent for program repair", 445 "authors": ["I. Bouzenia", "P. Devanbu", "M. Pradel"], 446 "year": 2025, 447 "relevance": "Autonomous LLM-based agent for program repair, representing the agentic approach to APR." 448 }, 449 { 450 "title": "Hierarchical knowledge injection for improving LLM-based program repair", 451 "authors": ["R. Ehsani", "E. Parra", "S. Haiduc", "P. Chatterjee"], 452 "year": 2025, 453 "relevance": "Studies how hierarchical prompt enrichment with bug/repository/project knowledge improves LLM-based repair, directly comparable prompt engineering approach." 454 }, 455 { 456 "title": "A survey of learning-based automated program repair", 457 "authors": ["Q. Zhang", "C. Fang", "Y. Ma", "W. Sun", "Z. Chen"], 458 "year": 2023, 459 "relevance": "Comprehensive survey of learning-based APR techniques providing broader context for LLM-based repair approaches." 460 }, 461 { 462 "title": "Evaluating explanations for software patches generated by large language models", 463 "authors": ["D. Sobania", "A. Geiger", "J. Callan", "A. Brownlee", "C. Hanna", "R. Moussa", "M. Z. López", "J. Petke", "F. Sarro"], 464 "year": 2023, 465 "relevance": "Evaluates LLM-generated patch explanations using correctness, completeness, and complexity criteria adopted by this paper." 466 }, 467 { 468 "title": "A quantitative and qualitative evaluation of LLM-based explainable fault localization", 469 "authors": ["S. Kang", "G. An", "S. Yoo"], 470 "year": 2024, 471 "relevance": "Evaluates LLM explanations for fault localization with inter-rater agreement methodology directly comparable to this work." 472 }, 473 { 474 "title": "Exploring LLM-driven explanations for quantum algorithms", 475 "authors": ["G. d'Aloisio", "S. Fortz", "C. Hanna", "D. Fortunato", "A. Bensoussan", "E. Mendiluze Usandizaga", "F. Sarro"], 476 "year": 2024, 477 "relevance": "Studies LLM-generated explanations for quantum algorithms, a closely related domain combining LLMs and quantum computing." 478 } 479 ], 480 "engagement_factors": { 481 "practical_relevance": { 482 "score": 1, 483 "justification": "Quantum programming is a niche domain; the technique requires QMutPy and quantum-specific tooling, limiting practical applicability to quantum developers." 484 }, 485 "surprise_contrarian": { 486 "score": 1, 487 "justification": "The finding that mutation analysis helps LLM repair is novel but not deeply surprising — providing more diagnostic information to an LLM improving output is expected." 488 }, 489 "fear_safety": { 490 "score": 0, 491 "justification": "No AI safety, security, or risk implications in the work." 492 }, 493 "drama_conflict": { 494 "score": 0, 495 "justification": "No controversy, no claims challenging established methods or critiquing competitors." 496 }, 497 "demo_ability": { 498 "score": 1, 499 "justification": "Replication package is available on Zenodo but requires quantum programming setup (Qiskit, QMutPy) and GPT-5 API access." 500 }, 501 "brand_recognition": { 502 "score": 1, 503 "justification": "Uses GPT-5 (recognizable) but authors are from Japanese universities without high AI brand recognition." 504 } 505 } 506 }