scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27230B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Leveraging Mutation Analysis for LLM-based Repair of Quantum Programs",
      6     "authors": [
      7       "Chihiro Yoshida",
      8       "Yuta Ishimoto",
      9       "Olivier Nourry",
     10       "Masanari Kondo",
     11       "Makoto Matsushita",
     12       "Yasutaka Kamei",
     13       "Yoshiki Higo"
     14     ],
     15     "year": 2026,
     16     "venue": "arXiv.org",
     17     "arxiv_id": "2601.12273",
     18     "doi": "10.48550/arXiv.2601.12273"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "All abstract claims are supported: low repair success in prior work (Guo et al. 17%, HornBro 249-gate cost) is cited; mutation analysis effectiveness shown in Table I (94.4% for S+D+M); explanation quality improvements shown in Table II.",
     26         "source": "haiku"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The paper uses a controlled ablation design (4 prompt configurations: S, S+D, S+M, S+D+M) isolating the effect of mutation analysis. This design is appropriate for causal inference within the LLM prompt engineering context.",
     32         "source": "haiku"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Scope is explicitly bounded: 18 bugs from Bugs4Q, Qiskit only, simulator only, GPT-5 only. External validity section (VI) clearly lists limitations regarding broader applicability to different frameworks, LLMs, and real quantum hardware.",
     38         "source": "haiku"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "Paper discusses why S+M underperforms (mutation analysis requires tests, so D is available anyway) and why WO vs TE bugs respond differently (runtime vs error info). However, limited discussion of why mutation analysis mechanistically helps the LLM.",
     44         "source": "haiku"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Repair success is measured as 'passes all tests', which directly tests the repair outcome. Explanation quality is measured against ground-truth patches (correctness, completeness, complexity), matching the claim about explanation improvement.",
     50         "source": "haiku"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Section VI 'Threats to Validity' provides dedicated discussion of construct, internal, and external validity with specific threats rather than boilerplate disclaimers.",
     58         "source": "haiku"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Specific threats include: test-passing repairs may not match developer intent (construct); stochastic LLM outputs and manual evaluation subjectivity (internal); simulator vs hardware, single benchmark, single framework, single LLM (external).",
     64         "source": "haiku"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Explicit boundaries: 18 bugs (not 42), Qiskit only, simulator only, GPT-5 only, no human subjects, no real quantum hardware. All stated in introduction and threats section.",
     70         "source": "haiku"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Acknowledgments disclose 5 funding sources: JSPS Grants-in-Aid (grants JP25K03102, JP24H00692, JP23K24823), JST ASPIRE (JPMJAP2415), and Inamori Research Institute fellowship.",
     78         "source": "haiku"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "All authors list institutional affiliations: University of Osaka (4 authors) and Kyushu University (3 authors). No undisclosed affiliations with quantum computing vendors or OpenAI.",
     84         "source": "haiku"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Funders are Japanese government and academic organizations (JSPS, JST, Inamori), independent of OpenAI/GPT-5 and quantum computing companies. Authors evaluate external tools they do not control.",
     90         "source": "haiku"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No explicit competing interests statement or declaration of patents, equity, or consulting relationships. While likely none exist, absence of explicit statement is a gap.",
     96         "source": "haiku"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Key terms defined: APR (Automated Program Repair), mutation analysis ('evaluates how small changes affect execution'), LLM (GPT-5), quantum concepts (qubits, gates, superposition, entanglement).",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Three contributions explicitly stated: (1) first evidence mutation analysis improves LLM-based quantum APR; (2) dynamic info + mutation analysis yields highest repair rate; (3) mutation analysis improves explanation quality.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section II systematically engages prior work: classical APR (InferFix, hierarchical knowledge injection), quantum APR (Guo et al. 17%, UnitAR, HornBro 249-gate cost), and explains why LLM-based approach chosen over synthesis.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "empirical": {
    122       "artifacts": {
    123         "code_released": {
    124           "applies": true,
    125           "answer": true,
    126           "justification": "Data availability statement (VIII) claims 'all data, benchmarks, scripts, and prompts' available in replication package [29] with Zenodo DOI 10.5281/zenodo.17626083.",
    127           "source": "haiku"
    128         },
    129         "data_released": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "Bugs4Q benchmark is publicly available. QMutPy outputs are deterministic. Static information and prompts promised in replication package. Mutation analysis results documented in Fig 1.",
    133           "source": "haiku"
    134         },
    135         "environment_specified": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "Mentions Python, Qiskit, QMutPy, GPT-5 API but no requirements.txt, Dockerfile, or version pinning. Hyperparameters use 'default settings' (vague). Insufficient for reproducibility.",
    139           "source": "haiku"
    140         },
    141         "reproduction_instructions": {
    142           "applies": true,
    143           "answer": true,
    144           "justification": "Process is documented: clone Bugs4Q, apply QMutPy, construct 4 prompt configs per Fig 1, call GPT-5 API 5 times per config, evaluate outputs. Referred to replication package for exact prompts/scripts.",
    145           "source": "haiku"
    146         }
    147       },
    148       "statistical_methodology": {
    149         "confidence_intervals_or_error_bars": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "Table I reports success rates (%) with no confidence intervals. Table II reports counts with no error bars or uncertainty quantification. Generated 5 outputs per config but reports only success binary outcome.",
    153           "source": "haiku"
    154         },
    155         "significance_tests": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "Differences between configurations (77.8% S vs 88.9% S+D vs 94.4% S+D+M) are not tested for statistical significance. No p-values or significance thresholds reported.",
    159           "source": "haiku"
    160         },
    161         "effect_sizes_reported": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "Effect sizes implicit in percentages (e.g., 94.4% vs 77.8% = 16.6pp improvement). Not formally reported as Cohen's d or odds ratios, but directional magnitudes visible.",
    165           "source": "haiku"
    166         },
    167         "sample_size_justified": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "18 bugs selected from 42 due to reproducibility/criteria constraints, but no power analysis or sample size justification. Single failure case (1 of 18 never repaired) reduces effective n further.",
    171           "source": "haiku"
    172         },
    173         "variance_reported": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "Generated 5 outputs per prompt configuration but reports 'at least one successful' binary outcome, not variance in success across runs. No std dev or variance metrics.",
    177           "source": "haiku"
    178         }
    179       },
    180       "evaluation_design": {
    181         "baselines_included": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Baselines are ablations: S (static), S+D (static+dynamic), S+M (static+mutation), S+D+M (full). Guo et al. and HornBro results cited but not directly compared on same bugs.",
    185           "source": "haiku"
    186         },
    187         "baselines_contemporary": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Ablations of own approach are contemporary. Prior work baselines (Guo et al. 17% on quantum, HornBro 249 gates) are from recent papers (2024-2025) but not directly retested.",
    191           "source": "haiku"
    192         },
    193         "ablation_study": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Systematic ablation: S → S+D (adds dynamic) → S+M (adds mutation) → S+D+M (full). Results show S+D+M best at 94.4%, isolating contribution of each component.",
    197           "source": "haiku"
    198         },
    199         "multiple_metrics": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "RQ1: repair success rate. RQ2: correctness/completeness/complexity across position/cause/change (9 metrics). Covers repair and explanation hypotheses.",
    203           "source": "haiku"
    204         },
    205         "human_evaluation": {
    206           "applies": true,
    207           "answer": true,
    208           "justification": "Two authors independently evaluated 72 LLM-generated explanations (9 criteria each, 648 total judgments) for correctness, completeness, complexity. Cohen's κ = 0.48 (moderate agreement).",
    209           "source": "haiku"
    210         },
    211         "held_out_test_set": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Bugs4Q provides test scripts for each bug. Generated repairs evaluated against held-out test suites. Results on test set, not training set.",
    215           "source": "haiku"
    216         },
    217         "per_category_breakdown": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Table I breaks down by bug type (WO vs TE). Table II breaks down by explanation element (position/cause/change) and criterion (correctness/completeness/complexity).",
    221           "source": "haiku"
    222         },
    223         "failure_cases_discussed": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "One program (1 of 18) could not be repaired by any configuration. Figure 2 shows this. Discussed: S+D+M fixed 17/18, unique successful repairs isolated.",
    227           "source": "haiku"
    228         },
    229         "negative_results_reported": {
    230           "applies": true,
    231           "answer": true,
    232           "justification": "Reports S+M worse than S+D (83.3% vs 88.9%), with explanation provided. For explanations, S+D+M worse at cause element than S. Cohen's κ = 0.48 moderate, not high.",
    233           "source": "haiku"
    234         }
    235       },
    236       "setup_transparency": {
    237         "model_versions_specified": {
    238           "applies": true,
    239           "answer": false,
    240           "justification": "Only stated 'GPT-5 via OpenAI API with default settings'. No model snapshot date, parameter count, or specific version. 'Default settings' is vague for reproducibility.",
    241           "source": "haiku"
    242         },
    243         "prompts_provided": {
    244           "applies": true,
    245           "answer": true,
    246           "justification": "All prompts stated to be 'available in our replication package [29]'. System prompt shared across all configs, specific content of each prompt (S/D/M components) described in Section III.B.",
    247           "source": "haiku"
    248         },
    249         "hyperparameters_reported": {
    250           "applies": true,
    251           "answer": false,
    252           "justification": "Only 'default settings' for GPT-5. No temperature, top-p, max_tokens, stop sequences, or other API parameters specified.",
    253           "source": "haiku"
    254         },
    255         "scaffolding_described": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "Prompt construction detailed: static info (code + description + expected behavior), dynamic info (execution results), mutation analysis (25 operators, 4 status types, line/operator/traceback per mutation).",
    259           "source": "haiku"
    260         },
    261         "data_preprocessing_documented": {
    262           "applies": true,
    263           "answer": true,
    264           "justification": "Static info manually collected from source URLs (consensus reached on disagreements). Bugs reproduced before inclusion. Mutation analysis applied with QMutPy [15]. Preprocessing steps documented.",
    265           "source": "haiku"
    266         }
    267       },
    268       "data_integrity": {
    269         "raw_data_available": {
    270           "applies": true,
    271           "answer": true,
    272           "justification": "Bugs4Q publicly available with test scripts. QMutPy outputs deterministic. 360 generated repairs promised in replication package [29] with DOI.",
    273           "source": "haiku"
    274         },
    275         "data_collection_described": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "Bugs4Q: cloned from GitHub, confirmed reproducible. Static info: two authors inspected source URLs, extracted from GitHub/Stack Overflow/Stack Exchange. Disagreements resolved by consensus.",
    279           "source": "haiku"
    280         },
    281         "recruitment_methods_described": {
    282           "applies": false,
    283           "answer": false,
    284           "justification": "Not a human subjects study. Uses existing Bugs4Q dataset of real-world bugs. Recruitment N/A.",
    285           "source": "haiku"
    286         },
    287         "data_pipeline_documented": {
    288           "applies": true,
    289           "answer": true,
    290           "justification": "Overall pipeline shown in Figure 1: Bugs4Q → QMutPy → 4 prompt configs → GPT-5 → evaluation. Described in Section III with data types and formats.",
    291           "source": "haiku"
    292         }
    293       },
    294       "contamination": {
    295         "training_cutoff_stated": {
    296           "applies": true,
    297           "answer": false,
    298           "justification": "GPT-5 training data cutoff not stated. Paper uses GPT-5 released Nov 2025 (reference 22). Training cutoff likely mid-2024 but not explicitly provided.",
    299           "source": "haiku"
    300         },
    301         "train_test_overlap_discussed": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "No discussion of whether Bugs4Q bugs (from GitHub/StackOverflow) overlap with GPT-5 training data. No mitigation for potential contamination.",
    305           "source": "haiku"
    306         },
    307         "benchmark_contamination_addressed": {
    308           "applies": true,
    309           "answer": false,
    310           "justification": "Bugs4Q is real-world (GitHub/Stack Overflow), created before GPT-5 training. No explicit discussion of whether examples appeared in training data.",
    311           "source": "haiku"
    312         }
    313       },
    314       "human_studies": {
    315         "pre_registered": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "Not a human subjects study. No human participants. N/A.",
    319           "source": "haiku"
    320         },
    321         "irb_or_ethics_approval": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "Not a human subjects study. No IRB approval needed. N/A.",
    325           "source": "haiku"
    326         },
    327         "demographics_reported": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "Not a human subjects study. No participant demographics. N/A.",
    331           "source": "haiku"
    332         },
    333         "inclusion_exclusion_criteria": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "Not a human subjects study. For bug selection: included if reproducible and had mutants + accessible repo. N/A for human study criteria.",
    337           "source": "haiku"
    338         },
    339         "randomization_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "Not a human subjects study. No randomization of participants. N/A.",
    343           "source": "haiku"
    344         },
    345         "blinding_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "Not a human subjects study. For explanation evaluation, two authors evaluated without blinding to prompt config. Not applicable category.",
    349           "source": "haiku"
    350         },
    351         "attrition_reported": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "Not a human subjects study. No attrition to report. N/A.",
    355           "source": "haiku"
    356         }
    357       },
    358       "cost_and_practicality": {
    359         "inference_cost_reported": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "No inference cost, API cost per repair, or latency reported. Running 360 repair attempts on GPT-5 API would have substantial cost, not disclosed.",
    363           "source": "haiku"
    364         },
    365         "compute_budget_stated": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "No total computational budget, wall-clock time, or resource usage (GPUs, API calls, cost) reported.",
    369           "source": "haiku"
    370         }
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "Mutation analysis results improve repair success rate of buggy quantum programs",
    377       "evidence": "Table I: S+D+M (with mutation) achieves 94.4% vs S (static only) 77.8%. For WO bugs: S+D+M 100% vs S+D 90%.",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "Combining static, dynamic, and mutation information yields best repair rate",
    382       "evidence": "S+D+M outperforms all other configurations in Table I: 94.4% total, 100% on WO, 87.5% on TE.",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "Mutation analysis improves quality of LLM-generated explanations",
    387       "evidence": "Table II: S+D+M achieves best scores in 6 of 9 evaluation items (correctness/completeness/complexity), especially for position element.",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "Dynamic information and mutation results are not helpful for TE (exception-throwing) bugs",
    392       "evidence": "Table I, TE column: all configurations achieve 87.5% (7 of 8). Static information alone sufficient because error is obvious.",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "S+M underperforms S+D because mutation analysis requires running tests, making D available",
    397       "evidence": "Section IV.A: S+M 83.3% < S+D 88.9%. Explained: 'availability of M implies D also available, thus lower success of S+M not concerning.'",
    398       "supported": "moderate"
    399     },
    400     {
    401       "claim": "Mutation analysis particularly effective at positional descriptions in explanations",
    402       "evidence": "Table II: for position element, S+D+M achieves best correctness (14/18), completeness (15/18), complexity (1/18, best).",
    403       "supported": "strong"
    404     },
    405     {
    406       "claim": "Mutation analysis less effective for explaining cause of bugs",
    407       "evidence": "Table II: for cause element, S achieves best correctness (14/18) and completeness (18/18), not S+D+M.",
    408       "supported": "moderate"
    409     },
    410     {
    411       "claim": "LLM-based approach is preferable to synthesis-based for quantum APR due to flexibility and minimal gate insertion",
    412       "evidence": "Section III.C: LLM handles wider range of bugs (API + gate), avoids excessive gate counts (HornBro added 249 gates), provides explanations.",
    413       "supported": "moderate"
    414     }
    415   ],
    416   "methodology_tags": [
    417     "benchmark-eval"
    418   ],
    419   "key_findings": "Incorporating mutation analysis results into prompts for LLM-based automated quantum program repair significantly improves success rates from 77.8% (static only) to 94.4% (static + dynamic + mutation), with 100% success on Wrong Output bugs. The approach also enhances explanation quality, particularly for accurately describing bug locations, though static information remains more valuable for explaining root causes.",
    420   "red_flags": [
    421     {
    422       "flag": "Small sample size",
    423       "detail": "Only 18 bugs from 42-bug benchmark. One bug unfixed by all approaches. Statistical power is limited for detecting true differences."
    424     },
    425     {
    426       "flag": "No significance testing",
    427       "detail": "Differences between configurations (77.8% → 94.4%) not tested for statistical significance. Results could be within noise."
    428     },
    429     {
    430       "flag": "Simulator-only evaluation",
    431       "detail": "All experiments on Qiskit simulator, not real quantum hardware. External validity to actual quantum computers unclear."
    432     },
    433     {
    434       "flag": "Single LLM tested",
    435       "detail": "Only GPT-5 used. Generalizability to other LLMs (Claude, Llama, open-source) unknown."
    436     },
    437     {
    438       "flag": "Hyperparameters vague",
    439       "detail": "GPT-5 'default settings' not specified. Temperature, top-p, max_tokens, stop sequences not documented. Reproducibility affected."
    440     },
    441     {
    442       "flag": "Moderate inter-rater agreement",
    443       "detail": "Cohen's κ = 0.48 for explanation evaluation. Authors acknowledge 'some subjective judgment unavoidable.' Not strong reliability."
    444     },
    445     {
    446       "flag": "No external baselines",
    447       "detail": "Only compared prompt configurations (ablations) of own approach. Did not re-run Guo et al. or HornBro on same bugs for direct comparison."
    448     },
    449     {
    450       "flag": "Training data contamination unaddressed",
    451       "detail": "GPT-5 training cutoff not stated. Bugs4Q from GitHub/StackOverflow may have overlap with training data."
    452     },
    453     {
    454       "flag": "TE bug success ceiling",
    455       "detail": "Throw Exception bugs plateau at 87.5% across all configs. Mutation info doesn't help error-throwing bugs; static info sufficient."
    456     },
    457     {
    458       "flag": "Causality inference limited",
    459       "detail": "Ablation design is good for isolating components but doesn't explain WHY mutation information helps the LLM mechanistically."
    460     }
    461   ],
    462   "cited_papers": [
    463     {
    464       "title": "On repairing quantum programs using chatgpt",
    465       "relevance": "Directly competitive baseline: Guo et al. achieved only 17% repair on quantum bugs, motivating the current LLM-based approach."
    466     },
    467     {
    468       "title": "Automatic repair of quantum programs via unitary operation",
    469       "relevance": "UnitAR synthesis-based repair. Paper contrasts LLM flexibility vs synthesis limitations (gate-only, excessive complexity)."
    470     },
    471     {
    472       "title": "HornBro: Homotopy-like method for automated quantum program repair",
    473       "relevance": "SOTA synthesis method achieving higher success than ChatGPT but adds 249 gates, reducing maintainability. Paper's key motivation."
    474     },
    475     {
    476       "title": "Mutation testing of quantum programs: A case study with qiskit",
    477       "relevance": "Core methodology paper defining 25 quantum + classical mutation operators used in this study (QMutPy framework)."
    478     },
    479     {
    480       "title": "A comprehensive study of bug fixes in quantum programs",
    481       "relevance": "Identifies common quantum bug types (API, gate-related) that the LLM approach claims to handle broadly."
    482     },
    483     {
    484       "title": "InferFix: End-to-end program repair with llms",
    485       "relevance": "Classical APR baseline showing context in prompts improves LLM repair. Paper adapts this pattern to quantum domain."
    486     },
    487     {
    488       "title": "A survey of learning-based automated program repair",
    489       "relevance": "Comprehensive review of LLM-based APR for classical programs, positioning quantum APR as underexplored."
    490     },
    491     {
    492       "title": "Bugs4Q: A benchmark of existing bugs for quantum program testing and debugging",
    493       "relevance": "The experimental benchmark used: 42 real-world quantum bugs from GitHub/StackOverflow with test suites."
    494     },
    495     {
    496       "title": "Evaluating mutation-based fault localization for quantum programs",
    497       "relevance": "Recent work (2025) showing mutation analysis effective for quantum fault localization, supporting its use here."
    498     },
    499     {
    500       "title": "Quantum software engineering: Roadmap and challenges ahead",
    501       "relevance": "Broad context: quantum software engineering challenges, technical debt, code smells motivating APR tooling."
    502     }
    503   ],
    504   "engagement_factors": {
    505     "practical_relevance": {
    506       "score": 2,
    507       "justification": "Useful for Qiskit developers but only on 18 tested bugs, requires expensive GPT-5 API access, simulator-only tested. Moderate practical applicability."
    508     },
    509     "surprise_contrarian": {
    510       "score": 1,
    511       "justification": "Finding that more context helps LLM performance is expected. Mutation analysis improving prompt quality is incremental rather than surprising."
    512     },
    513     "fear_safety": {
    514       "score": 0,
    515       "justification": "No AI safety concerns. Automated program repair for quantum computing is not a safety-sensitive application."
    516     },
    517     "drama_conflict": {
    518       "score": 0,
    519       "justification": "No controversy, no conflicting claims, no social/ethical angle. Technical contribution without drama."
    520     },
    521     "demo_ability": {
    522       "score": 1,
    523       "justification": "Requires Bugs4Q, Qiskit, QMutPy, GPT-5 API access. Reproducible from replication package but not trivially runnable."
    524     },
    525     "brand_recognition": {
    526       "score": 2,
    527       "justification": "Authors from reputable Japanese universities (Osaka, Kyushu). Uses GPT-5 (OpenAI brand). Not top-tier AI lab but solid institutions."
    528     }
    529   },
    530   "hn_data": {
    531     "threads": [
    532       {
    533         "hn_id": "46837037",
    534         "title": "Proc3D: Procedural 3D Generation and Parametric Editing of 3D Shapes with LLMs",
    535         "points": 5,
    536         "comments": 0,
    537         "url": "https://news.ycombinator.com/item?id=46837037"
    538       },
    539       {
    540         "hn_id": "46859436",
    541         "title": "Hybrid Concolic Testing with Large Language Models for Guided Path Exploration",
    542         "points": 1,
    543         "comments": 0,
    544         "url": "https://news.ycombinator.com/item?id=46859436"
    545       }
    546     ],
    547     "top_points": 5,
    548     "total_points": 6,
    549     "total_comments": 0
    550   }
    551 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs