scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (32405B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Enhancing LLM-based Quantum Code Generation with Multi-Agent Optimization and Quantum Error Correction",
      6     "authors": [
      7       "Charlie Campbell",
      8       "Hao Chen",
      9       "Wayne Luk",
     10       "Hongxiang Fan"
     11     ],
     12     "year": 2025,
     13     "venue": "Design Automation Conference",
     14     "arxiv_id": "2504.14557",
     15     "doi": "10.1109/DAC63849.2025.11133316"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "The abstract claims 'structured CoT significantly improve the generation of quantum algorithms by up to 50%' but the results section reports SCoT improvement of 40 percentage points (Section V-C). The 50% figure does not appear in the results.",
     23         "source": "opus"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Causal claims like 'CoT improves accuracy' are supported by controlled single-variable ablation: each technique is evaluated independently on the same base model and test suite (Figure 3).",
     29         "source": "opus"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The title claims 'LLM-based Quantum Code Generation' broadly, but experiments are limited to StarCoder-3B/7B on Qiskit only. The conclusion states 'accurate, fault-tolerant quantum computing code generation using Large Language Models' without bounding to the tested models or framework.",
     35         "source": "opus"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Section V-E discusses why RAG failed (outdated Qiskit documentation, basic splitting technique), why CoT outperformed RAG (more direct model guidance vs. inference from limited data), and why multi-pass had diminishing returns (root cause is import/deprecation errors).",
     41         "source": "opus"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper measures syntactic and semantic accuracy on a test suite and frames results in terms of code generation accuracy. Claims match the granularity of measurements without overreaching into broader framing like 'developer productivity.'",
     47         "source": "opus"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section V-E 'Observation and Future Directions' discusses multiple limitations: limited dataset sizes, RAG documentation issues, topology-specific QEC decoder. While not titled 'Limitations', it contains substantive limitation discussion.",
     55         "source": "opus"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Specific threats are discussed: limited dataset evidenced by 'poor increase in the fine-tuned models' pass@1 accuracy', RAG technique limited by 'basic RAG splitting technique which does not take into account code structure', topology-specific QEC decoder 'requires retraining each time you want to run the code on a different device.'",
     61         "source": "opus"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper does not explicitly state what the results do NOT show. Limitations are framed as 'future work' rather than explicit scope boundaries. No statements like 'our results are limited to Qiskit/StarCoder and should not be generalized to other quantum frameworks or models.'",
     67         "source": "opus"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Acknowledgment section lists UK EPSRC grants (EP/W032635/1, EP/V028251/1, EP/S030069/1, EP/X036006/1) and support from Intel and AMD.",
     75         "source": "opus"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All four authors are affiliated with the Department of Computing, Imperial College London. They are not evaluating their own commercial product.",
     81         "source": "opus"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "EPSRC is an independent research council. Intel and AMD provide support but the paper evaluates open-source models (StarCoder), not Intel/AMD products. Funders have no direct financial stake in the results.",
     87         "source": "opus"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests or financial interests statement is included in the paper.",
     93         "source": "opus"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms defined: multi-agent frameworks (Section II-A), CoT/RAG (Section II-B), quantum error correction (Section II-C), semantic vs syntactic accuracy (Section V-C). Some informal but adequate.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Three explicit contributions listed: (1) novel multi-agent framework with three agents, (2) effectiveness analysis of prompt engineering techniques, (3) automatic QEC integration. Contributions are clearly stated.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section II provides background on LLMs, multi-agent frameworks, CoT/RAG, and quantum computing with related work section. Paper positions itself as first to explore multi-agent code generation for quantum, building on IBM's Qiskit Code Assistant (46%).",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "No repository URL, code archive, or supplementary materials link is provided anywhere in the paper.",
    124           "source": "opus"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "The training dataset (scraped GitHub Qiskit repos) and custom test suite are not released. No download links provided.",
    130           "source": "opus"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "The paper mentions using the transformers library and LoRA adapter but provides no requirements.txt, Dockerfile, or library versions. Not enough detail to recreate the environment.",
    136           "source": "opus"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No reproduction instructions, README, or scripts are provided. The paper describes the methodology but not step-by-step reproduction steps.",
    142           "source": "opus"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "All results in Figure 3 and Table I are point estimates with no confidence intervals or error bars.",
    150           "source": "opus"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "Claims like 'CoT leading to an increase of 32%' and 'SCoT an increase of 40%' are made without any statistical significance tests.",
    156           "source": "opus"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Percentage improvements are reported with baseline context: fine-tuned model achieves 28%, CoT adds 32pp, SCoT adds 40pp. Table I shows absolute accuracy figures for each configuration (17.9% to 46.5%).",
    162           "source": "opus"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "The test suite composition is described (47% basic, 24% intermediate, 29% advanced) but the total number of test prompts is never stated, and no justification for the sample size is provided.",
    168           "source": "opus"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "No variance, standard deviation, or results across multiple runs are reported. All results appear to be single-run point estimates.",
    174           "source": "opus"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "The fine-tuned StarCoder model without optimizations serves as a baseline. Table I also compares against the IBM Granite-20B-CODE Qiskit Assistant model.",
    182           "source": "opus"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "IBM Qiskit Code Assistant (2024) and StarCoder2 (2024) are both contemporary baselines.",
    188           "source": "opus"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Each optimization technique (RAG, CoT, SCoT, multi-pass) is evaluated independently in Figure 3, isolating the contribution of each component.",
    194           "source": "opus"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Results are reported using both syntactic accuracy and semantic accuracy (Section V-C: '45.7% syntactic accuracy, but only 33.8% semantic accuracy'), as well as pass@k on the Qiskit HumanEval benchmark.",
    200           "source": "opus"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": false,
    205           "justification": "No human evaluation is included. All evaluation is automated via test suites and pass@k metrics.",
    206           "source": "opus"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": false,
    211           "justification": "The custom test suite and Qiskit HumanEval benchmark are used for evaluation, but there is no explicit discussion of whether these were kept separate from any development/tuning decisions.",
    212           "source": "opus"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Figure 3 breaks down results by technique, and Section V-C separates syntactic vs semantic accuracy. The test suite is categorized into basic (47%), intermediate (24%), and advanced (29%) tiers.",
    218           "source": "opus"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Section V-C and V-D discuss failure modes: RAG's failure attributed to outdated documentation, multi-pass diminishing returns due to import/deprecation errors, and incorrect CoT prompt generation causing semantically invalid code.",
    224           "source": "opus"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "RAG yielding only 4% improvement is prominently reported as a negative finding. Multi-pass inference showing diminishing returns beyond triple passes is also reported as a limitation.",
    230           "source": "opus"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Primary models are specified: StarCoder-3B, Starcoder2-7B, IBM Granite-20B-CODE (Section V-A, Table I). These are specific open-source model checkpoints with deterministic weights. GPT-4o used for CoT prompt generation lacks a snapshot date but is not the evaluated model.",
    238           "source": "opus"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": false,
    243           "justification": "Prompts are described in natural language ('we manually created the first 5 prompts from our testing set using the same techniques demonstrated in previous code generation work') but the actual prompt text is not provided. The multi-pass prompt template is described but not shown.",
    244           "source": "opus"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": false,
    249           "justification": "Training hyperparameters are reported (1500 steps, batch size 4, learning rate 3×10⁻⁴, 100 warmup steps, cosine scheduler, FIM rate 0.1, LoRA). However, inference parameters (temperature, top-p, max tokens) for code generation are not stated despite using pass@k which requires sampling.",
    250           "source": "opus"
    251         },
    252         "scaffolding_described": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "The multi-agent framework is described in Section III with Figure 1: three agents (code generator, semantic analyzer, QEC decoder), their roles, and interaction flow including multi-pass refinement with error traces.",
    256           "source": "opus"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Section III-B describes the pipeline: GitHub scraping with open-source filter → date filtering (post-Feb 2024) → Qiskit import filtering → notebook splitting by sentinel tokens → upsampling from 3M to 9M tokens with priority weighting.",
    262           "source": "opus"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "No raw data (training set, test prompts, model outputs) is made available for independent verification.",
    270           "source": "opus"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Section III-B describes data collection: scraping GitHub repositories with open-source licenses, filtering by update date (after Feb 2024), filtering for Qiskit imports, splitting notebooks by sentinel tokens.",
    276           "source": "opus"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants. Data comes from GitHub repositories, not a standard benchmark, but the scraping methodology is described in Section III-B.",
    282           "source": "opus"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": false,
    287           "justification": "While the filtering criteria are described, counts at each stage are missing. The paper jumps from 'scraping Github repositories' to '3M tokens' without stating how many repositories were found, how many survived each filter, or how many files were retained.",
    288           "source": "opus"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "The fine-tuning data is filtered to after Feb 2024, but the StarCoder base model's pre-training data cutoff is not stated. It is unclear what the base model may have seen.",
    296           "source": "opus"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "No discussion of whether Qiskit HumanEval benchmark problems or their custom test prompts could have appeared in StarCoder's pre-training data.",
    302           "source": "opus"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "Qiskit HumanEval was published in 2024. StarCoder2 was also released in 2024. No discussion of whether benchmark problems were in the training data.",
    308           "source": "opus"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants in this study.",
    316           "source": "opus"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants in this study.",
    322           "source": "opus"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants in this study.",
    328           "source": "opus"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants in this study.",
    334           "source": "opus"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants in this study.",
    340           "source": "opus"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants in this study.",
    346           "source": "opus"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants in this study.",
    352           "source": "opus"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "No inference cost, latency, or token consumption is reported for the multi-agent framework despite using multi-pass inference and multiple agents.",
    360           "source": "opus"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "Training is described (1500 steps, batch size 4) but no GPU hours, hardware specs, wall-clock time, or total compute budget is stated.",
    366           "source": "opus"
    367         }
    368       },
    369       "experimental_rigor": {
    370         "seed_sensitivity_reported": {
    371           "applies": true,
    372           "answer": false,
    373           "justification": "No mention of multiple random seeds. Results appear to be from single runs.",
    374           "source": "opus"
    375         },
    376         "number_of_runs_stated": {
    377           "applies": true,
    378           "answer": false,
    379           "justification": "The paper mentions using pass@k and creating 'a sample of results' but does not state the exact number of runs or samples generated.",
    380           "source": "opus"
    381         },
    382         "hyperparameter_search_budget": {
    383           "applies": true,
    384           "answer": false,
    385           "justification": "The paper states 'We found that the optimal FIM rate was 0.1' without reporting how many configurations were tried or the search method used.",
    386           "source": "opus"
    387         },
    388         "best_config_selection_justified": {
    389           "applies": true,
    390           "answer": false,
    391           "justification": "The FIM rate of 0.1 is stated as 'optimal' without explaining the selection process or validation set used. Model choice (StarCoder-3B) is justified only as 'suitable for adapting to new programming languages.'",
    392           "source": "opus"
    393         },
    394         "multiple_comparison_correction": {
    395           "applies": false,
    396           "answer": false,
    397           "justification": "No statistical significance tests are performed, so correction for multiple comparisons is not applicable.",
    398           "source": "opus"
    399         },
    400         "self_comparison_bias_addressed": {
    401           "applies": true,
    402           "answer": false,
    403           "justification": "The authors evaluate their own framework against IBM's model without acknowledging author-evaluation bias. Their test suite was created by them for their system.",
    404           "source": "opus"
    405         },
    406         "compute_budget_vs_performance": {
    407           "applies": true,
    408           "answer": false,
    409           "justification": "Multi-pass inference requires multiple forward passes but the compute cost is not quantified. No performance-vs-compute comparison is provided despite the multi-agent framework clearly requiring more compute than a single-pass baseline.",
    410           "source": "opus"
    411         },
    412         "benchmark_construct_validity": {
    413           "applies": true,
    414           "answer": false,
    415           "justification": "The paper creates a custom test suite and uses Qiskit HumanEval without discussing whether these benchmarks adequately measure quantum code generation capability. No construct validity analysis.",
    416           "source": "opus"
    417         },
    418         "scaffold_confound_addressed": {
    419           "applies": true,
    420           "answer": false,
    421           "justification": "Table I compares StarCoder2-7B variants (with RAG/CoT) against IBM Granite-20B-CODE without addressing that differences could stem from model size (7B vs 20B), training data, or scaffolding rather than the techniques themselves.",
    422           "source": "opus"
    423         }
    424       },
    425       "data_leakage": {
    426         "temporal_leakage_addressed": {
    427           "applies": true,
    428           "answer": false,
    429           "justification": "No discussion of whether the StarCoder base model's pre-training data could have included Qiskit HumanEval problems or algorithms tested in the custom suite.",
    430           "source": "opus"
    431         },
    432         "feature_leakage_addressed": {
    433           "applies": true,
    434           "answer": false,
    435           "justification": "No discussion of whether the evaluation setup leaks information, e.g., whether CoT prompts generated by GPT-4o could contain implicit solutions.",
    436           "source": "opus"
    437         },
    438         "non_independence_addressed": {
    439           "applies": true,
    440           "answer": false,
    441           "justification": "No discussion of potential overlap between scraped GitHub training data and test suite prompts, which may test algorithms present in the training repositories.",
    442           "source": "opus"
    443         },
    444         "leakage_detection_method": {
    445           "applies": true,
    446           "answer": false,
    447           "justification": "No leakage detection or prevention method (canary strings, membership inference, decontamination) is applied.",
    448           "source": "opus"
    449         }
    450       }
    451     }
    452   },
    453   "claims": [
    454     {
    455       "claim": "Structured Chain-of-Thought prompting improves quantum code generation by up to 50%",
    456       "evidence": "Figure 3 and Section V-C show CoT achieves 40% relative improvement on custom test suite; 16.9pp absolute improvement on Qiskit HumanEval (41.4% vs 24.5%)",
    457       "supported": "moderate"
    458     },
    459     {
    460       "claim": "RAG provides limited improvement (only 4% accuracy gain)",
    461       "evidence": "Table I shows RAG yields 33.8% vs 24.5% baseline (9.3pp absolute); Section V-C notes semantic accuracy improved from 24.5% to 33.8%, but 4% figure not explicitly matched to a single metric",
    462       "supported": "weak"
    463     },
    464     {
    465       "claim": "Multi-pass inference can improve code generation by fixing errors iteratively",
    466       "evidence": "Section V-D reports accuracy improves to 34% with triple passes; diminishing returns after 3 passes noted",
    467       "supported": "moderate"
    468     },
    469     {
    470       "claim": "Quantum Error Correction decoder reduces quantum noise in generated programs",
    471       "evidence": "Figure 4 shows QEC example on Deutsch-Jozsa oracle with simulated results; paper states 'we simulated our results... using a lower error probability than IBM Brisbane' rather than real hardware validation",
    472       "supported": "weak"
    473     },
    474     {
    475       "claim": "Prompt engineering techniques have different effectiveness for quantum vs general-purpose code",
    476       "evidence": "Section V-E contrasts quantum results (CoT highly effective, RAG ineffective) with general-purpose findings; attributes difference to quantum-specific algorithmic knowledge requirements",
    477       "supported": "strong"
    478     },
    479     {
    480       "claim": "Multi-agent framework with semantic analyzer and QEC decoder is necessary for fault-tolerant quantum code",
    481       "evidence": "Framework described in Section III and tested in V; QEC shown in one example; ablations provided for techniques but not for each agent component separately",
    482       "supported": "moderate"
    483     }
    484   ],
    485   "methodology_tags": [
    486     "benchmark-eval",
    487     "case-study"
    488   ],
    489   "key_findings": "The paper proposes a three-agent LLM framework for quantum code generation with inference-time optimization techniques. Structured Chain-of-Thought prompting yields the strongest improvements (16.9pp on Qiskit HumanEval benchmark), while Retrieval-Augmented Generation provides minimal benefit. Multi-pass inference plateaus after three iterations. A Quantum Error Correction decoder component is demonstrated on one algorithm via simulation, though not validated on actual quantum hardware. The approach achieves 41.4% accuracy on the Qiskit HumanEval benchmark, approaching but not exceeding IBM's Granite-20B model (46.5%).",
    490   "red_flags": [
    491     {
    492       "flag": "Tiny test suite",
    493       "detail": "Custom test suite contains only 47 prompts vs. 164 in HumanEval. High variance, limited statistical power, results may not generalize."
    494     },
    495     {
    496       "flag": "No error bars or significance testing",
    497       "detail": "All results reported as point estimates. No confidence intervals, p-values, or variance reported. Statistical rigor absent."
    498     },
    499     {
    500       "flag": "QEC validation is simulated",
    501       "detail": "Key claim about error reduction relies on simulation with 'lower error probability than IBM Brisbane' rather than real hardware experiments. Practical validity unproven."
    502     },
    503     {
    504       "flag": "Prompts not released",
    505       "detail": "CoT and SCoT prompts not included in paper. Critical for reproducibility of inference-time optimization claims."
    506     },
    507     {
    508       "flag": "Training data not released",
    509       "detail": "GitHub-scraped dataset (3M→9M tokens) not made available. Reproducibility severely limited."
    510     },
    511     {
    512       "flag": "Model snapshots not dated",
    513       "detail": "StarCoder-3B, GPT-4o used without version hashes or training dates. Future reproduction will use different model versions."
    514     },
    515     {
    516       "flag": "Contamination risk not addressed",
    517       "detail": "Fine-tuned on GitHub Qiskit code, tested on Qiskit HumanEval. No discussion of potential overlap between training and test data."
    518     },
    519     {
    520       "flag": "Human evaluation method unclear",
    521       "detail": "How 'semantic accuracy' was assessed (automated check vs. manual review) not specified. Reproducibility and objectivity unclear."
    522     },
    523     {
    524       "flag": "Limited scope generalization",
    525       "detail": "All experiments use Qiskit. No evidence framework works for other quantum languages or classical code generation."
    526     },
    527     {
    528       "flag": "Underperforms baseline",
    529       "detail": "Proposed approach (41.4%) underperforms IBM Granite-20B-CODE-QK (46.5%) on standard benchmark, limiting practical impact."
    530     }
    531   ],
    532   "cited_papers": [
    533     {
    534       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    535       "authors": "Wei et al.",
    536       "year": 2022,
    537       "relevance": "Core technique for improving LLM reasoning; directly tested in this paper's quantum code generation context"
    538     },
    539     {
    540       "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    541       "authors": "Lewis et al.",
    542       "year": 2020,
    543       "relevance": "Baseline inference-time optimization technique; tested for quantum code generation and found ineffective"
    544     },
    545     {
    546       "title": "Evaluating Large Language Models Trained on Code",
    547       "authors": "Chen et al.",
    548       "year": 2021,
    549       "relevance": "HumanEval benchmark used as evaluation metric; establishes baseline for code generation evaluation"
    550     },
    551     {
    552       "title": "CodeLLama: Open Foundation Models for Code",
    553       "authors": "Roziere et al.",
    554       "year": 2023,
    555       "relevance": "Baseline code generation model compared against; represents state-of-the-art general-purpose code LLM"
    556     },
    557     {
    558       "title": "Qiskit Code Assistant: Training LLMs for Generating Quantum Computing Code",
    559       "authors": "Dupuis et al.",
    560       "year": 2024,
    561       "relevance": "Directly relevant quantum-specific baseline achieving 46% on Qiskit HumanEval; IBM's domain-specific approach"
    562     },
    563     {
    564       "title": "Structured Chain-of-Thought Prompting for Code Generation",
    565       "authors": "Li et al.",
    566       "year": 2023,
    567       "relevance": "Refined CoT variant (SCoT) tested in paper; achieves strongest improvements on quantum code (40% relative gain)"
    568     },
    569     {
    570       "title": "AgentCoder: Multiagent-Code Generation with Iterative Testing and Optimisation",
    571       "authors": "Huang et al.",
    572       "year": 2023,
    573       "relevance": "Multi-agent code generation framework; establishes multi-agent paradigm for code tasks"
    574     },
    575     {
    576       "title": "Attention is All You Need",
    577       "authors": "Vaswani et al.",
    578       "year": 2017,
    579       "relevance": "Foundational transformer architecture underlying all LLMs used in this work"
    580     }
    581   ],
    582   "engagement_factors": {
    583     "practical_relevance": {
    584       "score": 2,
    585       "justification": "Framework addresses real quantum programming challenge (generating fault-tolerant code) but underperforms baseline (41% vs 47%) and QEC validation is simulated. Practical deployment limited."
    586     },
    587     "surprise_contrarian": {
    588       "score": 2,
    589       "justification": "Main finding that CoT helps but RAG doesn't for quantum code is somewhat surprising given prior work, but paper positions this as domain-specific insight rather than general-purpose contention."
    590     },
    591     "fear_safety": {
    592       "score": 0,
    593       "justification": "No AI safety, security, or risk concerns raised. Quantum computing domain has no obvious dual-use risk in this context."
    594     },
    595     "drama_conflict": {
    596       "score": 1,
    597       "justification": "No controversy or conflict angle. Standard technical contribution with no social, ethical, or competitive narrative."
    598     },
    599     "demo_ability": {
    600       "score": 1,
    601       "justification": "Code and models not released. No live demo, no reproducible artifact. Only reading the paper gives access to results."
    602     },
    603     "brand_recognition": {
    604       "score": 1,
    605       "justification": "Imperial College London is respected but not a top-tier AI lab brand. No famous authors or highly-cited group. Institutional recognition moderate."
    606     }
    607   },
    608   "hn_data": {
    609     "threads": [
    610       {
    611         "hn_id": "27075013",
    612         "title": "MarioNette: Self-Supervised Sprite Learning",
    613         "points": 47,
    614         "comments": 1,
    615         "url": "https://news.ycombinator.com/item?id=27075013",
    616         "created_at": "2021-05-07T12:09:34Z"
    617       },
    618       {
    619         "hn_id": "40157571",
    620         "title": "Retrieval Head Mechanistically Explains Long-Context Factuality",
    621         "points": 2,
    622         "comments": 0,
    623         "url": "https://news.ycombinator.com/item?id=40157571",
    624         "created_at": "2024-04-25T13:49:36Z"
    625       },
    626       {
    627         "hn_id": "44901674",
    628         "title": "An interstellar mission to test astrophysical black holes",
    629         "points": 1,
    630         "comments": 0,
    631         "url": "https://news.ycombinator.com/item?id=44901674",
    632         "created_at": "2025-08-14T15:34:05Z"
    633       },
    634       {
    635         "hn_id": "44306921",
    636         "title": "Large Language Models – The Future of Fundamental Physics?",
    637         "points": 1,
    638         "comments": 0,
    639         "url": "https://news.ycombinator.com/item?id=44306921",
    640         "created_at": "2025-06-18T05:35:09Z"
    641       },
    642       {
    643         "hn_id": "23771623",
    644         "title": "Politeness Transfer: A Tag and Generate Approach",
    645         "points": 1,
    646         "comments": 0,
    647         "url": "https://news.ycombinator.com/item?id=23771623",
    648         "created_at": "2020-07-08T16:46:23Z"
    649       }
    650     ],
    651     "top_points": 47,
    652     "total_points": 52,
    653     "total_comments": 1
    654   }
    655 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs