scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26734B)
      1 {
      2   "paper": {
      3     "title": "Enhancing LLM-based Quantum Code Generation with Multi-Agent Optimization and Quantum Error Correction",
      4     "authors": [
      5       "Charlie Campbell",
      6       "Hao (Mark) Chen",
      7       "Wayne Luk",
      8       "Hongxiang Fan"
      9     ],
     10     "year": 2025,
     11     "venue": "Design Automation Conference",
     12     "arxiv_id": "2504.14557",
     13     "doi": "10.1109/DAC63849.2025.11133316"
     14   },
     15   "scan_version": 2,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "A multi-agent framework for quantum code generation using StarCoder with CoT, SCoT, RAG, and QEC agents. Structured CoT prompting yields the largest improvement (~40 percentage points on a custom test suite), while RAG provides only marginal benefit (~4%). Multi-pass inference helps fix syntactic errors but has diminishing returns beyond 3 passes. QEC integration is demonstrated only in simulation, not on actual quantum hardware.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No repository URL, code archive, or supplementary materials link is provided anywhere in the paper."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The training dataset (scraped GitHub Qiskit repos) and custom test suite are not released. No download links provided."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper mentions using the transformers library and LoRA adapter but provides no requirements.txt, Dockerfile, or library versions. Not enough detail to recreate the environment."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No reproduction instructions, README, or scripts are provided. The paper describes the methodology but not step-by-step reproduction steps."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "All results in Figure 3 and Table I are point estimates with no confidence intervals or error bars."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Claims like 'CoT leading to an increase of 32%' and 'SCoT an increase of 40%' are made without any statistical significance tests."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Percentage improvements are reported with baseline context: fine-tuned model achieves 28%, CoT adds 32pp, SCoT adds 40pp. Table I shows absolute accuracy figures for each configuration (17.9% to 46.5%)."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The test suite composition is described (47% basic, 24% intermediate, 29% advanced) but the total number of test prompts is never stated, and no justification for the sample size is provided."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No variance, standard deviation, or results across multiple runs are reported. All results appear to be single-run point estimates."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The fine-tuned StarCoder model without optimizations serves as a baseline. Table I also compares against the IBM Granite-20B-CODE Qiskit Assistant model."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "IBM Qiskit Code Assistant (2024) and StarCoder2 (2024) are both contemporary baselines."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Each optimization technique (RAG, CoT, SCoT, multi-pass) is evaluated independently in Figure 3, isolating the contribution of each component."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Results are reported using both syntactic accuracy and semantic accuracy (Section V-C: '45.7% syntactic accuracy, but only 33.8% semantic accuracy'), as well as pass@k on the Qiskit HumanEval benchmark."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No human evaluation is included. All evaluation is automated via test suites and pass@k metrics."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "The custom test suite and Qiskit HumanEval benchmark are used for evaluation, but there is no explicit discussion of whether these were kept separate from any development/tuning decisions."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Figure 3 breaks down results by technique, and Section V-C separates syntactic vs semantic accuracy. The test suite is categorized into basic (47%), intermediate (24%), and advanced (29%) tiers."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section V-C and V-D discuss failure modes: RAG's failure attributed to outdated documentation, multi-pass diminishing returns due to import/deprecation errors, and incorrect CoT prompt generation causing semantically invalid code."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "RAG yielding only 4% improvement is prominently reported as a negative finding. Multi-pass inference showing diminishing returns beyond triple passes is also reported as a limitation."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The abstract claims 'structured CoT significantly improve the generation of quantum algorithms by up to 50%' but the results section reports SCoT improvement of 40 percentage points (Section V-C). The 50% figure does not appear in the results."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Causal claims like 'CoT improves accuracy' are supported by controlled single-variable ablation: each technique is evaluated independently on the same base model and test suite (Figure 3)."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The title claims 'LLM-based Quantum Code Generation' broadly, but experiments are limited to StarCoder-3B/7B on Qiskit only. The conclusion states 'accurate, fault-tolerant quantum computing code generation using Large Language Models' without bounding to the tested models or framework."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Section V-E discusses why RAG failed (outdated Qiskit documentation, basic splitting technique), why CoT outperformed RAG (more direct model guidance vs. inference from limited data), and why multi-pass had diminishing returns (root cause is import/deprecation errors)."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper measures syntactic and semantic accuracy on a test suite and frames results in terms of code generation accuracy. Claims match the granularity of measurements without overreaching into broader framing like 'developer productivity.'"
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Primary models are specified: StarCoder-3B, Starcoder2-7B, IBM Granite-20B-CODE (Section V-A, Table I). These are specific open-source model checkpoints with deterministic weights. GPT-4o used for CoT prompt generation lacks a snapshot date but is not the evaluated model."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Prompts are described in natural language ('we manually created the first 5 prompts from our testing set using the same techniques demonstrated in previous code generation work') but the actual prompt text is not provided. The multi-pass prompt template is described but not shown."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "Training hyperparameters are reported (1500 steps, batch size 4, learning rate 3×10⁻⁴, 100 warmup steps, cosine scheduler, FIM rate 0.1, LoRA). However, inference parameters (temperature, top-p, max tokens) for code generation are not stated despite using pass@k which requires sampling."
    158       },
    159       "scaffolding_described": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The multi-agent framework is described in Section III with Figure 1: three agents (code generator, semantic analyzer, QEC decoder), their roles, and interaction flow including multi-pass refinement with error traces."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section III-B describes the pipeline: GitHub scraping with open-source filter → date filtering (post-Feb 2024) → Qiskit import filtering → notebook splitting by sentinel tokens → upsampling from 3M to 9M tokens with priority weighting."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section V-E 'Observation and Future Directions' discusses multiple limitations: limited dataset sizes, RAG documentation issues, topology-specific QEC decoder. While not titled 'Limitations', it contains substantive limitation discussion."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Specific threats are discussed: limited dataset evidenced by 'poor increase in the fine-tuned models' pass@1 accuracy', RAG technique limited by 'basic RAG splitting technique which does not take into account code structure', topology-specific QEC decoder 'requires retraining each time you want to run the code on a different device.'"
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The paper does not explicitly state what the results do NOT show. Limitations are framed as 'future work' rather than explicit scope boundaries. No statements like 'our results are limited to Qiskit/StarCoder and should not be generalized to other quantum frameworks or models.'"
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No raw data (training set, test prompts, model outputs) is made available for independent verification."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section III-B describes data collection: scraping GitHub repositories with open-source licenses, filtering by update date (after Feb 2024), filtering for Qiskit imports, splitting notebooks by sentinel tokens."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. Data comes from GitHub repositories, not a standard benchmark, but the scraping methodology is described in Section III-B."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "While the filtering criteria are described, counts at each stage are missing. The paper jumps from 'scraping Github repositories' to '3M tokens' without stating how many repositories were found, how many survived each filter, or how many files were retained."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Acknowledgment section lists UK EPSRC grants (EP/W032635/1, EP/V028251/1, EP/S030069/1, EP/X036006/1) and support from Intel and AMD."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "All four authors are affiliated with the Department of Computing, Imperial College London. They are not evaluating their own commercial product."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "EPSRC is an independent research council. Intel and AMD provide support but the paper evaluates open-source models (StarCoder), not Intel/AMD products. Funders have no direct financial stake in the results."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is included in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "The fine-tuning data is filtered to after Feb 2024, but the StarCoder base model's pre-training data cutoff is not stated. It is unclear what the base model may have seen."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No discussion of whether Qiskit HumanEval benchmark problems or their custom test prompts could have appeared in StarCoder's pre-training data."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "Qiskit HumanEval was published in 2024. StarCoder2 was also released in 2024. No discussion of whether benchmark problems were in the training data."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No inference cost, latency, or token consumption is reported for the multi-agent framework despite using multi-pass inference and multiple agents."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Training is described (1500 steps, batch size 4) but no GPU hours, hardware specs, wall-clock time, or total compute budget is stated."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No mention of multiple random seeds. Results appear to be from single runs."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The paper mentions using pass@k and creating 'a sample of results' but does not state the exact number of runs or samples generated."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The paper states 'We found that the optimal FIM rate was 0.1' without reporting how many configurations were tried or the search method used."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The FIM rate of 0.1 is stated as 'optimal' without explaining the selection process or validation set used. Model choice (StarCoder-3B) is justified only as 'suitable for adapting to new programming languages.'"
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "No statistical significance tests are performed, so correction for multiple comparisons is not applicable."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors evaluate their own framework against IBM's model without acknowledging author-evaluation bias. Their test suite was created by them for their system."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "Multi-pass inference requires multiple forward passes but the compute cost is not quantified. No performance-vs-compute comparison is provided despite the multi-agent framework clearly requiring more compute than a single-pass baseline."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The paper creates a custom test suite and uses Qiskit HumanEval without discussing whether these benchmarks adequately measure quantum code generation capability. No construct validity analysis."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "Table I compares StarCoder2-7B variants (with RAG/CoT) against IBM Granite-20B-CODE without addressing that differences could stem from model size (7B vs 20B), training data, or scaffolding rather than the techniques themselves."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the StarCoder base model's pre-training data could have included Qiskit HumanEval problems or algorithms tested in the custom suite."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether the evaluation setup leaks information, e.g., whether CoT prompts generated by GPT-4o could contain implicit solutions."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of potential overlap between scraped GitHub training data and test suite prompts, which may test algorithms present in the training repositories."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection or prevention method (canary strings, membership inference, decontamination) is applied."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "Structured CoT significantly improves quantum code generation by up to 50%",
    370       "evidence": "Figure 3 and Section V-C show SCoT improvement of 40 percentage points over the fine-tuned model on the custom test suite. The 50% figure cited in the abstract does not appear in the results section.",
    371       "supported": "weak"
    372     },
    373     {
    374       "claim": "RAG shows limited improvement, yielding an accuracy increase of only 4%",
    375       "evidence": "Figure 3 and Section V-C confirm RAG's marginal impact. Attributed to outdated Qiskit documentation and basic RAG splitting technique.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Multi-pass inference improves accuracy to 34% using triple passes, with diminishing returns beyond that",
    380       "evidence": "Section V-D states this result. Root cause identified as import/deprecation errors that persist across passes.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "The QEC agent reduces quantum noise in generated programs",
    385       "evidence": "Figure 4 shows the Deutsch-Jozsa oracle example, but (c) is explicitly simulated: 'we simulated our results for (c) using a lower error probability than IBM Brisbane, corresponding to the new error rate after QEC.' Not demonstrated on actual hardware.",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "Fine-tuning on Qiskit data increases pass@1 by 10%, up to 28% overall",
    390       "evidence": "Section V-B reports this improvement on the custom test suite. Table I shows 17.9% → 24.5% on Qiskit HumanEval for StarCoder2-7B.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "CoT prompting approaches the performance of the much larger IBM Qiskit model (20B)",
    395       "evidence": "Table I shows StarCoder2-7B-QKCoT at 41.4% vs IBM Granite-20B-CODE-QK at 46.5% on Qiskit HumanEval, a ~5% gap.",
    396       "supported": "moderate"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "No error bars or variance on any result",
    402       "detail": "All results in Figure 3 and Table I are single point estimates with no confidence intervals, standard deviations, or variance across runs, making it impossible to assess result stability."
    403     },
    404     {
    405       "flag": "QEC results are simulated, not demonstrated on real hardware",
    406       "detail": "The paper states 'we simulated our results for (c) using a lower error probability than IBM Brisbane, corresponding to the new error rate after QEC.' The core QEC claim is based on simulation, not actual quantum error correction applied on hardware."
    407     },
    408     {
    409       "flag": "Abstract overstates improvement",
    410       "detail": "The abstract claims 'up to 50%' improvement from structured CoT, but the results section shows 40 percentage points. The 50% figure is not traceable to any specific result in the paper."
    411     },
    412     {
    413       "flag": "Test suite size unstated",
    414       "detail": "The total number of test prompts is never stated — only proportions (47% basic, 24% intermediate, 29% advanced). A small test suite could produce unreliable accuracy estimates."
    415     },
    416     {
    417       "flag": "Unfair cross-model comparison",
    418       "detail": "Table I compares a 7B parameter model (StarCoder2-7B with optimizations) against a 20B parameter model (IBM Granite), yet frames the 5% gap as 'approaching' parity without acknowledging the 3x size difference or controlling for compute."
    419     }
    420   ],
    421   "cited_papers": [
    422     {
    423       "title": "AgentCoder: Multiagent-code generation with iterative testing and optimisation",
    424       "authors": ["D. Huang", "Q. Bu", "J. M. Zhang", "M. Luck", "H. Cui"],
    425       "year": 2023,
    426       "arxiv_id": "2312.13010",
    427       "relevance": "Multi-agent framework for code generation, direct predecessor and inspiration for this paper's approach."
    428     },
    429     {
    430       "title": "Starcoder 2 and the stack v2: The next generation",
    431       "authors": ["A. Lozhkov", "R. Li", "L. Ben Allal"],
    432       "year": 2024,
    433       "arxiv_id": "2402.19173",
    434       "relevance": "Base code generation model used for fine-tuning in the paper's experiments."
    435     },
    436     {
    437       "title": "Code Llama: Open foundation models for code",
    438       "authors": ["B. Rozière", "J. Gehring", "F. Gloeckle"],
    439       "year": 2023,
    440       "arxiv_id": "2308.12950",
    441       "relevance": "Major open-source code generation model family discussed as related work."
    442     },
    443     {
    444       "title": "Evaluating large language models trained on code",
    445       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    446       "year": 2021,
    447       "arxiv_id": "2107.03374",
    448       "relevance": "Introduces HumanEval benchmark and pass@k metric used in this paper's evaluation."
    449     },
    450     {
    451       "title": "Qiskit Code Assistant: Training LLMs for generating quantum computing code",
    452       "authors": ["N. Dupuis", "L. Buratti"],
    453       "year": 2024,
    454       "arxiv_id": "2405.19495",
    455       "relevance": "IBM's quantum code generation model serving as the primary baseline in Table I."
    456     },
    457     {
    458       "title": "A survey on large language models for code generation",
    459       "authors": ["J. Jiang", "F. Wang", "J. Shen", "S. Kim", "S. Kim"],
    460       "year": 2024,
    461       "arxiv_id": "2406.00515",
    462       "relevance": "Survey of LLM code generation covering the broader landscape this paper contributes to."
    463     },
    464     {
    465       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    466       "authors": ["J. Wei", "X. Wang", "D. Schuurmans"],
    467       "year": 2022,
    468       "relevance": "Foundational prompting technique that produced the largest accuracy improvement in this paper's experiments."
    469     },
    470     {
    471       "title": "Structured chain-of-thought prompting for code generation",
    472       "authors": ["J. Li", "G. Li", "Y. Li", "Z. Jin"],
    473       "year": 2023,
    474       "relevance": "SCoT technique applied in the paper showing 40pp improvement in quantum code generation accuracy."
    475     },
    476     {
    477       "title": "LoRA: Low-rank adaptation of large language models",
    478       "authors": ["E. J. Hu", "Y. Shen", "P. Wallis"],
    479       "year": 2021,
    480       "arxiv_id": "2106.09685",
    481       "relevance": "Parameter-efficient fine-tuning method used to adapt StarCoder for quantum code generation."
    482     },
    483     {
    484       "title": "Qiskit HumanEval: An evaluation benchmark for quantum code generative models",
    485       "authors": ["S. Vishwakarma", "F. Harkins", "S. Golecha"],
    486       "year": 2024,
    487       "relevance": "Quantum-specific code generation benchmark used for evaluation in Table I."
    488     },
    489     {
    490       "title": "Scaling LLM test-time compute optimally can be more effective than scaling model parameters",
    491       "authors": ["C. Snell", "J. Lee", "K. Xu", "A. Kumar"],
    492       "year": 2024,
    493       "arxiv_id": "2408.03314",
    494       "relevance": "Inference-time compute scaling research that motivates the paper's multi-pass optimization approach."
    495     },
    496     {
    497       "title": "GPT-4 technical report",
    498       "authors": ["OpenAI"],
    499       "year": 2023,
    500       "relevance": "Major LLM referenced as context for model capabilities; GPT-4o used for CoT prompt generation."
    501     }
    502   ]
    503 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs