scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29735B)
      1 {
      2   "paper": {
      3     "title": "Model-Driven Quantum Code Generation Using Large Language Models and Retrieval-Augmented Generation",
      4     "authors": ["Nazanin Siavash", "Armin Moin"],
      5     "year": 2025,
      6     "venue": "ACM/IEEE International Conference on Model Driven Engineering Languages and Systems",
      7     "arxiv_id": "2508.21097",
      8     "doi": "10.1109/MODELS67397.2025.00031"
      9   },
     10   "scan_version": 3,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval", "case-study"],
     13   "key_findings": "Specific prompt engineering improves CodeBLEU scores by up to 4x (from 0.16 to 0.57–0.63) for LLM-based quantum code generation from UML model instances. Retrieval-Augmented Generation using 8 Qiskit GitHub repositories does not significantly improve code generation quality in any configuration. Near-perfect quantum-specific precision/recall is achievable with well-engineered prompts on GPT-4o, but overall recall (including non-quantum elements) remains low (~0.42) compared to rule-based approaches.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper provides a GitHub link: 'The source code is available at https://github.com/qas-lab/quantumcodegeneration' in the Software and Data Availability section."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The experimental data (UML model instances) adopted from Jiménez-Navajas et al. [6] are referenced as available at [34]. The paper states: 'The research data adopted from [6] are available at [34].'"
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed dependency listing is provided in the paper. The environment needed to run the RAG pipeline and LLM integration is not specified."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided in the paper. There is no README description, commands to run, or 'Reproducing Results' section."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Tables I–IV report individual run results and averages across 10 runs, but no confidence intervals, error bars, or ± notation is provided for any metric."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims prompt engineering 'has a far greater impact' and that RAG 'does not significantly improve performance' but uses no statistical tests (no p-values, t-tests, or other significance tests) to support these comparative claims."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports effect sizes with baseline context: 'the average CodeBLEU score increases from 0.16 to 0.57' and 'Q-Recall improves from 0.63 to 0.99, Q-Precision from 0.96 to 1.00' (Section V-C), allowing readers to assess magnitude."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification is given for using 7 model instances or 10 runs per configuration. No power analysis or discussion of whether this sample is sufficient for the claims made."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "While per-run results across 10 runs are shown in tables, no standard deviation, IQR, or any spread measure is reported. Only individual runs and averages are shown, so the reader must compute spread themselves."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper compares against the rule-based approach of Jiménez-Navajas et al. [6] and uses a factorial design comparing 4 configurations (generic/specific prompt × with/without RAG) as internal baselines."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Jiménez-Navajas et al. [6] is from 2025 and represents the most recent prior work on generating quantum code from UML models. Henderson et al. [25] is also cited as recent work in LLM-based quantum programming."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The 2×2 factorial design (generic vs specific prompt × with vs without RAG) across Tables I–IV serves as an ablation, isolating the contribution of prompt specificity and RAG independently."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Seven metrics are used: CodeBLEU, Q-Precision, Q-Recall, Q-F-measure, Precision, Recall, and F-measure, covering both syntactic/semantic code similarity and element-wise mapping accuracy."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "All evaluation is automated using CodeBLEU and element-wise precision/recall metrics. No human expert review of generated quantum code quality, correctness, or executability is reported."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "All 7 model instances are used for evaluation with no train/test split. The same instances are used across all configurations. There is no held-out set."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down per model instance: 'We conduct experiments on all seven model instances' with a comparative summary across all instances (Section V-C), reporting best/worst per metric per instance."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses that RAG failed to improve performance: 'the selected Qiskit repositories do not offer sufficiently relevant context' and reports the lowest CodeBLEU of 0.10 for model instance 7 with generic prompt and RAG."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper explicitly reports that RAG did not help: 'This external context does not significantly improve performance across most metrics, hence not validating RQ2 so far' (Section V-C, Table II discussion)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims 'well-engineered prompts can improve CodeBLEU scores by up to a factor of four' which is supported by the 0.16→0.57 improvement for model instance 1 (Tables I vs III). The abstract appropriately hedges about future work."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The causal claims ('prompt engineering has a far greater impact than RAG') are supported by the 2×2 factorial design with 10 repeated runs per cell, which provides controlled single-variable manipulation for both factors."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title claims 'Model-Driven Quantum Code Generation Using Large Language Models' but the study uses only GPT-4o, only Qiskit-targeted code, only 7 UML model instances from a single source, and only one RAG configuration. The broad framing significantly exceeds the narrow experimental scope."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper briefly notes the RAG repositories 'may not have been sufficiently relevant' but does not consider alternative explanations for the prompt engineering effect (e.g., whether the specific prompt simply provides the mapping rules that the model already knows, or whether the improvement is specific to Qiskit syntax rather than general quantum code generation)."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper uses CodeBLEU as a measure of code quality without discussing its well-known limitations as a proxy for actual code correctness, executability, or functional equivalence. The generated code is not tested for execution or functional correctness."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper specifies 'GPT-4o' throughout but provides no snapshot date, API version, or specific model checkpoint identifier. Per schema criteria, marketing names without a snapshot date do not count."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The two prompt types (generic and specific) are described only in natural language: 'a more general prompt that simply instructs the LLM' and one 'incorporating detailed implementation requirements, such as quantum gate mapping strategies and constraints on syntax and gate behavior.' The actual prompt text is not provided."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No LLM API parameters (temperature, top-p, max tokens, etc.) are reported anywhere in the paper. These significantly affect output variability and are essential for reproduction."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The RAG pipeline architecture is described in Section IV-B with Figure 1 illustrating the overall pipeline: retriever identifying relevant documents from a knowledge base, generator using retrieved information, incorporating 8 Qiskit GitHub repositories. While high-level, the architecture is described."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The paper does not describe how the 8 Qiskit repositories were selected, how code was chunked or embedded for retrieval, what similarity metrics were used, or how the UML model instances were preprocessed into LLM input format."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion mentions future work directions but does not substantively discuss limitations of the current study."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity are discussed. There is no consideration of internal, external, or construct validity threats specific to this study."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No explicit scope boundaries are stated. The paper does not articulate what the results do NOT show — e.g., that results may not generalize beyond GPT-4o, Qiskit, these 7 model instances, or this specific RAG configuration."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The input model instances are available via [34], but the raw experimental outputs (generated code from each of the 10 runs per configuration) are not mentioned as available. Only aggregated metrics are shown in the paper."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The model instances come from Jiménez-Navajas et al. [6], which is stated. However, the 8 Qiskit GitHub repositories used for RAG are not named, and their selection criteria are not described."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. The data source is UML model instances from prior published work, essentially a standard benchmark."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The pipeline from UML model instance → LLM prompt → generated code → evaluation metrics is described only at a high level. Key details are missing: how UML is serialized for the prompt, how RAG retrieval works, how CodeBLEU reference code is established."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Funding is disclosed in the Acknowledgment section: 'This work is funded by a grant from the Colorado Office of Economic Development and International Trade (OEDIT).'"
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Both authors are from the Department of Computer Science, University of Colorado Colorado Springs (UCCS). They evaluate OpenAI's GPT-4o but are not affiliated with OpenAI."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "OEDIT is a Colorado state economic development office with no apparent financial interest in GPT-4o performance or quantum code generation outcomes."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is included in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The paper does not state GPT-4o's training data cutoff date. This is relevant because the UML model instances from [6] and Qiskit code could potentially appear in training data."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether GPT-4o's training data includes the UML model instances, the reference Qiskit code from [6], or the 8 Qiskit repositories used for RAG."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "The model instances from Jiménez-Navajas et al. [6] and associated code were published before GPT-4o's training cutoff. The paper does not discuss whether GPT-4o may have seen this data during training, which would inflate performance metrics."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study. The evaluation is entirely automated using metrics on generated code."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. The study evaluates LLM-generated code against reference implementations."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No API costs, token counts, or latency measurements are reported. The paper runs GPT-4o 10 times per configuration across 7 model instances (280 total API calls) without reporting any cost information."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total computational budget, API spend, or hardware specifications are mentioned."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Tables I–IV show individual results for all 10 runs per configuration, revealing variability across runs (e.g., Table I Run 8 shows Q-Precision of 0.75 vs 0.99 for other runs). This constitutes reporting across multiple runs."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Explicitly stated: 'PERFORMANCE METRICS ACROSS 10 RUNS' in all four tables. The number 10 is clearly documented."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search is described. The two prompts (generic and specific) appear manually designed, and no search budget for prompt variants, RAG configurations, or API parameters is reported."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "All 4 configurations (2×2 factorial) are reported across all 7 model instances. The paper does not selectively report only the best configuration; it transparently shows that RAG didn't help."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The paper makes multiple comparisons across 4 configurations and 7 model instances but performs no statistical tests at all, let alone corrections for multiple comparisons."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors compare their LLM-based approach against Jiménez-Navajas et al. [6] without acknowledging that their implementation and evaluation choices may systematically favor their approach."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The LLM approach uses GPT-4o API calls (presumably orders of magnitude more compute than the rule-based baseline [6]) but no compute comparison is provided. Performance is never discussed relative to computational cost."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "CodeBLEU is used as the primary quality metric without discussing its known limitations for measuring actual code correctness or executability. No discussion of whether CodeBLEU (a text-similarity metric) actually captures quantum code quality."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "When comparing against the rule-based approach of [6], the entire generation pipeline differs (LLM+RAG vs Epsilon Generation Language), making it impossible to attribute differences to any single factor. This confound is not discussed."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "Not discussed. The UML model instances from [6] and Qiskit reference code may have been available online before GPT-4o's training cutoff, potentially inflating generation quality."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "Not discussed. The specific prompt variant includes 'quantum gate mapping strategies and constraints on syntax and gate behavior' which may effectively give the model the answer structure."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "Not discussed. All 7 model instances come from a single source [6], likely sharing structural patterns. The 10 runs per configuration are also not independent (same model, same prompt)."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, or decontamination."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Well-engineered specific prompts improve CodeBLEU scores by up to a factor of four compared to generic prompts.",
    365       "evidence": "Tables I and III show average CodeBLEU for model instance 1 increasing from 0.16 (generic prompt) to 0.57 (specific prompt). Section V-C also reports model instance 3 achieving 0.63 with specific prompt vs generic baselines around 0.16.",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "RAG pipeline with Qiskit GitHub repositories does not significantly improve quantum code generation quality.",
    370       "evidence": "Comparing Table I vs II (generic prompt: no change in average CodeBLEU at 0.16) and Table III vs IV (specific prompt: 0.57 vs 0.58). Section V-C states: 'This external context does not significantly improve performance across most metrics, hence not validating RQ2 so far.'",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Near-perfect quantum-specific metrics (Q-Precision, Q-Recall, Q-F-measure all near 1.0) are achievable with specific prompts.",
    375       "evidence": "Table III shows averages of Q-Recall 0.99, Q-Precision 1.00, Q-F-measure 0.99 for model instance 1 with specific prompt. Table IV shows similar results with RAG added.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Prompt engineering has far greater impact on code generation performance than external context through the RAG pipeline.",
    380       "evidence": "Section V-C: 'Comparing Table I with Table III, and Table II with Table IV further highlight that prompt-engineering has a far greater impact on the LLM's code generation performance than the provided external context through the RAG pipeline in the current setup.' Supported by the factorial comparison.",
    381       "supported": "moderate"
    382     }
    383   ],
    384   "red_flags": [
    385     {
    386       "flag": "Very small sample size",
    387       "detail": "Only 7 UML model instances from a single source are used, with detailed results shown only for model instance 1 (described as having 'best overall performance'). This is insufficient to generalize to quantum code generation broadly."
    388     },
    389     {
    390       "flag": "No statistical tests",
    391       "detail": "Comparative claims about prompt engineering vs RAG effectiveness are made purely by comparing averages across 10 runs, with no significance tests, confidence intervals, or variance measures reported."
    392     },
    393     {
    394       "flag": "Selective detailed reporting",
    395       "detail": "Detailed per-run results (Tables I–IV) are shown only for model instance 1, which 'demonstrated the best overall performance across all evaluation metrics.' Other instances get only a summary, hiding potentially poor performance."
    396     },
    397     {
    398       "flag": "No functional correctness evaluation",
    399       "detail": "Generated quantum code is never executed or tested for functional correctness. All evaluation is based on text-similarity metrics (CodeBLEU) and element-wise matching, which do not verify that the code actually runs or produces correct quantum circuits."
    400     },
    401     {
    402       "flag": "Missing critical experimental details",
    403       "detail": "No model version/snapshot, no API hyperparameters (temperature, top-p), no actual prompt text provided, and no description of which 8 Qiskit repositories were used for RAG or how they were selected."
    404     },
    405     {
    406       "flag": "Broad claims from narrow evidence",
    407       "detail": "The paper title claims 'Model-Driven Quantum Code Generation Using Large Language Models' but tests only GPT-4o, only Qiskit, only 7 instances from one prior paper, and only one RAG configuration."
    408     }
    409   ],
    410   "cited_papers": [
    411     {
    412       "title": "Large language models for software engineering: A systematic literature review",
    413       "authors": ["X. Hou", "Y. Zhao", "Y. Liu", "Z. Yang", "K. Wang", "L. Li", "X. Luo", "D. Lo", "J. Grundy", "H. Wang"],
    414       "year": 2024,
    415       "relevance": "Comprehensive SLR on LLMs in software engineering, directly relevant to the survey's scope on AI/LLM capabilities in SE."
    416     },
    417     {
    418       "title": "Large language models for software engineering: Survey and open problems",
    419       "authors": ["A. Fan", "B. Gokkaya", "M. Harman", "M. Lyubarskiy", "S. Sengupta", "S. Yoo", "J. M. Zhang"],
    420       "year": 2023,
    421       "relevance": "Survey on LLMs for SE identifying open problems, directly within the survey's scope on LLM programming capabilities."
    422     },
    423     {
    424       "title": "Model cascading for code: Reducing inference costs with model cascading for LLM based code generation",
    425       "authors": ["B. Chen", "M. Zhu", "B. Dolan-Gavitt", "M. Shafique", "S. Garg"],
    426       "year": 2024,
    427       "arxiv_id": "2405.15842",
    428       "relevance": "Proposes cost-efficient approaches to LLM-based code generation through model cascading, relevant to practical AI code generation."
    429     },
    430     {
    431       "title": "Rethinking AI code generation: a one-shot correction approach based on user feedback",
    432       "authors": ["K. T. Le", "A. Andrzejak"],
    433       "year": 2024,
    434       "relevance": "Novel approach to LLM code generation using feedback-based correction, relevant to improving AI code generation quality."
    435     },
    436     {
    437       "title": "CodeGeeX: A pre-trained model for code generation with multilingual benchmarking on HumanEval-X",
    438       "authors": ["Q. Zheng", "X. Xia", "X. Zou", "Y. Dong", "S. Wang"],
    439       "year": 2023,
    440       "relevance": "13B-parameter multilingual code generation model with benchmarking, directly relevant to LLM code generation evaluation."
    441     },
    442     {
    443       "title": "Programming quantum computers with large language models",
    444       "authors": ["E. R. Henderson", "J. M. Henderson", "J. Ange", "M. A. Thornton"],
    445       "year": 2025,
    446       "relevance": "Most directly comparable work — studies LLMs for quantum programming, though with older GPT-4 model and no RAG."
    447     },
    448     {
    449       "title": "CodeBLEU: a method for automatic evaluation of code synthesis",
    450       "authors": ["S. Ren", "D. Guo", "S. Lu", "L. Zhou", "S. Liu", "D. Tang", "N. Sundaresan", "M. Zhou", "A. Blanco", "S. Ma"],
    451       "year": 2020,
    452       "arxiv_id": "2009.10297",
    453       "relevance": "Defines the CodeBLEU metric used for evaluating code generation quality, relevant to evaluation methodology in AI code generation."
    454     },
    455     {
    456       "title": "A survey on RAG meeting LLMs: Towards retrieval-augmented large language models",
    457       "authors": ["W. Fan", "Y. Ding", "L. Ning", "S. Wang", "H. Li", "D. Yin", "T.-S. Chua", "Q. Li"],
    458       "year": 2024,
    459       "relevance": "Survey on RAG for LLMs, relevant to understanding retrieval-augmented approaches for improving LLM code generation."
    460     },
    461     {
    462       "title": "Lost in translation: A study of bugs introduced by large language models while translating code",
    463       "authors": ["R. Pan", "A. R. Ibrahimzada", "R. Krishna", "D. Sankar", "L. P. Wassi", "M. Merler", "B. Sobolev", "R. Pavuluri", "S. Sinha", "R. Jabbarvand"],
    464       "year": 2024,
    465       "relevance": "Studies bugs introduced by LLMs during code translation, relevant to risks and quality issues in AI code generation."
    466     },
    467     {
    468       "title": "Exploring and unleashing the power of large language models in automated code translation",
    469       "authors": ["Z. Yang", "F. Liu", "Z. Yu", "J. W. Keung", "J. Li", "S. Liu", "Y. Hong", "X. Ma", "Z. Jin", "G. Li"],
    470       "year": 2024,
    471       "relevance": "Studies LLMs for automated code translation/transpilation, relevant to the broader theme of LLM-based code transformations."
    472     }
    473   ],
    474   "engagement_factors": {
    475     "practical_relevance": {
    476       "score": 1,
    477       "justification": "Quantum code generation from UML is a niche use case with very limited current practitioner adoption; the approach requires domain-specific UML models."
    478     },
    479     "surprise_contrarian": {
    480       "score": 1,
    481       "justification": "The finding that RAG didn't help is mildly surprising but not framed as challenging conventional wisdom; prompt engineering helping is expected."
    482     },
    483     "fear_safety": {
    484       "score": 0,
    485       "justification": "No AI safety, security, or risk concerns are raised or relevant to this work."
    486     },
    487     "drama_conflict": {
    488       "score": 0,
    489       "justification": "No controversy, no challenge to established methods or claims beyond a straightforward comparison."
    490     },
    491     "demo_ability": {
    492       "score": 1,
    493       "justification": "Source code is released on GitHub but requires OpenAI API access, specific UML model instances, and quantum computing context to try."
    494     },
    495     "brand_recognition": {
    496       "score": 1,
    497       "justification": "Uses GPT-4o (OpenAI product) but the authors are from UCCS, not a well-known AI lab; the venue (MODELS) is established but niche."
    498     }
    499   }
    500 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs