scan.json (29904B)
1 { 2 "paper": { 3 "title": "PennyLang: Pioneering LLM-Based Quantum Code Generation with a Novel PennyLane-Centric Dataset", 4 "authors": [ 5 "Abdul Basit", 6 "Nouhaila Innan", 7 "Muhammad Haider Asif", 8 "Minghao Shao", 9 "Muhammad Kashif", 10 "Alberto Marchisio", 11 "Muhammad Shafique" 12 ], 13 "year": 2025, 14 "venue": "arXiv", 15 "arxiv_id": "2503.02497", 16 "doi": "10.48550/arXiv.2503.02497" 17 }, 18 "scan_version": 2, 19 "active_modules": ["experimental_rigor", "data_leakage"], 20 "methodology_tags": ["benchmark-eval"], 21 "key_findings": "PennyLang is a 3,347-sample PennyLane-centric dataset curated from GitHub, textbooks, and documentation. When paired with RAG, it substantially boosts open-source model performance (Qwen 7B from 8.7% to 41.7% success rate), while commercial models like Claude 3.5 Sonnet (95.1% baseline) show no benefit from retrieval augmentation. The 75% context setting consistently outperformed full-context retrieval, suggesting moderate retrieval reduces noise. No statistical tests, error bars, or contamination analysis were performed.", 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": false, 27 "justification": "The paper states 'We release both the PennyLang dataset and the evaluation framework' in the conclusion, but no repository URL, Zenodo archive, or HuggingFace link is provided anywhere in the paper text." 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": false, 32 "justification": "The paper claims the dataset is 'open-source' and 'off-the-shelf' but provides no download URL or repository link for the PennyLang dataset itself. Promises of release without a working URL count as NO." 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment specification is provided. The paper mentions specific tools (LangChain, Chroma, OpenAI Embeddings) but not their versions or installation requirements." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "No step-by-step reproduction instructions are provided. The methodology describes the pipeline conceptually but does not include commands, scripts, or a README for reproducing the experiments." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": true, 48 "answer": false, 49 "justification": "Table VI reports only point estimates for success rates, Pass@1 through Pass@5, and working notebook counts. No confidence intervals, error bars, or uncertainty measures are provided." 50 }, 51 "significance_tests": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper claims 'RAG substantially improves performance' and makes comparative claims across models and retrieval settings, but no statistical significance tests (t-tests, bootstrap, etc.) are performed. Differences are assessed by comparing raw numbers only." 55 }, 56 "effect_sizes_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "The paper reports improvements with baseline context: 'Qwen 7B's success rate rises from 8.71% to 41.66% (≈32.95% relative increase)' and 'LLaMa 4 improves from 78.78% to 84.84% (≈7.7% relative increase).' The from-to framing provides sufficient context to assess magnitude." 60 }, 61 "sample_size_justified": { 62 "applies": true, 63 "answer": false, 64 "justification": "The benchmark uses 264 test cases and generates 1,320 notebooks per model (264 × 5). No justification is given for why 264 test cases were chosen or whether this is sufficient for the claims being made." 65 }, 66 "variance_reported": { 67 "applies": true, 68 "answer": false, 69 "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. Pass@5 involves 5 generations per test case as part of the metric, but no indication of running the full experiment multiple times to measure stability." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": true, 76 "justification": "The paper compares RAG-augmented vs. non-augmented settings (0%, 50%, 75%, 100% context) and evaluates across 4 models (Qwen 7B, LLaMa 4, GPT-4o-mini, Claude 3.5 Sonnet), providing baseline comparisons." 77 }, 78 "baselines_contemporary": { 79 "applies": true, 80 "answer": true, 81 "justification": "The evaluated models are contemporary: GPT-4o Mini, Claude 3.5 Sonnet, Qwen 2.5-7B-Instruct-Turbo, and LLaMa 4 Maverick 17B. These represent recent open-source and commercial models." 82 }, 83 "ablation_study": { 84 "applies": true, 85 "answer": true, 86 "justification": "The paper ablates the retrieval context coverage (0%, 50%, 75%, 100%), showing how different amounts of retrieved context affect performance. This reveals that 75% often outperforms 100%." 87 }, 88 "multiple_metrics": { 89 "applies": true, 90 "answer": true, 91 "justification": "Table VI reports both 'Working Notebooks' (syntax/execution correctness — does the notebook run?) and 'Success Rate' / Pass@k (functional correctness — does it pass tests?). These capture different aspects of code quality." 92 }, 93 "human_evaluation": { 94 "applies": true, 95 "answer": false, 96 "justification": "Evaluation is entirely automated via test execution (Pass@k). Human verification was used during dataset construction but not for evaluating LLM outputs. No human ratings of output quality, readability, or correctness beyond automated tests." 97 }, 98 "held_out_test_set": { 99 "applies": true, 100 "answer": true, 101 "justification": "A dedicated benchmark suite of 264 test cases was designed to evaluate PennyLane functionalities. These are separate from the 3,347 dataset samples used for RAG retrieval." 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": false, 106 "justification": "Table VI breaks down results by model and context level, but provides no per-category breakdown by quantum functionality type (e.g., gates vs. optimization vs. chemistry). Section IV analyzes dataset statistics by feature group, but evaluation results are only aggregate." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": false, 111 "justification": "No error analysis or qualitative examination of failure cases is provided. The paper notes that commercial models degrade with full retrieval but does not analyze specific failing test cases or common error patterns." 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": true, 116 "justification": "The paper reports that GPT-4o-mini and Claude 3.5 Sonnet performance degrades with full-context RAG (e.g., Claude drops from 95.07% without RAG to 89.01% with full retrieval). The paper also notes full context can introduce 'redundancy or irrelevant tokens.'" 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": true, 123 "justification": "Abstract claims are supported by Table VI: Qwen 7B rises from 8.71% to 41.66% (abstract says 8.7% to 41.7%), LLaMa 4 from 78.78% to 84.84% (abstract says 78.8% to 84.8%). The claim about commercial models not benefiting is also supported." 124 }, 125 "causal_claims_justified": { 126 "applies": true, 127 "answer": true, 128 "justification": "The paper's main causal claim is that RAG with PennyLang improves code generation. The study design (same models tested with and without RAG using the same test cases) is a controlled comparison adequate for this claim." 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": false, 133 "justification": "The title claims 'Pioneering LLM-Based Quantum Code Generation' broadly, but results are limited to PennyLane code with 4 specific models on 264 test cases. No other quantum frameworks (Qiskit, Cirq) are tested. The abstract claims to advance 'AI-assisted quantum development' generally from PennyLane-only evidence." 134 }, 135 "alternative_explanations_discussed": { 136 "applies": true, 137 "answer": true, 138 "justification": "The paper discusses why commercial models don't benefit ('broad pretraining, which likely includes similar material') and why partial context outperforms full context ('focused and moderately sized context windows improve model grounding by avoiding redundancy and context saturation'). These address alternative explanations for observed patterns." 139 }, 140 "proxy_outcome_distinction": { 141 "applies": true, 142 "answer": true, 143 "justification": "The paper measures functional correctness via test execution (Pass@k) and directly frames results as code correctness. The measurement matches the claim granularity without inflating to broader constructs like 'developer productivity.'" 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": true, 149 "answer": false, 150 "justification": "The paper uses 'GPT-4o Mini' and 'Claude 3.5 Sonnet' without snapshot dates or API versions. While 'Qwen2.5-7B-Instruct-Turbo' and 'LLaMa 4 Maverick 17B' are more specific, the commercial model versions are insufficient per the schema (marketing names without snapshot dates)." 151 }, 152 "prompts_provided": { 153 "applies": true, 154 "answer": false, 155 "justification": "The paper mentions using ChatPromptTemplate for 'structured prompts' (Table V) and describes the RAG pipeline conceptually, but never provides the actual prompt text sent to the models. Only descriptions like 'structured prompts for AI models to ensure consistency' are given." 156 }, 157 "hyperparameters_reported": { 158 "applies": true, 159 "answer": false, 160 "justification": "No temperature, top-p, max tokens, or other inference hyperparameters are reported for any of the four evaluated models. These significantly affect output quality and reproducibility." 161 }, 162 "scaffolding_described": { 163 "applies": true, 164 "answer": true, 165 "justification": "The RAG pipeline is described in detail: LangChain for orchestration, Chroma vector database for storage, OpenAI Embeddings for encoding, MMR-based retrieval, and ChatPromptTemplate for formatting (Section III.C, Table V, Fig. 2). The full pipeline from query to response is diagrammed." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": true, 170 "justification": "Section III.A documents the full preprocessing pipeline: GitHub API search with filtering criteria (main branch only, no forks, Python files only, import statements checked), duplicate removal, PEP 8 formatting, manual review, and GPT-4o conversion to instruction-query format." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": false, 177 "justification": "No dedicated limitations or threats-to-validity section exists. The conclusion mentions future directions (cross-framework generalization, lightweight fine-tuned models) but does not discuss limitations of the current work." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": false, 182 "justification": "No threats to validity are discussed. The paper does not address potential issues such as test case representativeness, dataset bias toward certain PennyLane patterns, or generalizability concerns." 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": false, 187 "justification": "No explicit scope boundaries are stated. The paper does not articulate what the results do NOT show, such as limitations to specific quantum computing paradigms, model sizes, or programming tasks." 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": false, 194 "justification": "The 3,347 code samples and 264 test cases are claimed to be released but no download URL is provided. The generated notebooks (1,320 per model) are not available for independent verification." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": true, 199 "justification": "Section III.A describes data collection in detail: GitHub API search for repos with 'PennyLane' in name/description/README, filtering by license type, extraction from two specific quantum computing textbooks, and scraping official PennyLane documentation." 200 }, 201 "recruitment_methods_described": { 202 "applies": true, 203 "answer": true, 204 "justification": "Data source selection criteria are well-described: GitHub repos required permissive open-source licenses, PennyLane keyword presence, main branch at latest commit, no forks, Python files only with explicit import statements. Books and documentation sources are named specifically." 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": true, 209 "justification": "The pipeline from raw sources to final dataset is documented: collection (GitHub API, books, docs) → filtering (import checks, license verification) → preprocessing (license/metadata removal, PEP 8 formatting) → annotation (GPT-4o conversion) → deduplication. Table III shows sample counts by source (1,952 + 1,321 + 53 + 21 = 3,347)." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": true, 216 "justification": "The Acknowledgment section discloses funding from NYUAD Center for Quantum and Topological Systems (CQTS) via Tamkeen grant CG008, and Center for CyberSecurity (CCS) via Tamkeen grant G1104." 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "Author affiliations are clearly stated: NYU Abu Dhabi eBRAIN Lab and CQTS. The authors are not affiliated with PennyLane/Xanadu or the LLM providers being evaluated." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": true, 226 "justification": "Funders (Tamkeen/NYUAD research institutes) are academic funding bodies with no financial stake in PennyLane adoption, LLM performance, or quantum computing framework preferences." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": false, 231 "justification": "No competing interests or financial interests statement is present in the paper. Absence of disclosure is not the same as absence of conflict." 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": true, 237 "answer": false, 238 "justification": "No training data cutoff dates are stated for any of the four evaluated models (GPT-4o Mini, Claude 3.5 Sonnet, Qwen 7B, LLaMa 4). This is critical since PennyLane documentation is public and likely in their training data." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": true, 242 "answer": false, 243 "justification": "No discussion of whether the 264 test cases or PennyLane code patterns could have appeared in the models' training data. The dataset is curated from public sources (GitHub, official docs) highly likely to be in model training sets." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": true, 247 "answer": false, 248 "justification": "No contamination analysis is performed despite the dataset being drawn from publicly available PennyLane documentation and GitHub repositories that these models were almost certainly trained on. This is a critical omission." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study. It is a dataset curation and benchmark evaluation paper." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants. The study involves code collection and automated model evaluation only." 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in this study." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants in this study." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": true, 291 "answer": false, 292 "justification": "The evaluation generates 1,320 notebooks per model (5,280 total) using both commercial APIs (GPT-4o Mini, Claude 3.5) and open-source models. No API costs, token counts, or inference latency is reported." 293 }, 294 "compute_budget_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "The acknowledgment mentions NYU Abu Dhabi HPC resources, but no GPU hours, total API spend, or compute time is quantified for the experiments." 298 } 299 }, 300 "experimental_rigor": { 301 "seed_sensitivity_reported": { 302 "applies": true, 303 "answer": false, 304 "justification": "No random seeds or seed sensitivity analysis is mentioned. LLM generation is stochastic (temperature-dependent), and results could vary across runs, but this is not examined." 305 }, 306 "number_of_runs_stated": { 307 "applies": true, 308 "answer": true, 309 "justification": "The paper explicitly states 'each model generates five candidate solutions per test' for Pass@5, and '1320 notebooks were generated and executed per model (264 tests × 5 completions).' The number of generations is clearly documented." 310 }, 311 "hyperparameter_search_budget": { 312 "applies": true, 313 "answer": false, 314 "justification": "No hyperparameter search is described for the RAG pipeline (e.g., chunk size, embedding model, MMR parameters) or for inference settings. The retrieval context percentages (0%, 50%, 75%, 100%) appear chosen without justification." 315 }, 316 "best_config_selection_justified": { 317 "applies": true, 318 "answer": true, 319 "justification": "All four context configurations (0%, 50%, 75%, 100%) are reported for all four models in Table VI. The paper does not selectively report only the best configuration — all results are visible." 320 }, 321 "multiple_comparison_correction": { 322 "applies": false, 323 "answer": false, 324 "justification": "No statistical tests are performed at all (no p-values, no hypothesis testing), so the question of correcting for multiple comparisons does not arise." 325 }, 326 "self_comparison_bias_addressed": { 327 "applies": true, 328 "answer": false, 329 "justification": "The authors designed both the dataset and the 264-test benchmark, then evaluated their own RAG pipeline against baselines. No acknowledgment of author-evaluation bias or independent evaluation is provided." 330 }, 331 "compute_budget_vs_performance": { 332 "applies": true, 333 "answer": false, 334 "justification": "Different retrieval settings (0% to 100% context) imply different token counts and compute costs, but performance is not analyzed as a function of compute budget. No token-level cost comparison across conditions." 335 }, 336 "benchmark_construct_validity": { 337 "applies": true, 338 "answer": false, 339 "justification": "The paper does not discuss whether the 264 test cases adequately measure quantum code generation capability vs. PennyLane API memorization. No analysis of what the benchmark actually tests relative to the claimed evaluation goal." 340 }, 341 "scaffold_confound_addressed": { 342 "applies": true, 343 "answer": true, 344 "justification": "All four models are evaluated within the same RAG scaffold (LangChain + Chroma + MMR), ensuring that model comparisons are not confounded by different scaffolding. The same retrieval pipeline is used consistently." 345 } 346 }, 347 "data_leakage": { 348 "temporal_leakage_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "No discussion of temporal leakage. The PennyLane documentation and GitHub repositories used to build the dataset have been public for years and are almost certainly in the training data of the evaluated commercial models." 352 }, 353 "feature_leakage_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "No discussion of whether the RAG-retrieved context provides information that would not be available in realistic deployment settings, or whether retrieved examples contain patterns too similar to the test cases." 357 }, 358 "non_independence_addressed": { 359 "applies": true, 360 "answer": false, 361 "justification": "No discussion of whether the 264 test cases and the 3,347 dataset samples share structural similarities, come from the same sources, or overlap in content. Both are drawn from the PennyLane ecosystem, creating potential non-independence." 362 }, 363 "leakage_detection_method": { 364 "applies": true, 365 "answer": false, 366 "justification": "No concrete leakage detection or prevention method is applied. No n-gram overlap analysis, deduplication check between dataset and test cases, or contamination testing for the evaluated models." 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "RAG with PennyLang substantially improves Qwen 7B performance from 8.71% to 41.66% success rate on PennyLane code generation tasks.", 373 "evidence": "Table VI shows Qwen 7B at 0% context achieves 8.71% success rate (23 pass@5) vs. 41.66% (110 pass@5) at full context. Section V reports these numbers.", 374 "supported": "moderate" 375 }, 376 { 377 "claim": "LLaMa 4 Maverick improves from 78.78% to 84.84% success rate with full-context RAG augmentation.", 378 "evidence": "Table VI shows LLaMa 4 at 0% context = 78.78% (208 pass@5) and full context = 84.84% (224 pass@5). The improvement is modest at ~6 percentage points.", 379 "supported": "moderate" 380 }, 381 { 382 "claim": "Commercial models (GPT-4o-mini, Claude 3.5 Sonnet) do not benefit from RAG augmentation with PennyLang; their best performance is achieved without retrieval.", 383 "evidence": "Table VI shows GPT-4o-mini peaks at 85.60% without RAG vs. 80.30% with full RAG, and Claude 3.5 Sonnet peaks at 95.07% without RAG vs. 89.01% with full RAG. Both degrade with full-context retrieval.", 384 "supported": "strong" 385 }, 386 { 387 "claim": "75% context retrieval often outperforms full-context (100%) retrieval, suggesting moderate retrieval reduces noise.", 388 "evidence": "Table VI: Qwen 7B 75% = 45.07% vs. full = 41.66%; GPT-4o-mini 75% = 84.84% vs. full = 80.30%; Claude 3.5 75% = 92.44% vs. full = 89.01%. Pattern is consistent across 3 of 4 models.", 389 "supported": "moderate" 390 }, 391 { 392 "claim": "PennyLang is the first PennyLane-centric dataset for LLM-based quantum code generation.", 393 "evidence": "Table I compares existing datasets, showing none are structured, curated PennyLane datasets. Existing PennyLane resources (documentation, GitHub repos) are unstructured. Table II positions PennyLang as a novel contribution.", 394 "supported": "moderate" 395 } 396 ], 397 "red_flags": [ 398 { 399 "flag": "No statistical rigor", 400 "detail": "All comparative claims are made by comparing raw numbers without statistical tests, confidence intervals, or variance measurements. With stochastic LLM generation and only 264 test cases, observed differences could be within noise." 401 }, 402 { 403 "flag": "Contamination risk unaddressed", 404 "detail": "PennyLane documentation and GitHub repositories (the dataset sources) are almost certainly in the training data of GPT-4o-mini and Claude 3.5 Sonnet. The high baseline performance of commercial models without RAG (85.6% and 95.1%) may reflect memorization, not capability. No contamination analysis is performed." 405 }, 406 { 407 "flag": "Circularity in dataset construction", 408 "detail": "GPT-4o was used to convert code snippets into instruction-query pairs, then GPT-4o-mini (a variant from the same model family) is evaluated on the resulting dataset. This creates potential circularity where the evaluation model benefits from the formatting model's patterns." 409 }, 410 { 411 "flag": "No artifacts despite open-source claims", 412 "detail": "The paper repeatedly claims the dataset and evaluation framework are 'open-source' and 'released,' but provides no URL, repository link, or download location anywhere in the paper." 413 }, 414 { 415 "flag": "Author-designed benchmark without independent validation", 416 "detail": "The 264 test cases were designed by the same team that built the dataset. No independent validation of the benchmark's coverage, difficulty distribution, or construct validity is provided." 417 }, 418 { 419 "flag": "No limitations section", 420 "detail": "The paper lacks any discussion of limitations, threats to validity, or scope boundaries. This omission is notable for a paper making broad claims about 'pioneering' quantum code generation." 421 } 422 ], 423 "cited_papers": [ 424 { 425 "title": "Evaluating large language models trained on code", 426 "authors": ["M. Chen", "J. Tworek", "H. Jun"], 427 "year": 2021, 428 "arxiv_id": "2107.03374", 429 "relevance": "Introduces HumanEval benchmark for LLM code generation evaluation, foundational to code generation assessment methodology." 430 }, 431 { 432 "title": "Competition-level code generation with AlphaCode", 433 "authors": ["Y. Li", "D. R. Zhou", "N. Kohl"], 434 "year": 2022, 435 "arxiv_id": "2203.07814", 436 "relevance": "Demonstrates LLM performance on competitive programming, relevant to understanding LLM code generation capabilities and limits." 437 }, 438 { 439 "title": "OpenAI Codex: Programming with Natural Language", 440 "authors": ["OpenAI"], 441 "year": 2021, 442 "relevance": "Pioneering work on LLM-driven code generation from natural language, establishing the paradigm this paper extends to quantum computing." 443 }, 444 { 445 "title": "Code LLaMA: Open Foundation Models for Code", 446 "authors": ["B. Rozière", "J. Gehring", "F. Gloeckle"], 447 "year": 2024, 448 "arxiv_id": "2308.12950", 449 "relevance": "Open-source code generation model relevant to understanding how specialized fine-tuning affects code generation quality." 450 }, 451 { 452 "title": "Qiskit Code Assistant: Training LLMs for Generating Quantum Computing Code", 453 "authors": ["N. Dupuis", "L. Buratti", "S. Vishwakarma"], 454 "year": 2024, 455 "relevance": "Most directly related prior work: adapts LLMs for quantum code generation using Qiskit, the closest precedent to PennyLang's PennyLane approach." 456 }, 457 { 458 "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages", 459 "authors": ["Z. Feng", "D. Guo", "D. Tang"], 460 "year": 2020, 461 "arxiv_id": "2002.08155", 462 "relevance": "Pre-trained model for code understanding that established bidirectional transformer approaches for programming language processing." 463 }, 464 { 465 "title": "InCoder: A Generative Model for Code Infilling and Synthesis", 466 "authors": ["D. Fried", "A. Aghajanyan", "J. Lin"], 467 "year": 2023, 468 "arxiv_id": "2204.05999", 469 "relevance": "Multi-turn interactive code generation model relevant to understanding iterative code completion approaches." 470 }, 471 { 472 "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", 473 "authors": ["P. Lewis", "E. Perez", "A. Piktus"], 474 "year": 2021, 475 "arxiv_id": "2005.11401", 476 "relevance": "Foundational RAG paper that this work builds on to augment LLMs with domain-specific quantum computing knowledge." 477 }, 478 { 479 "title": "QCircuitNet: A Large-Scale Hierarchical Dataset for Quantum Algorithm Design", 480 "authors": ["R. Yang", "Y. Gu", "Z. Wang"], 481 "year": 2024, 482 "arxiv_id": "2410.07961", 483 "relevance": "Quantum computing dataset for LLM training (Qiskit-focused), directly comparable to PennyLang's contribution for PennyLane." 484 }, 485 { 486 "title": "Language Models are Few-Shot Learners", 487 "authors": ["T. B. Brown", "B. Mann", "N. Ryder"], 488 "year": 2020, 489 "relevance": "GPT-3 paper establishing few-shot learning capabilities of large language models, foundational to the LLM code generation paradigm." 490 }, 491 { 492 "title": "A Survey on Large Language Models for Code Generation", 493 "authors": ["J. Jiang", "F. Wang", "J. Shen"], 494 "year": 2024, 495 "arxiv_id": "2406.00515", 496 "relevance": "Recent survey of LLM code generation relevant to understanding the landscape of AI-assisted programming research." 497 } 498 ] 499 }