scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28999B)
      1 {
      2   "paper": {
      3     "title": "Qiskit HumanEval: An Evaluation Benchmark For Quantum Code Generative Models",
      4     "authors": [
      5       "Sanjay Vishwakarma",
      6       "Francis Harkins",
      7       "Siddharth Golecha",
      8       "Vishal Sharathchandra Bajpe",
      9       "Nicolas Dupuis",
     10       "Luca Buratti",
     11       "David Kremer",
     12       "Ismael Faro",
     13       "Ruchir Puri",
     14       "Juan Cruz-Benito"
     15     ],
     16     "year": 2024,
     17     "venue": "International Conference on Quantum Computing and Engineering (QCE)",
     18     "arxiv_id": "2406.14712",
     19     "doi": "10.1109/QCE60285.2024.00137"
     20   },
     21   "scan_version": 3,
     22   "active_modules": ["experimental_rigor", "data_leakage"],
     23   "methodology_tags": ["benchmark-eval"],
     24   "key_findings": "The paper introduces Qiskit HumanEval (QHE), a 101-task benchmark for evaluating LLM quantum code generation using Qiskit. IBM's Granite-8B-Code-QK model, fine-tuned on Qiskit-specific data, achieves 46.53% pass@1 on QHE, outperforming all baselines including models 4x larger, with a 17.8 percentage point gain over the un-tuned Granite base model. No model passes either of the 2 difficult-level tasks. Fine-tuning for Qiskit slightly degrades general HumanEval performance (39.02% → 38.41%).",
     25   "checklist": {
     26     "artifacts": {
     27       "code_released": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper states 'the dataset will be made publicly available as open source in Q3 2024' and 'The complete dataset is scheduled for release in the upcoming months.' This is a promise of future release, not a working link or archive at publication time."
     31       },
     32       "data_released": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Same as above — the QHE dataset was not released at time of publication. Section V states 'The preliminary public release will offer a set of 150 tests' in a future release. No download link or repository URL is provided."
     36       },
     37       "environment_specified": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper mentions 'compatible with Qiskit ≥1.0' and the base model is on HuggingFace, but no requirements.txt, Dockerfile, or detailed dependency list is provided. Library versions beyond Qiskit are not specified."
     41       },
     42       "reproduction_instructions": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No step-by-step reproduction instructions are provided. The paper describes the evaluation approach conceptually but does not provide scripts, commands, or a README for reproducing the experiments."
     46       }
     47     },
     48     "statistical_methodology": {
     49       "confidence_intervals_or_error_bars": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "Tables II and III report only point estimates (e.g., '46.53%') with no confidence intervals, error bars, or uncertainty measures."
     53       },
     54       "significance_tests": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The paper claims 'Our Qiskit model GRANITE-8B-CODE-QK performs better than all the other baselines on QHE' based solely on comparing raw pass@1 numbers with no statistical significance test."
     58       },
     59       "effect_sizes_reported": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "The paper reports 'The pass rate jumps 17.8 points compared with GRANITE-8B-CODE-BASE' with both baseline (28.71%) and tuned (46.53%) values, providing sufficient context for the magnitude of improvement."
     63       },
     64       "sample_size_justified": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The benchmark has 101 tasks with no justification for this sample size. The difficulty distribution (54 basic, 45 intermediate, 2 difficult) is described but not justified, and the 2-task difficult category is extremely small."
     68       },
     69       "variance_reported": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "Results are from greedy decoding (single deterministic pass). No variance across runs, seeds, or sampling strategies is reported."
     73       }
     74     },
     75     "evaluation_design": {
     76       "baselines_included": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Table II includes 5 baseline models: CodeLlama-34B-Python-hf, DeepSeek-Coder-33B-base, StarCoder2-15B, CodeGemma-7B, and Granite-8B-Code-Base."
     80       },
     81       "baselines_contemporary": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The baselines (CodeLlama, DeepSeek-Coder, StarCoder2, CodeGemma, Granite) are all from the 2023-2024 era, which is contemporary for a mid-2024 paper."
     85       },
     86       "ablation_study": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "The fine-tuning pipeline involves multiple stages (extended pre-training on Qiskit data, then instruction tuning with multiple data sources) but no ablation isolates the contribution of each component or data source."
     90       },
     91       "multiple_metrics": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "Only Pass@1 with greedy decoding is reported. No Pass@5, Pass@10, or other metrics (code quality, efficiency, etc.) are used."
     95       },
     96       "human_evaluation": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "Evaluation is entirely automated via unit test pass/fail. No human evaluation of generated code quality, readability, or correctness beyond test passing."
    100       },
    101       "held_out_test_set": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The QHE dataset is described as 'hand-curated' and 'not part of the training dataset for the large language models being tested.' Tasks are stated to be 'original and not extracted from any existing tutorials, courses, books, or papers.' The benchmark functions as a held-out test set."
    105       },
    106       "per_category_breakdown": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Table III provides per-difficulty breakdowns (basic, intermediate, difficult) for all models. Table I also categorizes the 101 tasks into 8 categories, though results are not broken down by these categories."
    110       },
    111       "failure_cases_discussed": {
    112         "applies": true,
    113         "answer": false,
    114         "justification": "The paper notes 'No model is currently able to pass either of the 2 difficult tests' but provides no analysis of why models fail, what types of errors occur, or which categories are hardest."
    115       },
    116       "negative_results_reported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Table II shows that Granite-8B-Code-QK's general HumanEval score slightly decreases from 39.02% to 38.41% after Qiskit fine-tuning, and all models fail both difficult tasks."
    120       }
    121     },
    122     "claims_and_evidence": {
    123       "abstract_claims_supported": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The abstract claims the work 'demonstrate[s] the feasibility of using LLMs for generating quantum code' and 'establish[es] a new benchmark.' The results (models achieving 24-46% pass rates) support feasibility, and the benchmark is indeed introduced."
    127       },
    128       "causal_claims_justified": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The main causal claim is that Qiskit-specific fine-tuning improves QHE performance ('The pass rate jumps 17.8 points... which shows the efficiency of our tuning approach'). The comparison of Granite-8B base vs. Granite-8B-QK (same architecture, different training) is a reasonable controlled comparison adequate for this claim."
    132       },
    133       "generalization_bounded": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "The paper is titled specifically for 'Quantum Code Generative Models' and scoped to the Qiskit SDK. Claims stay within the tested domain. Section V discusses Qiskit-specific limitations and future expansion to OpenQASM 3.0."
    137       },
    138       "alternative_explanations_discussed": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "No alternative explanations are discussed for the results. For instance, the paper does not consider whether the QHE tasks disproportionately favor Qiskit-specific patterns seen in IBM's fine-tuning data, or whether model size differences confound comparisons."
    142       },
    143       "proxy_outcome_distinction": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The paper measures Pass@1 (unit test pass rate) and frames it as evaluating 'the correctness of the generated solutions' and 'functional correctness.' These are well-matched — no proxy gap exists between what is measured and what is claimed."
    147       }
    148     },
    149     "setup_transparency": {
    150       "model_versions_specified": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Models are identified by specific HuggingFace model IDs (e.g., 'ibm-granite/granite-8b-code-base'). Section IV names CodeLlama-34B-Python-hf, DeepSeek-Coder-33B-base, StarCoder2-15B, CodeGemma-7B with specific identifiers."
    154       },
    155       "prompts_provided": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "The paper provides 5 complete task examples (Examples 1-5) including full prompt text with function signatures, docstrings, canonical solutions, and test functions. The dataset structure format is described in Section III.A, and these prompts are what is fed to the models in HumanEval-style evaluation."
    159       },
    160       "hyperparameters_reported": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Pass@1 is computed using greedy decoding (implying temperature=0). Fine-tuning hyperparameters are detailed: learning rate 1×10⁻⁵, cosine schedule, batch size 64, 3 epochs, 4096 context. Instruction tuning: learning rate 8×10⁻⁶, batch size 32, 3 epochs, 2048 sequence length."
    164       },
    165       "scaffolding_described": {
    166         "applies": false,
    167         "answer": false,
    168         "justification": "No agentic scaffolding is used. The evaluation is standard code completion (model generates function body from prompt)."
    169       },
    170       "data_preprocessing_documented": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section IV describes the fine-tuning data pipeline: Qiskit data collected April 2024, license-filtered (Apache 2.0, MIT, etc.), date-filtered (post-2023), deduplicated, cleaned to ~50M tokens. Instruction tuning data sources and sizes are also specified (8k openassistant, 5k commitpackft, 2.7k synthetic Q&A, 1k synthetic prompt/code)."
    174       }
    175     },
    176     "limitations_and_scope": {
    177       "limitations_section_present": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "There is no dedicated limitations or threats-to-validity section. Section V (Discussion) focuses on future plans (OpenQASM 3.0 expansion, community contributions, version updates) rather than acknowledging methodological limitations."
    181       },
    182       "threats_to_validity_specific": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No specific threats to validity are discussed. The paper does not address potential issues such as the small number of difficult tasks, the lack of statistical testing, or the potential for benchmark bias toward IBM's training data distribution."
    186       },
    187       "scope_boundaries_stated": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "The paper does not explicitly state what the results do NOT show. It does not discuss, for example, that pass@1 on 101 tasks does not measure real-world quantum programming productivity, or that results on Qiskit do not generalize to other quantum SDKs."
    191       }
    192     },
    193     "data_integrity": {
    194       "raw_data_available": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "The QHE dataset and model outputs were not released at time of publication. Only 5 example tasks are shown. Independent verification of the 101 tasks and model outputs is not possible."
    198       },
    199       "data_collection_described": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Section III describes the dataset creation in detail: panel of quantum computing and Qiskit experts created tasks, each was peer-reviewed by every panel member, revised iteratively until consensus on 'correctness in terms of definition, feasibility, clarity, and reproducibility.'"
    203       },
    204       "recruitment_methods_described": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The expert panel composition is described: 'Qiskit advocates, members of the Qiskit community, members of IBM Quantum support and documentation, and quantum computing software developers.' All are IBM-affiliated, which is disclosed."
    208       },
    209       "data_pipeline_documented": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "The task creation pipeline is documented: experts defined tasks → peer review by all panel members → revision based on feedback → consensus reached. The fine-tuning data pipeline is also documented with filtering stages and token counts."
    213       }
    214     },
    215     "conflicts_of_interest": {
    216       "funding_disclosed": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No funding source is explicitly disclosed. The Acknowledgment section thanks specific individuals but does not mention grants, funding agencies, or corporate sponsorship."
    220       },
    221       "affiliations_disclosed": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "All authors are clearly listed with IBM affiliations: IBM Research (Yorktown Heights, Zurich) and IBM Quantum (Yorktown Heights, Gurugram)."
    225       },
    226       "funder_independent_of_outcome": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "All authors are IBM employees evaluating IBM's Granite model on IBM's Qiskit framework. IBM has a direct commercial interest in demonstrating that their model excels at Qiskit code generation. The funder (IBM) is not independent of the outcome."
    230       },
    231       "financial_interests_declared": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No competing interests or financial interests statement is present in the paper."
    235       }
    236     },
    237     "contamination": {
    238       "training_cutoff_stated": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "The Granite model's Qiskit fine-tuning data was 'collected in April 2024' and the base model was 'pretrained on 4 Trillion tokens,' but no training cutoff date is stated for the base model or any of the baseline models (CodeLlama, DeepSeek-Coder, StarCoder2, CodeGemma)."
    242       },
    243       "train_test_overlap_discussed": {
    244         "applies": true,
    245         "answer": true,
    246         "justification": "Section III states: 'The dataset is not part of the training dataset for the large language models (LLMs) being tested' and 'All tasks included in the dataset are original and not extracted from any existing tutorials, courses, books, or papers.' Section III.D also notes tasks involve 'novel features for which there is even more limited documentation.'"
    247       },
    248       "benchmark_contamination_addressed": {
    249         "applies": true,
    250         "answer": true,
    251         "justification": "The paper argues contamination is unlikely because tasks are hand-curated originals: 'While the experts responsible for creating the tasks may have been inspired by existing materials, they were required to write each task from scratch.' Some tasks test novel Qiskit 1.0 features with limited documentation."
    252       }
    253     },
    254     "human_studies": {
    255       "pre_registered": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study. It is a benchmark evaluation of LLMs."
    259       },
    260       "irb_or_ethics_approval": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "demographics_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "inclusion_exclusion_criteria": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "randomization_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       },
    280       "blinding_described": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this study."
    284       },
    285       "attrition_reported": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "No human participants in this study."
    289       }
    290     },
    291     "cost_and_practicality": {
    292       "inference_cost_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No inference cost, latency, or tokens consumed are reported for any of the evaluated models."
    296       },
    297       "compute_budget_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "Fine-tuning hyperparameters (batch size, epochs, learning rate) are stated but total GPU hours, hardware used, or wall-clock training time are not reported."
    301       }
    302     },
    303     "experimental_rigor": {
    304       "seed_sensitivity_reported": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "Results use greedy decoding only (deterministic). No exploration of seed sensitivity for training runs or alternative decoding strategies (temperature, nucleus sampling)."
    308       },
    309       "number_of_runs_stated": {
    310         "applies": true,
    311         "answer": true,
    312         "justification": "Table II caption states 'Pass@1 computed using greedy decoding,' which implies a single deterministic evaluation pass per model. This is sufficient for the reader to know the number of runs."
    313       },
    314       "hyperparameter_search_budget": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "Fine-tuning hyperparameters are stated (learning rate, batch size, epochs) but no search budget is reported — it is unclear how these values were selected or how many configurations were tried."
    318       },
    319       "best_config_selection_justified": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "The paper presents a single configuration for fine-tuning without explaining how it was selected or whether alternatives were considered."
    323       },
    324       "multiple_comparison_correction": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "Six models are compared across two benchmarks with no statistical tests applied, let alone multiple comparison correction."
    328       },
    329       "self_comparison_bias_addressed": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "IBM employees evaluate IBM's Granite model against external baselines on IBM's benchmark. The inherent bias of evaluating one's own system on one's own benchmark is not acknowledged."
    333       },
    334       "compute_budget_vs_performance": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "The IBM model (8B parameters) is compared against models up to 34B parameters. No discussion of compute-performance tradeoffs or whether the domain-specific fine-tuning data gives an unfair advantage unrelated to model architecture."
    338       },
    339       "benchmark_construct_validity": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "The paper does not discuss whether pass@1 on 101 hand-curated tasks actually measures the claimed capability of 'quantum code generation.' No analysis of whether the task distribution reflects real-world quantum programming needs or whether the 54/45/2 difficulty split is adequate."
    343       },
    344       "scaffold_confound_addressed": {
    345         "applies": false,
    346         "answer": false,
    347         "justification": "No scaffolding is involved — this is standard code completion evaluation."
    348       }
    349     },
    350     "data_leakage": {
    351       "temporal_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "Training cutoff dates are not stated for the baseline models. While the paper argues QHE tasks are novel, it does not explicitly analyze whether similar Qiskit code patterns in training data could enable models to solve tasks without genuine understanding."
    355       },
    356       "feature_leakage_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No discussion of whether the evaluation setup (function signatures, docstrings, import statements) provides hints beyond what a practitioner would have in real usage."
    360       },
    361       "non_independence_addressed": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No analysis of whether the 101 QHE tasks are independent or whether structural similarities between tasks (e.g., multiple circuit generation tasks sharing patterns) could inflate apparent performance."
    365       },
    366       "leakage_detection_method": {
    367         "applies": true,
    368         "answer": false,
    369         "justification": "No concrete leakage detection method is employed (no canary strings, membership inference, n-gram overlap analysis). The contamination argument relies solely on the assertion that tasks are original."
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "Granite-8B-Code-QK achieves 46.53% pass@1 on QHE, outperforming all baselines including models up to 34B parameters.",
    376       "evidence": "Table II shows QHE pass@1 scores: Granite-8B-Code-QK (46.53%) vs DeepSeek-Coder-33B (39.6%), StarCoder2-15B (37.62%), Granite-8B-base (28.71%), CodeLlama-34B (26.73%), CodeGemma-7B (24.75%).",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "Domain-specific fine-tuning on Qiskit data improves QHE performance by 17.8 percentage points.",
    381       "evidence": "Table II: Granite-8B-Code-Base (28.71%) vs Granite-8B-Code-QK (46.53%). Section IV states 'The pass rate jumps 17.8 points.'",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "The QHE dataset is not contaminated by training data of the tested models.",
    386       "evidence": "Section III states tasks are 'original and not extracted from any existing tutorials, courses, books, or papers' and 'not part of the training dataset for the large language models being tested.' No verification method is used.",
    387       "supported": "weak"
    388     },
    389     {
    390       "claim": "No current model can solve the difficult-level tasks.",
    391       "evidence": "Table III shows 0/2 difficult tasks passed by all 6 models.",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "Qiskit-specific fine-tuning only slightly degrades general HumanEval performance.",
    396       "evidence": "Table II: Granite-8B-Code-Base HE score 39.02% vs Granite-8B-Code-QK 38.41%, a drop of 0.61 percentage points.",
    397       "supported": "moderate"
    398     }
    399   ],
    400   "red_flags": [
    401     {
    402       "flag": "Company evaluating own product on own benchmark",
    403       "detail": "All authors are IBM employees. They created the QHE benchmark, fine-tuned the Granite model on IBM's Qiskit data, and evaluated it against external competitors on their own benchmark. IBM has a direct commercial interest in demonstrating Granite excels at Qiskit code generation."
    404     },
    405     {
    406       "flag": "No statistical rigor in comparisons",
    407       "detail": "Model comparisons rely on single-run greedy decoding pass@1 with no error bars, confidence intervals, or significance tests. The claim that Granite-8B-QK 'performs better than all the other baselines' is based on raw number comparison only."
    408     },
    409     {
    410       "flag": "Tiny difficult category renders it meaningless",
    411       "detail": "Only 2 of 101 tasks are labeled 'difficult.' With all models scoring 0/2, this category has no discriminative power and the difficulty distribution is heavily skewed toward basic (54) and intermediate (45)."
    412     },
    413     {
    414       "flag": "Unfair model size comparison",
    415       "detail": "Granite-8B-QK (8B, domain-fine-tuned) is compared against generic models up to 34B parameters without domain-specific fine-tuning. The paper does not discuss whether the comparison is fair given the asymmetric fine-tuning."
    416     },
    417     {
    418       "flag": "Dataset and code not released at publication",
    419       "detail": "The benchmark is promised for 'Q3 2024' release but was unavailable at time of publication, preventing independent verification of all claims."
    420     }
    421   ],
    422   "cited_papers": [
    423     {
    424       "title": "Evaluating Large Language Models Trained on Code",
    425       "authors": ["M. Chen"],
    426       "year": 2021,
    427       "relevance": "Introduced HumanEval, the code generation benchmark that QHE is modeled after."
    428     },
    429     {
    430       "title": "Is your code generated by chatgpt really correct? Rigorous evaluation of large language models for code generation",
    431       "authors": ["J. Liu"],
    432       "year": 2023,
    433       "relevance": "EvalPlus — improved version of HumanEval/MBPP with more rigorous test cases for code generation evaluation."
    434     },
    435     {
    436       "title": "Program synthesis with large language models",
    437       "authors": ["J. Austin"],
    438       "year": 2021,
    439       "relevance": "Introduced MBPP benchmark for program synthesis, a foundational code generation evaluation."
    440     },
    441     {
    442       "title": "Ds-1000: A natural and reliable benchmark for data science code generation",
    443       "authors": ["Y. Lai"],
    444       "year": 2022,
    445       "relevance": "Domain-specific code generation benchmark using Python data science SDKs, architecturally similar to QHE's approach."
    446     },
    447     {
    448       "title": "Multipl-e: A scalable and extensible approach to benchmarking neural code generation",
    449       "authors": ["F. Cassano"],
    450       "year": 2022,
    451       "relevance": "Multilingual code generation benchmark enabling cross-language evaluation of LLMs."
    452     },
    453     {
    454       "title": "Qiskit Code Assistant: Training LLMs for generating Quantum Computing Code",
    455       "authors": ["N. Dupuis"],
    456       "year": 2024,
    457       "arxiv_id": "2405.19495",
    458       "relevance": "Companion paper on training LLMs specifically for Qiskit code generation."
    459     },
    460     {
    461       "title": "OctoPack: Instruction Tuning Code Large Language Models",
    462       "authors": ["N. Muennighoff"],
    463       "year": 2023,
    464       "relevance": "Instruction tuning methodology for code LLMs, directly used for QK model's instruct-tuning stage."
    465     },
    466     {
    467       "title": "Granite code models: A family of open foundation models for code intelligence",
    468       "authors": ["M. Mishra"],
    469       "year": 2024,
    470       "arxiv_id": "2405.04324",
    471       "relevance": "Describes the base Granite-8B model family used as the foundation for the QK model."
    472     },
    473     {
    474       "title": "CruxEval: A benchmark for code reasoning, understanding and execution",
    475       "authors": ["A. Gu"],
    476       "year": 2024,
    477       "relevance": "Code reasoning benchmark testing LLM understanding and execution prediction capabilities."
    478     },
    479     {
    480       "title": "Can it edit? Evaluating the ability of large language models to follow code editing instructions",
    481       "authors": ["F. Cassano"],
    482       "year": 2023,
    483       "relevance": "Benchmark evaluating LLMs on code editing tasks, complementary to code generation evaluation."
    484     },
    485     {
    486       "title": "CrossCodeEval: A diverse and multilingual benchmark for cross-file code completion",
    487       "authors": ["Y. Ding"],
    488       "year": 2023,
    489       "relevance": "Cross-file code completion benchmark testing LLMs on multi-file context understanding."
    490     }
    491   ],
    492   "engagement_factors": {
    493     "practical_relevance": {
    494       "score": 1,
    495       "justification": "Quantum computing programming is a niche domain; the benchmark is useful only to researchers working at the intersection of LLMs and quantum computing."
    496     },
    497     "surprise_contrarian": {
    498       "score": 0,
    499       "justification": "Domain-specific fine-tuning improving domain-specific benchmark performance is entirely expected."
    500     },
    501     "fear_safety": {
    502       "score": 0,
    503       "justification": "No safety or security concerns raised; the paper is about quantum code generation quality."
    504     },
    505     "drama_conflict": {
    506       "score": 0,
    507       "justification": "No controversy or conflict; straightforward benchmark introduction and evaluation."
    508     },
    509     "demo_ability": {
    510       "score": 1,
    511       "justification": "The Granite base model is available on HuggingFace (ibm-granite/granite-8b-code-base) but the QHE dataset and fine-tuned QK model were not yet released."
    512     },
    513     "brand_recognition": {
    514       "score": 2,
    515       "justification": "IBM Quantum is a well-known brand in quantum computing, and Qiskit is a leading quantum SDK, though neither has mainstream AI/ML visibility."
    516     }
    517   }
    518 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs