scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (18209B)
      1 {
      2   "paper": {
      3     "title": "CreativEval: Evaluating Creativity of LLM-Based Hardware Code Generation",
      4     "authors": ["Matthew DeLorenzo", "Vasudev Gohil", "Jeyavijayan Rajendran"],
      5     "year": 2024,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2404.08806"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub link provided: https://github.com/matthewdelorenzo/CreativEval/ stated in the introduction as open-source."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper states datasets will be open-sourced at the same GitHub link. The prompts are sourced from publicly available HDLBits via AutoChip."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions Python 3.10, Icarus Verilog 10.3, and NVIDIA A100 GPU, but does not provide a requirements.txt, Dockerfile, or detailed dependency list sufficient to recreate the environment."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README or reproduction guide is described."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Table I reports only point estimates for all metrics. No confidence intervals or error bars are provided."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims GPT-3.5 is the most creative and makes comparative claims between models, but no significance tests are performed."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Only raw metric values are reported. No effect sizes with baseline context (e.g., Cohen's d or percentage improvement with denominators) are provided."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The dataset consists of 111 single-module prompts and 9 multi-module prompts. No justification is given for why these sizes are sufficient, and the elaboration set (9 prompts) is notably small."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance, standard deviation, or spread measures are reported across the t=10 responses per prompt. Only aggregated metrics are shown."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Multiple LLMs are compared against each other (CodeLlama-7B/13B, VeriGen-6B/16B, GPT-3.5, GPT-4), serving as mutual baselines."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The models evaluated (GPT-4, GPT-3.5, CodeLlama, VeriGen) were contemporary at the time of writing (2024)."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No ablation study is performed to evaluate which components of the creativity framework matter most, e.g., varying the equal weighting or the GNN4IP threshold."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Four creativity sub-metrics (fluency, flexibility, originality, elaboration) plus overall creativity and functionality are reported."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "Creativity is inherently subjective, yet no human evaluation of the generated code's creativity is included. All evaluation is automated via GNN4IP similarity scores."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is not a training/tuning study. The framework evaluates pre-trained models on a fixed prompt set, so held-out test sets are not applicable."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table I provides per-subcategory breakdowns (fluency, flexibility, originality, elaboration) for each model."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses failure modes: CodeLlama producing direct copies for flexibility, VeriGen generating repeated similar implementations for fluency."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "GPT models performing worse on originality than smaller models is reported, and CodeLlama-7B scoring 0.0 on flexibility is noted."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims GPT-3.5 is the most creative model, which is supported by Table I showing GPT-3.5 with the highest creativity score (0.2201)."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper suggests causal explanations (e.g., GPT models' lower originality 'could be due to its large size and training dataset') without adequate evidence. These are speculative."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title claims to evaluate 'creativity of LLM-based hardware code generation' broadly, but results are limited to 111 HDLBits prompts in Verilog with a specific GNN4IP-based metric. This narrow scope is not bounded in the claims."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No alternative explanations for the results are discussed. For example, the originality metric may reflect training data memorization rather than creativity, and the GNN4IP similarity threshold choice is not examined."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper says 'GPT-3.5' and 'GPT-4' without specifying API versions or snapshot dates. CodeLlama and VeriGen are specified by parameter count and HuggingFace links."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Full prompt examples are provided in Listings 1, 2, and 3 for fluency/originality, flexibility, and elaboration respectively."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section IV-A specifies temperature=0.3, max_tokens=1024, top_k=10, top_p=0.95 for all models."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. The framework is direct prompt-response evaluation."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section IV-A describes that prompts are sourced from HDLBits via AutoChip, responses are trimmed to the first 'endmodule', and functionality is checked via Icarus Verilog testbenches."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No dedicated limitations or threats-to-validity section exists in the paper."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No threats to validity are discussed anywhere in the paper."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do not show. The conclusion only mentions future work evaluating more LLMs and larger prompt sets."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw data (individual GNN4IP similarity scores, per-prompt results) is provided in the paper. Only aggregated metrics in Table I."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section IV-A describes the prompt dataset (111 HDLBits prompts via AutoChip, 9 multi-module prompts), the LLM inference process, and the GNN4IP comparison procedure."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data source is a standard benchmark (HDLBits)."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline from prompt generation to LLM response to functionality check to GNN4IP similarity scoring to metric aggregation is documented in Section III and IV."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Acknowledgment section lists Purdue CSME and NSF grants (CNS-1822848, DGE-2039610)."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Authors are from Texas A&M University. They evaluate third-party models (GPT, CodeLlama, VeriGen), no direct product conflict."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "NSF and Purdue CSME are independent of the LLM vendors evaluated. No funder has a stake in the results."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No training cutoff dates are stated for any of the models evaluated. HDLBits prompts may be in the training data of GPT models."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether HDLBits prompts or solutions appeared in the training data of the evaluated models."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "HDLBits is publicly available and could be in the training data of GPT-3.5/4. This contamination risk is not addressed."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference cost, API spend, or wall-clock time is reported despite querying GPT-3.5/4 APIs and running local inference on A100 GPU."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total compute budget is stated. The hardware (A100 80GB) is mentioned but total GPU hours or API costs are not."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "GPT-3.5 is the most creative model for hardware code generation among those evaluated.",
    286       "evidence": "Table I shows GPT-3.5 achieves the highest overall creativity score of 0.2201, driven primarily by the highest flexibility score (0.1600).",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "GPT-4 has the highest fluency in generating unique Verilog solutions.",
    291       "evidence": "Table I shows GPT-4 fluency of 0.1644, the highest among all models.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Creativity slightly drops for larger model sizes of GPT and VeriGen.",
    296       "evidence": "Table I: GPT-3.5 (0.2201) > GPT-4 (0.2107); VeriGen-6B (0.2026) > VeriGen-16B (0.1962). But differences are small and no statistical tests are performed.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "CodeLlama models produce direct copies when asked for alternative implementations, indicating flexibility depends on natural language understanding.",
    301       "evidence": "Section IV-B states CodeLlama 'produced results that were often direct copies of the provided module' with CodeLlama-7B scoring 0.0 on flexibility.",
    302       "supported": "moderate"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval"],
    306   "key_findings": "CreativEval proposes a framework measuring four creativity dimensions (fluency, flexibility, originality, elaboration) for LLM-based Verilog code generation using GNN4IP circuit similarity. Across six models, GPT-3.5 scored highest overall (0.2201) primarily due to superior flexibility, while GPT-4 led in fluency. Larger model variants showed slightly lower creativity scores, though differences are small and not statistically tested. All creativity scores are low in absolute terms (max 0.22 out of 1.0).",
    307   "red_flags": [
    308     {
    309       "flag": "No statistical rigor",
    310       "detail": "All comparative claims between models are based on single point estimates with no confidence intervals, significance tests, or variance reporting across the t=10 responses."
    311     },
    312     {
    313       "flag": "Benchmark contamination risk",
    314       "detail": "HDLBits prompts are publicly available and likely in GPT-3.5/4 training data. This could confound creativity metrics, especially originality, which measures deviation from 'golden' solutions the model may have memorized."
    315     },
    316     {
    317       "flag": "No human validation of creativity metric",
    318       "detail": "The paper proposes to measure 'creativity' using an automated graph similarity metric (GNN4IP) but never validates whether this metric correlates with human judgments of creativity in hardware design."
    319     },
    320     {
    321       "flag": "Very small elaboration dataset",
    322       "detail": "The elaboration metric is measured on only 9 multi-module prompts, producing nearly identical scores for 5 of 6 models (0.3333), making the metric uninformative."
    323     },
    324     {
    325       "flag": "No limitations section",
    326       "detail": "The paper contains no discussion of limitations, threats to validity, or scope boundaries."
    327     },
    328     {
    329       "flag": "Equal weighting assumption unjustified",
    330       "detail": "The four creativity sub-components are weighted equally (0.25 each) in the composite score with no justification or sensitivity analysis for this choice."
    331     }
    332   ],
    333   "cited_papers": [
    334     {
    335       "title": "Evaluating large language models trained on code",
    336       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    337       "year": 2021,
    338       "relevance": "Foundational HumanEval benchmark paper for evaluating LLM code generation capabilities."
    339     },
    340     {
    341       "title": "VeriGen: A large language model for verilog code generation",
    342       "authors": ["S. Thakur", "B. Ahmad", "H. Pearce"],
    343       "year": 2023,
    344       "relevance": "Demonstrates fine-tuning smaller models for hardware RTL code generation, directly evaluated in this paper."
    345     },
    346     {
    347       "title": "GPT-4 technical report",
    348       "authors": ["OpenAI"],
    349       "year": 2024,
    350       "relevance": "Technical report for GPT-4, one of the primary models evaluated for creativity."
    351     },
    352     {
    353       "title": "ChipNeMo: Domain-adapted LLMs for chip design",
    354       "authors": ["M. Liu", "T.-D. Ene", "R. Kirby"],
    355       "year": 2024,
    356       "relevance": "Domain adaptation of LLMs for hardware design tasks via fine-tuning."
    357     },
    358     {
    359       "title": "RTLCoder: Outperforming GPT-3.5 in design RTL generation with our open-source dataset and lightweight solution",
    360       "authors": ["S. Liu", "W. Fang", "Y. Lu"],
    361       "year": 2024,
    362       "relevance": "Open-source RTL generation model that outperforms GPT-3.5, relevant to LLM code generation benchmarking."
    363     },
    364     {
    365       "title": "AutoChip: Automating HDL generation using LLM feedback",
    366       "authors": ["S. Thakur", "J. Blocklove", "H. Pearce"],
    367       "year": 2023,
    368       "relevance": "LLM-based automated hardware design with feedback loops, source of the prompt dataset used in this paper."
    369     },
    370     {
    371       "title": "Assessing and understanding creativity in large language models",
    372       "authors": ["Y. Zhao", "R. Zhang", "W. Li"],
    373       "year": 2024,
    374       "relevance": "Evaluates LLM creativity using the same four cognitive sub-categories but for natural language tasks."
    375     },
    376     {
    377       "title": "VerilogEval: Evaluating large language models for Verilog code generation",
    378       "authors": ["M. Liu", "N. Pinckney", "B. Khailany"],
    379       "year": 2023,
    380       "relevance": "Benchmark for evaluating LLM functionality in Verilog generation."
    381     }
    382   ]
    383 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs