scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24309B)
      1 {
      2   "paper": {
      3     "title": "LAAG-RV: LLM Assisted Assertion Generation for RTL Design Verification",
      4     "authors": ["Karthik Maddala", "Bhabesh Mali", "Chandan Karfa"],
      5     "year": 2024,
      6     "venue": "2024 IEEE 8th International Test Conference India (ITC India)",
      7     "arxiv_id": "2409.15281",
      8     "doi": "10.1109/ITCIndia62949.2024.10651860"
      9   },
     10   "scan_version": 3,
     11   "active_modules": [],
     12   "methodology_tags": ["case-study"],
     13   "key_findings": "LAAG-RV uses GPT-4 with a one-time Verilog loop for signal synchronization to generate SystemVerilog Assertions from natural language specifications. Evaluated on 6 OpenTitan designs, the framework generates 6-14 assertions per design, with initial assertions often containing syntax or timing errors that can be corrected through iterative prompting with simulation error logs. Compared with ChIRAAG, LAAG-RV requires fewer prompts for most designs due to the signal synchronization step, and most generated assertions are functionally common between the two frameworks.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL, code archive, or link to the LAAG-RV framework is provided anywhere in the paper. The custom GPT-4 environment and test cases are not released."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The evaluation uses designs from the publicly available OpenTitan Repository (referenced as [27], https://opentitan.org/). However, the custom test cases developed for validation are not released."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper mentions 'Synopsys VCS 2021.09' and 'OpenAI GPT-4' with '1280000 tokens of the context window', but provides no requirements.txt, Dockerfile, or detailed environment setup. Not enough to recreate the environment."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. The framework flow (Fig. 1) gives a high-level overview but no specific commands, scripts, or procedures a researcher could follow."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Table I reports raw counts (number of assertions, simulation time) with no confidence intervals or error bars. No uncertainty quantification is provided."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims LAAG-RV takes 'relatively less number of iterations as compared to ChIRAAG' but provides no statistical significance tests. Comparisons are based solely on raw numbers from 6 designs."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Only raw counts are reported (e.g., number of assertions, number of prompts). No effect sizes, relative improvements with baseline context, or magnitude measures are provided."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Six OpenTitan designs are used with no justification for why these 6 were selected or whether this sample is sufficient for the claims being made."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "LLMs can produce different outputs for the same input (acknowledged in Section III.B), but no variance across runs is reported. All results appear to be from single experimental runs."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper compares LAAG-RV against ChIRAAG [26] across all 6 designs (Table I, Section IV.B), comparing number of assertions generated and prompts required."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "ChIRAAG [26] (Mali et al., 2024) is a contemporary baseline from the same year, addressing the same problem of LLM-based SVA generation."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "The system has multiple components (custom GPT-4 with domain knowledge, one-time Verilog loop, error-specific prompting), but no ablation study isolates their individual contributions. The one-time Verilog loop is claimed to reduce iterations but is not tested in isolation."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Table I reports number of assertions generated (LAAG-RV vs OT vs ChIRAAG), number of common assertions, and VCS simulation time. Fig. 3 compares number of prompts required."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "Assertions are verified through automated simulation (Synopsys VCS), not systematic human evaluation. While manual inspection is mentioned for design issues, there is no structured human evaluation of assertion quality, completeness, or correctness."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No separation between development/tuning and testing. The same 6 designs appear to be used for both developing the prompting approach and reporting results."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table I provides per-design results for all 6 modules (RV Timer, PattGen, GPIO, ROM_Ctrl, sram_ctrl, adc_ctrl), showing assertion counts and simulation times individually."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section III.E discusses specific failures in detail: Assertion 1 had a timing issue, Assertion 2 required three iterations to fix, and the paper notes that 'wrong output may be encountered from the test cases' requiring manual inspection."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports that initial assertions contain errors, some require multiple prompts to fix (Assertion 2 needed 3 iterations), and acknowledges 'it is not guaranteed that the assertions generated are enough to cover all the design aspects.' It also notes manual inspection may be needed."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The abstract claims LLMs 'significantly simplify the process of generating assertions, making it efficient and less error-prone.' No quantitative comparison with manual assertion writing is provided. The evidence shows assertions require iterative manual prompting and simulation-based debugging, undermining the 'less error-prone' claim."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper claims the one-time Verilog loop is 'the main reason for rectifying SVA errors in fewer iterations' (Section V). This causal claim is not supported by a controlled comparison — no experiment removes the Verilog loop to measure its isolated effect."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title claims 'RTL Design Verification' broadly, but evaluation covers only 6 specific OpenTitan designs. Section IV.A claims 'LLMs are showcasing their promise to revolutionize verification' based on this limited sample without bounding generalization to the tested designs."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No alternative explanations are discussed. For example, GPT-4 may have seen OpenTitan designs during training, which could explain its ability to generate correct assertions. The role of human expertise in crafting prompts is not analyzed as a confound."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper measures number of assertions generated and prompts required, then frames this as the process being 'efficient and less error-prone' without discussing whether assertion count is a valid proxy for verification quality or whether prompt count captures actual human effort."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper states 'OpenAI's GPT-4' (Section IV) without specifying a version or snapshot date (e.g., gpt-4-0613). The context window is mentioned as '1280000 tokens' but this does not constitute a version specification."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper describes what information was fed to the LLM (design descriptions, block diagrams, error logs) but does not provide the actual prompt text used. Section III describes the prompting approach in natural language without reproducing exact prompts."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for the GPT-4 API calls. Only the context window capacity is mentioned."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "The framework is a manual human-in-the-loop prompting workflow, not agentic scaffolding. Each step (providing specifications, feeding error logs, crafting prompts) involves manual human intervention."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "Section III.A mentions they 'focused on the basic understandable details of the designs, excluding information about registers, Verilog implementation' but does not specify the exact filtering criteria or what was included/excluded in a reproducible way."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No dedicated limitations section exists. The conclusion (Section V) mentions future work on consistency and completeness but does not substantively discuss limitations of the current approach."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity are discussed. Issues like the small evaluation set, potential training data contamination, reliance on manual prompting expertise, and non-determinism of LLM outputs are not addressed."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No explicit scope boundaries are stated. The paper does not clarify what types of designs, assertion complexity levels, or verification scenarios are out of scope for the approach."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw data (generated assertions, prompt transcripts, simulation logs, test cases) is made available for independent verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section III.A describes that design descriptions were collected from the OpenTitan Repository [27], each design has multiple modules with block diagrams, descriptions, and Verilog code. Section III.B describes the custom GPT-4 setup with domain-specific knowledge from references [2, 23-25, 28]."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data sources are OpenTitan designs (publicly available repository)."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The high-level flow is shown in Fig. 1, but the specific pipeline from input to final assertion suite is not documented with filtering counts, rejection rates, or intermediate steps. For example, how many raw assertions were generated before iterative refinement is not stated per design."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source or acknowledgments section is present in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: all three authors are from Indian Institute of Technology Guwahati, India. No commercial product affiliation."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding is disclosed, so independence cannot be assessed. The paper uses OpenAI's GPT-4 but authors are not affiliated with OpenAI."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is included in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The paper uses GPT-4 to generate assertions for OpenTitan designs but does not state GPT-4's training data cutoff date. OpenTitan is a public repository that could be in GPT-4's training data."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "OpenTitan is a well-known public repository. GPT-4 could have seen both the designs and existing assertions during training. This potential overlap is never discussed."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "The OpenTitan repository has been publicly available since before GPT-4's training. The paper does not address whether GPT-4's ability to generate correct assertions stems from having seen these specific designs and their documentation during training."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study. The framework involves human-in-the-loop prompting but does not study human subjects."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "VCS simulation times are reported (80-460 ns, Table I), but no LLM API costs, token usage, or inference latency for the GPT-4 calls is reported."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total computational budget (API spend, number of tokens consumed, GPU hours) is stated."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "LAAG-RV can generate SVA from natural language specifications of hardware designs.",
    296       "evidence": "Demonstrated on 6 OpenTitan designs (Table I), generating 6-14 assertions per design that pass simulation after iterative refinement.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "The one-time Verilog loop for signal synchronization reduces the number of iterations needed to produce correct SVA.",
    301       "evidence": "Fig. 3 shows LAAG-RV requires fewer prompts than ChIRAAG for most designs (except ROM_Ctrl and sram_ctrl). The paper attributes this to signal name synchronization (Section IV.B).",
    302       "supported": "weak"
    303     },
    304     {
    305       "claim": "LLMs significantly simplify the process of generating assertions, making it efficient and less error-prone.",
    306       "evidence": "No quantitative comparison with manual assertion writing effort is provided. The evidence shows initial assertions contain errors requiring multiple iterations of manual prompting and simulation-based debugging.",
    307       "supported": "unsupported"
    308     },
    309     {
    310       "claim": "Most assertions generated by LAAG-RV and ChIRAAG are functionally common, showing consistency of LLMs.",
    311       "evidence": "Table I shows 4-9 common assertions between the two frameworks across 6 designs. Section IV.B provides example comparisons (Assertions 11 and 12).",
    312       "supported": "moderate"
    313     },
    314     {
    315       "claim": "The LLM can identify and correct most assertion errors within a single prompt when given simulation error logs.",
    316       "evidence": "Section III.E shows Assertion 1 was corrected in one prompt, but Assertion 2 required three iterations. No aggregate statistics on single-prompt correction rates are provided.",
    317       "supported": "weak"
    318     }
    319   ],
    320   "red_flags": [
    321     {
    322       "flag": "Very small evaluation set",
    323       "detail": "Only 6 OpenTitan designs are evaluated. This is too small for the broad claims about LLMs simplifying RTL verification. No justification is given for why these 6 were selected."
    324     },
    325     {
    326       "flag": "Potential training data contamination",
    327       "detail": "OpenTitan is a well-known public repository. GPT-4 could have seen the designs, their specifications, and existing assertions during training. The paper never addresses this, making it impossible to determine whether the LLM is generating assertions from specification understanding or pattern-matching from training data."
    328     },
    329     {
    330       "flag": "Overclaiming relative to evidence",
    331       "detail": "The abstract claims LLMs 'significantly simplify' and make the process 'less error-prone.' Section IV states LLMs 'are showcasing their promise to revolutionize verification.' These claims are unsupported — no comparison with manual effort is provided, and the results show assertions require multiple rounds of manual prompting and debugging."
    332     },
    333     {
    334       "flag": "Non-reproducible methodology",
    335       "detail": "The custom GPT-4 environment, exact prompts, test cases, and domain knowledge fed to the LLM are not released. Combined with acknowledged LLM non-determinism ('LLMs can generate different outputs for the same input'), results cannot be reproduced."
    336     },
    337     {
    338       "flag": "No statistical analysis",
    339       "detail": "All comparisons are based on raw counts from a single run on 6 designs, with no error bars, significance tests, or variance analysis despite acknowledged non-determinism of LLM outputs."
    340     },
    341     {
    342       "flag": "Completeness not assessed",
    343       "detail": "The paper acknowledges 'it is not guaranteed that the assertions generated are enough to cover all the design aspects' but provides no coverage metrics. The claim of usefulness rests on generating some correct assertions, not on achieving adequate verification coverage."
    344     }
    345   ],
    346   "cited_papers": [
    347     {
    348       "title": "Chip-chat: Challenges and opportunities in conversational hardware design",
    349       "authors": ["J. Blocklove"],
    350       "year": 2023,
    351       "relevance": "Early work on using LLMs conversationally for hardware design, directly relevant to LLM-assisted EDA."
    352     },
    353     {
    354       "title": "Evaluating large language models trained on code",
    355       "authors": ["M. Chen"],
    356       "year": 2021,
    357       "arxiv_id": "2107.03374",
    358       "relevance": "Foundational benchmark paper on LLM code generation capability (Codex/HumanEval)."
    359     },
    360     {
    361       "title": "Large language models are edge-case fuzzers: Testing deep learning libraries via FuzzGPT",
    362       "authors": ["Y. Deng"],
    363       "year": 2023,
    364       "arxiv_id": "2304.02014",
    365       "relevance": "Uses LLMs for automated software testing via fuzzing."
    366     },
    367     {
    368       "title": "Universal fuzzing via large language models",
    369       "authors": ["C. S. Xia"],
    370       "year": 2023,
    371       "arxiv_id": "2308.04748",
    372       "relevance": "Applies LLMs to universal fuzzing for software testing."
    373     },
    374     {
    375       "title": "Less training, more repairing please: revisiting automated program repair via zero-shot learning",
    376       "authors": ["C. S. Xia"],
    377       "year": 2022,
    378       "relevance": "Zero-shot LLM-based automated program repair, relevant to LLM coding capabilities."
    379     },
    380     {
    381       "title": "Impact of code language models on automated program repair",
    382       "authors": ["N. Jiang"],
    383       "year": 2023,
    384       "arxiv_id": "2302.05020",
    385       "relevance": "Evaluates the impact of code LLMs on automated program repair."
    386     },
    387     {
    388       "title": "Using LLMs to facilitate formal verification of RTL",
    389       "authors": ["M. Orenes-Vera"],
    390       "year": 2023,
    391       "relevance": "Directly related work on using LLMs for RTL formal verification and assertion generation."
    392     },
    393     {
    394       "title": "LLM-assisted generation of hardware assertions",
    395       "authors": ["R. Kande"],
    396       "year": 2023,
    397       "relevance": "Demonstrated that LLMs can generate hardware assertions from RTL, finding 4.53% correctness rate."
    398     },
    399     {
    400       "title": "Automated generation of security assertions for RTL models",
    401       "authors": ["H. Witharana"],
    402       "year": 2023,
    403       "relevance": "Automated security assertion generation for hardware verification using LLMs."
    404     },
    405     {
    406       "title": "ChIRAAG: ChatGPT informed rapid and automated assertion generation",
    407       "authors": ["B. Mali"],
    408       "year": 2024,
    409       "relevance": "Direct baseline comparison — uses JSON-formatted specifications and ChatGPT for SVA generation."
    410     },
    411     {
    412       "title": "Code Llama: Open foundation models for code",
    413       "authors": ["B. Roziere"],
    414       "year": 2023,
    415       "arxiv_id": "2308.12950",
    416       "relevance": "Open-source code LLM relevant to understanding code generation capabilities."
    417     }
    418   ],
    419   "engagement_factors": {
    420     "practical_relevance": {
    421       "score": 1,
    422       "justification": "Addresses a real need in hardware verification but the framework is not released and requires proprietary tools (Synopsys VCS, GPT-4 API)."
    423     },
    424     "surprise_contrarian": {
    425       "score": 0,
    426       "justification": "Confirms the expected finding that LLMs can assist with code/assertion generation, with no surprising or contrarian results."
    427     },
    428     "fear_safety": {
    429       "score": 0,
    430       "justification": "No safety or AI risk concerns raised; the paper is about assisting verification engineers."
    431     },
    432     "drama_conflict": {
    433       "score": 0,
    434       "justification": "No controversy or conflict angle."
    435     },
    436     "demo_ability": {
    437       "score": 0,
    438       "justification": "No code, demo, or tool released. Framework cannot be tried."
    439     },
    440     "brand_recognition": {
    441       "score": 1,
    442       "justification": "Uses GPT-4 which is well-known, but the paper and authors are from IIT Guwahati with no major brand association."
    443     }
    444   }
    445 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs