scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (24083B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Make Every Move Count: LLM-based High-Quality RTL Code Generation Using MCTS",
      6     "authors": [
      7       "Matthew DeLorenzo",
      8       "A. B. Chowdhury",
      9       "Vasudev Gohil",
     10       "Shailja Thakur",
     11       "Ramesh Karri",
     12       "Siddharth Garg",
     13       "Jeyavijayan Rajendran"
     14     ],
     15     "year": 2024,
     16     "venue": "arXiv.org",
     17     "arxiv_id": "2402.03289",
     18     "doi": "10.48550/arXiv.2402.03289"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Core claims (MCTS produces functionally correct code, 31.8% ADP improvement) directly supported by Table 2 (15/15 success vs 1/15 greedy, 4/15 beam search) and Table 3 (documented ADP improvements).",
     26         "source": "haiku"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Paper compares MCTS vs greedy vs beam search on identical LLM baseline (VeriGen-2B), and includes ablations (modularity in Table 1, reward parameter in Figure 4). Study design supports causal inference.",
     32         "source": "haiku"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Claims bounded to 'adders, multipliers, and multiply-accumulate units' of specified bit widths (4-64). Paper acknowledges dataset of 15 problems and does not generalize beyond arithmetic circuits.",
     38         "source": "haiku"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "Paper shows MCTS outperforms baselines but does not discuss alternative explanations (e.g., implementation quality differences, variance across runs). No confound analysis.",
     44         "source": "haiku"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Claims use direct metrics: functional correctness (compilation+correct output), area, delay, and ADP. These are not proxies—they are the actual quantities being optimized for hardware design.",
     50         "source": "haiku"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No dedicated limitations or threats-to-validity section. Section 5 is 'Discussion and Future Work', which mentions MCTS is 'time-intensive' but lacks systematic scope analysis.",
     58         "source": "haiku"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "Discussion mentions time cost and small dataset casually, but does not systematically address threats like generalization to other RTL types, training-test overlap, or robustness across random seeds.",
     64         "source": "haiku"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "Paper implicitly bounds scope to arithmetic circuits and VeriGen-2B but does not explicitly state what results do NOT show (e.g., 'does not generalize to sequential circuits').",
     70         "source": "haiku"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Funding sources explicitly listed: Purdue Center for Secure Microelectronics Ecosystem, NSF CNS–1822848, NSF DGE–2039610, and Synopsys gift.",
     78         "source": "haiku"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Author affiliations clearly stated: Texas A&M and NYU. VeriGen baseline from prior work [19] openly cited. No undisclosed affiliations with evaluated product.",
     84         "source": "haiku"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "NSF and Purdue are independent. Synopsys gift is not direct evaluation funding. Reasonable independence maintained.",
     90         "source": "haiku"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "Acknowledgment section contains no competing interests statement. No patents, equity, or consulting relationships disclosed.",
     96         "source": "haiku"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Key terms (RTL, Verilog, PPA, MCTS, functional correctness) defined or explained contextually. Adequate for hardware engineering audience.",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Contributions explicitly stated: (1) first MCTS technique for Verilog generation, (2) solves search/scalability challenges, (3) enables functional correctness on diverse circuits, (4) first PPA-optimized decoding.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 2 systematically reviews LLM code generation (2.1) and Verilog-specific work (2.2), citing 10+ papers. Positions work clearly against prior art.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "empirical": {
    122       "artifacts": {
    123         "code_released": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "Paper states MCTS implemented in Python 3.8 but provides no GitHub repository or code availability statement.",
    127           "source": "haiku"
    128         },
    129         "data_released": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "The 15 test Verilog problems are created by authors but not released or linked.",
    133           "source": "haiku"
    134         },
    135         "environment_specified": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "Specifies Python 3.8, RTX A5000 GPU, Icarus Verilog 10.3, Yosys, but no requirements.txt or dependency file. Custom Yosys scripts not provided.",
    139           "source": "haiku"
    140         },
    141         "reproduction_instructions": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "No step-by-step reproduction guide provided. Methodology described but not as executable instructions.",
    145           "source": "haiku"
    146         }
    147       },
    148       "statistical_methodology": {
    149         "confidence_intervals_or_error_bars": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "Tables 2–3 show single point estimates. No error bars, confidence intervals, or uncertainty quantification across runs.",
    153           "source": "haiku"
    154         },
    155         "significance_tests": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "No formal statistical significance tests. Functional correctness gap (15/15 vs 1/15) is stark but untested; ADP improvements lack p-values.",
    159           "source": "haiku"
    160         },
    161         "effect_sizes_reported": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "Functional correctness reported as success rates (15/15, 1/15, 4/15). ADP improvements quantified as percentages (5.69%, 14.27%, 31.8%).",
    165           "source": "haiku"
    166         },
    167         "sample_size_justified": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "15 test modules (5 bit-widths × 3 types), but no power analysis or justification for sample size adequacy.",
    171           "source": "haiku"
    172         },
    173         "variance_reported": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "Figures 4–5 show single curves, not distributions. No error bars or std dev across runs.",
    177           "source": "haiku"
    178         }
    179       },
    180       "evaluation_design": {
    181         "baselines_included": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Two baselines: VeriGen with greedy search and beam search, directly compared in Tables 2–3.",
    185           "source": "haiku"
    186         },
    187         "baselines_contemporary": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "VeriGen from 2023 [19], paper from 2024. Beam search is standard. Appropriate baselines for RTL domain.",
    191           "source": "haiku"
    192         },
    193         "ablation_study": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Multiple ablations: comment filtering (Section 3.3), modularity (Table 1), baseline reward (Figure 4), MCTS iterations (Figure 5).",
    197           "source": "haiku"
    198         },
    199         "multiple_metrics": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Evaluation uses: functional correctness (binary), area (µm²), delay (ps), and ADP. Multiple perspectives on code quality.",
    203           "source": "haiku"
    204         },
    205         "human_evaluation": {
    206           "applies": false,
    207           "answer": false,
    208           "justification": "RTL evaluation is fully automated (compilation, simulation, synthesis). Human evaluation not applicable.",
    209           "source": "haiku"
    210         },
    211         "held_out_test_set": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "15 test modules are evaluation set, distinct from VeriGen's training data.",
    215           "source": "haiku"
    216         },
    217         "per_category_breakdown": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Results broken down by design type (adders, multipliers, MACs) and bit width (4–64) in Tables 2–3.",
    221           "source": "haiku"
    222         },
    223         "failure_cases_discussed": {
    224           "applies": true,
    225           "answer": false,
    226           "justification": "Baseline failures shown (14/15 for greedy search) but failure modes not analyzed. Why does greedy search fail on 8-bit adder?",
    227           "source": "haiku"
    228         },
    229         "negative_results_reported": {
    230           "applies": true,
    231           "answer": false,
    232           "justification": "All results are positive. MCTS succeeds on all tasks; no failures or null results reported.",
    233           "source": "haiku"
    234         }
    235       },
    236       "setup_transparency": {
    237         "model_versions_specified": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "Model explicitly identified as 'VeriGen-2B LLM [19]'. Sufficient specificity.",
    241           "source": "haiku"
    242         },
    243         "prompts_provided": {
    244           "applies": true,
    245           "answer": false,
    246           "justification": "Example prompt shown in Figure 1, but full prompt templates not provided. Paper mentions 'hand-designed prompts' (Section 3.5) without sharing them.",
    247           "source": "haiku"
    248         },
    249         "hyperparameters_reported": {
    250           "applies": true,
    251           "answer": false,
    252           "justification": "Reward parameters given (α_NC=−1, α_NF=−0.1, α_B=0.5). Exploration constant c_PUCT mentioned in Eq. 3 but value not specified. LLM sampling hyperparameters (temperature, top-p) not stated.",
    253           "source": "haiku"
    254         },
    255         "scaffolding_described": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "MCTS algorithm detailed (Section 3.2), modularity strategy explained (3.4), comment filtering described (3.3). Scaffolding is transparent.",
    259           "source": "haiku"
    260         },
    261         "data_preprocessing_documented": {
    262           "applies": true,
    263           "answer": false,
    264           "justification": "No preprocessing pipeline documented. Problem specifications and expected outputs not detailed.",
    265           "source": "haiku"
    266         }
    267       },
    268       "data_integrity": {
    269         "raw_data_available": {
    270           "applies": true,
    271           "answer": false,
    272           "justification": "15 test problems and synthesis results not released. No access to raw data.",
    273           "source": "haiku"
    274         },
    275         "data_collection_described": {
    276           "applies": true,
    277           "answer": false,
    278           "justification": "Minimal: 'We created a dataset of 15 Verilog problems... with bit widths in {4, 8, 16, 32, 64}.' No detail on selection criteria or problem specification.",
    279           "source": "haiku"
    280         },
    281         "recruitment_methods_described": {
    282           "applies": false,
    283           "answer": false,
    284           "justification": "No human participants.",
    285           "source": "haiku"
    286         },
    287         "data_pipeline_documented": {
    288           "applies": true,
    289           "answer": false,
    290           "justification": "Pipeline described narratively and visually (Figure 2) but not formally documented with reproducible lineage.",
    291           "source": "haiku"
    292         }
    293       },
    294       "contamination": {
    295         "training_cutoff_stated": {
    296           "applies": true,
    297           "answer": false,
    298           "justification": "VeriGen-2B training cutoff not stated. VeriGen [19] is from 2023, likely trained in 2022–2023, but exact date unknown.",
    299           "source": "haiku"
    300         },
    301         "train_test_overlap_discussed": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "Potential overlap not discussed. VeriGen trained on GitHub RTL; standard circuits (adders, multipliers) likely present in training set. Risk not addressed.",
    305           "source": "haiku"
    306         },
    307         "benchmark_contamination_addressed": {
    308           "applies": true,
    309           "answer": false,
    310           "justification": "Custom benchmark avoids standard contamination risk, but overlap with natural training distribution (similar circuits) not discussed.",
    311           "source": "haiku"
    312         }
    313       },
    314       "human_studies": {
    315         "pre_registered": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants.",
    319           "source": "haiku"
    320         },
    321         "irb_or_ethics_approval": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants.",
    325           "source": "haiku"
    326         },
    327         "demographics_reported": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants.",
    331           "source": "haiku"
    332         },
    333         "inclusion_exclusion_criteria": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants.",
    337           "source": "haiku"
    338         },
    339         "randomization_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants.",
    343           "source": "haiku"
    344         },
    345         "blinding_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants.",
    349           "source": "haiku"
    350         },
    351         "attrition_reported": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "No human participants.",
    355           "source": "haiku"
    356         }
    357       },
    358       "cost_and_practicality": {
    359         "inference_cost_reported": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "Table 1 reports MCTS iteration rates (0.08–0.24 iterations/min), but total wall-clock time or compute cost not reported. At 200 iterations for MAC units, ~14 hours required but not quantified.",
    363           "source": "haiku"
    364         },
    365         "compute_budget_stated": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "Hardware specified (RTX A5000) but total computational budget (GPU-hours, cost) not stated.",
    369           "source": "haiku"
    370         }
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "MCTS-guided decoding produces functionally correct Verilog code for all 15 test modules (adders, multipliers, MACs)",
    377       "evidence": "Table 2: VeriGen+MCTS achieves 15/15 success vs VeriGen 1/15 and Beam Search 4/15",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "MCTS achieves 31.8% area-delay product improvement over beam search for 16-bit adder",
    382       "evidence": "Table 3: 16-bit adder ADP 94.39 (MCTS) vs 138.47 (Beam Search), (138.47−94.39)/138.47 = 31.8%",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "Modularity improves MCTS iteration rate by 3× for 64-bit adders",
    387       "evidence": "Table 1: iteration rate 0.08 (without modularity) → 0.24 (with modularity) = 3× improvement",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "Comment token filtering reduces search space complexity",
    392       "evidence": "Section 3.3 describes mechanism (filtering comment tokens) but effect not quantitatively measured in results",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "MCTS converges quickly for simple circuits (4–8 bit) but requires many iterations for complex ones (32–64 bit)",
    397       "evidence": "Figure 5: 16-bit designs converge within 50 iterations; MAC unit requires ~200 iterations",
    398       "supported": "strong"
    399     },
    400     {
    401       "claim": "Baseline reward parameter α_B balances exploration vs exploitation and affects functional correctness",
    402       "evidence": "Figure 4: functional correctness increases from 0% (α_B=0.1) to 100% (α_B=1.0) on 8-bit designs",
    403       "supported": "strong"
    404     }
    405   ],
    406   "methodology_tags": [
    407     "benchmark-eval",
    408     "ablation-study",
    409     "case-study"
    410   ],
    411   "key_findings": "MCTS-guided token selection achieves 100% functional correctness on 15 Verilog generation tasks (adders, multipliers, MACs at 4–64 bit widths) versus <30% for greedy and beam search baselines. Two key algorithmic optimizations—filtering non-functional comment tokens and reusing optimized sub-modules—reduce search complexity and enable scalability to larger circuits. Achieves up to 31.8% area-delay product improvement over beam search on 16-bit adders. However, computational cost is substantial (0.08–0.24 MCTS iterations per minute), requiring ~14 hours for complex designs. Evaluation is limited to simple arithmetic circuits; generalization to sequential circuits or other RTL patterns unknown.",
    412   "red_flags": [
    413     {
    414       "flag": "Tiny evaluation set",
    415       "detail": "Only 15 test modules (5 bit-widths × 3 circuit types). No power analysis or sample size justification. Generalization to broader RTL design space unknown."
    416     },
    417     {
    418       "flag": "Weak baseline LLM",
    419       "detail": "VeriGen-2B used instead of VeriGen-16B to 'demonstrate potential of MCTS', but does not address whether better base models reduce dependence on expensive search."
    420     },
    421     {
    422       "flag": "Code and data not released",
    423       "detail": "No repository, artifact link, or dataset release. Prompts described as 'hand-designed' but not provided. Reproduction impossible."
    424     },
    425     {
    426       "flag": "Computational cost not quantified",
    427       "detail": "Table 1 shows very slow iteration rates (0.08 iterations/min for 64-bit). At 200 iterations for complex modules, ~14 hours required, but total inference time never stated."
    428     },
    429     {
    430       "flag": "Training-test contamination not addressed",
    431       "detail": "VeriGen likely trained on standard arithmetic circuits from GitHub. Overlap between training and test set (adders, multipliers) possible but not discussed."
    432     },
    433     {
    434       "flag": "No robustness analysis",
    435       "detail": "All results are point estimates from single runs. No error bars, confidence intervals, or sensitivity analysis across random seeds or problem instances."
    436     },
    437     {
    438       "flag": "Narrow domain generalization",
    439       "detail": "Evaluated only on arithmetic circuits (adders, multipliers, MACs). Generalization to sequential circuits, state machines, or complex RTL patterns untested."
    440     },
    441     {
    442       "flag": "Minimal optimization gains for simple circuits",
    443       "detail": "Figure 3 shows example optimization (8-bit adder) produces visually similar code with marginal PPA improvement (25462.87 → 25158.27 ADP, 0.1%)."
    444     },
    445     {
    446       "flag": "Failure modes not analyzed",
    447       "detail": "Baseline methods fail frequently (14/15 for greedy, 11/15 for beam search) but failure modes not examined. Why does greedy fail on 8-bit but succeed on 4-bit?"
    448     },
    449     {
    450       "flag": "No human expert validation",
    451       "detail": "Code quality validated only by automated compilation and simulation, not by hardware engineers verifying design efficiency or correctness."
    452     }
    453   ],
    454   "cited_papers": [
    455     {
    456       "title": "VerilogEval: Evaluating Large Language Models for Verilog Code Generation",
    457       "relevance": "Establishes RTL evaluation benchmark (156 tasks); baseline for LLM Verilog quality measurement"
    458     },
    459     {
    460       "title": "Benchmarking Large Language Models for Automated Verilog RTL Code Generation (VeriGen)",
    461       "relevance": "Direct baseline model (VeriGen-2B); shows fine-tuned LLM outperforms GPT-4 on RTL tasks"
    462     },
    463     {
    464       "title": "Competition-level code generation with AlphaCode",
    465       "relevance": "Large-scale sampling and beam search for code generation; demonstrates diversity improves LLM output quality"
    466     },
    467     {
    468       "title": "ChipNeMo: Domain-Adapted LLMs for Chip Design",
    469       "relevance": "Domain-specific fine-tuning for hardware (RTL, EDA scripts); shows value of domain adaptation"
    470     },
    471     {
    472       "title": "AutoChip: Automating HDL Generation Using LLM Feedback",
    473       "relevance": "Uses compilation errors as feedback loop to improve code generation; complements MCTS search strategy"
    474     },
    475     {
    476       "title": "Chip-Chat: Challenges and Opportunities in Conversational Hardware Design",
    477       "relevance": "Conversational interface for chip design; identifies LLM brittleness on error detection in generated RTL"
    478     }
    479   ],
    480   "engagement_factors": {
    481     "practical_relevance": {
    482       "score": 1,
    483       "justification": "Computational overhead (14+ hours per complex design) and lack of code release limit practical deployment for practitioners."
    484     },
    485     "surprise_contrarian": {
    486       "score": 1,
    487       "justification": "Somewhat surprising that standard baselines completely fail (1/15, 4/15), but finding aligns with known LLM brittleness; not deeply contrarian."
    488     },
    489     "fear_safety": {
    490       "score": 0,
    491       "justification": "No safety or AI risk implications discussed. RTL generation is technical capability improvement with no obvious safety angle."
    492     },
    493     "drama_conflict": {
    494       "score": 0,
    495       "justification": "Straightforward technical contribution with no controversy, debate, or conflicting interpretations."
    496     },
    497     "demo_ability": {
    498       "score": 0,
    499       "justification": "Code and prompts not released. No interactive demo or accessible reproduction path for readers to experiment."
    500     },
    501     "brand_recognition": {
    502       "score": 1,
    503       "justification": "Authors from respectable institutions (Texas A&M, NYU) with NSF/Synopsys support, but not top-tier AI labs. Moderate prestige."
    504     }
    505   },
    506   "hn_data": {
    507     "threads": [
    508       {
    509         "hn_id": "39275203",
    510         "title": "Bluesky and the AT Protocol: Usable decentralized social media",
    511         "points": 245,
    512         "comments": 276,
    513         "url": "https://news.ycombinator.com/item?id=39275203",
    514         "created_at": "2024-02-06T15:25:33Z"
    515       },
    516       {
    517         "hn_id": "39292705",
    518         "title": "Training-Free Consistent Text-to-Image Generation",
    519         "points": 2,
    520         "comments": 0,
    521         "url": "https://news.ycombinator.com/item?id=39292705",
    522         "created_at": "2024-02-07T18:59:52Z"
    523       },
    524       {
    525         "hn_id": "39526135",
    526         "title": "College Basketball: An In-Depth Study of the \"Foul Up 3\" Dilemma [pdf]",
    527         "points": 1,
    528         "comments": 0,
    529         "url": "https://news.ycombinator.com/item?id=39526135",
    530         "created_at": "2024-02-27T16:38:27Z"
    531       }
    532     ],
    533     "top_points": 245,
    534     "total_points": 248,
    535     "total_comments": 276
    536   }
    537 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs