scan.json (21333B)
1 { 2 "paper": { 3 "title": "Ask-EDA: A Design Assistant Empowered by LLM, Hybrid RAG and Abbreviation De-hallucination", 4 "authors": ["Luyao Shi", "Michael Kazda", "Bradley Sears", "Nick Shropshire", "Ruchir Puri"], 5 "year": 2024, 6 "venue": "arXiv preprint", 7 "arxiv_id": "2406.06575" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": false, 14 "justification": "No source code repository URL is provided in the paper. The system references internal IBM tools and data, and no GitHub or Zenodo link is given." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "The three evaluation datasets (q2a-100, cmds-100, abbr-100) are described as domain-specific to IBM chip design and are not publicly released." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "The paper mentions using LangChain, ChromaDB, and specific models but does not provide a requirements.txt, Dockerfile, or detailed library versions sufficient to recreate the environment." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions are provided. The methodology is described at a conceptual level but without runnable scripts or detailed setup guides." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Results in Figures 3 and 4 are presented as bar charts with point estimates only. No confidence intervals, error bars, or uncertainty measures are reported." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims hybrid RAG yields 'significantly superior results' and ADH 'significantly boosts performance' but provides no statistical significance tests (no p-values, t-tests, or any formal test)." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The abstract reports percentage improvements with context: 'over a 40% improvement in Recall on the q2a-100 dataset and over a 60% improvement on the cmds-100 dataset compared to not using RAG' and 'over a 70% enhancement in Recall on the abbr-100 dataset.' Results are also shown in bar charts with baseline comparisons, providing enough context to gauge magnitude." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "Each evaluation dataset contains 100 examples. No justification is given for why 100 was chosen, and no power analysis is discussed." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be from single runs with no indication of multiple experimental runs." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "The paper compares hybrid RAG against no RAG, sparse-only RAG (BM25), and dense-only RAG (sentence transformer) as baselines (Section III.C, Figures 3 and 4)." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": false, 68 "justification": "The baselines are component ablations (dense-only, sparse-only, no RAG) rather than comparisons against other contemporary RAG systems or design assistant tools. ChipNeMo is mentioned as prior art but not benchmarked against." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "The evaluation structure functions as an ablation: comparing hybrid RAG vs. dense-only, sparse-only, and no RAG, plus testing with and without the ADH component (Figures 3 and 4)." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Two metrics are used: ROUGE-Lsum F1 and ROUGE-Lsum Recall (Section III.C)." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": false, 83 "justification": "No human evaluation of Ask-EDA's outputs is reported. The paper mentions collecting feedback via Slack GUI but explicitly states 'the feedback data are not used in the evaluation study in this paper' (Section II.E). Given the system is a design assistant for engineers, human evaluation of response quality would be relevant." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": false, 88 "justification": "It is unclear whether the evaluation datasets were used for any tuning or development decisions. No explicit separation of dev and test splits is described." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down across three distinct datasets (q2a-100, cmds-100, abbr-100), each targeting a different task aspect, and across two models (Figures 3 and 4)." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "The paper discusses failure cases: Llama2-13b-chat's lower F1 despite comparable Recall ('it is plausible that Llama2-13b-chat struggles to effectively extract the final answer'), and that neither model achieves 1.0 recall on abbr-100 despite abbreviation knowledge being in the prompt, attributing this to 'intrinsic limitations in LLMs' (Section III.C)." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports that Llama2-13b-chat does not show improvement from hybrid over sparse/dense in terms of F1 on q2a-100, and that neither model achieves perfect recall on abbreviations even with ADH (Section III.C)." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims of 40%, 60%, and 70% improvements are directionally supported by the bar charts in Figures 3 and 4, though exact numbers are read from charts rather than stated in tables." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper makes causal claims that hybrid RAG and ADH improve performance. These are supported by controlled ablation experiments where components are added/removed while holding other factors constant (Figures 3 and 4). This single-variable manipulation constitutes adequate causal design for these claims." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The abstract and conclusion claim Ask-EDA 'can effectively respond to design-related inquiries' broadly, but evaluation is on three proprietary IBM chip design datasets with only two 13B-parameter models. No bounding to the specific design domain, document corpus, or model scale is provided." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "No alternative explanations for the results are discussed. For example, the paper does not consider whether improvements could be due to the specific characteristics of the evaluation datasets rather than the methods, or whether ROUGE is an appropriate metric for measuring answer quality in this domain." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "Specific model versions are provided: 'Granite-13b-chat-v2.1' and 'Llama2-13b-chat' (Section III.B). The sentence transformer is also specified: 'all-MiniLM-L6-v2'." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "The system prompt is provided verbatim in Section III.B: 'You are a helpful AI language model. Your primary function is to assist users in answering questions...' The ADH prompt format is also specified in Section II.C." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Key hyperparameters are reported in Section III.B: context length 8192, max new tokens 4096, chunk size 2048, chunk overlap 256, ndense=nsparse=nhybrid=3, RRF constant k=60." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "The RAG pipeline scaffolding is described in detail: document ingestion (Section II.B.1), retrieval with hybrid search and RRF (Section II.B.2), abbreviation matching and injection (Section II.C), LLM generation (Section II.D), and Slack interface (Section II.E). Diagrams are provided in Figures 1 and 2." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section II.A describes document sources (400MB, 10,200 command pages, 5,000 parameters, 30 slack channels, 18,000 Q&A pairs). Section II.B.1 describes ingestion using LangChain document loaders with supported formats. Section III.A describes how each evaluation dataset was derived." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": false, 159 "justification": "There is no dedicated limitations or threats-to-validity section. The Discussion (Section IV) briefly mentions future directions but does not discuss limitations." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "No specific threats to validity are discussed. Issues such as the proprietary nature of datasets, small evaluation sizes, use of only ROUGE as a metric, and lack of human evaluation are not acknowledged." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "No explicit scope boundaries are stated. The paper does not discuss what its results do not show, or what settings/domains/models are excluded from its claims." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "The evaluation datasets and raw results are not made publicly available. The data is described as proprietary IBM chip design content." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section III.A describes how each dataset was collected: q2a-100 from a stack-overflow type system with expert-marked answers, cmds-100 from tool manual pages with command synopses turned into questions, abbr-100 randomly sampled from the 249-term abbreviation dictionary." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants are involved in the evaluation. The datasets are curated from existing internal documents and tools." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The data pipeline from document sources through ingestion (LangChain loaders, chunking, embedding, BM25 indexing) to retrieval and generation is documented in Sections II.A-II.D with accompanying diagrams (Figures 1 and 2)." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding source is disclosed. All authors are from IBM, and the Acknowledgment section thanks IBM management for 'support and guidance' but does not explicitly disclose funding." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "All author affiliations are clearly listed: IBM Research (Shi, Puri) and IBM Infrastructure (Kazda, Sears, Shropshire). The paper evaluates IBM's own Granite model alongside Llama2." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "IBM employees built and evaluated the system using IBM's own Granite model. IBM has a commercial interest in demonstrating the effectiveness of both the Ask-EDA system and the Granite model. The funder (IBM) is not independent of the outcome." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is present in the paper. Given that the work evaluates IBM's own products, this omission is notable." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper states for Granite-13b-chat-v2.1 that 'There is no overlap between the LLM training data and our ingested data' (Section III.B), but does not state any training data cutoff date for either model." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": true, 225 "justification": "For Granite-13b-chat-v2.1, the paper explicitly states 'There is no overlap between the LLM training data and our ingested data' (Section III.B). No such statement is made for Llama2-13b-chat, but the data is proprietary IBM content unlikely to be in public training data." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": true, 230 "justification": "The evaluation datasets are curated from proprietary IBM internal systems (internal Q&A platform, internal tool manuals, internal abbreviation dictionary), making benchmark contamination unlikely. The paper notes 'There is no overlap between the LLM training data and our ingested data' for Granite." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants are involved in the evaluation study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants are involved in the evaluation study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants are involved in the evaluation study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants are involved in the evaluation study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants are involved in the evaluation study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants are involved in the evaluation study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants are involved in the evaluation study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "No inference cost, latency, or tokens consumed per query is reported, despite the system making multiple retrieval and LLM calls per query." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No information about hardware used, GPU hours, or computational resources is provided." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "Hybrid RAG offers over a 40% improvement in Recall on q2a-100 compared to not using RAG", 286 "evidence": "Figure 3 shows bar chart comparisons of hybrid vs. no-RAG configurations on q2a-100 for both models (Section III.C).", 287 "supported": "moderate" 288 }, 289 { 290 "claim": "Hybrid RAG offers over a 60% improvement in Recall on cmds-100 compared to not using RAG", 291 "evidence": "Figure 3 shows that no-RAG achieves 0 Recall on cmds-100 while hybrid RAG achieves non-zero scores. Exact values are read from bar charts (Section III.C).", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "ADH yields over a 70% enhancement in Recall on abbr-100", 296 "evidence": "Figure 4 shows comparison of hybrid RAG with and without ADH on abbr-100 (Section III.C). Results shown only as bar charts without exact numbers or statistical tests.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "Hybrid search RAG outperforms both sparse-only and dense-only RAG", 301 "evidence": "Figure 3 shows hybrid consistently achieving highest or comparable scores across datasets and models. For Granite on q2a-100, hybrid is best. On cmds-100, hybrid outperforms. For Llama2, the pattern holds for Recall but not F1 on q2a-100 (Section III.C).", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "Adding ADH does not adversely affect performance on cmds-100 and q2a-100", 306 "evidence": "Figure 4 shows results for cmds-100 and q2a-100 with and without ADH, demonstrating comparable performance (Section III.C).", 307 "supported": "moderate" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval", "case-study"], 311 "key_findings": "Ask-EDA combines hybrid RAG (dense + sparse retrieval with reciprocal rank fusion) and an abbreviation de-hallucination component to assist EDA engineers. Hybrid RAG substantially improves retrieval performance over no RAG, sparse-only, and dense-only approaches on three proprietary IBM evaluation datasets. The ADH component significantly improves abbreviation resolution without degrading performance on other tasks. However, neither model achieves perfect abbreviation recall even when correct information is present in the prompt, suggesting intrinsic LLM limitations.", 312 "red_flags": [ 313 { 314 "flag": "Company evaluating own product", 315 "detail": "All authors are IBM employees evaluating IBM's Granite model alongside an open-source baseline. Granite is shown to outperform Llama2 on F1 across all conditions. No competing interests statement is provided." 316 }, 317 { 318 "flag": "No statistical tests despite 'significant' claims", 319 "detail": "The paper repeatedly uses the word 'significantly' (e.g., 'significantly superior results,' 'significantly boosts performance') without any statistical significance testing. All claims are based on visual comparison of bar charts." 320 }, 321 { 322 "flag": "No error bars or variance", 323 "detail": "All results appear to be from single runs with no error bars, confidence intervals, or multi-run variance reported." 324 }, 325 { 326 "flag": "Proprietary non-reproducible evaluation", 327 "detail": "All three evaluation datasets are IBM-internal and proprietary. No code is released. The results cannot be independently verified or reproduced." 328 }, 329 { 330 "flag": "No limitations section", 331 "detail": "The paper has no limitations or threats-to-validity section despite multiple methodological concerns (small datasets, single metric family, no human evaluation, proprietary data)." 332 }, 333 { 334 "flag": "ROUGE as sole metric family", 335 "detail": "Only ROUGE-Lsum variants (F1 and Recall) are used. ROUGE measures lexical overlap and may not capture semantic correctness, particularly problematic for design Q&A where paraphrased but correct answers would score low." 336 } 337 ], 338 "cited_papers": [ 339 { 340 "title": "GPT-4 Technical Report", 341 "authors": ["J. Achiam", "S. Adler", "S. Agarwal"], 342 "year": 2023, 343 "arxiv_id": "2303.08774", 344 "relevance": "Foundational LLM used in many agentic AI and code generation evaluations." 345 }, 346 { 347 "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", 348 "authors": ["P. Lewis", "E. Perez", "A. Piktus"], 349 "year": 2020, 350 "relevance": "Introduces RAG, a core technique for grounding LLM outputs used widely in agentic AI systems." 351 }, 352 { 353 "title": "ChipNeMo: Domain-Adapted LLMs for Chip Design", 354 "authors": ["M. Liu", "T.-D. Ene", "R. Kirby"], 355 "year": 2023, 356 "arxiv_id": "2311.00176", 357 "relevance": "Directly comparable domain-adapted LLM system for chip design assistance, evaluating LLM productivity tools for engineers." 358 }, 359 { 360 "title": "Granite Code Models: A Family of Open Foundation Models for Code Intelligence", 361 "authors": ["M. Mishra", "M. Stallone", "G. Zhang"], 362 "year": 2024, 363 "arxiv_id": "2405.04324", 364 "relevance": "Describes the Granite model family used in this evaluation, relevant to LLM code generation capabilities." 365 }, 366 { 367 "title": "Training Language Models to Follow Instructions with Human Feedback", 368 "authors": ["L. Ouyang", "J. Wu", "X. Jiang"], 369 "year": 2022, 370 "relevance": "Introduces RLHF for instruction-following, foundational technique for LLM alignment and safety." 371 } 372 ] 373 }