scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23146B)
      1 {
      2   "paper": {
      3     "title": "Automated C/C++ Program Repair for High-Level Synthesis via Large Language Models",
      4     "authors": [
      5       "Kangwei Xu",
      6       "Grace Li Zhang",
      7       "Xunzhao Yin",
      8       "Cheng Zhuo",
      9       "Ulf Schlichtmann",
     10       "Bing Li"
     11     ],
     12     "year": 2024,
     13     "arxiv_id": "2407.03889",
     14     "doi": null
     15   },
     16   "checklist": {
     17     "artifacts": {
     18       "code_released": {
     19         "applies": true,
     20         "answer": false,
     21         "justification": "No repository URL, code archive, or link to released code is provided anywhere in the paper. The framework and repair library are described but not made available."
     22       },
     23       "data_released": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "The 24 benchmarks are described as coming from related work [6], [7], [9], [10] and LeetCode, but no dataset download link or archive of the specific benchmark suite is provided."
     27       },
     28       "environment_specified": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The paper mentions 'Intel(R) Xeon(R) Silver 4314 2.40 GHz CPU', Catapult HLS Tool, Synopsys Design Compiler with Nangate 45nm library, and GPT-4 Turbo via OpenAI API, but no software versions, Python environment, or dependency specifications are provided."
     32       },
     33       "reproduction_instructions": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No step-by-step reproduction instructions, README, or scripts for replicating the experiments are provided. The methodology is described at a conceptual level but not in sufficient detail to reproduce without significant reverse engineering."
     37       }
     38     },
     39     "statistical_methodology": {
     40       "confidence_intervals_or_error_bars": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "Results are reported as pass rates (percentages) without confidence intervals or error bars. Table II and Figures 6-8 show point estimates only."
     44       },
     45       "significance_tests": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "The paper claims the proposed framework 'outperforms the baseline' and achieves 'much higher repair pass rates' but no statistical significance tests are performed to validate these comparative claims."
     49       },
     50       "effect_sizes_reported": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "The paper reports effect sizes with baseline context: 'an average 23.33% and 13.89% increase in repair pass rate' (Section IV), and specific reductions of '36.57%, 33.03%, and 29.08% reduction in area, power, and minimum clock period' (Section III-C). Per-benchmark pass rates from baseline to proposed are shown in Table II."
     54       },
     55       "sample_size_justified": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "Each experiment is repeated n=15 instances, but no justification is given for why 15 was chosen. No power analysis or discussion of whether 15 repetitions is sufficient for reliable pass rate estimation."
     59       },
     60       "variance_reported": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "Only pass rates (m/n with n=15) are reported without standard deviation, confidence intervals, or any spread measure across the 15 runs per benchmark."
     64       }
     65     },
     66     "evaluation_design": {
     67       "baselines_included": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "Table II compares the proposed framework against two baselines: traditional scripts [6][7] and direct application of GPT-4 Turbo for repair."
     71       },
     72       "baselines_contemporary": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "GPT-4 Turbo (2024) is used as the LLM baseline, which was state-of-the-art at submission time. The traditional scripts baseline [6][7] represents the prior non-LLM approach."
     76       },
     77       "ablation_study": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Figure 6 includes an ablation comparing 'GPT-4 Turbo with Scripts' (without RAG) against the full proposed framework (with RAG), demonstrating the RAG contribution. The joint LLM-script mechanism is also evaluated separately in Fig. 7(b)."
     81       },
     82       "multiple_metrics": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The paper uses compilation pass rate, simulation pass rate, area, power, minimum clock period, and LLM cost as evaluation metrics."
     86       },
     87       "human_evaluation": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "The system produces compilable and simulatable hardware code verified by HLS tools; human evaluation of code quality is not clearly relevant since correctness is verified by C-RTL co-simulation."
     91       },
     92       "held_out_test_set": {
     93         "applies": false,
     94         "answer": false,
     95         "justification": "This is not a machine learning training/testing setup. The benchmarks are real-world C/C++ programs that need repair; there is no train/test split involved."
     96       },
     97       "per_category_breakdown": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Table II provides per-benchmark results for all 24 applications, organized by 8 error types (T1-T8). Figures 6-8 also show per-benchmark breakdowns."
    101       },
    102       "failure_cases_discussed": {
    103         "applies": true,
    104         "answer": false,
    105         "justification": "The paper shows that many benchmarks have less than 100% pass rate (e.g., AES at 60%, Edge Detection at 60%) but does not discuss why specific repairs fail, what error patterns are harder, or provide qualitative analysis of failure cases."
    106       },
    107       "negative_results_reported": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "All experiments show the proposed method improving over baselines. No negative results, failed approaches, or configurations that did not work are reported."
    111       }
    112     },
    113     "claims_and_evidence": {
    114       "abstract_claims_supported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The abstract claims 'much higher repair pass rates in 24 real-world applications compared with traditional scripts and direct application of LLMs' and a '23.33% increase in repair pass rate,' both supported by Table II and Section IV results."
    118       },
    119       "causal_claims_justified": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The paper makes causal claims about RAG improving pass rate (23.33% increase) and joint LLM-script reducing cost (21.56%). The ablation study in Fig. 6 compares GPT-4 Turbo baseline vs. GPT-4 with scripts vs. proposed framework with RAG, providing controlled single-variable comparisons for these components."
    123       },
    124       "generalization_bounded": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "The title claims 'Automated C/C++ Program Repair for High-Level Synthesis' broadly, but experiments use only one LLM (GPT-4 Turbo), one HLS tool (Catapult), and 24 benchmarks. No discussion of whether results generalize to other LLMs, HLS tools (Vitis HLS, Intel HLS), or larger programs."
    128       },
    129       "alternative_explanations_discussed": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "No alternative explanations for the results are discussed. For example, the improvement could partly be due to the specific correction templates chosen, or the particular benchmarks selected. No threats-to-validity or robustness checks are present."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper states 'GPT-4 Turbo Model was used as the LLM via OpenAI APIs' (Section IV) but does not specify a snapshot date or API version (e.g., 'gpt-4-turbo-2024-04-09'). 'GPT-4 Turbo' is a marketing name without a specific version."
    140       },
    141       "prompts_provided": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The paper provides actual prompt text in Figures 10-13 (Appendix II), including system prompts ('You are an expert in HLS repair...'), problem description prompts, and correction template examples. While some content is 'omitted for brevity due to space limitations,' the core prompt structure and actual text are shown."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported. The paper states '5 times to repair' as the iteration limit but no API settings."
    150       },
    151       "scaffolding_described": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "The agentic scaffolding is described in detail: Section III covers the 5-stage pipeline (preprocessing, RAG-based repair, bit width optimization, equivalence verification, PPA optimization), including the iterative repair loop with compiler feedback, the RAG retrieval mechanism using sentence transformers, and the joint LLM-script interaction."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section IV states benchmarks come from related work [6][7][9][10], each repeated n=15 instances with 5 LLM queries per instance. The preprocessing stage (Section III-A) describes how programs are first compiled to detect actual errors, then fed to LLM for potential error detection."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": false,
    166         "justification": "There is no limitations section, threats-to-validity section, or any substantive discussion of limitations in the paper. The conclusion (Section V) contains no caveats."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": false,
    171         "justification": "No threats to validity are discussed anywhere in the paper."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No scope boundaries are stated. The paper does not discuss what settings the results do not apply to, such as other HLS tools, other LLMs, larger or more complex programs, or different error types."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No raw data (individual repair attempts, LLM outputs, specific error logs) is available. Only aggregate pass rates are reported."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section IV describes the experimental setup: 24 benchmarks from related work, GPT-4 Turbo via OpenAI API, Catapult HLS Tool, 15 repetitions per benchmark, 5 LLM queries per repair attempt. Benchmarks are organized by 8 error types."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": false,
    192         "answer": false,
    193         "justification": "No human participants involved. Benchmarks are from prior work and LeetCode, which is a standard data source not involving recruitment."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The pipeline from input C/C++ program through preprocessing, RAG-based repair, bit width optimization, equivalence verification, and PPA optimization is documented in Section III with workflow diagrams (Fig. 2). The evaluation pipeline (compile, simulate, count pass/fail) is described in Section IV."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "No funding information or acknowledgments section is present in the paper."
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Author affiliations are clearly listed: TUM, TU Darmstadt, Zhejiang University, and University of Siegen. These are academic institutions, not vendors of the evaluated tools."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding source is disclosed, so independence cannot be assessed. The paper uses OpenAI's GPT-4 Turbo and Siemens' Catapult HLS Tool but does not disclose whether any funding relationship exists with these companies."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No competing interests or financial interests statement is present in the paper."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "GPT-4 Turbo's training data cutoff is not stated. The benchmarks include LeetCode problems which are publicly available and likely in GPT-4's training data."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No discussion of whether GPT-4 Turbo may have seen the benchmark programs or similar HLS repair patterns during training. LeetCode problems ([10]) are widely available online."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "The benchmarks include LeetCode problems and examples from prior work published before GPT-4's training cutoff. No contamination analysis is performed despite this clear risk."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "No human participants in this study."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human participants in this study."
    250       },
    251       "demographics_reported": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this study."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study."
    265       },
    266       "blinding_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "attrition_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": true,
    281         "justification": "Section III-D discusses cost explicitly: GPT-4 Turbo charges '$0.01 per 1K input tokens and $0.03 per 1K output tokens.' Fig. 7(b) shows normalized cost ratios for each error type, and the paper reports a 21.56% average cost reduction with the joint LLM-script approach."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "No total computational budget is stated. The paper does not report total API spend, total tokens consumed, or total wall-clock time for the experiments."
    287       }
    288     }
    289   },
    290   "claims": [
    291     {
    292       "claim": "The proposed LLM-driven framework achieves an average 23.33% increase in repair pass rate compared to the GPT-4 Turbo baseline.",
    293       "evidence": "Section IV and Fig. 6 compare simulation pass rates across 24 benchmarks. The proposed method with RAG outperforms the direct GPT-4 Turbo baseline by 23.33% on average.",
    294       "supported": "moderate"
    295     },
    296     {
    297       "claim": "RAG contributes a 13.89% improvement over GPT-4 Turbo with scripts (without RAG).",
    298       "evidence": "Fig. 6 shows the ablation comparing GPT-4 Turbo with scripts (no RAG) vs. the full proposed framework. The average improvement is 13.89%.",
    299       "supported": "moderate"
    300     },
    301     {
    302       "claim": "Bit width optimization achieves average reductions of 36.57% in area, 33.03% in power, and 29.08% in minimum clock period.",
    303       "evidence": "Fig. 7(a) shows normalized performance ratios for 5 benchmarks (ET, GA, BS, CC, AES) after bit width optimization compared to default 32-bit int.",
    304       "supported": "moderate"
    305     },
    306     {
    307       "claim": "The joint LLM-script repair mechanism reduces LLM usage cost by 21.56% on average.",
    308       "evidence": "Fig. 7(b) shows normalized cost ratios across 8 error types (T1-T8), comparing LLM-only vs. joint LLM-script repair.",
    309       "supported": "moderate"
    310     },
    311     {
    312       "claim": "LLM-driven PPA optimization achieves average reductions of 14.02% in area, 11.50% in power, and 17.94% in minimum clock period.",
    313       "evidence": "Fig. 8 shows normalized performance for 6 benchmarks with performance bottlenecks before and after optimization.",
    314       "supported": "moderate"
    315     }
    316   ],
    317   "methodology_tags": [
    318     "benchmark-eval"
    319   ],
    320   "key_findings": "The paper proposes an LLM-driven framework for automatically repairing regular C/C++ programs to make them compatible with High-Level Synthesis (HLS) tools. Using RAG with a manually-built correction template library, the framework achieves an average 23.33% improvement in repair pass rate over direct GPT-4 Turbo application across 24 real-world benchmarks. An LLM-generated bit width optimization scheme reduces area, power, and clock period by 29-37%, while a joint LLM-script mechanism cuts LLM costs by 21.56%.",
    321   "red_flags": [
    322     {
    323       "flag": "No variance or uncertainty quantification",
    324       "detail": "Pass rates are computed from only 15 repetitions per benchmark without confidence intervals, error bars, or standard deviations. With n=15, a single additional success/failure changes the pass rate by 6.67 percentage points, making small differences unreliable."
    325     },
    326     {
    327       "flag": "Potential benchmark contamination",
    328       "detail": "Several benchmarks come from LeetCode, which is extensively represented in GPT-4's training data. The LLM may have memorized solutions or similar repair patterns, inflating pass rates for both baseline and proposed method. No contamination analysis is performed."
    329     },
    330     {
    331       "flag": "No limitations or threats to validity discussed",
    332       "detail": "The paper entirely omits any discussion of limitations, threats to validity, or scope boundaries. All results are presented positively with no negative results or failure analysis."
    333     },
    334     {
    335       "flag": "No significance testing for comparative claims",
    336       "detail": "Claims of superiority ('much higher repair pass rates') are based on comparing raw percentages across 15 trials without any statistical tests. Some reported differences are small enough to be within sampling noise."
    337     },
    338     {
    339       "flag": "Single LLM and single HLS tool",
    340       "detail": "All experiments use only GPT-4 Turbo and only Catapult HLS, but claims are framed generally ('LLM-driven program repair for HLS'). Generalizability to other LLMs or HLS tools (Vitis HLS, Intel HLS) is unknown."
    341     }
    342   ],
    343   "cited_papers": [
    344     {
    345       "title": "Examining Zero-Shot Vulnerability Repair with Large Language Models",
    346       "authors": ["Hammond Pearce", "Benjamin Tan", "Baleegh Ahmad", "Ramesh Karri", "Brendan Dolan-Gavitt"],
    347       "year": 2023,
    348       "relevance": "Evaluates LLMs' ability to automatically repair code vulnerabilities, directly relevant to LLM-based program repair methodology."
    349     },
    350     {
    351       "title": "Repair is nearly generation: multilingual program repair with LLMs",
    352       "authors": ["Harshit Joshi", "Jose Cambronero", "Sumit Gulwani", "Vu Le", "Ivan Radicek", "Gust Verbruggen"],
    353       "year": 2023,
    354       "relevance": "Studies LLM-based multilingual program repair, a core topic in automated program repair with LLMs."
    355     },
    356     {
    357       "title": "ITER: Iterative Neural Repair for Multi-Location Patches",
    358       "authors": ["He Ye", "Martin Monperrus"],
    359       "year": 2024,
    360       "relevance": "Proposes iterative LLM-based repair approach, directly relevant to iterative program repair methodology."
    361     },
    362     {
    363       "title": "Automated program repair in the era of large pre-trained language models",
    364       "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"],
    365       "year": 2023,
    366       "relevance": "Comprehensive study of LLM-based automated program repair, core reference for the field."
    367     },
    368     {
    369       "title": "InferFix: End-to-End Program Repair with LLMs",
    370       "authors": ["Matthew Jin", "Syed Shahriar", "Michele Tufano", "Xin Shi", "Shuai Lu", "Neel Sundaresan", "Alexey Svyatkovskiy"],
    371       "year": 2023,
    372       "relevance": "End-to-end LLM-based program repair system, relevant to automated code repair evaluation."
    373     },
    374     {
    375       "title": "AutoChip: Automating HDL Generation Using LLM Feedback",
    376       "authors": ["Shailja Thakur", "Jason Blocklove", "Hammond Pearce", "Benjamin Tan", "Siddharth Garg", "Ramesh Karri"],
    377       "year": 2023,
    378       "arxiv_id": "2311.04887",
    379       "relevance": "Uses LLM feedback loops for hardware design language generation, relevant to LLM-driven hardware design automation."
    380     },
    381     {
    382       "title": "Chip-Chat: Challenges and Opportunities in Conversational Hardware Design",
    383       "authors": ["Jason Blocklove", "Siddharth Garg", "Ramesh Karri", "Hammond Pearce"],
    384       "year": 2023,
    385       "relevance": "Explores conversational LLM-based hardware design, relevant to LLM applications in hardware/software co-design."
    386     },
    387     {
    388       "title": "GPT4AIGChip: Towards Next-Generation AI Accelerator Design Automation via Large Language Models",
    389       "authors": ["Yonggan Fu", "Yongan Zhang", "Zhongzhi Yu"],
    390       "year": 2023,
    391       "relevance": "Uses GPT-4 for AI chip design automation, relevant to LLM-driven hardware design methodology."
    392     },
    393     {
    394       "title": "Automatically Fixing RTL Syntax Errors with Large Language Model",
    395       "authors": ["Yun-Da Tsai", "Mingjie Liu", "Haoxing Ren"],
    396       "year": 2024,
    397       "relevance": "LLM-based automatic RTL syntax error repair, directly relevant to LLM-driven code repair in hardware design."
    398     },
    399     {
    400       "title": "RTLLM: An Open-Source Benchmark for Design RTL Generation with Large Language Model",
    401       "authors": ["Yao Lu", "Shang Liu", "Qijun Zhang", "Zhiyao Xie"],
    402       "year": 2024,
    403       "relevance": "Benchmark for LLM-based RTL generation, relevant to evaluating LLM capabilities in hardware design."
    404     },
    405     {
    406       "title": "Training language models to follow instructions with human feedback",
    407       "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang", "Ryan Lowe"],
    408       "year": 2022,
    409       "relevance": "Foundational RLHF paper for instruction-following LLMs, relevant to understanding the LLM capabilities leveraged in this work."
    410     }
    411   ]
    412 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs