scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26396B)
      1 {
      2   "paper": {
      3     "title": "AutoVCoder: A Systematic Framework for Automated Verilog Code Generation using LLMs",
      4     "authors": [
      5       "Mingzhe Gao",
      6       "Jieru Zhao",
      7       "Zhe Lin",
      8       "Wenchao Ding",
      9       "Xiaofeng Hou",
     10       "Yu Feng",
     11       "Chao Li",
     12       "Minyi Guo"
     13     ],
     14     "year": 2024,
     15     "venue": "arXiv preprint",
     16     "arxiv_id": "2407.18333"
     17   },
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The paper describes AutoVCoder as 'open-source' in the abstract and introduction, but no GitHub link, Zenodo archive, or any repository URL is provided anywhere in the paper. A claim of being open-source without a working URL does not satisfy this criterion."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper describes constructing a high-quality hardware dataset from GitHub (~1M modules, ~50k after filtering) and a synthetic dataset, but no download links or data releases are provided. The benchmarks used (VerilogEval, RTLLM) are publicly available, but the paper's own generated datasets are not released."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper mentions using 'three Nvidia A100 GPUs' (Section IV-A) and names base models (Codellama-7B, DeepSeek-Coder-6.7B, CodeQwen1.5-7B) but provides no requirements.txt, Dockerfile, or detailed environment setup listing library versions. This is insufficient to recreate the environment."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions, README with commands, or scripts are provided. The methodology section describes the approach conceptually but does not include specific commands or procedures for replication."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "All results in Tables I, II, and III report only point estimates (e.g., '69.0%', '79.3%') with no confidence intervals, error bars, or uncertainty quantification."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper makes multiple comparative claims (e.g., 'AutoVCoder outperforms both industrial and academic LLMs') based solely on comparing raw percentages in Tables I-III without any statistical significance tests."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "The paper reports improvements with baseline context: '0.5% and 2.2% improvement in functional correctness on the EvalMachine and EvalHuman benchmarks compared with BetterV' and '3.4% increase in syntax correctness and a 3.4% increase in functional correctness on the RTLLM benchmark compared with RTLCoder.' Tables provide both baseline and system scores allowing readers to compute absolute differences."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The paper uses n=10 for pass@k evaluation without justifying why 10 samples are sufficient. No power analysis or discussion of whether the benchmarks (100+ tasks for VerilogEval, 29 tasks for RTLLM) provide adequate statistical power for the claims made."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No standard deviation, variance, or spread measures are reported across experimental runs. The pass@k metric is computed from n=10 generations but no variance across seeds or repeated experiments is reported."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Table I includes extensive baselines: GPT-3.5, GPT-4, ChipNeMo, VerilogEval, Codegen2, Starcoder, Thakur et al., RTLCoder (Mistral and DeepSeek variants), and BetterV (Codellama, DeepSeek, CodeQwen variants)."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The baselines include contemporary work: BetterV (ICML 2024), RTLCoder (2023), and GPT-4. These represent the state-of-the-art for Verilog code generation at the time of submission."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Table II provides ablation of the two-round fine-tuning (no fine-tuning, round 1 only, round 2 only, both rounds). Table III provides ablation of the RAG components (no RAG, example retriever only, knowledge retriever only, both retrievers). These cover the three main components of the framework."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The paper uses multiple metrics: pass@1 and pass@5 for VerilogEval, and syntactic correctness and functional correctness for RTLLM. This provides evaluation from multiple angles."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No human evaluation of the generated Verilog code quality is included. All evaluation is automated via Icarus Verilog syntax checking and testbench execution. Human evaluation of code quality (readability, design patterns, synthesizability) would be relevant given claims about 'high-quality' code generation."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The paper uses established external benchmarks (VerilogEval and RTLLM V1.1) as test sets, which are separate from the training data (GitHub open-source database and ChatGPT-generated synthetic dataset). This provides a clear separation between training and evaluation data."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results are broken down by benchmark (VerilogEval EvalMachine, VerilogEval EvalHuman, RTLLM), by base model (Codellama, DeepSeek, CodeQwen), and by metric (syntax correctness, functional correctness, pass@1, pass@5) across all tables."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper discusses failure patterns: LLMs tend to 'use loops like those in Python and C++' leading to 'excessive usage of for loops when generating RTL code' (Section III-C). Section IV-D notes 'the example database is not well-matched to some of the difficult problems in RTLLM, due to the scarcity of large-scale Verilog design instances.'"
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Table III shows that the knowledge retriever alone sometimes does not improve or even slightly reduces performance (e.g., AutoVCoder-DeepSeek-KR has lower EvalMachine pass@5 than AutoVCoder-DeepSeek-1&2: 77.5% vs 77.8%). The paper also notes that 'improvements on RTLLM question set are less notable' (Section IV-D)."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims '0.5% and 2.2% improvement in functional correctness on the EvalMachine and EvalHuman benchmarks compared with BetterV, and also achieves a 3.4% increase in syntax correctness and a 3.4% increase in functional correctness on the RTLLM benchmark compared with RTLCoder.' These specific numbers are supported by Table I (comparing AutoVCoder-CodeQwen vs BetterV-CodeQwen for VerilogEval, and AutoVCoder-DeepSeek vs RTLCoder-DeepSeek for RTLLM)."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper makes causal claims about the contribution of each component (fine-tuning rounds, RAG retrievers). Tables II and III provide controlled ablation studies where single components are added or removed while keeping other variables constant, which is adequate for causal inference about component contributions."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper claims to be 'a systematic framework for Automated Verilog Code Generation' broadly, but results are limited to two benchmarks (VerilogEval with ~100 tasks, RTLLM with 29 tasks) and three base models (all 7B-class). The title and abstract do not bound claims to these specific settings. No discussion of whether results would hold for larger models, different HDL languages, or industrial-scale designs."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper does not discuss alternative explanations for the observed improvements. For instance, it does not consider whether improvements could be due to data leakage between the GitHub-sourced training data and the benchmarks, or whether the gains are simply from having more training data rather than the specific methodology."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper specifies base model names with sizes: 'Codellama-7B [25], DeepSeek-Coder-6.7B [26], and CodeQwen1.5-7B [27]' (Section IV-A). These are specific model identifiers that can be unambiguously located. ChatGPT-3.5 is used for data generation without a specific version, but it is not the system being evaluated."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The paper provides actual prompt text used for data generation: Fig. 3 shows the code scoring prompt, and Fig. 5 shows the problem-code pair generation prompt template with specific variable definitions ($level, $circuit_type, $problem_type). These are the actual prompts used in the pipeline."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section IV-A reports: learning rate γ = 2e-4 for LLM fine-tuning, learning rate 1e-5 for retriever training, 1 epoch for first-round fine-tuning, 3 epochs for second-round fine-tuning, top_p = 0.95, temperature = 0.8, n = 10 for pass@k, retriever chunk counts (2 for example retriever, 3 for knowledge retriever)."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "The RAG scaffolding is described in detail in Section III-C: the example retriever and knowledge retriever are explained, including how they fetch information, how retrieved chunks are combined with user prompts, the contrastive learning training process (Fig. 7), and the inference pipeline (Fig. 2)."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section III-A describes the data pipeline: collection from ~20,000 GitHub repositories yielding ~1M raw modules, scoring via ChatGPT-3.5 trained code scorer, filtering threshold of 6.5 (21.7% retained), and synthetic dataset generation with code filter verification (syntax check via Icarus Verilog, functional correctness check via Python equivalents for combinational circuits). Fig. 6 illustrates the flow."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no dedicated limitations or threats-to-validity section in the paper. The conclusion (Section V) does not discuss limitations either."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No specific threats to validity are discussed. The paper briefly notes in Section IV-D that 'the example database is not well-matched to some of the difficult problems in RTLLM,' but this is a result observation, not a systematic discussion of validity threats."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No explicit scope boundaries are stated. The paper does not specify what the results do NOT show — for example, no mention that results are limited to small benchmarks, 7B-class models only, or that the approach has not been validated on industrial-scale designs."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw data is available for independent verification. The generated datasets (open-source database, synthetic dataset), model outputs, and per-task benchmark results are not released."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section III-A describes data collection in detail: GitHub search for .v files across ~20,000 repositories yielding ~1M modules, ChatGPT-3.5 scoring of 15,000 modules for code scorer training, code scorer applied to remaining modules with 6.5 threshold (21.7% retained). Synthetic dataset generated via ChatGPT-3.5 with verification pipeline (Fig. 6)."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants involved. The paper uses automated benchmarks and LLM-generated data."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The data pipeline is documented across Section III-A and Figs. 4 and 6: GitHub scraping → module segmentation → code scoring (ChatGPT-3.5 → code scorer model) → threshold filtering → first-round fine-tuning data. Separately: ChatGPT-3.5 generation → syntax verification (Icarus Verilog) → functional verification (Python equivalence or testbench) → second-round fine-tuning data. However, exact counts of the final synthetic dataset size are not provided."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding source is mentioned anywhere in the paper. There is no acknowledgments section listing grants or sponsors."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed: Shanghai Jiao Tong University, Sun Yat-sen University, and Fudan University. These are academic institutions, not companies whose products are being evaluated."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding is disclosed, so independence of the funder cannot be assessed. The absence of any funding disclosure is itself a concern."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement or financial interest declaration is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The paper fine-tunes models (Codellama-7B, DeepSeek-Coder-6.7B, CodeQwen1.5-7B) and evaluates them on VerilogEval and RTLLM benchmarks, but does not state the training data cutoff dates for these base models. It is possible that the base models' pre-training data included VerilogEval benchmark problems."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of whether the base models' pre-training data or the GitHub-sourced fine-tuning data overlaps with the VerilogEval or RTLLM benchmark problems. The open-source database was scraped from GitHub, and VerilogEval benchmarks were also derived from public sources, creating contamination risk."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "VerilogEval was published in 2023, and the base models were trained on data that may include these benchmarks. The paper does not address this contamination risk at all. Additionally, the GitHub-scraped training data could overlap with benchmark problems."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study. It is a benchmark evaluation of LLM-based Verilog code generation."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No inference cost, latency, or token consumption is reported. The RAG pipeline involves retriever queries plus LLM inference, and the cost implications of this are not discussed."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "The paper mentions using 'three Nvidia A100 GPUs' (Section IV-A) but does not report total GPU hours, training time, API costs for ChatGPT-3.5 data generation (which involved scoring ~1M modules and generating synthetic datasets), or total computational budget."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "AutoVCoder outperforms both industrial and academic LLMs in Verilog code generation, showing 0.5% and 2.2% improvement in functional correctness on EvalMachine and EvalHuman compared with BetterV.",
    295       "evidence": "Table I shows AutoVCoder-CodeQwen achieves 79.9% and 55.9% pass@5 on EvalMachine and EvalHuman respectively, vs BetterV-CodeQwen at 79.4% and 53.7%. The differences are 0.5% and 2.2%.",
    296       "supported": "weak"
    297     },
    298     {
    299       "claim": "AutoVCoder achieves a 3.4% increase in syntax correctness and 3.4% increase in functional correctness on RTLLM compared with RTLCoder.",
    300       "evidence": "Table I shows AutoVCoder-DeepSeek achieves 100% syntax and 51.7% functional correctness on RTLLM, vs RTLCoder-DeepSeek at 93.1% syntax and 48.3% functional. The differences are 6.9% and 3.4%. The 3.4% syntax claim appears to compare against RTLCoder-Mistral (96.6%).",
    301       "supported": "moderate"
    302     },
    303     {
    304       "claim": "Both rounds of fine-tuning are crucial, with the second round having more significant impact on correctness improvement.",
    305       "evidence": "Table II shows consistent improvements across all three base models. For CodeQwen: base 52.8% EvalMachine pass@5, round-1 only 65.4%, round-2 only 75.1%, both rounds 78.3%. The second round alone provides larger gains than the first round alone.",
    306       "supported": "moderate"
    307     },
    308     {
    309       "claim": "The domain-specific RAG technique, combining example and knowledge retrievers, achieves the best performance in most cases.",
    310       "evidence": "Table III shows that ER&KR combined generally matches or exceeds individual retriever configurations. For AutoVCoder-CodeQwen: 1&2 baseline 78.3%, ER 79.1%, KR 79.3%, ER&KR 79.9% on EvalMachine pass@5.",
    311       "supported": "moderate"
    312     },
    313     {
    314       "claim": "AutoVCoder outperforms GPT-4 on the EvalMachine question set.",
    315       "evidence": "Table I shows AutoVCoder-CodeQwen achieves 79.9% pass@5 on EvalMachine vs GPT-4 at 70.6%. However, GPT-4 has no fine-tuning on Verilog-specific data, making this an unfair comparison of a specialized fine-tuned 7B model against a general-purpose model.",
    316       "supported": "weak"
    317     }
    318   ],
    319   "methodology_tags": [
    320     "benchmark-eval"
    321   ],
    322   "key_findings": "AutoVCoder proposes a three-component framework for Verilog code generation: high-quality dataset generation from GitHub with automated scoring, two-round LoRA fine-tuning (general Verilog syntax then task-specific problem-code pairs), and domain-specific RAG with example and knowledge retrievers. On VerilogEval and RTLLM benchmarks, AutoVCoder achieves marginal improvements over prior SOTA (0.5-3.4% gains), with ablation studies confirming each component contributes. The improvements are small and reported without statistical significance testing or variance analysis, making it difficult to assess whether the gains are robust.",
    323   "red_flags": [
    324     {
    325       "flag": "No statistical significance testing",
    326       "detail": "All comparative claims are based on raw percentage differences (0.5%-3.4%) without any significance tests. Given that pass@k is estimated from only n=10 generations and RTLLM has only 29 tasks, the reported differences could easily be within noise."
    327     },
    328     {
    329       "flag": "No variance or error bars reported",
    330       "detail": "Results are reported as single numbers with no standard deviation, confidence intervals, or repeated runs. For stochastic LLM outputs (temperature=0.8, top_p=0.95), variance could be substantial."
    331     },
    332     {
    333       "flag": "Potential training-test contamination",
    334       "detail": "The open-source database is scraped from ~20,000 GitHub repositories. VerilogEval benchmarks are also derived from public Verilog sources. No analysis of whether training data overlaps with benchmark problems. Additionally, base model pre-training data may include the benchmarks."
    335     },
    336     {
    337       "flag": "No limitations section",
    338       "detail": "The paper has no dedicated limitations or threats-to-validity section, which is a significant omission for a paper making comparative performance claims."
    339     },
    340     {
    341       "flag": "Claimed 'open-source' but no code released",
    342       "detail": "The abstract and introduction describe AutoVCoder as 'open-source' but no repository URL, download link, or code archive is provided anywhere in the paper."
    343     },
    344     {
    345       "flag": "Very small improvement margins",
    346       "detail": "The headline improvements (0.5% on EvalMachine, 2.2% on EvalHuman vs BetterV) are extremely small and could be within measurement noise, especially with n=10 samples and no significance testing."
    347     }
    348   ],
    349   "cited_papers": [
    350     {
    351       "title": "Evaluating large language models trained on code",
    352       "authors": ["M. Chen"],
    353       "year": 2021,
    354       "arxiv_id": "2107.03374",
    355       "relevance": "Foundational work introducing the pass@k metric and HumanEval benchmark for code generation evaluation."
    356     },
    357     {
    358       "title": "Codegen: An open large language model for code with multi-turn program synthesis",
    359       "authors": ["E. Nijkamp"],
    360       "year": 2023,
    361       "relevance": "Open-source LLM for code generation, used as a baseline for comparison."
    362     },
    363     {
    364       "title": "GPT-4 technical report",
    365       "authors": ["OpenAI"],
    366       "year": 2024,
    367       "relevance": "Major commercial LLM used as an industrial baseline for Verilog code generation comparison."
    368     },
    369     {
    370       "title": "RTLLM: An open-source benchmark for design RTL generation with large language model",
    371       "authors": ["Y. Lu"],
    372       "year": 2024,
    373       "relevance": "One of two primary evaluation benchmarks used in this paper for RTL code generation."
    374     },
    375     {
    376       "title": "RTLCoder: Outperforming GPT-3.5 in design RTL generation with our open-source dataset and lightweight solution",
    377       "authors": ["S. Liu"],
    378       "year": 2023,
    379       "relevance": "Prior SOTA for Verilog code generation using synthetic data and fine-tuning, a key baseline."
    380     },
    381     {
    382       "title": "BetterV: Controlled Verilog generation with discriminative guidance",
    383       "authors": ["P. Zehua"],
    384       "year": 2024,
    385       "relevance": "Most recent SOTA baseline for Verilog code generation, published at ICML 2024."
    386     },
    387     {
    388       "title": "VerilogEval: Evaluating large language models for Verilog code generation",
    389       "authors": ["M. Liu", "N. Pinckney"],
    390       "year": 2023,
    391       "relevance": "Primary evaluation benchmark used in this paper, provides EvalMachine and EvalHuman task sets."
    392     },
    393     {
    394       "title": "ChipNeMo: Domain-adapted LLMs for chip design",
    395       "authors": ["M. Liu"],
    396       "year": 2023,
    397       "arxiv_id": "2311.00176",
    398       "relevance": "Nvidia's domain-adapted LLM for chip design using two-round fine-tuning, a key related work."
    399     },
    400     {
    401       "title": "Benchmarking large language models for automated Verilog RTL code generation",
    402       "authors": ["S. Thakur"],
    403       "year": 2023,
    404       "relevance": "Early benchmark evaluation of LLMs for Verilog generation, used as baseline."
    405     },
    406     {
    407       "title": "Code Llama: Open foundation models for code",
    408       "authors": ["B. Roziere"],
    409       "year": 2023,
    410       "relevance": "One of three base models used in AutoVCoder experiments (Codellama-7B)."
    411     },
    412     {
    413       "title": "DeepSeek-Coder: When the large language model meets programming - the rise of code intelligence",
    414       "authors": ["D. Guo"],
    415       "year": 2024,
    416       "arxiv_id": "2401.14196",
    417       "relevance": "One of three base models used in AutoVCoder experiments (DeepSeek-Coder-6.7B)."
    418     },
    419     {
    420       "title": "LoRA: Low-rank adaptation of large language models",
    421       "authors": ["E. J. Hu"],
    422       "year": 2022,
    423       "relevance": "Fine-tuning method used in AutoVCoder's two-round fine-tuning approach."
    424     }
    425   ]
    426 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs