scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22970B)
      1 {
      2   "paper": {
      3     "title": "Fuzz4All: Universal Fuzzing with Large Language Models",
      4     "authors": ["Chunqiu Steven Xia", "Matteo Paltenghi", "Jia Le Tian", "Michael Pradel", "Lingming Zhang"],
      5     "year": 2024,
      6     "venue": "ICSE 2024",
      7     "arxiv_id": "2308.04748",
      8     "doi": "10.1145/3597503.3639121"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "Fuzz4All uses LLMs (GPT-4 for autoprompting, StarCoder for generation) as a universal fuzzer across six languages and nine systems under test. It achieves 36.8% higher average coverage than state-of-the-art language-specific fuzzers across all targets, despite lower validity rates. The approach found 98 bugs in widely-used systems (GCC, Clang, Z3, CVC5, Go, javac, Qiskit), with 64 confirmed as previously unknown.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Code and data available at https://doi.org/10.5281/zenodo.10456883 and https://github.com/fuzz4all/fuzz4all, stated in the DATA AVAILABILITY section."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Data available at the Zenodo archive linked in the DATA AVAILABILITY section."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Section 4.3 specifies: 64-core workstation, 256 GB RAM, Ubuntu 20.04.5 LTS, 4 NVIDIA RTX A6000 GPUs. Table 1 lists specific SUT versions. However, no requirements.txt or Dockerfile is mentioned in the paper itself."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper does not include step-by-step reproduction instructions. While code is released, no README or reproduction guide is described in the paper."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Figure 4 shows coverage trends with shaded areas indicating min/max across five runs."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Mann-Whitney U-test used for statistical significance (p < 0.05), indicated with * in Tables 2 and 4. Explicitly described in Section 4.3."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Table 2 reports percentage improvements over baselines with baseline context (e.g., '+18.8%' for GCC coverage from 167,453 to 198,927). Average improvement of 36.8% is reported."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Five repetitions for 24-hour runs and four for ablation are stated but not justified. No power analysis or justification for why these numbers are sufficient."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Figure 4 shows min/max ranges across five runs. The paper explicitly states experiments are repeated five times to account for variance."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Table 1 lists baselines for each language: GrayC and Csmith (C), YARPGen (C++), TypeFuzz (SMT2), go-fuzz (Go), Hephaestus (Java), MorphQ (Qiskit)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include recent tools: GrayC (2023), TypeFuzz (2021), Hephaestus (2022), MorphQ (2023). Csmith (2011) is older but is a classic and accompanied by the contemporary GrayC."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Section 5.3 presents a detailed ablation study examining autoprompting variants (no input, raw prompt, autoprompt) and fuzzing loop variants (w/o example, w/ example, full Fuzz4All). Table 4 shows results."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple metrics reported: line coverage, validity rate (% valid), number of programs generated, hit rate for targeted fuzzing, and bug count."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is not relevant to fuzzing effectiveness claims; the evaluation is about coverage and bug detection, which are objectively measurable."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "This is a fuzzing/testing paper, not a prediction task. There is no train/test split concept applicable here."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results broken down per language/SUT in Tables 2, 3, 4, and 5. Per-target coverage, validity, and bug counts are reported individually."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses lower validity rates compared to baselines (56% average reduction), and acknowledges that quantum computing has especially low validity due to limited training data. Section 5.1.2 discusses these limitations."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Lower validity rates are reported honestly. Targeted fuzzing achieves lower overall coverage than general fuzzing (Table 3). The ablation shows no-input and raw-prompt variants performing worse."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims of highest coverage across all languages (supported by Table 2), 36.8% average improvement (supported by Table 2), and 98 bugs with 64 confirmed (supported by Table 5) are all backed by results."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims about component contributions are justified via ablation study (Section 5.3/Table 4) with controlled single-variable manipulation. The ablation isolates autoprompting and fuzzing loop components."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The title claims 'universal fuzzing' which could be overbroad, but the paper tests across six languages and nine SUTs, providing substantial breadth. The threats section (Section 6) acknowledges external validity concerns about evaluation targets."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 6 discusses threats: implementation correctness, evaluation targets, potential data shift over time, and LLM hallucination in autoprompting. The paper acknowledges that StarCoder's training data may degrade over time."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper uses coverage and bug count as metrics and claims coverage and bug-finding effectiveness — no proxy gap. It also explicitly notes coverage is a widely adopted measure (Section 4.3) and discusses the relationship between valid/invalid inputs and bug finding."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 4.1 specifies 'gpt-4-0613 checkpoint' for the distillation LLM and 'Hugging Face implementation of the StarCoder model' for generation. Specific SUT versions listed in Table 1."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The autoprompting instruction is quoted: 'Please summarize the above information in a concise manner to describe the usage and functionality of the target'. Generation strategy instructions are provided in Figure 3. An example distilled prompt is shown in Figure 2."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4.1: temperature=1, batch size=30, max output length=1024, nucleus sampling with top-p=1, max_token=500 for GPT-4. Autoprompting uses 4 candidate prompts with 30 fuzzing inputs each. Greedy sampling at temperature=0 for first prompt."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The two-stage pipeline (autoprompting + fuzzing loop) is described in detail with algorithms (Algorithms 1 and 2), figures (Figures 1 and 3), and examples. The fuzzing loop with three generation strategies is clearly documented."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "User inputs (documentation, specifications, example code) for each SUT are described in Sections 4.2.1-4.2.5. The autoprompting distillation process is detailed. Validity checking and oracle definitions are specified per SUT."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 6 'Threats to Validity' discusses internal and external threats with substantive detail."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 6 discusses specific threats: implementation bugs mitigated by code reviews, specific evaluation targets chosen, data shift risk with the StarCoder checkpoint over time, and LLM hallucination in autoprompting."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound which types of SUTs Fuzz4All may not work for, or discuss limitations around languages not in the LLM's training data beyond a brief mention of quantum computing."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Data available via Zenodo archive (https://doi.org/10.5281/zenodo.10456883). Bug reports are linked in the artifact."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 4 details how fuzzing inputs are generated (LLM-based), how coverage is measured (line coverage via standard tools), how bugs are detected (oracles described per SUT), and the 24-hour campaign protocol."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data comes from automated fuzzing of software systems using standard benchmarks."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The full pipeline is documented: user input → autoprompting → generation LLM → SUT execution → oracle checking → bug reporting. Coverage measurement methodology follows prior work conventions."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Acknowledgment section lists: NSF grants CCF-2131943 and CCF-2141474, Kwai Inc., European Research Council grant 851895, and German Research Foundation ConcSys and DeMoCo projects."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations clearly stated: University of Illinois Urbana-Champaign and University of Stuttgart."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "NSF, ERC, and DFG are independent research funders. Kwai Inc. is a tech company but the paper does not evaluate any Kwai product, so no direct conflict."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Fuzz4All does not evaluate model capability on a benchmark. It uses LLMs to generate fuzzing inputs and evaluates the fuzzer's coverage/bug-finding, not the LLM's knowledge. The LLM is a component, not the subject."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Not a benchmark evaluation of LLM capability. The LLM is used as a tool for input generation; there is no benchmark the LLM is being scored on."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Same as above — no benchmark evaluation of model knowledge."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No API costs or per-example costs reported. The paper uses GPT-4 API for autoprompting and local GPU inference for StarCoder but does not quantify monetary cost."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Hardware specified (64-core workstation, 4x RTX A6000, one GPU per run). Fuzzing budget is 24 hours per campaign, repeated 5 times. Autoprompting overhead stated as avg 2.3 minutes. Implementation is 872 LoC."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Results reported across 5 runs for 24-hour campaigns and 4 runs for ablation studies. Figure 4 shows min/max ranges across runs."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Section 4.3: 'we repeat the experiment for both Fuzz4All and the baselines five times' for RQ1, 'repeat four times for the ablation study.'"
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The choice of hyperparameters (temperature=1, batch size=30, 4 candidate prompts, etc.) is stated but no search budget or justification for these specific values is provided."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "The autoprompting algorithm (Algorithm 1) uses a validity-based scoring function to select the best prompt. Selection criterion is clearly defined and applied uniformly."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Mann-Whitney U-tests are applied across multiple SUT comparisons (6+ comparisons in Table 2, more in Table 4) but no multiple comparison correction is mentioned."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "Authors run their own system and baselines from replication packages. No discussion of author-evaluation bias or potential for baseline implementations to underperform."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Figure 4 shows coverage as a function of time (24-hour budget) for all tools. Table 2 shows number of programs generated. The time-matched comparison is fair — all tools get the same 24 hours."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "Line coverage is used as the primary metric following prior work, but there is no discussion of whether coverage is a valid proxy for fuzzing effectiveness or whether it captures the right notion of 'universality.'"
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "Fuzz4All is evaluated as a complete tool — the scaffold IS the system being tested. No model-vs-scaffold separation claim is made."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": false,
    342         "answer": false,
    343         "justification": "Not a benchmark evaluation of LLM capability. The LLM generates fuzzing inputs; it is not being scored on benchmark tasks."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": false,
    347         "answer": false,
    348         "justification": "Same as above — no benchmark evaluation of model knowledge."
    349       },
    350       "non_independence_addressed": {
    351         "applies": false,
    352         "answer": false,
    353         "justification": "No train/test split concept applies to fuzzing."
    354       },
    355       "leakage_detection_method": {
    356         "applies": false,
    357         "answer": false,
    358         "justification": "Not applicable — fuzzing tool evaluation, not model benchmark evaluation."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Fuzz4All achieves the highest code coverage across all six languages, improving the previous state-of-the-art coverage by 36.8% on average.",
    365       "evidence": "Table 2 shows statistically significant coverage improvements across all 6 targets: GCC +18.8%, G++ +26.5%, CVC5 +24.9%, Go +13.7%, javac +60.9%, Qiskit +75.6%. Figure 4 shows 24-hour coverage trends.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "Fuzz4All detected 98 bugs in widely used systems, with 64 confirmed by developers as previously unknown.",
    370       "evidence": "Table 5 breaks down bugs by SUT: GCC 30, Clang 27, CVC5 9, Z3 14, Go 4, Java 3, Qiskit 11. 64 confirmed, 26 pending, 6 won't fix, 2 known.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Targeted fuzzing achieves an average 83% hit rate for specific features.",
    375       "evidence": "Table 3 shows hit rates across 18 targeted fuzzing campaigns for 6 languages, with individual rates ranging from 54.79% to 97.20%.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "The autoprompting stage and fuzzing loop with generation strategies each contribute to Fuzz4All's effectiveness.",
    380       "evidence": "Table 4 ablation study shows autoprompt > raw prompt > no input for coverage, and full Fuzz4All > w/ example > w/o example across all targets, with statistically significant differences marked.",
    381       "supported": "strong"
    382     }
    383   ],
    384   "red_flags": [
    385     {
    386       "flag": "No multiple comparison correction",
    387       "detail": "Mann-Whitney U-tests across 6+ comparisons in Table 2 and more in Table 4 without Bonferroni or similar correction. At 5% significance level with 6 independent tests, family-wise error rate is ~26%."
    388     },
    389     {
    390       "flag": "No competing interests statement",
    391       "detail": "Kwai Inc. is listed as a funder but there is no formal competing interests declaration."
    392     }
    393   ],
    394   "cited_papers": [
    395     {
    396       "title": "Evaluating large language models trained on code",
    397       "authors": ["Mark Chen"],
    398       "year": 2021,
    399       "arxiv_id": "2107.03374",
    400       "relevance": "Codex/HumanEval — foundational LLM code generation benchmark."
    401     },
    402     {
    403       "title": "Large Language Models are Zero-Shot Fuzzers: Fuzzing Deep-Learning Libraries via Large Language Models",
    404       "authors": ["Yinlin Deng", "Chunqiu Steven Xia"],
    405       "year": 2023,
    406       "relevance": "TitanFuzz — prior LLM-based fuzzing work that Fuzz4All builds upon and compares against."
    407     },
    408     {
    409       "title": "CODAMOSA: Escaping Coverage Plateaus in Test Generation with Pre-trained Large Language Models",
    410       "authors": ["Caroline Lemieux"],
    411       "year": 2023,
    412       "relevance": "Combines search-based testing with LLM code generation for test generation."
    413     },
    414     {
    415       "title": "StarCoder: may the source be with you!",
    416       "authors": ["Raymond Li"],
    417       "year": 2023,
    418       "arxiv_id": "2305.06161",
    419       "relevance": "The generation LLM used in Fuzz4All — open-source code model trained on 1T+ tokens."
    420     },
    421     {
    422       "title": "GPT-4 Technical Report",
    423       "authors": ["OpenAI"],
    424       "year": 2023,
    425       "arxiv_id": "2303.08774",
    426       "relevance": "The distillation LLM used for autoprompting in Fuzz4All."
    427     },
    428     {
    429       "title": "Adaptive Test Generation Using a Large Language Model",
    430       "authors": ["Max Schäfer", "Sarah Nadi"],
    431       "year": 2023,
    432       "arxiv_id": "2302.06527",
    433       "relevance": "TestPilot — LLM-based unit test generation, related approach for LLM-assisted testing."
    434     },
    435     {
    436       "title": "Keep the Conversation Going: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT",
    437       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    438       "year": 2023,
    439       "arxiv_id": "2304.00385",
    440       "relevance": "LLM-based automated program repair with cost analysis."
    441     },
    442     {
    443       "title": "Finding and understanding bugs in C compilers",
    444       "authors": ["Xuejun Yang"],
    445       "year": 2011,
    446       "relevance": "Csmith — classic compiler fuzzing baseline used in evaluation."
    447     },
    448     {
    449       "title": "GrayC: Greybox Fuzzing of Compilers and Analysers for C",
    450       "authors": ["Karine Even-Mendoza"],
    451       "year": 2023,
    452       "doi": "10.1145/3597926.3598130",
    453       "relevance": "State-of-the-art C compiler fuzzer used as primary baseline."
    454     }
    455   ]
    456 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs