scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20362B)
      1 {
      2   "paper": {
      3     "title": "On the Challenges of Fuzzing Techniques via Large Language Models",
      4     "authors": ["Linghan Huang", "Peizhou Zhao", "Lei Ma", "Huaming Chen"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2402.00350"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["survey_methodology"],
     11   "methodology_tags": ["meta-analysis"],
     12   "key_findings": "This survey reviews the intersection of LLMs and fuzzing testing, categorizing LLM-based fuzzers into two types: those using LLMs for prompt engineering and seed mutation within traditional fuzzing pipelines, and those fine-tuning LLMs as dedicated fuzzers. The paper reports that LLM-based fuzzers achieve higher API/code coverage (e.g., TitanFuzz's 91.11% increase in TensorFlow API coverage over FreeFuzz), detect more complex vulnerabilities, and improve automation compared to traditional fuzzers. Seven challenges are identified including hallucinations, computational efficiency, lack of standardized benchmarks, and insufficient pre-training data quality.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No code or analysis scripts are released by the authors. The paper links to repositories of the surveyed tools (TitanFuzz, FuzzGPT, etc.) but provides no code for its own survey analysis."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No dataset of surveyed papers or extracted data is released. The paper does not provide a downloadable corpus or structured dataset of its literature review."
     24       },
     25       "environment_specified": {
     26         "applies": false,
     27         "answer": false,
     28         "justification": "This is a survey paper with no computational experiments, so environment specification is not applicable."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No instructions for reproducing the literature search or analysis are provided."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": false,
     39         "answer": false,
     40         "justification": "This is a survey paper that does not run experiments or perform statistical aggregation. It only reports numbers from surveyed papers."
     41       },
     42       "significance_tests": {
     43         "applies": false,
     44         "answer": false,
     45         "justification": "No experiments are conducted; the paper summarizes findings from other works without performing statistical tests."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": false,
     49         "answer": false,
     50         "justification": "The survey does not conduct its own experiments. Effect sizes reported are from the surveyed papers."
     51       },
     52       "sample_size_justified": {
     53         "applies": false,
     54         "answer": false,
     55         "justification": "No experiments are conducted; sample size justification is not applicable to this survey."
     56       },
     57       "variance_reported": {
     58         "applies": false,
     59         "answer": false,
     60         "justification": "No experiments are conducted by the authors."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The survey does not compare itself against prior surveys or reviews of LLM-based fuzzing. It claims to be 'the first work that covers the intersection of three areas' but does not compare its coverage or methodology against existing fuzzing surveys."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No comparison against prior surveys is made."
     73       },
     74       "ablation_study": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "This is a survey paper with no system to ablate."
     78       },
     79       "multiple_metrics": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "The survey does not run its own evaluation, so multiple metrics are not applicable."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "Human evaluation is not relevant to a literature survey's claims."
     88       },
     89       "held_out_test_set": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "No experiments are conducted; held-out test sets are not applicable."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Table I provides a breakdown of fuzzers by domain classification, models, benchmarks, and test type. The paper also categorizes fuzzers into 'Fuzzer by LLM' and 'Fine-Tuning Fuzzer' categories."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section V discusses challenges including LLM hallucinations (Section V-A), pre-training data quality issues (Section V-C), and computational efficiency limitations (Section V-D)."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper reports that only approximately 40% of GPT-4 synthesized fuzz drivers were error-free in the OSS-Fuzz project (Section V-A), and discusses limitations of LLM-based approaches."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims to provide a 'systematic overview' of LLM-fuzzing developments, which the paper delivers through its categorization and discussion of methods. Claims are appropriately hedged."
    115       },
    116       "causal_claims_justified": {
    117         "applies": false,
    118         "answer": false,
    119         "justification": "The paper does not make its own causal claims; it reports causal findings from surveyed papers."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper makes broad claims about LLM-based fuzzers having 'superior API and code coverage' and finding 'more complex bugs' without bounding these generalizations to the specific papers and settings reviewed. The title and claims suggest general conclusions from a limited set of ~15 papers."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper does not consider alternative explanations for why LLM-based fuzzers outperform traditional ones (e.g., newer software versions, different evaluation conditions, publication bias toward positive results)."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": false,
    133         "answer": false,
    134         "justification": "This is a survey with no original measurements."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "The survey does not use any models itself."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "The survey does not use prompting."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "No experiments are conducted."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used in this survey."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "Section I.2 describes selection criteria (4 bullet points) and mentions 'predetermined criteria, manual screening, and snowballing methods' but provides no counts at each filtering stage, no search query terms, no databases searched, and no PRISMA-style flow."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no dedicated limitations section. Section V discusses challenges of the surveyed technology but not limitations of the survey itself."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No threats to validity are discussed for the survey methodology."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what is excluded from the survey scope beyond the four inclusion criteria. No explicit discussion of what the survey does NOT cover."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw data (list of all papers screened, inclusion/exclusion decisions) is available."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "The paper mentions 'predetermined criteria, manual screening, and snowballing methods' but does not describe which databases were searched, what search terms were used, or what time period was covered."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants; data source is published literature. Standard benchmark for survey methodology is covered under data_collection_described."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "The pipeline from initial search to final set of reviewed papers is not documented. No counts of papers at each stage are provided."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding information is disclosed in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly stated: University of Sydney and University of Tokyo."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding is disclosed, so independence cannot be assessed."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement is provided."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "This is a survey paper that does not evaluate a pre-trained model on any benchmark."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "This is a survey paper that does not evaluate a pre-trained model on any benchmark."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "This is a survey paper that does not evaluate a pre-trained model on any benchmark."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this survey."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this survey."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this survey."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this survey."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this survey."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this survey."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this survey."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "This is a survey paper with no experiments."
    284       },
    285       "compute_budget_stated": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "This is a survey paper with no experiments."
    289       }
    290     },
    291     "survey_methodology": {
    292       "prisma_or_structured_protocol": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "The paper mentions 'predetermined criteria, manual screening, and snowballing methods' but does not follow PRISMA or any structured protocol. No flow diagram, no search queries, no database listing, no counts at each screening stage."
    296       },
    297       "quality_assessment_of_sources": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The survey does not assess the methodological quality of the papers it reviews. All papers are treated equally regardless of their rigor — results from TitanFuzz, FuzzGPT, etc. are reported at face value without quality assessment."
    301       },
    302       "publication_bias_discussed": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No discussion of publication bias. The survey does not consider that published LLM-fuzzing papers may skew toward positive results."
    306       }
    307     }
    308   },
    309   "claims": [
    310     {
    311       "claim": "TitanFuzz's API coverage in TensorFlow and PyTorch increased by 91.11% and 24.09% respectively over FreeFuzz and DeepREL",
    312       "evidence": "Section IV-A, reporting results from the TitanFuzz paper's comparison with FreeFuzz and DeepREL baselines.",
    313       "supported": "moderate"
    314     },
    315     {
    316       "claim": "CHATAFL achieves 5.8% more branch coverage than AFLNET and 6.7% more than NSFuzz",
    317       "evidence": "Section IV-A, citing CHATAFL's evaluation against AFLNET and NSFuzz.",
    318       "supported": "moderate"
    319     },
    320     {
    321       "claim": "FuzzGPT detected 76 bugs total, 49 confirmed as previously unknown, including 11 high-priority security vulnerabilities",
    322       "evidence": "Section IV-C, reporting FuzzGPT paper's results.",
    323       "supported": "moderate"
    324     },
    325     {
    326       "claim": "Only approximately 40% of GPT-4 synthesized fuzz drivers were error-free in OSS-Fuzz",
    327       "evidence": "Section V-A, citing the OSS-Fuzz evaluation [66].",
    328       "supported": "moderate"
    329     },
    330     {
    331       "claim": "LLM-based fuzzers provide superior API/code coverage, detect more complex bugs, and improve automation compared to traditional fuzzers",
    332       "evidence": "Section IV, aggregating results from multiple surveyed papers without independent verification.",
    333       "supported": "weak"
    334     }
    335   ],
    336   "red_flags": [
    337     {
    338       "flag": "No quality assessment of sources",
    339       "detail": "The survey reports results from individual papers at face value without assessing their methodological quality. Claims like '91.11% API coverage increase' are taken directly from TitanFuzz without evaluating the rigor of that evaluation."
    340     },
    341     {
    342       "flag": "No structured review protocol",
    343       "detail": "Despite claiming to be a 'systematic overview,' the paper provides no PRISMA flow, no search queries, no database listing, and no counts of papers screened vs. included. The review methodology is opaque."
    344     },
    345     {
    346       "flag": "Very small corpus",
    347       "detail": "Table I lists only 15 LLM-based fuzzers. For a paper claiming comprehensive coverage of the field, the corpus is quite small, raising questions about completeness."
    348     },
    349     {
    350       "flag": "Uncritical reporting of comparative claims",
    351       "detail": "The paper reports performance comparisons from surveyed papers without noting that these comparisons often use different experimental conditions, benchmarks, and baselines, making cross-paper comparisons unreliable."
    352     },
    353     {
    354       "flag": "Publication bias not acknowledged",
    355       "detail": "All surveyed papers report positive results for LLM-based fuzzing. The survey does not consider that negative results may go unpublished."
    356     }
    357   ],
    358   "cited_papers": [
    359     {
    360       "title": "Large language models are zero-shot fuzzers: Fuzzing deep-learning libraries via large language models",
    361       "authors": ["Y. Deng", "C. S. Xia", "H. Peng", "C. Yang", "L. Zhang"],
    362       "year": 2023,
    363       "relevance": "Foundational work (TitanFuzz) on using LLMs for fuzzing deep learning libraries, directly relevant to LLM code generation capabilities."
    364     },
    365     {
    366       "title": "Large language models are edge-case fuzzers: Testing deep learning libraries via FuzzGPT",
    367       "authors": ["Y. Deng", "C. S. Xia", "C. Yang", "S. D. Zhang", "S. Yang", "L. Zhang"],
    368       "year": 2023,
    369       "arxiv_id": "2304.02014",
    370       "relevance": "FuzzGPT fine-tunes LLMs for fuzzing, demonstrating LLM-based code generation for security testing."
    371     },
    372     {
    373       "title": "Fuzz4All: Universal fuzzing with large language models",
    374       "authors": ["C. S. Xia", "M. Paltenghi", "J. L. Tian", "M. Pradel", "L. Zhang"],
    375       "year": 2024,
    376       "relevance": "Universal LLM-based fuzzer using autoprompting, relevant to automated code generation and testing."
    377     },
    378     {
    379       "title": "Large language model guided protocol fuzzing",
    380       "authors": ["R. Meng", "M. Mirchev", "M. Böhme", "A. Roychoudhury"],
    381       "year": 2024,
    382       "relevance": "CHATAFL uses LLMs for protocol fuzzing, demonstrating LLM-guided testing in network security."
    383     },
    384     {
    385       "title": "Software testing with large language models: Survey, landscape, and vision",
    386       "authors": ["J. Wang", "Y. Huang", "C. Chen", "Z. Liu", "S. Wang", "Q. Wang"],
    387       "year": 2024,
    388       "doi": "10.1109/TSE.2024.3368208",
    389       "relevance": "Broader survey of LLM-based software testing, providing context for LLM capabilities in code generation and testing."
    390     },
    391     {
    392       "title": "Evaluating large language models trained on code",
    393       "authors": ["M. Chen", "J. Tworek", "H. Jun", "Q. Yuan"],
    394       "year": 2021,
    395       "arxiv_id": "2107.03374",
    396       "relevance": "Codex/HumanEval paper, foundational to LLM code generation capabilities used in fuzzing."
    397     },
    398     {
    399       "title": "When fuzzing meets LLMs: Challenges and opportunities",
    400       "authors": ["Y. Jiang", "J. Liang", "F. Ma", "Y. Chen"],
    401       "year": 2024,
    402       "arxiv_id": "2404.16297",
    403       "doi": "10.1145/3663529.3663784",
    404       "relevance": "Companion paper on challenges of LLM-based fuzzing, including hallucination analysis in OSS-Fuzz."
    405     },
    406     {
    407       "title": "White-box compiler fuzzing empowered by large language models",
    408       "authors": ["C. Yang", "Y. Deng", "R. Lu", "J. Yao", "J. Liu", "R. Jabbarvand", "L. Zhang"],
    409       "year": 2023,
    410       "relevance": "WhiteFox uses LLMs for white-box compiler fuzzing, demonstrating LLM code understanding for security."
    411     },
    412     {
    413       "title": "Understanding large language model based fuzz driver generation",
    414       "authors": ["C. Zhang", "M. Bai", "Y. Zheng", "Y. Li"],
    415       "year": 2023,
    416       "arxiv_id": "2307.12469",
    417       "relevance": "Studies LLM-generated fuzz drivers, relevant to LLM code generation quality assessment."
    418     },
    419     {
    420       "title": "From LLMs to LLM-based agents for software engineering: A survey of current, challenges and future",
    421       "authors": ["H. Jin", "L. Huang", "H. Cai", "J. Yan", "B. Li", "H. Chen"],
    422       "year": 2024,
    423       "arxiv_id": "2408.02479",
    424       "relevance": "Survey of LLM-based agents for software engineering, directly relevant to agentic AI capabilities."
    425     },
    426     {
    427       "title": "LLM hallucinations in practical code generation: Phenomena, mechanism, and mitigation",
    428       "authors": ["Z. Zhang", "Y. Wang", "C. Wang", "J. Chen", "Z. Zheng"],
    429       "year": 2025,
    430       "arxiv_id": "2409.20550",
    431       "relevance": "Studies LLM hallucinations in code generation, a key challenge for LLM-based fuzzing reliability."
    432     }
    433   ]
    434 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs