scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (32016B)
      1 {
      2   "paper": {
      3     "title": "ProphetFuzz: Fully Automated Prediction and Fuzzing of High-Risk Option Combinations with Only Documentation via Large Language Model",
      4     "authors": [
      5       "Dawei Wang",
      6       "Geng Zhou",
      7       "Li Chen",
      8       "Dan Li",
      9       "Yukai Miao"
     10     ],
     11     "year": 2024,
     12     "venue": "Conference on Computer and Communications Security",
     13     "arxiv_id": "2409.00922",
     14     "doi": "10.1145/3658644.3690231"
     15   },
     16   "scan_version": 3,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "ProphetFuzz uses GPT-4 Turbo with chain-of-thought prompts and bidirectional self-check to predict high-risk option combinations from documentation and automatically fuzz them, discovering 364 unique vulnerabilities across 52 programs (1.33× CarpetFuzz) with only 0.2× the commands at $8.69/program. The self-check mechanism raises constraint extraction precision from 23.41% to 94.00%. Persistent fuzzing on latest program versions uncovered 140 zero-day/half-day vulnerabilities, 93 confirmed by developers with 21 CVEs awarded.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The paper states 'We have open-sourced ProphetFuzz and the related datasets' and provides a GitHub URL (https://github.com/NASP-THU/ProphetFuzz) in footnote 3."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The evaluation dataset comprises 52 open-source programs from three prior public studies, and the authors state they have released related datasets alongside the code. Historical high-risk combinations used for few-shot corpus are also made available."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Section 4 and 5.1 describe the environment: Ubuntu 20.04, Intel Xeon E5-2630 v3 @ 2.40GHz with 32 cores and 128GB RAM, Docker container with virtualenv, 33 pre-installed Python libraries and 36 command-line tools. The open-source release presumably includes environment specifications."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "While the code is open-sourced and the experimental setup is described in Sections 4-5, the paper itself does not include step-by-step reproduction instructions, scripts to replicate experiments, or a 'Reproducing Results' section."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Results in Table 2 are point estimates. Despite repeating experiments 5 times, the paper reports the union of results rather than averages with confidence intervals or error bars."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper claims ProphetFuzz discovers 1.33× more vulnerabilities than CarpetFuzz but performs no statistical significance tests (no p-values, t-tests, or other tests) to support claims of superiority."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "The paper reports effect sizes as ratios and percentages with baseline context: '1.33 times more' vulnerabilities, '32.85% higher', '12.30% vs 1.50%' vulnerable combination ratios, and per-program comparisons in Table 2 showing both absolute numbers and ratios."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The 52 programs are drawn from 'datasets used in three previous studies' but no justification is given for why this sample size is adequate for the claims made. No power analysis or sample size rationale is discussed."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "Despite repeating each fuzzing run 5 times, the paper reports the union of results ('we take the union of the results from the five repetitions') rather than averages with standard deviation or other spread measures. The reader cannot assess result stability."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "CarpetFuzz is used as the primary baseline, described as 'recognized as the state-of-the-art (SOTA) tool in the field' (Section 5.1). The out-of-the-box LLM (OBLLM) serves as an additional baseline in the ablation study."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "CarpetFuzz (2023, USENIX Security) is the most recent state-of-the-art tool for option-aware fuzzing. The paper also references other contemporary approaches (POWER 2022, ConfigFuzz 2023) in the related work."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Multiple ablation studies are conducted: ProphetFuzz_NSC (without self-check), ProphetFuzz_ZS (without few-shot), OBLLM (out-of-box LLM), ProphetFuzz_NV (without generated option values), and ProphetFuzz_NS (without generated seed files). Results in Tables 3 and 4."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Multiple metrics are used: edge coverage, unique vulnerabilities, exclusive vulnerabilities, vulnerable combination ratio, and for constraint extraction: precision and recall (Table 2, Table 6, Section 5.3)."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "All evaluation is automated through fuzzing results and manual annotation of constraints. No human evaluation of the quality of predicted high-risk combinations or the system's outputs is conducted."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "The 8 programs used to generate few-shot examples are drawn from 'all popular C/C++ projects with over 100 stars' while the 52 evaluation programs come from prior studies of popular open-source programs. The paper does not discuss whether these sets overlap or establish a clear train/test separation."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Table 2 provides per-program breakdowns of all metrics (commands, unique vulnerabilities, exclusive vulnerabilities, vulnerable combination ratio, edge coverage) for all 52 programs."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 5.2 analyzes ProphetFuzz's failures in detail: 37 missed vulnerabilities from limited inference rounds, 38 from incorrect value assignment, and 4 from randomness. Specific programs (avconv, c++filt, tiffcrop, xmllint) are analyzed. The vim coverage failure is also discussed."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper reports CarpetFuzz finding 134 vulnerabilities that ProphetFuzz misses, ProphetFuzz's poor coverage on vim (0.22× CarpetFuzz), constraint extraction outliers with low precision (50-67%), and that ProphetFuzz finds fewer vulnerabilities than ProphetFuzz_NV on xmllint."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Abstract claims of 1748 high-risk combinations, $8.69/program cost, 364 unique vulnerabilities, 12.30% hit rate, 32.85% improvement, 140 zero-day vulnerabilities, 93 confirmed, and 21 CVEs are all supported by Tables 2 and 5 and Section 5."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Causal claims about component contributions are justified through controlled ablation studies: removing self-check (ProphetFuzz_NSC), removing few-shot (ProphetFuzz_ZS), removing generated values (ProphetFuzz_NV), and removing generated seeds (ProphetFuzz_NS), each isolating a single variable."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The title claims 'Fully Automated' prediction and fuzzing, but the system specifically requires Groff-formatted man pages as input (Section 4). The evaluation covers only C/C++ command-line programs with memory corruption vulnerabilities. These scope restrictions are not prominent in the title or abstract."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "The ablation studies isolate the contribution of each component (self-check, few-shot, values, seeds). Failure analysis in Section 5.2 considers alternative reasons for missed vulnerabilities (limited inferences, incorrect values, randomness). The vim case analysis identifies the file mutation choice as a confound."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper measures unique vulnerabilities (defined via first-3-entries in call stacks) and edge coverage, framing these directly as fuzzing effectiveness metrics. The claims match the granularity of the measurements without inflated framing."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Section 4 specifies 'GPT-4 Turbo (gpt-4-1106-preview)' — a specific API model version identifier."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Figures 4 and 6 provide the full prompt text for prediction, example generation, command assembly, and file generation. Table 1 provides the bidirectional self-check questions. The prompt for option separation is quoted in Section 3.2. Placeholders correspond to clearly defined data fields."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section 4 reports temperature settings (0.7 for diverse tasks, 0.2 for precision tasks) and the n parameter (10 for extraction/prediction, 1 for example generation, 3×N for assembly)."
    159       },
    160       "scaffolding_described": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "The full pipeline is described in detail across Sections 3-4: document parsing (keyword matching), constraint extraction (bidirectional self-check), prediction (CoT with few-shot), command assembly (LLM-guided), file generation (Python code in sandbox), and fuzzing. Figure 1 provides an overview diagram."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 4 describes document parsing in detail: Groff control sequences (.TH, .SH, .TP, .PP, .RS, .sp) used for extraction, 'col -b' for text conversion, keyword matching for sections, and JSON formatting. Multi-option key separation is also described."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 6 'Limitation and Future Work' provides a dedicated section discussing multiple specific limitations."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 6 discusses specific threats: constraint extraction relies on LLM reasoning quality, documentation correctness or absence limits capability, few-shot corpus is manually collected (one-time), cost vs. precision tradeoff in multi-option constraints, and input file mutation may not be optimal for all programs (e.g., vim)."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Section 6 states specific boundaries: ProphetFuzz focuses only on memory corruption vulnerabilities, requires documentation as input (cannot handle missing or incorrect documentation), currently only mutates input files not configuration files, and auto-generated examples may be less effective than expert-created ones."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The paper open-sources ProphetFuzz and related datasets at the GitHub repository (footnote 3). The constraint annotation data for recall evaluation is also referenced as available. The 52 evaluation programs are publicly available open-source software."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Section 5.1 describes the dataset: 52 programs from 40 packages, collected from three prior studies' datasets, covering 26 input formats. Few-shot data collection is described in Section 3.3: 29 historical high-risk combinations from GitHub Issues across 8 programs with >100 stars."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants. The data sources are standard publicly available programs from prior study datasets."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The full pipeline is documented: documentation parsing → constraint extraction with self-check filtering → prediction → command assembly → file generation in sandbox → file-placeholder matching and filtering → corpus minimization → fuzzing. Filtering criteria at each stage are described (Section 3.5)."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "The acknowledgment section thanks the shepherd and reviewers but does not mention any funding sources. Authors are affiliated with Zhongguancun Laboratory and Tsinghua University but no grants or sponsors are disclosed."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are clearly listed: Zhongguancun Laboratory (Beijing) and Tsinghua University (Beijing). The authors evaluate a third-party product (GPT-4 from OpenAI), not their own."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No funding is disclosed, so independence of the funder cannot be assessed. The authors use OpenAI's GPT-4 but are not affiliated with OpenAI."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests statement or financial interest declaration is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "The paper specifies 'gpt-4-1106-preview' but does not state GPT-4's training data cutoff date. Since the model is used to predict vulnerabilities in programs that may have known vulnerability histories in the training data, the cutoff is relevant."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "The paper does not discuss whether GPT-4's training data includes vulnerability reports, documentation, or security analyses of the 52 evaluation programs. The model's prior knowledge of these programs' vulnerabilities is a potential confound that is never addressed."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "Many of the 52 programs are well-known open-source projects with publicly documented vulnerabilities. GPT-4 could have been trained on prior vulnerability reports, security advisories, or CVE databases for these programs, which would inflate prediction accuracy. This is not discussed."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study. The evaluation is entirely automated through fuzzing."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants. The study involves automated testing of open-source software."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": true,
    290         "justification": "The paper reports 'an average cost of only $8.69 per program' for LLM API calls (Section 1, repeated in evaluation)."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "The total experiment consumed '10.44 CPU years' (Section 1, Section 5.1). Hardware is specified: Intel Xeon E5-2630 v3 @ 2.40GHz with 32 cores and 128GB RAM. Each fuzzing instance runs for 72 hours, repeated 5 times."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "Despite repeating experiments 5 times, the paper reports the union of results rather than analyzing sensitivity to random seeds or variance across runs."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": true,
    307         "justification": "Section 5.1 states 'Each fuzzing instance is run for 72 hours and repeated five times to mitigate the impact of the inherent randomness associated with fuzzing.'"
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "Temperature settings (0.7, 0.2) and n values (10, 1, 3×N) are reported but no search over these hyperparameters is described. No justification for why these specific values were chosen."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "The hyperparameter choices (temperature values, number of inferences) appear fixed rather than selected through systematic evaluation. No validation set or selection criterion is described."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": false,
    321         "answer": false,
    322         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors compare their system against CarpetFuzz, apparently using the original tool, but do not acknowledge or discuss the bias inherent in authors evaluating their own system versus a competitor's system."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "Both tools are run for 72 hours of fuzzing, but ProphetFuzz additionally consumes LLM API costs ($8.69/program) for prediction and command assembly that CarpetFuzz does not require. Performance is not reported as a function of compute budget, and no performance curves across compute levels are shown."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "Unique vulnerabilities are defined by the first three entries in call stack reports (following prior work), but the paper does not discuss whether this deduplication method accurately reflects distinct vulnerabilities or whether it could overcount or undercount."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "The scaffold (ProphetFuzz's pipeline) IS the system being evaluated. The comparison with CarpetFuzz is between two fundamentally different tools, not the same model in different scaffolds."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "GPT-4 (training data through early 2023) could have been trained on vulnerability reports, CVE databases, and security discussions about these 52 well-known open-source programs. This temporal overlap is not discussed."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "The documentation provided to GPT-4 is the same documentation publicly available online, which the model was likely trained on. The model's familiarity with this documentation could influence prediction quality. Not discussed."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "The 8 programs used for few-shot example generation may overlap with the 52 evaluation programs (both drawn from popular open-source C/C++ projects). This potential non-independence is not discussed."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No concrete leakage detection or prevention method is used. The paper does not test whether GPT-4's predictions correlate with publicly known vulnerability history."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "ProphetFuzz predicts 1748 high-risk option combinations across 52 programs at an average cost of $8.69 per program.",
    371       "evidence": "Table 2 shows 7614 commands assembled from predictions across 52 programs. Cost is stated in Section 1 and confirmed in the evaluation.",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "ProphetFuzz discovers 364 unique vulnerabilities in 72 hours, 1.33× more than CarpetFuzz's 274.",
    376       "evidence": "Table 2 summarizes results: 364 vs 274 unique vulnerabilities. However, results are the union of 5 runs (best-case), not averages, and no statistical tests are performed.",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "12.30% of ProphetFuzz's predicted high-risk combinations trigger vulnerabilities, vs 1.50% for CarpetFuzz (8.2× higher).",
    381       "evidence": "Table 2 shows vulnerable combination ratios per program. ProphetFuzz uses far fewer commands (7614 vs 38984), so higher hit rates are partly a consequence of fewer predictions.",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "The self-check mechanism improves constraint extraction precision from 23.41% to 94.00%.",
    386       "evidence": "Section 5.3 compares ProphetFuzz_NSC (no self-check: 6682 constraints at 23.41% precision) with ProphetFuzz (633 constraints at 94.00%). Manual annotation was used for validation.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "ProphetFuzz finds 140 zero-day/half-day vulnerabilities in latest program versions, 93 confirmed by developers, 21 CVEs awarded.",
    391       "evidence": "Table 5 lists all 140 vulnerabilities by program and type, with confirmation and CVE counts. External developer confirmation provides strong independent validation.",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "Generated option values and seed files contribute 34.65% and 17.24% more vulnerabilities respectively.",
    396       "evidence": "Table 4 shows ablation results on 25 programs: ProphetFuzz finds 136 vulnerabilities vs 101 without values (ProphetFuzz_NV) and 116 without seeds (ProphetFuzz_NS). Union of 5 runs reported.",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "ProphetFuzz's constraint extraction outperforms CarpetFuzz: 94.00% vs 76.06% overall precision, 633 vs 447 constraints.",
    401       "evidence": "Section 5.3 with manual annotation of all extracted constraints. Recall evaluation on 20 programs (Table 6): 97.97% precision and 80.56% recall vs CarpetFuzz's 77.78% and 46.67%.",
    402       "supported": "strong"
    403     }
    404   ],
    405   "red_flags": [
    406     {
    407       "flag": "Union-of-runs reporting",
    408       "detail": "Results are reported as the union of 5 repetitions ('we take the union of the results from the five repetitions'), which represents the best-case performance rather than expected performance. This inflates the reported numbers and hides variance across runs."
    409     },
    410     {
    411       "flag": "No statistical significance testing",
    412       "detail": "Claims of superiority over CarpetFuzz (1.33× more vulnerabilities) are based on raw number comparisons without statistical tests. Given the high variance in fuzzing outcomes, the observed differences may not be statistically significant."
    413     },
    414     {
    415       "flag": "Potential data contamination from LLM training data",
    416       "detail": "GPT-4 was likely trained on vulnerability reports, CVE databases, and security discussions about the 52 well-known evaluation programs. The model's predictions could be influenced by memorized vulnerability patterns rather than genuine reasoning from documentation, but this confound is never discussed."
    417     },
    418     {
    419       "flag": "Few-shot/evaluation set overlap not addressed",
    420       "detail": "The 8 programs used to generate few-shot examples (drawn from popular C/C++ projects) may overlap with the 52 evaluation programs (also popular open-source programs from prior studies). This potential data leakage is not analyzed."
    421     }
    422   ],
    423   "cited_papers": [
    424     {
    425       "title": "GPT-4 technical report",
    426       "authors": ["Josh Achiam", "Steven Adler", "Sandhini Agarwal"],
    427       "year": 2023,
    428       "arxiv_id": "2303.08774",
    429       "relevance": "Foundation LLM used as the backbone of ProphetFuzz; relevant to understanding LLM capabilities in automated security testing."
    430     },
    431     {
    432       "title": "Language models are few-shot learners",
    433       "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"],
    434       "year": 2020,
    435       "relevance": "Foundational work on few-shot learning with LLMs, the prompting strategy central to ProphetFuzz's prediction method."
    436     },
    437     {
    438       "title": "Large language models are zero-shot fuzzers: Fuzzing deep-learning libraries via large language models",
    439       "authors": ["Yinlin Deng", "Chunqiu Steven Xia", "Haoran Peng"],
    440       "year": 2023,
    441       "relevance": "TitanFuzz — demonstrates LLMs as zero-shot fuzzers for DL libraries, directly related to LLM-based fuzzing approaches."
    442     },
    443     {
    444       "title": "Large language models are edge-case generators: Crafting unusual programs for fuzzing deep learning libraries",
    445       "authors": ["Yinlin Deng", "Chunqiu Steven Xia", "Chenyuan Yang"],
    446       "year": 2024,
    447       "relevance": "FuzzGPT — LLM-based edge-case generation for fuzzing, advancing the LLM+fuzzing research direction."
    448     },
    449     {
    450       "title": "Codamosa: Escaping coverage plateaus in test generation with pre-trained large language models",
    451       "authors": ["Caroline Lemieux", "Jeevana Priya Inala", "Shuvendu K Lahiri"],
    452       "year": 2023,
    453       "relevance": "LLM-augmented test generation that uses LLMs to escape coverage plateaus, relevant to LLM-aided software testing."
    454     },
    455     {
    456       "title": "Large language model guided protocol fuzzing",
    457       "authors": ["Ruijie Meng", "Martin Mirchev", "Marcel Böhme"],
    458       "year": 2024,
    459       "relevance": "ChatAFL — uses LLMs to analyze RFC documents for protocol fuzzing, closely related to using LLMs for document-based security testing."
    460     },
    461     {
    462       "title": "KernelGPT: Enhanced Kernel Fuzzing via Large Language Models",
    463       "authors": ["Chenyuan Yang", "Zijie Zhao", "Lingming Zhang"],
    464       "year": 2023,
    465       "arxiv_id": "2401.00563",
    466       "relevance": "LLM-based kernel syscall specification generation for fuzzing, extending LLM applications to OS-level security testing."
    467     },
    468     {
    469       "title": "Fuzz4all: Universal fuzzing with large language models",
    470       "authors": ["Chunqiu Steven Xia", "Matteo Paltenghi", "Jia Le Tian"],
    471       "year": 2024,
    472       "relevance": "Universal LLM-based fuzzing approach, directly relevant to the broader trend of applying LLMs to automated security testing."
    473     },
    474     {
    475       "title": "Prompt Fuzzing for Fuzz Driver Generation",
    476       "authors": ["Yunlong Lyu", "Yuxuan Xie", "Peng Chen"],
    477       "year": 2023,
    478       "arxiv_id": "2312.17677",
    479       "relevance": "PromptFuzz — uses LLMs to generate fuzz drivers, relevant to automated fuzzing pipeline construction."
    480     },
    481     {
    482       "title": "CarpetFuzz: Automatic Program Option Constraint Extraction from Documentation for Fuzzing",
    483       "authors": ["Dawei Wang", "Ying Li", "Zhiyu Zhang"],
    484       "year": 2023,
    485       "relevance": "State-of-the-art baseline for option-aware fuzzing; ProphetFuzz is directly compared against and extends this work."
    486     },
    487     {
    488       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    489       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    490       "year": 2022,
    491       "relevance": "Foundation of the chain-of-thought prompting strategy used in ProphetFuzz's prediction module."
    492     },
    493     {
    494       "title": "Augmenting greybox fuzzing with generative AI",
    495       "authors": ["Jie Hu", "Qian Zhang", "Heng Yin"],
    496       "year": 2023,
    497       "arxiv_id": "2306.06782",
    498       "relevance": "ChatFuzz — explores generative AI for augmenting fuzzing, directly relevant to LLM-based security testing."
    499     },
    500     {
    501       "title": "Fuzzing BusyBox: Leveraging LLM and Crash Reuse for Embedded Bug Unearthing",
    502       "authors": ["Yaroslav Oliinyk", "Michael Scott", "Ryan Tsang"],
    503       "year": 2024,
    504       "arxiv_id": "2403.03897",
    505       "relevance": "LLM-assisted fuzzing of embedded systems, extending LLM fuzzing to IoT/embedded domain."
    506     }
    507   ],
    508   "engagement_factors": {
    509     "practical_relevance": {
    510       "score": 3,
    511       "justification": "Open-source tool that security practitioners can directly use for automated option-aware fuzz testing of command-line programs."
    512     },
    513     "surprise_contrarian": {
    514       "score": 1,
    515       "justification": "Using LLMs for security testing is an active research direction; the specific application to option combination prediction is novel but not contrarian."
    516     },
    517     "fear_safety": {
    518       "score": 1,
    519       "justification": "Finds real software vulnerabilities (140 zero-days, 21 CVEs) but these are traditional memory corruption bugs, not AI-specific safety concerns."
    520     },
    521     "drama_conflict": {
    522       "score": 0,
    523       "justification": "No controversy or conflict; straightforward tool comparison with CarpetFuzz."
    524     },
    525     "demo_ability": {
    526       "score": 2,
    527       "justification": "Open-source GitHub repository available, but requires Docker setup, GPT-4 API key, and target program configuration to use."
    528     },
    529     "brand_recognition": {
    530       "score": 1,
    531       "justification": "Published at CCS (top security venue), from Zhongguancun Laboratory/Tsinghua University; uses GPT-4 but the paper is not about GPT-4 itself."
    532     }
    533   }
    534 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs