scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (32655B)
      1 {
      2   "paper": {
      3     "title": "VulScribeR: Exploring RAG-based Vulnerability Augmentation with LLMs",
      4     "authors": [
      5       "Seyed Shayan Daneshvar",
      6       "Yu Nong",
      7       "Xu Yang",
      8       "Shaowei Wang",
      9       "Haipeng Cai"
     10     ],
     11     "year": 2025,
     12     "venue": "ACM Transactions on Software Engineering and Methodology",
     13     "arxiv_id": "2408.04125",
     14     "doi": "10.1145/3760775"
     15   },
     16   "scan_version": 3,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "VulScribeR proposes three LLM-based strategies (Mutation, Injection, Extension) for augmenting vulnerable code datasets, leveraging RAG for diversity and relevance. Injection and Extension consistently outperform SOTA baselines (VulGen, VGX, ROS) across 4 datasets, 3 DLVD models, and 3 LLMs, with Injection beating baselines by 15-28% in average F1-score at 5K samples. Unlike baselines that degrade beyond 5K samples, VulScribeR scales to 15K samples with continued improvement, at a cost of ~$1.88 per 1K samples.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The authors state 'We made our source code, experimental results, and the augmented datasets publicly available for future research [1]' with reference [1] pointing to https://github.com/shayandaneshvar/VulScribeR."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The replication package includes augmented datasets (reference [1]). Additionally, all base datasets used (Devign, Reveal, BigVul, PrimeVul) are publicly available, and the paper specifies the cleaned versions from VGX."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper mentions hardware ('four 24GB Nvidia RTX 3090s') but does not list software dependencies, library versions, requirements.txt, Dockerfile, or conda environment specifications in the paper text."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No step-by-step reproduction instructions are provided in the paper. While a replication package is linked, the paper itself does not contain a 'Reproducing Results' section or specific commands to run."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "All results in Tables 2-8 report point estimates (Precision, Recall, F1) without confidence intervals, error bars, or ± notation. Despite running Devign and Reveal models 5 times with different seeds, no uncertainty measures are reported."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "All claims of outperformance (e.g., 'Injection outperforms NoAug, VulGen, VGX, and ROS by 30.80%, 27.48%, 27.93%, and 15.41%') are based on comparing point estimates without any statistical significance tests."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "The paper consistently reports percentage improvements with baseline context throughout, e.g., 'Injection outperforms NoAug, Vulgen, VGX, and ROS by 30.80%, 27.48%, 27.93%, and 15.41% on average F1-score.' Full result tables provide baseline and proposed method numbers for computing effect magnitudes."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No justification is given for why 5K, 10K, and 15K augmented samples were chosen as experimental points. No power analysis or reasoning for these specific quantities is provided."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "For Devign and Reveal DLVD models, the authors 'train both of these models 5 times with random seed values, test them and report the results with the highest F1-score achieved' (Section 4.3). Only the best-of-5 result is reported with no standard deviation or variance across runs."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Four baselines are included: NoAug (no augmentation), ROS (random oversampling), VulGen, and VGX. All are described in Section 4.6."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "VGX (Nong et al., 2024, ICSE '24) is contemporary and represents the latest state of the art in vulnerability augmentation. VulGen (2023, ICSE '23) is also recent."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "RQ2 (Section 5.2) provides ablation studies removing the Retriever component (w/o Retriever) and the clustering phase (w/o Clustering) for both Injection and Extension strategies, with results in Tables 3 and 4."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Three evaluation metrics are used: Precision, Recall, and F1-Score, reported in all result tables (Tables 2-8)."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Section 6.4 reports a manual assessment of 150 randomly selected generated samples (50 per strategy) to determine whether the LLM followed instructions and generated truly vulnerable code, finding 72-90% success rates."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The paper trains on augmented Devign and tests on BigVul's test set and Reveal dataset. 'We specifically train and test the models with different datasets, following the settings of VulGen and VGX, to ensure there is no information leakage' (Section 4.3)."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Results are broken down by test dataset (Reveal, BigVul), DLVD model (Devign, Reveal, LineVul), and LLM (ChatGPT, CodeQwen) in all tables, providing 12 experimental instances per strategy."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 6.4 discusses failure patterns for each strategy: Mutation fails by changing code in unspecified ways, Injection produces wrong or half-complete injections, and Extension removes input code instead of extending it."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Several negative results are reported: Mutation does not consistently beat ROS (Section 5.1), ensemble of strategies does not always outperform single strategies (Section 6.1, 'M+I+E always fail to beat all of the single strategies'), and free-form extension without clean samples 'does not work well, and tends to hallucinate' (Section 3.1.3)."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract's specific numerical claims (e.g., '27.48%, 27.93%, and 15.41% in f1-score with 5K' and '53.84%, 54.10%, 69.90%, and 40.93% with 15K') are verified in Tables 2 and 5 respectively. The cost claim of '$1.88 per 1K samples' is confirmed in Section 6.3."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Causal claims about component contributions (e.g., 'RAG component makes a significant contribution') are supported by controlled ablation studies in RQ2, where single variables (Retriever, Clustering) are systematically removed to measure their impact (Tables 3-4)."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The conclusion states 'our method significantly outperforms SOTA methods, and is suitable for large-scale vulnerability augmentation' without bounding to C/C++ code, the specific DLVD families tested, or the specific LLM capabilities used. All experiments use C/C++ datasets only, but claims are framed for vulnerability augmentation broadly."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Section 6.5 (Threats to Validity) discusses specific alternative explanations: hyperparameter settings could yield different results, label noise in Devign/BigVul could affect results (addressed by PrimeVul experiment), and LLM hallucination could affect generated sample quality (addressed by verifier and manual inspection)."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper measures F1-score on vulnerability detection tasks and frames results in terms of DLVD model effectiveness. The measurement (F1 on held-out vulnerability detection benchmarks) directly matches the claim (improved vulnerability detection performance). No broader proxy gap exists."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "The paper specifies 'gpt-3.5-turbo', 'CodeQwen1.5-7B-Chat', and 'GPT4o-mini' but provides no snapshot dates or API versions for the OpenAI models. The gpt-3.5-turbo endpoint has been updated multiple times with different behaviors, and no date qualifier is given."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Prompt templates are shown in Figures 2, 3, and 4. While the Mutation template truncates some transformation rules ('{...remaining_rules_are_hidden_to_save_space}'), the paper links to a public repository (reference [1]) containing the full prompts. Placeholders are data-dependent (code samples from datasets) and deterministically reconstructible."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section 4.5 reports temperature=0.5 for ChatGPT, default parameters for CodeQwen except max_new_tokens=4096, and GPT4o-mini temperature=0.5. Section 4.3 reports DLVD training settings (5 random seeds for Devign/Reveal, 10 epochs for LineVul)."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used. The system is a RAG-based generation pipeline with prompt templates, retrieval, and parsing-based filtering — not an agentic system with tool use, feedback loops, or memory management."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Data preprocessing is described in detail: VGX's cleaned dataset versions with deduplicates removed (Section 4.2), SHA2 hash deduplication for PrimeVul, extraction of vulnerable lines via diff, clustering with CodeBERT embeddings and KMeans, BM-25 retrieval, and fuzzy parser filtering removing 2-13% of generated data."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 6.5 'Threats to Validity' provides substantive discussion of internal and external validity threats across multiple paragraphs."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 6.5 discusses study-specific threats: hyperparameter settings for DLVD models and LLMs not being tuned, label noise in Devign and BigVul datasets (citing references [11, 14, 54]), LLM hallucination affecting generated code quality, and limitation to two LLMs and three datasets for generalizability."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The external validity section explicitly identifies what was not tested: 'Even though we used two different LLMs... evaluated on three commonly used datasets, and covered three SOTA DLVD models... our findings may not generalize.' It specifically identifies gaps: encoder-decoder LLMs, other datasets, and other DLVD models."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The replication package (reference [1], GitHub) includes 'source code, experimental results, and the augmented datasets' making both generated data and results available for verification."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Section 4.2 describes dataset origins in detail: Devign from VGX's cleaned version, BigVul's training set with 6,610 samples that include vulnerable line metadata, Reveal as a testing set, and PrimeVul with SHA2 hash deduplication against Devign."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants in the study. All data comes from standard public vulnerability detection benchmarks (Devign, Reveal, BigVul, PrimeVul)."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The full pipeline is documented: Algorithm 1 describes pair retrieval with clustering, Section 3 describes the Retriever→Formulator→Generator→Verifier pipeline, Section 4.7 describes the approach for each RQ including specific sample counts (e.g., generate 6K to yield 5K after filtering)."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding or acknowledgments section is present in the paper text. Funding sources are not disclosed."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "All author affiliations are clearly listed: University of Manitoba (Daneshvar, Yang, Wang), Washington State University (Nong), and University at Buffalo (Cai). No commercial affiliations with evaluated products."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "Cannot assess funder independence since funding is not disclosed. Absence of disclosure prevents evaluation."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests or financial interests statement is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. LLMs (ChatGPT, CodeQwen) are used as generation tools, and DLVD models are trained/fine-tuned on specific datasets then tested on separate held-out sets. The contamination concern is handled by train/test separation, not by pre-training cutoff dates."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "Same rationale — the paper trains DLVD models and tests them on separate datasets. Pre-trained model benchmark contamination is not the relevant concern here."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "The study evaluates data augmentation effectiveness for training classifiers, not a pre-trained model's zero-shot or few-shot capability on benchmarks."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in the study. The manual inspection of 150 generated samples is researcher evaluation work, not a human subjects study."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants. All data is from public code repositories and vulnerability databases."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in the study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in the study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in the study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in the study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in the study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": true,
    290         "justification": "Section 6.3 reports 'generating every 1,000 vulnerable samples... costs about $1.88 with GPT3.5-Turbo and about 9 GPU hours with CodeQwen1.5-7B-Chat on two RTX3090s.' Also states 'less than $19 with ChatGPT to augment a dataset like Devign to twice its size.'"
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "Section 4.8 states 'We conducted a total of 781 experiments for all our RQs, which cost close to 2000 GPU hours solely for training and testing DLVD models' on 'four 24GB Nvidia RTX 3090s.' Generation costs are also quantified in Section 6.3."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "For Devign and Reveal models, the paper trains '5 times with random seed values' but reports only 'the results with the highest F1-score achieved' (Section 4.3). No mean/std across seeds is reported, and the best-of-5 selection masks seed sensitivity."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": true,
    307         "justification": "Section 4.3 explicitly states '5 times with random seed values' for Devign and Reveal models, and '10 epochs' for LineVul with checkpoint selection on validation set."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "No hyperparameter search budget is reported. The paper states it followed settings from previous studies (VulGen, VGX) for DLVD models and used mostly default LLM parameters, but does not state how many configurations were explored."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "For Devign and Reveal models, the paper reports 'the results with the highest F1-score achieved' from 5 test-set runs — this is selection on the test set, not the validation set, which inflates reported performance. LineVul correctly selects on the validation set."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "The paper makes numerous pairwise comparisons across strategies, models, datasets, and LLMs without applying any correction for multiple comparisons (no Bonferroni, Holm, or Benjamini-Hochberg mentioned)."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors do not acknowledge author-evaluation bias. While VGX/VulGen generated results are reused from original papers, the authors implement their own pipeline, DLVD training, and manual quality assessment without discussing self-comparison bias."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "No analysis of performance as a function of compute budget. The cost analysis in Section 6.3 reports generation cost but does not compare total compute (generation + training) across methods or discuss whether VulScribeR's improvements justify additional compute over baselines."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": true,
    337         "justification": "Section 6.5 explicitly discusses that Devign and BigVul are 'known to be noisy datasets... meaning that a considerable number of vulnerable samples are incorrectly labeled, which can affect the results.' They address this by adding PrimeVul, 'a more recent dataset with considerably less label noise,' in RQ4."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "No scaffolding is involved. The DLVD models are standard classifiers (graph-based and transformer-based), not scaffolded agents."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "The LLMs used for generation (ChatGPT, CodeQwen) may have seen the vulnerability datasets during pre-training, potentially meaning 'generated' samples are memorized. This temporal leakage concern is not discussed."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of whether the augmentation process introduces feature leakage — e.g., whether generated samples carry artifacts that the DLVD models could exploit as shortcuts rather than learning genuine vulnerability patterns."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": true,
    359         "justification": "The paper explicitly addresses independence: 'we used the cleaned version from VGX, which had duplicates removed between the datasets' and 'we used the SHA2 hash to remove samples available in Devign... from all three sets of the PrimeVul dataset' (Section 4.2). They train and test on different datasets to prevent leakage."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": true,
    364         "justification": "SHA2 hash deduplication is used to remove PrimeVul samples that overlap with Devign (Section 4.2), and VGX's cleaned versions with duplicates removed between datasets are used. These are concrete deduplication methods applied to prevent data leakage."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "VulScribeR's Injection strategy outperforms NoAug, VulGen, VGX, and ROS by 30.80%, 27.48%, 27.93%, and 15.41% in average F1-score with 5K generated samples.",
    371       "evidence": "Table 2 (Section 5.1) shows results across 12 experimental instances (3 DLVD models × 2 test datasets × 2 LLMs). Injection and Extension outperform all baselines in all 12 instances in terms of F1-score.",
    372       "supported": "moderate"
    373     },
    374     {
    375       "claim": "RAG component contributes 4.99% (Injection) and 10.77% (Extension) improvement in average F1-score.",
    376       "evidence": "Tables 3-4 (Section 5.2) compare full pipeline vs. w/o Retriever and w/o Clustering ablations. Injection w/ full Retriever improves by 4.99% over w/o Retriever; Extension improves by 10.77%.",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "VulScribeR scales to 15K augmented samples with continued improvement, while VulGen, VGX, and ROS degrade beyond 5K.",
    381       "evidence": "Table 5 and Figure 6 (Section 5.3) show Injection performance increasing from 5K to 15K in 11/12 instances, while VGX/VulGen/ROS decrease in 5/6, 4/6, and 4/6 instances respectively when going from 5K to 15K.",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "On PrimeVul (heavily imbalanced, 1:35 ratio), Extension outperforms NoAug, VulGen, VGX, and ROS by 14.54%, 21.14%, 17.73%, and 3.39% in average F1-score.",
    386       "evidence": "Table 6 (Section 5.4) shows results across 3 DLVD models using GPT4o-mini. Extension outperforms all baselines in all instances.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "72-90% of generated samples are correctly vulnerable based on manual inspection (72% Mutation, 82% Injection, 90% Extension).",
    391       "evidence": "Section 6.4 reports manual assessment of 150 randomly selected samples (50 per strategy) from ChatGPT-3.5 RQ1 results.",
    392       "supported": "weak"
    393     },
    394     {
    395       "claim": "Generating 1K vulnerable samples costs approximately $1.88 with GPT-3.5-Turbo.",
    396       "evidence": "Section 6.3 directly reports this cost based on their experiments.",
    397       "supported": "strong"
    398     }
    399   ],
    400   "red_flags": [
    401     {
    402       "flag": "Best-of-5 test-set selection inflates results",
    403       "detail": "For Devign and Reveal DLVD models, the paper trains 5 times with different seeds and reports 'the results with the highest F1-score achieved' — this selects the best run on the test set rather than using the validation set for selection, inflating reported performance. This is particularly concerning because the same protocol is applied to both VulScribeR and baselines, but the variance may differ across methods."
    404     },
    405     {
    406       "flag": "No statistical significance testing",
    407       "detail": "All claims of outperformance across 781 experiments rely entirely on comparing point estimates. With no significance tests and no reported variance, it is impossible to assess whether observed differences are meaningful or within noise ranges."
    408     },
    409     {
    410       "flag": "No variance or uncertainty reporting despite multiple runs",
    411       "detail": "Despite training models 5 times with different seeds, only the maximum F1 is reported. Standard deviation, IQR, or any spread measure is absent, making it impossible to assess result stability."
    412     },
    413     {
    414       "flag": "Manual quality assessment by paper authors with no inter-rater reliability",
    415       "detail": "The 150-sample quality assessment in Section 6.4 was conducted by the authors evaluating their own system's output. No independent evaluators, inter-rater reliability metrics, or blinding procedures are described."
    416     },
    417     {
    418       "flag": "LLM memorization of vulnerability datasets not addressed",
    419       "detail": "ChatGPT and CodeQwen may have seen Devign, BigVul, and other vulnerability datasets during pre-training. Generated 'new' vulnerable samples could be memorized rather than genuinely novel, which would inflate the apparent effectiveness of augmentation. This is not discussed."
    420     }
    421   ],
    422   "cited_papers": [
    423     {
    424       "title": "VULGEN: Realistic Vulnerability Generation Via Pattern Mining and Deep Learning",
    425       "authors": ["Yu Nong", "Yuzhe Ou", "Michael Pradel", "Feng Chen", "Haipeng Cai"],
    426       "year": 2023,
    427       "relevance": "Core baseline for vulnerability generation via pattern mining and injection, representing SOTA in single-statement vulnerability augmentation."
    428     },
    429     {
    430       "title": "VGX: Large-Scale Sample Generation for Boosting Learning-Based Software Vulnerability Analyses",
    431       "authors": ["Yu Nong", "Richard Fang", "Guangbei Yi", "Kunsong Zhao", "Xiapu Luo", "Feng Chen", "Haipeng Cai"],
    432       "year": 2024,
    433       "doi": "10.1145/3597503.3639116",
    434       "relevance": "Improved vulnerability generation method with larger-scale pattern mining; key baseline showing limitations of single-statement approaches."
    435     },
    436     {
    437       "title": "Vulnerability Detection with Code Language Models: How Far Are We?",
    438       "authors": ["Yangruibo Ding", "Yanjun Fu", "Omniyyah Ibrahim"],
    439       "year": 2024,
    440       "arxiv_id": "2403.18624",
    441       "relevance": "Introduces PrimeVul dataset and evaluates LLM-based vulnerability detection, directly relevant to understanding limitations of current approaches."
    442     },
    443     {
    444       "title": "LineVul: A Transformer-based Line-Level Vulnerability Prediction",
    445       "authors": ["Michael Fu", "Chakkrit Tantithamthavorn"],
    446       "year": 2022,
    447       "doi": "10.1145/3524842.3528452",
    448       "relevance": "SOTA transformer-based DLVD model used as one of three evaluation models in VulScribeR experiments."
    449     },
    450     {
    451       "title": "Devign: Effective Vulnerability Identification by Learning Comprehensive Program Semantics via Graph Neural Networks",
    452       "authors": ["Yaqin Zhou", "Shangqing Liu", "Jingkai Siow", "Xiaoning Du", "Yang Liu"],
    453       "year": 2019,
    454       "relevance": "Foundational GNN-based vulnerability detection model used both as a DLVD evaluator and as the primary training dataset source."
    455     },
    456     {
    457       "title": "Deep Learning Based Vulnerability Detection: Are We There Yet?",
    458       "authors": ["Saikat Chakraborty", "Rahul Krishna", "Yangruibo Ding", "Baishakhi Ray"],
    459       "year": 2020,
    460       "relevance": "Introduces Reveal dataset and DLVD model, evaluating the state of deep learning for vulnerability detection."
    461     },
    462     {
    463       "title": "GRACE: Empowering LLM-based software vulnerability detection with graph structure and in-context learning",
    464       "authors": ["Guilong Lu", "Xiaolin Ju", "Xiang Chen", "Wenlong Pei", "Zhilong Cai"],
    465       "year": 2024,
    466       "doi": "10.1016/j.jss.2024.112031",
    467       "relevance": "LLM-based vulnerability detection using RAG and graph structures, representing the LLM-based alternative approach to DLVD."
    468     },
    469     {
    470       "title": "Vul-RAG: Enhancing LLM-based Vulnerability Detection via Knowledge-level RAG",
    471       "authors": ["Xueying Du", "Geng Zheng", "Kaixin Wang"],
    472       "year": 2024,
    473       "arxiv_id": "2406.11147",
    474       "relevance": "RAG-based approach for LLM vulnerability detection, closely related to VulScribeR's use of RAG for vulnerability understanding."
    475     },
    476     {
    477       "title": "LLM agents can autonomously exploit one-day vulnerabilities",
    478       "authors": ["Richard Fang", "Rohan Bindu", "Akul Gupta", "Daniel Kang"],
    479       "year": 2024,
    480       "arxiv_id": "2404.08144",
    481       "relevance": "Demonstrates LLM agents' capability in vulnerability exploitation, relevant to understanding LLMs' security capabilities."
    482     },
    483     {
    484       "title": "Does data sampling improve deep learning-based vulnerability detection? Yeas! and Nays!",
    485       "authors": ["Xu Yang", "Shaowei Wang", "Yi Li", "Shaohua Wang"],
    486       "year": 2023,
    487       "relevance": "Studies data sampling effects on DLVD models, establishing that random oversampling can improve detection — used as a baseline context."
    488     },
    489     {
    490       "title": "Uncovering the limits of machine learning for automatic vulnerability detection",
    491       "authors": ["Niklas Risse", "Marcel Böhme"],
    492       "year": 2024,
    493       "relevance": "Examines fundamental limitations of ML-based vulnerability detection, relevant to understanding the ceiling for data augmentation approaches."
    494     },
    495     {
    496       "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages",
    497       "authors": ["Zhangyin Feng", "Daya Guo", "Duyu Tang"],
    498       "year": 2020,
    499       "arxiv_id": "2002.08155",
    500       "relevance": "Pre-trained code model used for embedding code samples in VulScribeR's clustering and retrieval pipeline."
    501     }
    502   ],
    503   "engagement_factors": {
    504     "practical_relevance": {
    505       "score": 2,
    506       "justification": "Provides a usable pipeline for augmenting vulnerability datasets with released code and datasets, applicable to practitioners training vulnerability detectors."
    507     },
    508     "surprise_contrarian": {
    509       "score": 0,
    510       "justification": "Results confirm expected intuition that LLMs with RAG produce better code augmentation than simpler pattern-mining approaches."
    511     },
    512     "fear_safety": {
    513       "score": 1,
    514       "justification": "Generates vulnerable code at scale which has dual-use potential, though framed entirely for defensive improvement of vulnerability detectors."
    515     },
    516     "drama_conflict": {
    517       "score": 0,
    518       "justification": "Standard academic contribution with no controversy or conflict narrative."
    519     },
    520     "demo_ability": {
    521       "score": 2,
    522       "justification": "Source code and augmented datasets are publicly released on GitHub, enabling replication of the pipeline."
    523     },
    524     "brand_recognition": {
    525       "score": 0,
    526       "justification": "From University of Manitoba, Washington State, and University at Buffalo — not high-profile AI labs."
    527     }
    528   }
    529 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs