ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (32177B)


      1 {
      2   "paper": {
      3     "title": "DLAP: A Deep Learning Augmented Large Language Model Prompting Framework for Software Vulnerability Detection",
      4     "authors": [
      5       "Yanjing Yang",
      6       "Xin Zhou",
      7       "Runfeng Mao",
      8       "Jinwei Xu",
      9       "Lanxin Yang",
     10       "Yu Zhang",
     11       "Haifeng Shen",
     12       "He Zhang"
     13     ],
     14     "year": 2024,
     15     "venue": "Journal of Systems and Software",
     16     "arxiv_id": "2405.01202",
     17     "doi": "10.48550/arXiv.2405.01202"
     18   },
     19   "scan_version": 3,
     20   "active_modules": ["experimental_rigor", "data_leakage"],
     21   "methodology_tags": ["benchmark-eval"],
     22   "key_findings": "DLAP combines DL model predictions with LLM prompting via ICL and COT to improve vulnerability detection. Linevul was the most effective DL model plugin among three tested, outperforming Devign by ~10% MCC on average. DLAP outperformed four prompting baselines (PRol, PAux, PCot, GRACE) across four C/C++ projects, with higher F1 and MCC. Compared to LoRA fine-tuning of Vicuna-13B, DLAP achieved comparable average F1 at significantly lower computational cost but underperformed fine-tuning on larger projects.",
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "A GitHub URL is provided: https://github.com/Yang-Yanjing/DLAP.git, referenced in a footnote in Section 1 as 'Data and materials.'"
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The datasets are built from public open-source projects (Chrome, Linux, Android, Qemu) and the GitHub repository is stated to contain data and materials. The underlying vulnerability datasets from Fan et al. and Chakraborty et al. are publicly available."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Table 2 lists DL model hyperparameters including Java versions and specific tool versions (Joern 0.3.1, Neo4j), but no Python version, pip requirements, Docker configuration, or complete dependency list is provided in the paper."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper describes the DLAP algorithm (Algorithm 1) and experimental design, but provides no step-by-step reproduction instructions, no README description, and no commands to run the experiments."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Tables 3, 5, and 6 report only point estimates (e.g., '40.4', '73.3'). No confidence intervals, error bars, or ± notation appear anywhere in the results."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No statistical significance tests (p-values, t-tests, Wilcoxon, etc.) are reported. Claims like 'DLAP outperforms' are based solely on comparing raw metric values across frameworks."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "The paper reports absolute percentage differences with baseline context: 'Linevul surpasses using Devign by an average of 7.2% and 10.5% on F1 and MCC' (Section 5.1), and '10% higher F1 score and a 20% higher MCC' (Section 1). Tables show both DLAP and baseline values."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Section 4.2 describes three criteria for project selection (researched by related work, >3000 functions, traceable) but provides no statistical power analysis or justification for why four projects or the specific dataset sizes are sufficient."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "No standard deviations, variance across runs, or interquartile ranges are reported. All results appear to be from single experimental runs with no spread measures."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Four prompting baselines (PRol, PAux, PCot, GRACE) are compared in Table 5 (RQ2), and LoRA fine-tuning of Vicuna-13B is compared in Table 6 (RQ3). Three DL models are also compared in Table 3 (RQ1)."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "GRACE (Lu et al. 2024) and the prompting frameworks from Zhang et al. (2023) are recent. LoRA (Hu et al. 2022) is the standard efficient fine-tuning technique. The baselines represent the state of the art at time of writing."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "RQ1 (Section 5.1, Table 3) systematically compares three DL model variants (Sysevr, Devign, Linevul) as plugins for DLAP, effectively ablating the DL model component to determine its impact."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Five evaluation metrics are used: Precision, Recall, F1-score, FPR, and MCC (Section 4.5). Results are reported across all five in Tables 3, 5, and 6."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No human evaluation of DLAP's outputs is conducted. Figure 8 shows one qualitative example of DLAP output matching a fix commit, but there is no systematic human evaluation of detection quality or explanatory text."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 4.2 states: 'we divided the dataset into training and testing sets with the 8:2 proportion. The training set was used to build DL models, while the testing set was used to evaluate the performance of DLAP.'"
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Results are reported per-project (Chrome, Android, Linux, Qemu) across all metrics in Tables 3, 5, and 6, allowing comparison of performance variation across different project characteristics."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": false,
    112         "justification": "No systematic failure case analysis is provided. Figure 8 shows one successful example. The paper does not discuss where DLAP produces incorrect predictions, what types of vulnerabilities it misses, or qualitative error analysis."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "RQ3 (Table 6) shows DLAP underperforms fine-tuning on Chrome (F1: 52.1 vs 82.0) and Linux (F1: 65.4 vs 70.3). RQ1 shows Sysevr performs poorly as a DLAP plugin (e.g., Chrome MCC 14.6% vs Linevul 37.6%)."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The abstract claims DLAP 'outperforms state-of-the-art prompting frameworks...as well as fine-tuning on multiple metrics.' While DLAP does beat prompting baselines, Table 6 shows fine-tuning (Vicuna-13B) achieves higher F1 on Chrome (82.0 vs 52.1) and Linux (70.3 vs 65.4) and higher overall precision (88.7 vs 54.1). The abstract significantly overstates the fine-tuning comparison."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper makes central causal claims about 'implicit fine-tuning' of LLMs (Equations 3-5, Section 3.3) and that DLAP 'stimulates' LLMs to adapt. The authors acknowledge in construct validity (Section 7) that they 'can not strictly demonstrate that the stimulation produces gradient descent optimization loss.' The causal mechanism is theoretically argued but not empirically verified."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The title claims 'Software Vulnerability Detection' generally, but experiments are limited to four C/C++ projects. Section 6.2 discusses generalization to other tasks (library identification, code smell detection) speculatively without bounding the actual results to the tested setting."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "Section 7 discusses threats to validity but focuses on methodological limitations (DL model choice, LLM choice, observability of internals) rather than alternative explanations for the results. For example, DLAP's improvement could simply be due to providing more contextual information rather than 'implicit fine-tuning,' but this alternative is not discussed."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The paper measures binary classification metrics (precision, recall, F1, FPR, MCC) for vulnerability detection and claims vulnerability detection performance. The measurement granularity matches the claims — they do not inflate function-level detection into broader security claims."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section 5.2 specifies 'GPT-3.5-turbo-0125' (includes version date), and the fine-tuning comparison uses 'Llama-13b' / 'Vicuna-13B'. DL models are specified by name (Linevul, Devign, Sysevr) with architecture details in Table 2."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Full prompt text is provided for all four baselines (PRol, PAux, PCot in Section 4.4). DLAP prompt examples with actual content are shown in Figures 2, 3, 4, and 5. The COT template library is referenced on GitHub."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "Table 2 provides extensive DL model hyperparameters (learning rates, batch sizes, optimizers, etc.), but no LLM API parameters are reported for GPT-3.5-turbo (temperature, top-p, max tokens, frequency penalty). These significantly affect generation quality."
    162       },
    163       "scaffolding_described": {
    164         "applies": false,
    165         "answer": false,
    166         "justification": "DLAP is a prompt construction pipeline, not agentic scaffolding. The LLM is called once with a pre-assembled prompt; there is no agent loop, tool calling by the LLM, retry logic, or memory management."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 4.2 documents the preprocessing: project selection criteria (3 criteria listed), random undersampling of non-vulnerable samples for class balance, and 8:2 train/test split. Table 1 provides raw function and vulnerability counts per project."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 7 'Threats to Validity' is a dedicated section discussing internal validity, construct validity, and external validity, spanning approximately one page."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Internal validity discusses the specific risk of selecting an erroneous DL model. Construct validity specifically discusses inability to observe LLM internal outputs to verify implicit fine-tuning. External validity identifies GPT-3.5-turbo-0125 as a specific threat — different LLMs may yield different results."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound findings to C/C++ projects, function-level detection, or the specific LLM used. Section 6.2 instead speculates about broader applicability without stating scope boundaries."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "The GitHub repository (https://github.com/Yang-Yanjing/DLAP.git) is stated to contain data and materials. The underlying vulnerability datasets are from publicly available sources (Chrome, Linux, Android, Qemu projects used by Chakraborty et al. and Fan et al.)."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section 4.2 describes three criteria for project selection: (1) researched by related work, (2) >3,000 functions, (3) traceable vulnerability fix records. Table 1 provides basic statistics. Source references (Chakraborty et al. [4], Fan et al. [12], Zhou et al. [49]) are cited."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": false,
    204         "answer": false,
    205         "justification": "No human participants. Data sources are standard public open-source projects commonly used as benchmarks in vulnerability detection research."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": false,
    210         "justification": "The paper describes undersampling and 8:2 splitting but does not report the resulting dataset sizes after undersampling, how many examples were removed at each stage, or the final training/test set sizes per project."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding source, acknowledgments section, or grant information is mentioned anywhere in the paper."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Author affiliations are clearly listed: Nanjing University (China) and Southern Cross University (Australia). The authors are academic researchers not affiliated with the evaluated products (GPT, Llama)."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No funding is disclosed, so independence cannot be assessed. Without a funding statement, this criterion cannot be verified."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests statement or financial interest declaration appears in the paper."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No mention of GPT-3.5-turbo's training data cutoff date. The model may have been trained on data containing the public vulnerability datasets used for evaluation."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "No discussion of whether the Chrome, Linux, Android, or Qemu vulnerability data (all publicly available) could have appeared in GPT-3.5-turbo's training corpus."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "The vulnerability datasets from public projects were available online well before GPT-3.5-turbo's training cutoff. Vulnerability fix commits and CVE descriptions are widely indexed. This contamination risk is not addressed."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study. It is a benchmark evaluation of automated vulnerability detection."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants in this study."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Table 7 reports memory usage (MB), wall-clock time (hours), and GPU memory (GB) for both DLAP and LoRA fine-tuning across all four datasets. Section 5.2 mentions 'cost constraints associated with OpenAI API calls.'"
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": true,
    298         "justification": "Table 7 provides GPU memory requirements and training time for each dataset. DLAP ranges from 0.3-0.8 hours and 2.8-6.3 GB GPU, versus fine-tuning at 1.3-11.1 hours and 28.7-31.2 GB GPU."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No mention of multiple random seeds, seed sensitivity analysis, or results across different initializations. All results appear to be single-run."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The number of experimental runs is never stated. It is unclear whether results are from a single run or averaged across multiple runs."
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "Section 4.3 states hyperparameters were 'referenced the parameters reported in the respective research papers.' No search budget, search method, or number of configurations tried is reported."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": true,
    320         "justification": "RQ1 (Section 5.1) systematically compares three DL models on the test set and selects Linevul based on superior performance across all metrics and datasets. The selection criterion (best overall F1 and MCC) is clear."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "Comparisons are made across 4 datasets × 5 metrics × multiple frameworks with no statistical tests performed at all, let alone corrections for multiple comparisons."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The authors implement their own versions of baseline prompting frameworks (PRol, PAux, PCot) and do not acknowledge the bias of evaluating their own system against their own baseline implementations. No independent evaluation or discussion of this bias."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": true,
    335         "justification": "Table 7 directly compares computational costs (memory, time, GPU) between DLAP and LoRA fine-tuning alongside performance results in Table 6, allowing readers to assess the cost-performance tradeoff."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "While Section 4.2 references Croft et al. [10] on labeling bias, the paper does not discuss whether binary classification of vulnerability presence at function level actually measures vulnerability detection capability, or whether the chosen metrics capture real-world detection utility."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": true,
    344         "answer": false,
    345         "justification": "RQ3 compares DLAP (using GPT-3.5-turbo) against fine-tuning (using Vicuna-13B) — different models with different architectures and training. The performance difference cannot be attributed purely to prompting vs. fine-tuning since the underlying LLMs differ. This confound is not discussed."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of temporal leakage. The vulnerability datasets contain historical vulnerability fixes from Chrome, Linux, Android, and Qemu that predate GPT-3.5-turbo's training. The model may have memorized vulnerability patterns or fix commits."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of feature leakage. The ICL prompts provide similar code examples with DL model predictions (probabilities), which could leak information about the ground truth labels through the training-set examples."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No discussion of non-independence. Training and test examples come from the same projects, so functions may share structural similarities, common patterns, or even be near-duplicates from the same codebase."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": false,
    367         "justification": "No leakage detection or prevention method is used. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines are mentioned."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Linevul is the most effective DL model for augmenting DLAP, outperforming Devign by 7.2% F1 and 10.5% MCC on average, and Sysevr by 28.4% F1 and 34.0% MCC on average.",
    374       "evidence": "Table 3 (Section 5.1) shows per-project results across three DL models on all five metrics. Linevul achieves the highest CV in probability distribution (Table 4), indicating more discrete predictions.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "DLAP outperforms all state-of-the-art prompting frameworks (PRol, PAux, PCot, GRACE) across all metrics and datasets.",
    379       "evidence": "Table 5 (Section 5.2) shows DLAP achieves the highest F1 and MCC on all four projects. For example, Chrome F1: DLAP 52.1% vs GRACE 32.6%, Linux MCC: DLAP 56.4% vs PCot 6.5%.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "DLAP achieves performance comparable to fine-tuning at significantly lower computational cost.",
    384       "evidence": "Table 6 (Section 5.3) shows fine-tuning achieves higher F1 on Chrome (82.0 vs 52.1) and Linux (70.3 vs 65.4), but DLAP wins on Android (49.3 vs 46.7) and Qemu (66.7 vs 12.1). Table 7 shows DLAP requires ~5x less GPU memory and ~5-14x less time.",
    385       "supported": "weak"
    386     },
    387     {
    388       "claim": "DLAP stimulates implicit fine-tuning of LLMs through ICL prompts, producing similar predictive distributions to explicit fine-tuning.",
    389       "evidence": "Theoretical analysis in Equations 3-5 and Appendix, plus Figure 7 showing similar probability distributions between DLAP and fine-tuning. Authors acknowledge in Section 7 they 'can not strictly demonstrate' the mechanism.",
    390       "supported": "weak"
    391     },
    392     {
    393       "claim": "DL models with more discrete probability distributions (higher CV) are more suitable as DLAP plugins.",
    394       "evidence": "Table 4 shows Linevul has the highest CV (2.7 average) and Figure 6 shows its more discrete distribution. This correlates with Linevul's superior performance in Table 3.",
    395       "supported": "weak"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "Confounded fine-tuning comparison",
    401       "detail": "RQ3 compares DLAP (GPT-3.5-turbo) against fine-tuning (Vicuna-13B) — different underlying LLMs. Performance differences could be due to model capability differences rather than prompting vs. fine-tuning. This makes the central claim about prompting rivaling fine-tuning unreliable."
    402     },
    403     {
    404       "flag": "No statistical significance testing",
    405       "detail": "All claims of superiority (across 4 datasets × 5 metrics × multiple comparisons) are based on comparing raw numbers with no significance tests, error bars, or variance reporting. With single-run results, observed differences may be within noise."
    406     },
    407     {
    408       "flag": "Abstract overclaims fine-tuning comparison",
    409       "detail": "The abstract states DLAP outperforms 'fine-tuning on multiple metrics,' but Table 6 shows fine-tuning achieves dramatically higher F1 on Chrome (82.0 vs 52.1) and Linux (70.3 vs 65.4). DLAP's advantage is limited to the smallest dataset (Qemu) and partially to Android."
    410     },
    411     {
    412       "flag": "Severe contamination risk",
    413       "detail": "GPT-3.5-turbo was likely trained on data containing the public vulnerability fix commits from Chrome, Linux, Android, and Qemu. The model may have memorized vulnerability patterns or even specific fix descriptions, inflating or confounding all results. This is never discussed."
    414     },
    415     {
    416       "flag": "Model selection on test set",
    417       "detail": "RQ1 selects Linevul as the best DL model based on test set performance (Table 3), then RQ2 evaluates DLAP with Linevul on the same test set. This introduces selection bias — the test set influenced the system configuration."
    418     },
    419     {
    420       "flag": "Single-run results with no variance",
    421       "detail": "DL model training involves random initialization, and undersampling involves randomness. All results appear to be from single runs with no seed sensitivity analysis or variance reporting, making reproducibility of exact numbers uncertain."
    422     }
    423   ],
    424   "cited_papers": [
    425     {
    426       "title": "Deep learning based vulnerability detection: Are we there yet",
    427       "authors": ["Saikat Chakraborty", "Rahul Krishna", "Yangruibo Ding", "Baishakhi Ray"],
    428       "year": 2022,
    429       "relevance": "Empirical study showing DL-based vulnerability detection performance drops 73% on real-world projects, demonstrating generalization challenges."
    430     },
    431     {
    432       "title": "Linevul: A transformer-based line-level vulnerability prediction",
    433       "authors": ["Michael Fu", "Chakkrit Tantithamthavorn"],
    434       "year": 2022,
    435       "relevance": "The DL model selected as the best plugin for DLAP; transformer-based vulnerability prediction achieving strong function-level and line-level results."
    436     },
    437     {
    438       "title": "An empirical study of deep learning models for vulnerability detection",
    439       "authors": ["Benjamin Steenhoek", "Md Mahbubur Rahman", "Richard Jiles", "Wei Le"],
    440       "year": 2023,
    441       "relevance": "Empirical study demonstrating variability between runs and low agreement among DL models for vulnerability detection."
    442     },
    443     {
    444       "title": "Prompt-enhanced software vulnerability detection using chatgpt",
    445       "authors": ["Chenyuan Zhang", "Hao Liu", "Jiutian Zeng", "Kejing Yang", "Yuhong Li", "Hui Li"],
    446       "year": 2023,
    447       "arxiv_id": "2308.12697",
    448       "relevance": "Baseline prompting framework using role-based and auxiliary information prompts for ChatGPT vulnerability detection."
    449     },
    450     {
    451       "title": "GRACE: Empowering LLM-based software vulnerability detection with graph structure and in-context learning",
    452       "authors": ["Guanjun Lu", "Xiaojun Ju", "Xiang Chen", "Wenhua Pei", "Zhiyong Cai"],
    453       "year": 2024,
    454       "relevance": "State-of-the-art baseline that combines graph structural information with ICL for LLM-based vulnerability detection."
    455     },
    456     {
    457       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    458       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans", "Maarten Bosma"],
    459       "year": 2022,
    460       "relevance": "Foundational work on chain-of-thought prompting, the core technique used in DLAP's COT component."
    461     },
    462     {
    463       "title": "Why can GPT learn in-context? Language models implicitly perform gradient descent as meta-optimizers",
    464       "authors": ["Damai Dai", "Yutao Sun", "Li Dong", "Yaru Hao"],
    465       "year": 2023,
    466       "relevance": "Theoretical basis for DLAP's claim that ICL produces implicit fine-tuning of LLM attention layers."
    467     },
    468     {
    469       "title": "LoRA: Low-rank adaptation of large language models",
    470       "authors": ["Edward J. Hu", "Phillip Wallis", "Zeyuan Allen-Zhu"],
    471       "year": 2022,
    472       "relevance": "State-of-the-art efficient fine-tuning technique used as the fine-tuning baseline in RQ3."
    473     },
    474     {
    475       "title": "SySeVR: A framework for using deep learning to detect software vulnerabilities",
    476       "authors": ["Zhen Li", "Deqing Zou", "Shouhuai Xu", "Hai Jin"],
    477       "year": 2021,
    478       "relevance": "DL framework for vulnerability detection using syntactic/semantic code representation; one of three DL models tested in DLAP."
    479     },
    480     {
    481       "title": "Devign: Effective vulnerability identification by learning comprehensive program semantics via graph neural networks",
    482       "authors": ["Yaqin Zhou", "Shangqing Liu", "Jingkai Siow", "Xiaoning Du", "Yang Liu"],
    483       "year": 2019,
    484       "relevance": "Graph neural network approach to vulnerability detection; one of three DL models tested in DLAP."
    485     },
    486     {
    487       "title": "Software vulnerability detection using large language models",
    488       "authors": ["Moumita Das Purba", "Arpita Ghosh", "Benjamin J. Radford", "Bill Chu"],
    489       "year": 2023,
    490       "relevance": "Evaluation of LLMs for software vulnerability detection, establishing baseline LLM performance levels."
    491     },
    492     {
    493       "title": "Evaluation of ChatGPT model for vulnerability detection",
    494       "authors": ["Anton Cheshkov", "Pavel Zadorozhny", "Rodion Levichev"],
    495       "year": 2023,
    496       "arxiv_id": "2304.07232",
    497       "relevance": "Found that ChatGPT and GPT-3 do not outperform existing tools for Java vulnerability detection."
    498     },
    499     {
    500       "title": "Data quality for software vulnerability datasets",
    501       "authors": ["Roland Croft", "M. Ali Babar", "M. Mehdi Kholoosi"],
    502       "year": 2023,
    503       "relevance": "Identifies labeling bias in common vulnerability detection datasets, directly motivating DLAP's dataset selection criteria."
    504     },
    505     {
    506       "title": "Exploring distributional shifts in large language models for code analysis",
    507       "authors": ["Shushan Arakelyan", "Rishiraj Das", "Yi Mao", "Xiang Ren"],
    508       "year": 2023,
    509       "relevance": "Studies how distributional shifts affect LLM performance on code analysis tasks, relevant to prompt engineering effectiveness."
    510     }
    511   ],
    512   "engagement_factors": {
    513     "practical_relevance": {
    514       "score": 2,
    515       "justification": "DLAP provides a usable framework for vulnerability detection combining DL models with LLM prompting, with code available on GitHub."
    516     },
    517     "surprise_contrarian": {
    518       "score": 1,
    519       "justification": "Combines existing techniques (ICL, COT, DL models) in a novel way but does not challenge fundamental assumptions about LLM capabilities."
    520     },
    521     "fear_safety": {
    522       "score": 1,
    523       "justification": "Addresses software vulnerability detection which is security-relevant, but presents a defensive tool rather than demonstrating novel attack vectors."
    524     },
    525     "drama_conflict": {
    526       "score": 0,
    527       "justification": "No controversy, no criticism of existing tools or companies, straightforward technical contribution."
    528     },
    529     "demo_ability": {
    530       "score": 1,
    531       "justification": "GitHub repository exists but requires DL model training, GPT API access, and static tool setup — not trivially runnable."
    532     },
    533     "brand_recognition": {
    534       "score": 1,
    535       "justification": "Uses GPT-3.5-turbo (recognizable product) but from a lesser-known university lab; not from a major AI company."
    536     }
    537   }
    538 }

Impressum · Datenschutz