scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30602B)
      1 {
      2   "paper": {
      3     "title": "Everything You Wanted to Know About LLM-based Vulnerability Detection But Were Afraid to Ask",
      4     "authors": [
      5       "Yue Li",
      6       "Xiao Li",
      7       "Hao Wu",
      8       "Minghui Xu",
      9       "Yue Zhang",
     10       "Xiuzhen Cheng",
     11       "Fengyuan Xu",
     12       "Sheng Zhong"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv",
     16     "arxiv_id": "2504.13474",
     17     "doi": "10.48550/arXiv.2504.13474"
     18   },
     19   "scan_version": 2,
     20   "active_modules": ["experimental_rigor", "data_leakage"],
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The paper states 'Our dataset and code are available' with an anonymous link (https://anonymous.4open.science/r/CORRECT) provided in §1. This is an anonymized review link but does provide a working URL."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The dataset of 2,000 vulnerable-patched program pairs is released at the same anonymous URL. The paper also builds on publicly available sources: MoreFixes, PrimeVul, and ReposVul."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No environment specifications (requirements.txt, Dockerfile, library versions) are mentioned in the paper. Only tools used are named (cflow, Joern) without version details."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions are provided. The paper describes the framework methodology but does not include a README or scripts to replicate experiments."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "All results are reported as point estimates (e.g., '67% accuracy', 'precision close to 0.8', 'F1-score exceeds 70%'). No confidence intervals or error bars appear in figures or tables."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper makes numerous comparative claims (e.g., 'significantly surpassing the random baseline', 'significantly improved performance') but no statistical significance tests (p-values, t-tests, etc.) are reported anywhere."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "The paper reports improvements with baseline context: e.g., precision from ~0.5 (w/o context) to ~0.8 (Strict Mode), (1,0) proportion from 25% random baseline to 37%, accuracy from ~50-55% to 67%. These provide sufficient context for understanding effect magnitudes."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The CWE-1000 subset uses 50 pairs per top-level CWE, with three categories having only 30, 10, and 10 pairs respectively. No power analysis or justification for these sample sizes is provided."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "Temperature is set to 0 for deterministic results, yielding single-run numbers. No variance, standard deviation, or spread measures are reported across experimental conditions."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The paper compares four evaluation settings: w/o context w/o revision (matching prior work), w/ context w/o revision, Lenient Mode, and Strict Mode. Random guessing baselines (50% accuracy, 25% (1,0) proportion, 0.5 precision) are also established."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "The 13 evaluated models include contemporary SOTA: DeepSeek-R1, DeepSeek-V3, o3-mini, Qwen2.5 series, and Llama-3.3-70B. Prior work comparisons reference recent studies from 2024-2025."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The framework is evaluated with systematic ablation of its components: context (w/o vs w/ context), revision process (w/o revision vs Lenient vs Strict Mode), and scaling dimensions (model size, sequential vs parallel test-time scaling)."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Multiple metrics are used: F1-score, accuracy, recall, precision (conventional metrics), plus pair-wise prediction proportions ((1,0), (1,1), (0,0), (0,1)) as introduced in §5.1."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Manual inspection was conducted for error attribution: the authors 'manually inspected cases where the two models produced inconsistent results on patched code' (§5.3, Table 5), classifying errors into categories. Also, 50 pairs were manually audited for 98% label accuracy."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "The 400-pair CWE-1000 subset is used purely for evaluation. No models were fine-tuned or tuned on this data — all models are evaluated in zero-shot mode with temperature 0."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Table 4 and Figure 6 provide detailed breakdowns across 10 top-level CWE categories, classified into Common and Rare groups. F1-scores are reported per CWE type across all models."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Extensive failure analysis in §5.3 and Table 5: error types categorized as Patch Ignored, Patch Deemed Insufficient (with subcategories: Minimum Reasoning, Procedural Error, Mis-Corrected Reasoning). Appendix H provides concrete examples."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Several negative findings: sequential scaling degrades recall (~10% drop), reasoning models exhibit overthinking/mis-corrected reasoning (Finding #4), test-time scaling has diminishing returns (Finding #6), and LLMs struggle with rare vulnerability types (Finding #2)."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Abstract claims are supported: 67% accuracy (Finding #1), >70% F1 on key CWEs (Table 4, CWE-664 and CWE-682), precision nearing 0.8 (Figure 4(f)), false positives from reasoning errors (Table 5), diminishing returns (Finding #6)."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The primary causal claim — that context improves performance — is supported by controlled comparison between w/o context and w/ context conditions on the same models and dataset. Model scaling claims are supported by systematic size comparisons across model families."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The title ('Everything You Wanted to Know About LLM-based Vulnerability Detection') and abstract claim broad applicability, but the dataset is entirely C/C++ code from specific projects (Linux, FFmpeg, etc.) covering 99 CWEs. Results may not generalize to other languages or vulnerability types. This scope limitation is not explicitly bounded."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The Discussion section explains root causes of prior misconceptions but does not consider alternative explanations for CORRECT's own positive results. For instance, the prompts include ground-truth CWE descriptions, which provides the model with information about what vulnerability type to look for — this potential confound is not discussed."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "A key contribution of the paper is explicitly distinguishing between correct labels (binary prediction) and correct rationales (root-cause identification). The two evaluation modes (Lenient and Strict) address this gap, and the paper argues prior work conflated label accuracy with true detection capability."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "Table 3 lists model names (Qwen2.5-7B-Inst, Llama-3.1-8B-Inst, DeepSeek-R1, etc.) but o3-mini and GPT-4o (used as LLM-as-a-judge) lack API snapshot dates or version identifiers. Per schema rules, marketing names without snapshot dates do not count as specified versions."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Full prompt templates are provided in Appendix E (Figures 10, 11, 12): Context-Rich Vulnerability Assessment Prompt, Rationale Assessment Prompt for vulnerable input, and Rationale Assessment Prompt for patched input. The structural template with variable slots (CWE description, code) is complete."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Temperature set to 0 for all models (§5.1), temperature 0.6 for parallel scaling experiments, max_feedback_rounds = 4 for Strict Mode. These are the key inference hyperparameters."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "No agentic scaffolding is used. CORRECT is a prompting-based evaluation framework that sends single prompts to models and evaluates responses."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The data pipeline is documented in §4.1: CVE record crawling → patch diff extraction → function-level commit filtering → Code Property Graph construction → vulnerability-related context extraction → context merging. Tools used include cflow and Joern."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "A 'Limitations' subsection exists within the Discussion (§6), discussing LLM-as-a-judge accuracy (92%) and the impossibility of gathering complete context."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Specific threats discussed: LLM-as-a-judge accuracy validated at 91/99 (92%) on a random sample of 50 cases reviewed on ds-v3 and ds-r1; the practical impossibility of gathering all context for certain vulnerabilities. These are specific to this study."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The paper does not explicitly state what the results do NOT show. No discussion of language limitations (all C/C++), project-type limitations (heavily Linux-kernel), or that results may not transfer to other vulnerability detection scenarios (e.g., web applications, smart contracts)."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "Dataset and code are released at the anonymous repository URL. The 2,000 vulnerable-patched pairs with context are available for verification."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "§4.1 describes data collection: sourced from MoreFixes, PrimeVul, and ReposVul; CVE records and patch commits used; repositories cloned and function-level diffs extracted. 364 real-world projects, 99 CWEs."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. Data is sourced from established vulnerability datasets (MoreFixes, PrimeVul, ReposVul) which are standard benchmarks."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "While the pipeline stages are described (§4.1, steps ➀-➄), exact filtering counts are not provided. The paper does not document how many CVEs were initially collected, how many were filtered at each stage, or what proportion was excluded and why before arriving at 2,000 pairs."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding information or acknowledgments section appears in the paper. The absence of any funding disclosure is noted."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are listed: Nanjing University (National Key Lab for Novel Software Technology) and Shandong University. No conflict with any evaluated product or company."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No funding is disclosed, making it impossible to assess funder independence. The absence of a funding disclosure does not confirm independence."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests statement or financial disclosure appears in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No training data cutoff dates are stated for any of the 13 models evaluated. The CVEs used span multiple years and could overlap with model training data."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No discussion of whether the CVE patches, commit messages, or vulnerability descriptions in the dataset were present in the training data of the evaluated models."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "The dataset is built from publicly available sources (MoreFixes, PrimeVul, ReposVul) containing well-known CVEs. Models trained on internet data could have seen these exact patches. This contamination risk is not addressed."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this study. It is a benchmark evaluation of LLMs on vulnerability detection."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants. The study evaluates LLMs on code datasets."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "Token counts for thinking tokens are discussed in the test-time scaling analysis (Figure 7), but no overall inference cost, API spend, or per-example cost is reported for the main evaluation across 13 models × 400 pairs × multiple modes."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No total computational budget (GPU hours, API costs, wall-clock time) is stated despite the substantial evaluation: 13 models, 400+ pairs, 4 evaluation modes, plus scaling experiments."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "Temperature is set to 0 for deterministic results, but no seed sensitivity analysis is reported. The paper does not examine how results vary with different temperatures or sampling configurations for the main evaluation."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The number of experimental runs is not explicitly stated. Temperature=0 implies single deterministic runs for main results, but this is not explicitly documented. For parallel scaling, 3/5/8 samples are stated."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "No hyperparameter search budget reported. The choice of temperature=0 follows prior work [51], and max_feedback_rounds=4 is justified by Table 6, but no systematic search was conducted or reported."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": true,
    318         "justification": "The choice of zero-shot CoT is justified: '(a) it avoids example selection biases present in few-shot approaches, and (b) advanced prompting techniques prove ineffective for SOTA LLMs' (§4.2). max_feedback_rounds=4 is justified by Table 6 analysis."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "Numerous comparisons are made across 13 models, 10 CWE categories, 4 evaluation modes, and scaling conditions, but no multiple comparison correction (Bonferroni, etc.) is applied."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors evaluate their own CORRECT framework against prior evaluation methodologies but do not acknowledge or address the bias of evaluating their own system. No independent evaluation is conducted."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": true,
    333         "justification": "Figure 7 explicitly plots accuracy, precision, and recall as functions of thinking tokens for both o3-mini (at three effort levels) and r1-qn-14b (sequential and parallel scaling). The power-law relationship is discussed."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": true,
    338         "justification": "Construct validity is a central focus of the paper. Sections 3 and 6 extensively argue that prior benchmarks failed to measure real vulnerability detection capability due to missing context, and CORRECT is designed to address this validity gap."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": false,
    342         "answer": false,
    343         "justification": "No scaffolding is involved. Models are evaluated directly via prompts without agentic scaffolding, so the scaffold confound does not apply."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "No discussion of temporal leakage. The CVEs span multiple years and many predate the training data of evaluated models. Models may have seen the exact vulnerabilities, patches, and CVE descriptions during training."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "The evaluation prompt includes ground-truth CWE descriptions, telling the model what vulnerability type to look for. This provides information not available in real-world detection scenarios. This potential feature leakage is not discussed."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "Multiple CVEs come from the same projects (Linux has 400+ samples per Figure 9c). Structural similarities between test examples from the same codebase are not discussed."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No concrete leakage detection or prevention method is used. No canary strings, membership inference tests, temporal splits, or decontamination pipelines are applied."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "Prior evaluations significantly underestimated LLM vulnerability detection capabilities due to missing context. DeepSeek-R1 achieves 67% accuracy and a 37% (1,0) proportion under CORRECT.",
    372       "evidence": "Figure 4(i),(j) show Strict Mode results vs w/o context baseline (~50-55% accuracy). Figure 5 shows (1,0) proportion reaching 37% vs 25% random baseline. (§5.2, Finding #1)",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "SOTA LLMs achieve precision close to 0.8 in distinguishing vulnerable from patched code, with only ~10% (1,1) false positive proportion.",
    377       "evidence": "Figure 4(f) shows precision approaching 0.8 for 671B models under Strict Mode. Figure 5 shows all models exhibit ~10% (1,1) proportion. (§5.3, Finding #3)",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "Only 9.5% of false positives in reasoning models result from failing to notice patches; most arise from incorrect inference that patches are insufficient.",
    382       "evidence": "Table 5: 'Patch Ignored' accounts for 2/21 (9.5%) of ds-r1's FP cases, vs 'Patch Deemed Insufficient' at 19/21 (90.5%). Manual inspection of ds-v3 vs ds-r1 inconsistent cases. (§5.3)",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Model size scaling laws exist for vulnerability detection when context is provided, but are invisible in context-free evaluations.",
    387       "evidence": "Figure 4(c): F1 clustered at 0.5-0.6 without context, no scaling trend. Figure 4(i): clear scaling trends in Strict Mode across Qwen and Llama families. (§5.4, Finding #5)",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "Test-time scaling follows a power-law relationship, making it uneconomical: 5x thinking tokens yields less than 5% accuracy improvement.",
    392       "evidence": "Figure 7(d): o3-mini aggregate accuracy shows power-law relationship. o3-mini-high with 5x tokens of medium achieves <0.05 accuracy improvement. (§5.4, Finding #6)",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "Sequential scaling induces conservative bias that reduces recall by approximately 10%, while parallel scaling avoids this issue.",
    397       "evidence": "Figure 7(i)-(l): r1-qn-14b sequential scaling shows recall declining from ~0.4 to ~0.2 as tokens increase beyond 2k. Parallel scaling maintains or improves recall. (§5.4, Finding #7)",
    398       "supported": "moderate"
    399     },
    400     {
    401       "claim": "LLMs achieve F1-scores around 0.7 on common fixed-pattern vulnerabilities (CWE-664, CWE-682) but struggle with rare vulnerability types (CWE-435, CWE-697).",
    402       "evidence": "Table 4: CWE-664 max F1=0.700, CWE-682 max F1=0.713 vs CWE-435 max F1=0.556, CWE-697 max F1=0.400. Figure 6 shows per-CWE breakdowns. (§5.2, Finding #2)",
    403       "supported": "strong"
    404     }
    405   ],
    406   "methodology_tags": ["benchmark-eval"],
    407   "key_findings": "The paper demonstrates that prior negative assessments of LLM vulnerability detection were artifacts of context-deprived evaluations. Under the CORRECT framework with rich contextual information, SOTA models (DeepSeek-R1) achieve 67% accuracy with precision near 0.8, significantly exceeding random baselines. Most false positives stem from reasoning errors (judging patches as insufficient) rather than failing to notice patches. Test-time and model scaling improve performance but with diminishing returns, and sequential scaling introduces conservative biases that reduce recall.",
    408   "red_flags": [
    409     {
    410       "flag": "Ground-truth CWE descriptions provided in prompts",
    411       "detail": "The evaluation prompt includes the ground-truth CWE category description, telling the model what vulnerability type to look for. Real-world vulnerability detection does not have this information, making the evaluation setting more favorable than practical deployment. This is essentially feature leakage that inflates apparent capability."
    412     },
    413     {
    414       "flag": "No contamination analysis despite public CVE dataset",
    415       "detail": "The dataset is built from well-known CVEs in public repositories (Linux, FFmpeg, etc.) using established datasets (MoreFixes, PrimeVul, ReposVul). Models trained on internet data almost certainly encountered these specific vulnerabilities and patches during training. No contamination analysis is performed."
    416     },
    417     {
    418       "flag": "No statistical significance tests",
    419       "detail": "Despite making numerous comparative claims ('significantly surpassing', 'significantly improved'), no statistical significance tests are reported. All comparisons are based on point estimates without uncertainty quantification."
    420     },
    421     {
    422       "flag": "Very small sample sizes for some CWE categories",
    423       "detail": "CWE-435 and CWE-697 have only 10 pairs each, CWE-693 has only 30 pairs. Drawing conclusions about model performance on these categories from such small samples is unreliable."
    424     },
    425     {
    426       "flag": "LLM-as-a-judge with 8% error rate",
    427       "detail": "GPT-4o is used as LLM-as-a-judge for rationale assessment with a validated accuracy of 92% (91/99). An 8% error rate could systematically bias results, especially given that this evaluation step determines the key distinction between Lenient and Strict modes."
    428     }
    429   ],
    430   "cited_papers": [
    431     {
    432       "title": "Vulnerability Detection with Code Language Models: How Far Are We?",
    433       "authors": ["Yangruibo Ding", "Yanjun Fu", "Omniyyah Ibrahim"],
    434       "year": 2024,
    435       "relevance": "Key prior work on LLM vulnerability detection benchmarking (PrimeVul), one of the 'consensus' papers this work challenges."
    436     },
    437     {
    438       "title": "LLMs cannot reliably identify and reason about security vulnerabilities (yet?): A comprehensive evaluation, framework, and benchmarks",
    439       "authors": ["Saad Ullah", "Mingji Han", "Saurabh Pujar"],
    440       "year": 2024,
    441       "relevance": "SecLLMHolmes evaluation framework for LLM security analysis, a primary baseline this paper argues is flawed due to missing context."
    442     },
    443     {
    444       "title": "To Err is Machine: Vulnerability Detection Challenges LLM Reasoning",
    445       "authors": ["Benjamin Steenhoek", "Md Mahbubur Rahman", "Monoshi Kumar Roy"],
    446       "year": 2024,
    447       "arxiv_id": "2403.17218",
    448       "relevance": "Evaluates LLM reasoning on vulnerability detection across model scales, provides 'Consensus #3' (plateaued performance) that this paper challenges."
    449     },
    450     {
    451       "title": "Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning",
    452       "authors": ["Daya Guo", "Dejian Yang"],
    453       "year": 2025,
    454       "arxiv_id": "2501.12948",
    455       "relevance": "DeepSeek-R1 reasoning LLM evaluated as SOTA model in this paper, achieving the best vulnerability detection performance."
    456     },
    457     {
    458       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    459       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    460       "year": 2022,
    461       "relevance": "Foundational CoT prompting technique used in this paper's prompt-driven reasoning approach for vulnerability detection."
    462     },
    463     {
    464       "title": "Scaling llm test-time compute optimally can be more effective than scaling model parameters",
    465       "authors": ["Charlie Snell", "Jaehoon Lee", "Kelvin Xu"],
    466       "year": 2024,
    467       "arxiv_id": "2408.03314",
    468       "relevance": "Test-time scaling theory that this paper empirically tests in the vulnerability detection domain."
    469     },
    470     {
    471       "title": "Top score on the wrong exam: On benchmarking in machine learning for vulnerability detection",
    472       "authors": ["Niklas Risse", "Marcel Böhme"],
    473       "year": 2024,
    474       "arxiv_id": "2408.12986",
    475       "relevance": "Critiques ML benchmarking for vulnerability detection, aligns with this paper's argument about flawed evaluation practices."
    476     },
    477     {
    478       "title": "VulEval: Towards repository-level evaluation of software vulnerability detection",
    479       "authors": ["Xin-Cheng Wen", "Xinchen Wang"],
    480       "year": 2024,
    481       "arxiv_id": "2404.15596",
    482       "relevance": "Repository-level vulnerability evaluation framework; performs some context augmentation with caller-callee relationships."
    483     },
    484     {
    485       "title": "LLM4Vuln: A unified evaluation framework for decoupling and enhancing llms' vulnerability reasoning",
    486       "authors": ["Yuqiang Sun", "Daoyuan Wu"],
    487       "year": 2024,
    488       "arxiv_id": "2401.16185",
    489       "relevance": "LLM vulnerability reasoning evaluation framework with RAG-based context augmentation, a direct comparison point."
    490     },
    491     {
    492       "title": "Reasoning with LLMs for Zero-Shot Vulnerability Detection",
    493       "authors": ["Arastoo Zibaeirad", "Marco Vieira"],
    494       "year": 2025,
    495       "arxiv_id": "2503.17885",
    496       "relevance": "VulnSage: zero-shot reasoning approach for vulnerability detection that introduced 'not sure' responses to address context limitations."
    497     },
    498     {
    499       "title": "Benchmarking LLMs and LLM-based Agents in Practical Vulnerability Detection for Code Repositories",
    500       "authors": ["Alperen Yildiz", "Sin G Teo"],
    501       "year": 2025,
    502       "arxiv_id": "2503.03586",
    503       "relevance": "JitVul: benchmarks LLM-based agents for repository-level vulnerability detection, directly relevant to evaluation methodology."
    504     },
    505     {
    506       "title": "s1: Simple test-time scaling",
    507       "authors": ["Niklas Muennighoff", "Zitong Yang"],
    508       "year": 2025,
    509       "arxiv_id": "2501.19393",
    510       "relevance": "Test-time scaling methodology ('Wait' appending technique) directly used in this paper's sequential scaling experiments."
    511     }
    512   ]
    513 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs