scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (36596B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Everything You Wanted to Know About LLM-based Vulnerability Detection But Were Afraid to Ask",
      6     "authors": [
      7       "Yue Li",
      8       "Xiao Li",
      9       "Hao Wu",
     10       "Minghui Xu",
     11       "Yue Zhang",
     12       "Xiuzhen Cheng",
     13       "Fengyuan Xu",
     14       "Sheng Zhong"
     15     ],
     16     "year": 2025,
     17     "venue": "arXiv.org",
     18     "arxiv_id": "2504.13474",
     19     "doi": "10.48550/arXiv.2504.13474"
     20   },
     21   "checklist": {
     22     "claims_and_evidence": {
     23       "abstract_claims_supported": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "Abstract claims are supported: 67% accuracy (Finding #1), >70% F1 on key CWEs (Table 4, CWE-664 and CWE-682), precision nearing 0.8 (Figure 4(f)), false positives from reasoning errors (Table 5), diminishing returns (Finding #6).",
     27         "source": "opus"
     28       },
     29       "causal_claims_justified": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The primary causal claim — that context improves performance — is supported by controlled comparison between w/o context and w/ context conditions on the same models and dataset. Model scaling claims are supported by systematic size comparisons across model families.",
     33         "source": "opus"
     34       },
     35       "generalization_bounded": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The title ('Everything You Wanted to Know About LLM-based Vulnerability Detection') and abstract claim broad applicability, but the dataset is entirely C/C++ code from specific projects (Linux, FFmpeg, etc.) covering 99 CWEs. Results may not generalize to other languages or vulnerability types. This scope limitation is not explicitly bounded.",
     39         "source": "opus"
     40       },
     41       "alternative_explanations_discussed": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The Discussion section explains root causes of prior misconceptions but does not consider alternative explanations for CORRECT's own positive results. For instance, the prompts include ground-truth CWE descriptions, which provides the model with information about what vulnerability type to look for — this potential confound is not discussed.",
     45         "source": "opus"
     46       },
     47       "proxy_outcome_distinction": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "A key contribution of the paper is explicitly distinguishing between correct labels (binary prediction) and correct rationales (root-cause identification). The two evaluation modes (Lenient and Strict) address this gap, and the paper argues prior work conflated label accuracy with true detection capability.",
     51         "source": "opus"
     52       }
     53     },
     54     "limitations_and_scope": {
     55       "limitations_section_present": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "A 'Limitations' subsection exists within the Discussion (§6), discussing LLM-as-a-judge accuracy (92%) and the impossibility of gathering complete context.",
     59         "source": "opus"
     60       },
     61       "threats_to_validity_specific": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Specific threats discussed: LLM-as-a-judge accuracy validated at 91/99 (92%) on a random sample of 50 cases reviewed on ds-v3 and ds-r1; the practical impossibility of gathering all context for certain vulnerabilities. These are specific to this study.",
     65         "source": "opus"
     66       },
     67       "scope_boundaries_stated": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "The paper does not explicitly state what the results do NOT show. No discussion of language limitations (all C/C++), project-type limitations (heavily Linux-kernel), or that results may not transfer to other vulnerability detection scenarios (e.g., web applications, smart contracts).",
     71         "source": "opus"
     72       }
     73     },
     74     "conflicts_of_interest": {
     75       "funding_disclosed": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No funding information or acknowledgments section appears in the paper. The absence of any funding disclosure is noted.",
     79         "source": "opus"
     80       },
     81       "affiliations_disclosed": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Author affiliations are listed: Nanjing University (National Key Lab for Novel Software Technology) and Shandong University. No conflict with any evaluated product or company.",
     85         "source": "opus"
     86       },
     87       "funder_independent_of_outcome": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No funding is disclosed, making it impossible to assess funder independence. The absence of a funding disclosure does not confirm independence.",
     91         "source": "opus"
     92       },
     93       "financial_interests_declared": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No competing interests statement or financial disclosure appears in the paper.",
     97         "source": "opus"
     98       }
     99     },
    100     "scope_and_framing": {
    101       "key_terms_defined": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "'Context' is explicitly defined (callee functions, type declarations, global variables, slicing paths); 'reasoning' is defined via System 1/System 2 framing in §2.1; 'vulnerability detection' evaluation modes are formally defined in §4.3.",
    105         "source": "haiku"
    106       },
    107       "intended_contribution_clear": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Three contributions are explicitly listed: the CORRECT framework, a 2,000-pair context-rich dataset, and evidence refuting three community consensus beliefs.",
    111         "source": "haiku"
    112       },
    113       "engagement_with_prior_work": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Table 1 systematically compares CORRECT against eight prior evaluation frameworks along four dimensions, and §2.2 explains specific limitations of each prior approach that CORRECT addresses.",
    117         "source": "haiku"
    118       }
    119     }
    120   },
    121   "type_checklist": {
    122     "empirical": {
    123       "artifacts": {
    124         "code_released": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "The paper states 'Our dataset and code are available' with an anonymous link (https://anonymous.4open.science/r/CORRECT) provided in §1. This is an anonymized review link but does provide a working URL.",
    128           "source": "opus"
    129         },
    130         "data_released": {
    131           "applies": true,
    132           "answer": true,
    133           "justification": "The dataset of 2,000 vulnerable-patched program pairs is released at the same anonymous URL. The paper also builds on publicly available sources: MoreFixes, PrimeVul, and ReposVul.",
    134           "source": "opus"
    135         },
    136         "environment_specified": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "No environment specifications (requirements.txt, Dockerfile, library versions) are mentioned in the paper. Only tools used are named (cflow, Joern) without version details.",
    140           "source": "opus"
    141         },
    142         "reproduction_instructions": {
    143           "applies": true,
    144           "answer": false,
    145           "justification": "No step-by-step reproduction instructions are provided. The paper describes the framework methodology but does not include a README or scripts to replicate experiments.",
    146           "source": "opus"
    147         }
    148       },
    149       "statistical_methodology": {
    150         "confidence_intervals_or_error_bars": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "All results are reported as point estimates (e.g., '67% accuracy', 'precision close to 0.8', 'F1-score exceeds 70%'). No confidence intervals or error bars appear in figures or tables.",
    154           "source": "opus"
    155         },
    156         "significance_tests": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "The paper makes numerous comparative claims (e.g., 'significantly surpassing the random baseline', 'significantly improved performance') but no statistical significance tests (p-values, t-tests, etc.) are reported anywhere.",
    160           "source": "opus"
    161         },
    162         "effect_sizes_reported": {
    163           "applies": true,
    164           "answer": true,
    165           "justification": "The paper reports improvements with baseline context: e.g., precision from ~0.5 (w/o context) to ~0.8 (Strict Mode), (1,0) proportion from 25% random baseline to 37%, accuracy from ~50-55% to 67%. These provide sufficient context for understanding effect magnitudes.",
    166           "source": "opus"
    167         },
    168         "sample_size_justified": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "The CWE-1000 subset uses 50 pairs per top-level CWE, with three categories having only 30, 10, and 10 pairs respectively. No power analysis or justification for these sample sizes is provided.",
    172           "source": "opus"
    173         },
    174         "variance_reported": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "Temperature is set to 0 for deterministic results, yielding single-run numbers. No variance, standard deviation, or spread measures are reported across experimental conditions.",
    178           "source": "opus"
    179         }
    180       },
    181       "evaluation_design": {
    182         "baselines_included": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "The paper compares four evaluation settings: w/o context w/o revision (matching prior work), w/ context w/o revision, Lenient Mode, and Strict Mode. Random guessing baselines (50% accuracy, 25% (1,0) proportion, 0.5 precision) are also established.",
    186           "source": "opus"
    187         },
    188         "baselines_contemporary": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "The 13 evaluated models include contemporary SOTA: DeepSeek-R1, DeepSeek-V3, o3-mini, Qwen2.5 series, and Llama-3.3-70B. Prior work comparisons reference recent studies from 2024-2025.",
    192           "source": "opus"
    193         },
    194         "ablation_study": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "The framework is evaluated with systematic ablation of its components: context (w/o vs w/ context), revision process (w/o revision vs Lenient vs Strict Mode), and scaling dimensions (model size, sequential vs parallel test-time scaling).",
    198           "source": "opus"
    199         },
    200         "multiple_metrics": {
    201           "applies": true,
    202           "answer": true,
    203           "justification": "Multiple metrics are used: F1-score, accuracy, recall, precision (conventional metrics), plus pair-wise prediction proportions ((1,0), (1,1), (0,0), (0,1)) as introduced in §5.1.",
    204           "source": "opus"
    205         },
    206         "human_evaluation": {
    207           "applies": true,
    208           "answer": true,
    209           "justification": "Manual inspection was conducted for error attribution: the authors 'manually inspected cases where the two models produced inconsistent results on patched code' (§5.3, Table 5), classifying errors into categories. Also, 50 pairs were manually audited for 98% label accuracy.",
    210           "source": "opus"
    211         },
    212         "held_out_test_set": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "The 400-pair CWE-1000 subset is used purely for evaluation. No models were fine-tuned or tuned on this data — all models are evaluated in zero-shot mode with temperature 0.",
    216           "source": "opus"
    217         },
    218         "per_category_breakdown": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "Table 4 and Figure 6 provide detailed breakdowns across 10 top-level CWE categories, classified into Common and Rare groups. F1-scores are reported per CWE type across all models.",
    222           "source": "opus"
    223         },
    224         "failure_cases_discussed": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "Extensive failure analysis in §5.3 and Table 5: error types categorized as Patch Ignored, Patch Deemed Insufficient (with subcategories: Minimum Reasoning, Procedural Error, Mis-Corrected Reasoning). Appendix H provides concrete examples.",
    228           "source": "opus"
    229         },
    230         "negative_results_reported": {
    231           "applies": true,
    232           "answer": true,
    233           "justification": "Several negative findings: sequential scaling degrades recall (~10% drop), reasoning models exhibit overthinking/mis-corrected reasoning (Finding #4), test-time scaling has diminishing returns (Finding #6), and LLMs struggle with rare vulnerability types (Finding #2).",
    234           "source": "opus"
    235         }
    236       },
    237       "setup_transparency": {
    238         "model_versions_specified": {
    239           "applies": true,
    240           "answer": false,
    241           "justification": "Table 3 lists model names (Qwen2.5-7B-Inst, Llama-3.1-8B-Inst, DeepSeek-R1, etc.) but o3-mini and GPT-4o (used as LLM-as-a-judge) lack API snapshot dates or version identifiers. Per schema rules, marketing names without snapshot dates do not count as specified versions.",
    242           "source": "opus"
    243         },
    244         "prompts_provided": {
    245           "applies": true,
    246           "answer": true,
    247           "justification": "Full prompt templates are provided in Appendix E (Figures 10, 11, 12): Context-Rich Vulnerability Assessment Prompt, Rationale Assessment Prompt for vulnerable input, and Rationale Assessment Prompt for patched input. The structural template with variable slots (CWE description, code) is complete.",
    248           "source": "opus"
    249         },
    250         "hyperparameters_reported": {
    251           "applies": true,
    252           "answer": true,
    253           "justification": "Temperature set to 0 for all models (§5.1), temperature 0.6 for parallel scaling experiments, max_feedback_rounds = 4 for Strict Mode. These are the key inference hyperparameters.",
    254           "source": "opus"
    255         },
    256         "scaffolding_described": {
    257           "applies": false,
    258           "answer": false,
    259           "justification": "No agentic scaffolding is used. CORRECT is a prompting-based evaluation framework that sends single prompts to models and evaluates responses.",
    260           "source": "opus"
    261         },
    262         "data_preprocessing_documented": {
    263           "applies": true,
    264           "answer": true,
    265           "justification": "The data pipeline is documented in §4.1: CVE record crawling → patch diff extraction → function-level commit filtering → Code Property Graph construction → vulnerability-related context extraction → context merging. Tools used include cflow and Joern.",
    266           "source": "opus"
    267         }
    268       },
    269       "data_integrity": {
    270         "raw_data_available": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "Dataset and code are released at the anonymous repository URL. The 2,000 vulnerable-patched pairs with context are available for verification.",
    274           "source": "opus"
    275         },
    276         "data_collection_described": {
    277           "applies": true,
    278           "answer": true,
    279           "justification": "§4.1 describes data collection: sourced from MoreFixes, PrimeVul, and ReposVul; CVE records and patch commits used; repositories cloned and function-level diffs extracted. 364 real-world projects, 99 CWEs.",
    280           "source": "opus"
    281         },
    282         "recruitment_methods_described": {
    283           "applies": false,
    284           "answer": false,
    285           "justification": "No human participants. Data is sourced from established vulnerability datasets (MoreFixes, PrimeVul, ReposVul) which are standard benchmarks.",
    286           "source": "opus"
    287         },
    288         "data_pipeline_documented": {
    289           "applies": true,
    290           "answer": false,
    291           "justification": "While the pipeline stages are described (§4.1, steps ➀-➄), exact filtering counts are not provided. The paper does not document how many CVEs were initially collected, how many were filtered at each stage, or what proportion was excluded and why before arriving at 2,000 pairs.",
    292           "source": "opus"
    293         }
    294       },
    295       "contamination": {
    296         "training_cutoff_stated": {
    297           "applies": true,
    298           "answer": false,
    299           "justification": "No training data cutoff dates are stated for any of the 13 models evaluated. The CVEs used span multiple years and could overlap with model training data.",
    300           "source": "opus"
    301         },
    302         "train_test_overlap_discussed": {
    303           "applies": true,
    304           "answer": false,
    305           "justification": "No discussion of whether the CVE patches, commit messages, or vulnerability descriptions in the dataset were present in the training data of the evaluated models.",
    306           "source": "opus"
    307         },
    308         "benchmark_contamination_addressed": {
    309           "applies": true,
    310           "answer": false,
    311           "justification": "The dataset is built from publicly available sources (MoreFixes, PrimeVul, ReposVul) containing well-known CVEs. Models trained on internet data could have seen these exact patches. This contamination risk is not addressed.",
    312           "source": "opus"
    313         }
    314       },
    315       "human_studies": {
    316         "pre_registered": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "No human participants in this study. It is a benchmark evaluation of LLMs on vulnerability detection.",
    320           "source": "opus"
    321         },
    322         "irb_or_ethics_approval": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "No human participants. The study evaluates LLMs on code datasets.",
    326           "source": "opus"
    327         },
    328         "demographics_reported": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "No human participants.",
    332           "source": "opus"
    333         },
    334         "inclusion_exclusion_criteria": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "No human participants.",
    338           "source": "opus"
    339         },
    340         "randomization_described": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "No human participants.",
    344           "source": "opus"
    345         },
    346         "blinding_described": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "No human participants.",
    350           "source": "opus"
    351         },
    352         "attrition_reported": {
    353           "applies": false,
    354           "answer": false,
    355           "justification": "No human participants.",
    356           "source": "opus"
    357         }
    358       },
    359       "cost_and_practicality": {
    360         "inference_cost_reported": {
    361           "applies": true,
    362           "answer": false,
    363           "justification": "Token counts for thinking tokens are discussed in the test-time scaling analysis (Figure 7), but no overall inference cost, API spend, or per-example cost is reported for the main evaluation across 13 models × 400 pairs × multiple modes.",
    364           "source": "opus"
    365         },
    366         "compute_budget_stated": {
    367           "applies": true,
    368           "answer": false,
    369           "justification": "No total computational budget (GPU hours, API costs, wall-clock time) is stated despite the substantial evaluation: 13 models, 400+ pairs, 4 evaluation modes, plus scaling experiments.",
    370           "source": "opus"
    371         }
    372       },
    373       "experimental_rigor": {
    374         "seed_sensitivity_reported": {
    375           "applies": true,
    376           "answer": false,
    377           "justification": "Temperature is set to 0 for deterministic results, but no seed sensitivity analysis is reported. The paper does not examine how results vary with different temperatures or sampling configurations for the main evaluation.",
    378           "source": "opus"
    379         },
    380         "number_of_runs_stated": {
    381           "applies": true,
    382           "answer": false,
    383           "justification": "The number of experimental runs is not explicitly stated. Temperature=0 implies single deterministic runs for main results, but this is not explicitly documented. For parallel scaling, 3/5/8 samples are stated.",
    384           "source": "opus"
    385         },
    386         "hyperparameter_search_budget": {
    387           "applies": true,
    388           "answer": false,
    389           "justification": "No hyperparameter search budget reported. The choice of temperature=0 follows prior work [51], and max_feedback_rounds=4 is justified by Table 6, but no systematic search was conducted or reported.",
    390           "source": "opus"
    391         },
    392         "best_config_selection_justified": {
    393           "applies": true,
    394           "answer": true,
    395           "justification": "The choice of zero-shot CoT is justified: '(a) it avoids example selection biases present in few-shot approaches, and (b) advanced prompting techniques prove ineffective for SOTA LLMs' (§4.2). max_feedback_rounds=4 is justified by Table 6 analysis.",
    396           "source": "opus"
    397         },
    398         "multiple_comparison_correction": {
    399           "applies": true,
    400           "answer": false,
    401           "justification": "Numerous comparisons are made across 13 models, 10 CWE categories, 4 evaluation modes, and scaling conditions, but no multiple comparison correction (Bonferroni, etc.) is applied.",
    402           "source": "opus"
    403         },
    404         "self_comparison_bias_addressed": {
    405           "applies": true,
    406           "answer": false,
    407           "justification": "The authors evaluate their own CORRECT framework against prior evaluation methodologies but do not acknowledge or address the bias of evaluating their own system. No independent evaluation is conducted.",
    408           "source": "opus"
    409         },
    410         "compute_budget_vs_performance": {
    411           "applies": true,
    412           "answer": true,
    413           "justification": "Figure 7 explicitly plots accuracy, precision, and recall as functions of thinking tokens for both o3-mini (at three effort levels) and r1-qn-14b (sequential and parallel scaling). The power-law relationship is discussed.",
    414           "source": "opus"
    415         },
    416         "benchmark_construct_validity": {
    417           "applies": true,
    418           "answer": true,
    419           "justification": "Construct validity is a central focus of the paper. Sections 3 and 6 extensively argue that prior benchmarks failed to measure real vulnerability detection capability due to missing context, and CORRECT is designed to address this validity gap.",
    420           "source": "opus"
    421         },
    422         "scaffold_confound_addressed": {
    423           "applies": false,
    424           "answer": false,
    425           "justification": "No scaffolding is involved. Models are evaluated directly via prompts without agentic scaffolding, so the scaffold confound does not apply.",
    426           "source": "opus"
    427         }
    428       },
    429       "data_leakage": {
    430         "temporal_leakage_addressed": {
    431           "applies": true,
    432           "answer": false,
    433           "justification": "No discussion of temporal leakage. The CVEs span multiple years and many predate the training data of evaluated models. Models may have seen the exact vulnerabilities, patches, and CVE descriptions during training.",
    434           "source": "opus"
    435         },
    436         "feature_leakage_addressed": {
    437           "applies": true,
    438           "answer": false,
    439           "justification": "The evaluation prompt includes ground-truth CWE descriptions, telling the model what vulnerability type to look for. This provides information not available in real-world detection scenarios. This potential feature leakage is not discussed.",
    440           "source": "opus"
    441         },
    442         "non_independence_addressed": {
    443           "applies": true,
    444           "answer": false,
    445           "justification": "Multiple CVEs come from the same projects (Linux has 400+ samples per Figure 9c). Structural similarities between test examples from the same codebase are not discussed.",
    446           "source": "opus"
    447         },
    448         "leakage_detection_method": {
    449           "applies": true,
    450           "answer": false,
    451           "justification": "No concrete leakage detection or prevention method is used. No canary strings, membership inference tests, temporal splits, or decontamination pipelines are applied.",
    452           "source": "opus"
    453         }
    454       }
    455     }
    456   },
    457   "claims": [
    458     {
    459       "claim": "Prior context-free evaluations significantly underestimated LLM vulnerability detection capability; SOTA models achieve 67% accuracy and 37% (1,0) proportion under CORRECT's Strict Mode versus near-random without context.",
    460       "evidence": "Figure 4(d) shows w/o context accuracy clustered 0.50–0.55; Figure 4(j) shows Strict Mode accuracy reaching 0.67 for DeepSeek-R1; Figure 5 shows (1,0) proportion of 37% vs 25% random baseline.",
    461       "supported": "strong"
    462     },
    463     {
    464       "claim": "SOTA LLMs (671B) achieve precision approaching 0.8 in distinguishing vulnerable from patched code when provided with sufficient context.",
    465       "evidence": "Figure 4(f) shows DeepSeek-R1 and DeepSeek-V3 reaching precision ~0.8 in Strict Mode, while Figure 4(a) shows all models clustered near 0.5 without context.",
    466       "supported": "strong"
    467     },
    468     {
    469       "claim": "Only ~9.5% of false positives from reasoning models reflect genuine failure to distinguish patched code; the majority arise because the model incorrectly deems the patch insufficient.",
    470       "evidence": "Table 5 manual analysis: 'Patch Ignored' accounts for 2 cases (R→Non-R direction) vs 19 for 'Patch Deemed Insufficient' in reasoning models; 9.5% figure cited in Finding #4.",
    471       "supported": "moderate"
    472     },
    473     {
    474       "claim": "A clear model size scaling law emerges for vulnerability detection only when context is provided; without context, smaller models perform comparably to 671B SOTA models.",
    475       "evidence": "Figure 4(c) shows F1 clustered 0.5–0.6 regardless of model size w/o context; Figure 4(g)(i) show clear positive correlation with model size under CORRECT.",
    476       "supported": "strong"
    477     },
    478     {
    479       "claim": "Test-time scaling follows a power-law relationship where 5x thinking tokens yields less than 5% accuracy improvement, making it economically unviable as the primary scaling strategy.",
    480       "evidence": "Figure 7(d) shows aggregate accuracy trend for o3-mini-low/medium/high; paper states 'o3-mini-high, with five times the thinking tokens of o3-mini-medium, achieves an accuracy improvement of less than 0.05'.",
    481       "supported": "strong"
    482     },
    483     {
    484       "claim": "LLMs fail to generalize to rare and out-of-distribution vulnerability types even with context, achieving F1 scores as low as 0.4 (CWE-697).",
    485       "evidence": "Table 4 shows rare CWEs (CWE-435, CWE-697, CWE-703) max F1 of 0.556, 0.400, 0.479 respectively, versus 0.700–0.713 for common CWEs.",
    486       "supported": "strong"
    487     }
    488   ],
    489   "methodology_tags": [
    490     "benchmark-eval"
    491   ],
    492   "key_findings": "The CORRECT framework demonstrates that three widely-held beliefs about LLM vulnerability detection (unreliable, insensitive to patches, plateaued with scale) are artifacts of context-free evaluations rather than fundamental model limitations. When provided with callee functions, type declarations, and execution context, SOTA models (DeepSeek-R1) achieve 67% accuracy and precision near 0.8 in Strict Mode, vastly outperforming prior evaluations' near-random results. The primary bottleneck is not binary classification but recall (~0.5 for SOTA), and most false positives arise from reasoning errors (deeming patches insufficient) rather than genuine inability to detect patches. Both model scaling and test-time scaling improve performance, but with diminishing returns and trade-offs: sequential scaling causes recall degradation via overthinking, while LLMs systematically fail on rare/out-of-distribution vulnerability types regardless of context.",
    493   "red_flags": [
    494     {
    495       "flag": "Contamination unaddressed",
    496       "detail": "The dataset consists of historical CVE records from public repositories (Linux, FFmpeg, ImageMagick, etc.) that almost certainly appear in the training corpora of all 13 evaluated models; no training cutoffs are stated and no decontamination analysis is performed."
    497     },
    498     {
    499       "flag": "CWE hints in prompts inflate performance",
    500       "detail": "Prompt I includes the ground-truth CWE description, effectively telling the model what vulnerability type to look for. This substantially aids detection and confounds the comparison with prior work that used blind evaluation."
    501     },
    502     {
    503       "flag": "LLM-as-a-judge circularity",
    504       "detail": "GPT-4o is used to evaluate rationale quality for GPT-4o-adjacent models, with judge accuracy validated on only 50 manually reviewed cases (92%). This creates a circular evaluation loop for OpenAI models and insufficient validation sample."
    505     },
    506     {
    507       "flag": "No statistical significance testing",
    508       "detail": "All comparative claims across 13 models and 4 evaluation modes are based on point estimates with no confidence intervals, error bars, or significance tests applied."
    509     },
    510     {
    511       "flag": "Anonymous code link not permanent",
    512       "detail": "The code and data link (anonymous.4open.science) is an anonymous review platform; such links are typically temporary and may not survive post-publication, undermining reproducibility claims."
    513     },
    514     {
    515       "flag": "400-pair sample unjustified",
    516       "detail": "Only 400 of 2,000 pairs are used for primary evaluation; the subsampling rationale (CWE-1000 classification) is described but no power analysis or sufficiency argument is made for this sample size."
    517     }
    518   ],
    519   "cited_papers": [
    520     {
    521       "title": "Vulnerability Detection with Code Language Models: How Far Are We? (PrimeVul)",
    522       "relevance": "Primary dataset source and key prior evaluation work that this paper challenges for context-free methodology"
    523     },
    524     {
    525       "title": "LLMs Cannot Reliably Identify and Reason About Security Vulnerabilities (Yet?): A Comprehensive Evaluation, Framework, and Benchmarks (SecLLMHolmes)",
    526       "relevance": "Key comparison establishing Consensus #1 (unreliability); CORRECT directly refutes its conclusions"
    527     },
    528     {
    529       "title": "To Err is Machine: Vulnerability Detection Challenges LLM Reasoning (Steenhoek et al.)",
    530       "relevance": "Establishes Consensus #3 (plateaued performance); CORRECT shows this is an artifact of missing context"
    531     },
    532     {
    533       "title": "LLM4Vuln: A Unified Evaluation Framework for Decoupling and Enhancing LLMs' Vulnerability Reasoning",
    534       "relevance": "Prior context-augmented framework compared in Table 1"
    535     },
    536     {
    537       "title": "VulEval: Towards Repository-Level Evaluation of Software Vulnerability Detection",
    538       "relevance": "Prior work on context augmentation via caller-callee relationships"
    539     },
    540     {
    541       "title": "MoreFixes: A Large-Scale Dataset of CVE Fix Commits Mined Through Enhanced Repository Discovery",
    542       "relevance": "One of three dataset sources for CORRECT's 2,000 program pairs"
    543     },
    544     {
    545       "title": "ReposVul: A Repository-Level High-Quality Vulnerability Dataset",
    546       "relevance": "One of three dataset sources for CORRECT's 2,000 program pairs"
    547     },
    548     {
    549       "title": "Top Score on the Wrong Exam: On Benchmarking in Machine Learning for Vulnerability Detection (Risse & Böhme)",
    550       "relevance": "Foundational critique of vulnerability detection benchmarking methodology that motivates this work"
    551     },
    552     {
    553       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    554       "relevance": "Best-performing model in evaluations; reasoning LLM whose capabilities the paper reassesses"
    555     },
    556     {
    557       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    558       "relevance": "Foundational prompting technique used in prior vulnerability detection evaluations this paper critiques"
    559     }
    560   ],
    561   "engagement_factors": {
    562     "practical_relevance": {
    563       "score": 3,
    564       "justification": "Directly addresses how to properly evaluate and deploy LLMs for security vulnerability detection, with actionable guidance on context provision and scaling strategy trade-offs."
    565     },
    566     "surprise_contrarian": {
    567       "score": 3,
    568       "justification": "Explicitly and successfully challenges three established community consensus beliefs, showing LLMs are actually much better at vulnerability detection than reported — a significant reversal."
    569     },
    570     "fear_safety": {
    571       "score": 2,
    572       "justification": "Security vulnerability detection is directly safety-relevant; the finding that prior evaluations were systematically wrong has implications for trust in deployed LLM-based security tools."
    573     },
    574     "drama_conflict": {
    575       "score": 2,
    576       "justification": "The framing as refuting community 'consensus beliefs' and the direct critiques of named prior papers create genuine scientific controversy."
    577     },
    578     "demo_ability": {
    579       "score": 2,
    580       "justification": "Code and dataset are available at an anonymous link, enabling practitioners to apply CORRECT to evaluate their own LLMs on vulnerability detection."
    581     },
    582     "brand_recognition": {
    583       "score": 1,
    584       "justification": "Authors are from Nanjing and Shandong universities without major lab brand recognition, though they evaluate prominent models like DeepSeek-R1 and o3-mini."
    585     }
    586   },
    587   "hn_data": {
    588     "threads": [
    589       {
    590         "hn_id": "27146649",
    591         "title": "Geometric Deep Learning: Grids, Groups, Graphs, Geodesics, and Gauges",
    592         "points": 3,
    593         "comments": 1,
    594         "url": "https://news.ycombinator.com/item?id=27146649",
    595         "created_at": "2021-05-13T20:00:26Z"
    596       },
    597       {
    598         "hn_id": "45166677",
    599         "title": "Geometric Deep Learning Grids, Groups, Graphs, Geodesics, and Gauges [pdf]",
    600         "points": 3,
    601         "comments": 0,
    602         "url": "https://news.ycombinator.com/item?id=45166677",
    603         "created_at": "2025-09-08T10:39:40Z"
    604       },
    605       {
    606         "hn_id": "42855137",
    607         "title": "Why a Race to Artificial Superintelligence Is Self-Defeating [pdf]",
    608         "points": 3,
    609         "comments": 0,
    610         "url": "https://news.ycombinator.com/item?id=42855137",
    611         "created_at": "2025-01-28T17:27:43Z"
    612       },
    613       {
    614         "hn_id": "43788230",
    615         "title": "Show HN: A new way to verify remote AI model execution (no TEEs, no ZK)",
    616         "points": 2,
    617         "comments": 0,
    618         "url": "https://news.ycombinator.com/item?id=43788230",
    619         "created_at": "2025-04-24T22:31:33Z"
    620       },
    621       {
    622         "hn_id": "44796040",
    623         "title": "From Large to Super-Tiny: End-to-End Optimization for Cost-Efficient LLMs",
    624         "points": 2,
    625         "comments": 0,
    626         "url": "https://news.ycombinator.com/item?id=44796040",
    627         "created_at": "2025-08-05T09:39:59Z"
    628       },
    629       {
    630         "hn_id": "44968425",
    631         "title": "Consumer Autonomy or Illusion? Rethinking Consumer Agency in Age of Algorithms",
    632         "points": 2,
    633         "comments": 1,
    634         "url": "https://news.ycombinator.com/item?id=44968425",
    635         "created_at": "2025-08-21T02:16:50Z"
    636       },
    637       {
    638         "hn_id": "45483510",
    639         "title": "A Convex Formulation of Compliant Contact Between Filaments and Rigid Bodies",
    640         "points": 2,
    641         "comments": 0,
    642         "url": "https://news.ycombinator.com/item?id=45483510",
    643         "created_at": "2025-10-05T17:33:41Z"
    644       },
    645       {
    646         "hn_id": "42836005",
    647         "title": "Autonomy-of-Experts Models (ArXiv)",
    648         "points": 2,
    649         "comments": 0,
    650         "url": "https://news.ycombinator.com/item?id=42836005",
    651         "created_at": "2025-01-27T00:43:16Z"
    652       },
    653       {
    654         "hn_id": "42008373",
    655         "title": "Geometric Deep Learning: Grids, Groups, Graphs, Geodesics, and Gauges",
    656         "points": 2,
    657         "comments": 0,
    658         "url": "https://news.ycombinator.com/item?id=42008373",
    659         "created_at": "2024-10-31T16:19:04Z"
    660       },
    661       {
    662         "hn_id": "30395596",
    663         "title": "Geometric Deep Learning: Grids, Groups, Graphs, Geodesics, and Gauges",
    664         "points": 2,
    665         "comments": 0,
    666         "url": "https://news.ycombinator.com/item?id=30395596",
    667         "created_at": "2022-02-19T09:10:06Z"
    668       }
    669     ],
    670     "top_points": 3,
    671     "total_points": 23,
    672     "total_comments": 2
    673   }
    674 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs