scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (30540B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Everything You Wanted to Know About LLM-based Vulnerability Detection But Were Afraid to Ask",
      6     "authors": [
      7       "Yue Li",
      8       "Xiao Li",
      9       "Hao Wu",
     10       "Minghui Xu",
     11       "Yue Zhang",
     12       "Xiuzhen Cheng",
     13       "Fengyuan Xu",
     14       "Sheng Zhong"
     15     ],
     16     "year": 2025,
     17     "venue": "arXiv.org",
     18     "arxiv_id": "2504.13474",
     19     "doi": "10.48550/arXiv.2504.13474"
     20   },
     21   "checklist": {
     22     "claims_and_evidence": {
     23       "abstract_claims_supported": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "All major abstract claims are backed by experimental results: 67% accuracy and >70% F1 on key CWEs (Figure 4, Table 4), precision ~0.8 (Figure 4f), reasoning-error attribution of FPs (Table 5), and diminishing returns from scaling (Figure 7).",
     27         "source": "haiku"
     28       },
     29       "causal_claims_justified": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The paper makes causal claims that context deprivation causes underestimation of model performance, supported by controlled ablation across four conditions (w/o context w/o revision → Lenient Mode → Strict Mode) on the same models and dataset.",
     33         "source": "haiku"
     34       },
     35       "generalization_bounded": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper's dataset is exclusively C/C++ vulnerabilities from 364 projects, but claims like 'LLMs have been underestimated' and 'misconceptions' are stated without bounding findings to C/C++ or function-level detection specifically.",
     39         "source": "haiku"
     40       },
     41       "alternative_explanations_discussed": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "Section 6 (Root Causes Analysis) explicitly traces how UO(I) affects FP judgments and UO(II) affects FN judgments, providing mechanistic explanations for why prior consensus emerged from evaluation artifacts rather than model limitations.",
     45         "source": "haiku"
     46       },
     47       "proxy_outcome_distinction": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper carefully distinguishes binary label prediction from rationale correctness, and introduces Lenient/Strict modes precisely to separate 'got the label right' from 'correctly reasoned about the root cause.'",
     51         "source": "haiku"
     52       }
     53     },
     54     "limitations_and_scope": {
     55       "limitations_section_present": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "A dedicated 'Limitations' subsection appears in Section 6 with two specific concerns: LLM-as-a-judge accuracy (evaluated at 92% on 50 cases) and inability to gather complete context in practice.",
     59         "source": "haiku"
     60       },
     61       "threats_to_validity_specific": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The limitations are specific: the LLM-as-a-judge was validated on 50 sampled rationales achieving 91/99 accuracy (92%), and the context-completeness limitation is acknowledged with concrete examples of what cannot be captured (postconditions).",
     65         "source": "haiku"
     66       },
     67       "scope_boundaries_stated": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "The paper does not explicitly state that results are bounded to C/C++ function-level detection, or that findings may not hold for other languages, vulnerability classes beyond the 99 CWEs tested, or non-open-source models.",
     71         "source": "haiku"
     72       }
     73     },
     74     "conflicts_of_interest": {
     75       "funding_disclosed": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No acknowledgments or funding disclosure section is present in the provided paper text; no grants, institutions, or sponsors are mentioned.",
     79         "source": "haiku"
     80       },
     81       "affiliations_disclosed": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Author affiliations are clearly disclosed on the title page: Nanjing University (National Key Lab for Novel Software Technology) and Shandong University.",
     85         "source": "haiku"
     86       },
     87       "funder_independent_of_outcome": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "No funding disclosed; this criterion is not applicable.",
     91         "source": "haiku"
     92       },
     93       "financial_interests_declared": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No competing interests statement, patent declarations, or financial disclosures appear anywhere in the paper.",
     97         "source": "haiku"
     98       }
     99     },
    100     "scope_and_framing": {
    101       "key_terms_defined": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper precisely defines 'reasoning LLMs' vs 'non-reasoning LLMs,' 'System 1' vs 'System 2' thinking, 'sequential scaling' vs 'parallel scaling,' and the pair-wise prediction proportion metrics (1,0), (1,1), (0,0), (0,1) with formal notation.",
    105         "source": "haiku"
    106       },
    107       "intended_contribution_clear": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Three explicit contributions are enumerated: the CORRECT framework + 2,000-pair dataset, empirical overturn of three community misconceptions, and identification of new failure modes (generalization limits, overthinking).",
    111         "source": "haiku"
    112       },
    113       "engagement_with_prior_work": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Table 1 systematically compares CORRECT against 10 prior works on four dimensions, and Section 3.2 directly quotes quantitative results from Ding et al., Steenhoek et al., Khare et al., and others to establish what prior evaluations concluded.",
    117         "source": "haiku"
    118       }
    119     }
    120   },
    121   "type_checklist": {
    122     "empirical": {
    123       "artifacts": {
    124         "code_released": {
    125           "applies": true,
    126           "answer": false,
    127           "justification": "Code is linked via an anonymous review URL (anonymous.4open.science/r/CORRECT), which is a temporary peer-review link that will not persist after publication — this does not constitute a stable public release.",
    128           "source": "haiku"
    129         },
    130         "data_released": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "The processed dataset of 2,000 program pairs is linked at the same anonymous review URL; while source datasets (MoreFixes, PrimeVul, ReposVul) are public, the authors' curated context-rich dataset is not stably released.",
    134           "source": "haiku"
    135         },
    136         "environment_specified": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "No requirements file, Dockerfile, or dependency specification is mentioned in the paper; Joern (CPG tool) and cflow (call graph tool) are named but not versioned or packaged.",
    140           "source": "haiku"
    141         },
    142         "reproduction_instructions": {
    143           "applies": true,
    144           "answer": false,
    145           "justification": "No step-by-step instructions for running the pipeline are provided in the paper; Appendix D describes data distributions and Appendix E shows prompts, but not how to reproduce the evaluation end-to-end.",
    146           "source": "haiku"
    147         }
    148       },
    149       "statistical_methodology": {
    150         "confidence_intervals_or_error_bars": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "All results are reported as point estimates (F1, accuracy, precision, recall) without confidence intervals or error bars in any figure or table.",
    154           "source": "haiku"
    155         },
    156         "significance_tests": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "No statistical significance tests are applied to comparative claims (e.g., context vs. no context, model A vs. model B), despite making quantitative comparisons across conditions and model families.",
    160           "source": "haiku"
    161         },
    162         "effect_sizes_reported": {
    163           "applies": true,
    164           "answer": true,
    165           "justification": "Percentage improvements are consistently reported with baseline context (e.g., 5% accuracy gain from 5x more reasoning tokens, recall drop of ~10% from sequential scaling), enabling practical magnitude assessment.",
    166           "source": "haiku"
    167         },
    168         "sample_size_justified": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "The 400-pair evaluation subset is explained structurally (50 pairs per top-level CWE with exceptions for rare CWEs), but no power analysis or statistical justification for sufficient sample size is provided.",
    172           "source": "haiku"
    173         },
    174         "variance_reported": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "Temperature is set to 0 for determinism, but no variance across runs, seeds, or repeated evaluations is reported; it is unclear whether the zero-temperature setting eliminates all stochasticity.",
    178           "source": "haiku"
    179         }
    180       },
    181       "evaluation_design": {
    182         "baselines_included": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "The 'w/o context, w/o revision' condition replicates prior work's evaluation methodology and serves as a direct baseline, with results shown to align with previously published numbers.",
    186           "source": "haiku"
    187         },
    188         "baselines_contemporary": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "Baselines include current SOTA models (DeepSeek-R1 671B, o3-mini, DeepSeek-V3) and recently published evaluation papers from 2024, not outdated references.",
    192           "source": "haiku"
    193         },
    194         "ablation_study": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "Four evaluation configurations are systematically compared — (1) w/o context w/o revision, (2) w/ context w/o revision, (3) Lenient Mode, (4) Strict Mode — isolating the contribution of context and rationale validation.",
    198           "source": "haiku"
    199         },
    200         "multiple_metrics": {
    201           "applies": true,
    202           "answer": true,
    203           "justification": "Evaluation uses F1-score, accuracy, precision, recall, and pair-wise prediction proportions (1,0), (1,1), (0,0), (0,1) across all experimental conditions.",
    204           "source": "haiku"
    205         },
    206         "human_evaluation": {
    207           "applies": true,
    208           "answer": true,
    209           "justification": "Manual auditing of 50 pairs confirmed 98% label accuracy, and human inspection of cases in Table 5 was used to categorize reasoning error types (Patch Ignored, Minimum Reasoning, Procedural Error, Mis-Corrected).",
    210           "source": "haiku"
    211         },
    212         "held_out_test_set": {
    213           "applies": false,
    214           "answer": false,
    215           "justification": "The paper evaluates pre-trained LLMs zero-shot; there is no training phase requiring a held-out test split, making this criterion not applicable.",
    216           "source": "haiku"
    217         },
    218         "per_category_breakdown": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "Table 4 and Figure 6 provide detailed per-CWE breakdowns for all 10 top-level CWE categories, including F1-scores per model family and prevalence statistics.",
    222           "source": "haiku"
    223         },
    224         "failure_cases_discussed": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "Appendix H provides concrete examples of three failure categories (Patch Ignored, Minimum Reasoning, Mis-Corrected Reasoning) with actual LLM outputs and ground truth CVE context.",
    228           "source": "haiku"
    229         },
    230         "negative_results_reported": {
    231           "applies": true,
    232           "answer": true,
    233           "justification": "Diminishing returns from test-time scaling, recall degradation (~10% drop) from sequential scaling, poor generalization to rare vulnerability types, and the ongoing low recall (~0.5) for SOTA models are all reported as negative findings.",
    234           "source": "haiku"
    235         }
    236       },
    237       "setup_transparency": {
    238         "model_versions_specified": {
    239           "applies": true,
    240           "answer": false,
    241           "justification": "Table 3 provides model names (Qwen2.5-7B-Inst, Llama-3.1-8B-Inst, etc.) but API-accessed models like DeepSeek-R1, DeepSeek-V3, and o3-mini lack snapshot dates, which is critical given rapid model updates.",
    242           "source": "haiku"
    243         },
    244         "prompts_provided": {
    245           "applies": true,
    246           "answer": true,
    247           "justification": "Appendix E (Figures 10, 11, 12) provides the full text of the Context-Rich Vulnerability Assessment Prompt and both variants of the Rationale Assessment Prompt, including all structural elements.",
    248           "source": "haiku"
    249         },
    250         "hyperparameters_reported": {
    251           "applies": true,
    252           "answer": true,
    253           "justification": "Temperature = 0 for all deterministic evaluations is stated; temperature = 0.6 for parallel scaling experiments; o3-mini reasoning effort levels (low/medium/high) are specified; max_feedback_rounds = 4 is documented.",
    254           "source": "haiku"
    255         },
    256         "scaffolding_described": {
    257           "applies": true,
    258           "answer": true,
    259           "justification": "The CORRECT pipeline is described in detail: CPG construction with Joern, backward/forward slicing, two-layer callee depth restriction, dual-prompt design, LLM-as-a-judge with feedback loops, and Strict Mode iteration logic.",
    260           "source": "haiku"
    261         },
    262         "data_preprocessing_documented": {
    263           "applies": true,
    264           "answer": true,
    265           "justification": "Section 4.1 documents the full preprocessing pipeline: repository cloning, function-level commit filtering, CPG construction, slicing path extraction, shared context merging, and precondition handling.",
    266           "source": "haiku"
    267         }
    268       },
    269       "data_integrity": {
    270         "raw_data_available": {
    271           "applies": true,
    272           "answer": false,
    273           "justification": "Raw data is linked via an anonymous review URL that is not a stable archival release; the CVE source data from NVD is public but the processed 2,000-pair dataset with extracted contexts is not independently verifiable.",
    274           "source": "haiku"
    275         },
    276         "data_collection_described": {
    277           "applies": true,
    278           "answer": true,
    279           "justification": "Section 4.1 Phase I describes the collection process in detail: CVE record retrieval, patch commit crawling, repository cloning, function-level filtering, and sources (MoreFixes, PrimeVul, ReposVul).",
    280           "source": "haiku"
    281         },
    282         "recruitment_methods_described": {
    283           "applies": false,
    284           "answer": false,
    285           "justification": "No human participants were recruited; data was collected from public CVE repositories and open-source software.",
    286           "source": "haiku"
    287         },
    288         "data_pipeline_documented": {
    289           "applies": true,
    290           "answer": true,
    291           "justification": "Figure 3 and Section 4 together document the full pipeline from CVE collection (➀) through CPG construction (➂), context extraction (➃), merging (➄), and evaluation generation, with numbered stages.",
    292           "source": "haiku"
    293         }
    294       },
    295       "contamination": {
    296         "training_cutoff_stated": {
    297           "applies": true,
    298           "answer": false,
    299           "justification": "Training data cutoffs are not stated for any of the 13 evaluated models, including API-accessed DeepSeek-R1, o3-mini, or the Qwen/Llama instruct variants.",
    300           "source": "haiku"
    301         },
    302         "train_test_overlap_discussed": {
    303           "applies": true,
    304           "answer": false,
    305           "justification": "The dataset includes CVEs dating back to at least 2012 (CVE-2012-6689 shown in Appendix H), which almost certainly fall within training windows of all evaluated models; this is never discussed.",
    306           "source": "haiku"
    307         },
    308         "benchmark_contamination_addressed": {
    309           "applies": true,
    310           "answer": false,
    311           "justification": "Many CVEs in the dataset (e.g., from MoreFixes, PrimeVul) were publicly available before any evaluated model's training cutoff, but the paper does not acknowledge or address potential contamination.",
    312           "source": "haiku"
    313         }
    314       },
    315       "human_studies": {
    316         "pre_registered": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "No human participants in a study design requiring pre-registration.",
    320           "source": "haiku"
    321         },
    322         "irb_or_ethics_approval": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "No human participants.",
    326           "source": "haiku"
    327         },
    328         "demographics_reported": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "No human participants.",
    332           "source": "haiku"
    333         },
    334         "inclusion_exclusion_criteria": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "No human participants.",
    338           "source": "haiku"
    339         },
    340         "randomization_described": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "No human participants.",
    344           "source": "haiku"
    345         },
    346         "blinding_described": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "No human participants.",
    350           "source": "haiku"
    351         },
    352         "attrition_reported": {
    353           "applies": false,
    354           "answer": false,
    355           "justification": "No human participants.",
    356           "source": "haiku"
    357         }
    358       },
    359       "cost_and_practicality": {
    360         "inference_cost_reported": {
    361           "applies": true,
    362           "answer": false,
    363           "justification": "No API costs, dollar amounts, or latency measurements are reported, despite using paid API models (o3-mini, GPT-4o as judge) at scale across 400+ pairs and 13 models.",
    364           "source": "haiku"
    365         },
    366         "compute_budget_stated": {
    367           "applies": true,
    368           "answer": false,
    369           "justification": "Total computational budget for running all evaluations (including feedback loops up to 4 rounds per case) is not stated.",
    370           "source": "haiku"
    371         }
    372       }
    373     }
    374   },
    375   "claims": [
    376     {
    377       "claim": "Context-free evaluations cause SOTA LLMs to appear near-random (F1 0.5–0.6); with CORRECT's context-rich evaluation, DeepSeek-R1 achieves 67% accuracy and 37% (1,0) proportion.",
    378       "evidence": "Figure 4(c/d) shows 0.5–0.6 F1 / 0.5–0.55 accuracy without context; Figure 4(i/j) shows 0.6 F1 / 67% accuracy in Strict Mode for DeepSeek-R1; Figure 5 shows (1,0) proportion.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "SOTA models (671B parameter) achieve precision approaching 0.8 and ~10% (1,1) proportion under CORRECT, compared to 0.5 precision in context-free settings.",
    383       "evidence": "Figure 4(a) shows precision 0.5–0.55 without context; Figure 4(f) shows precision ~0.8 in Strict Mode for DeepSeek-R1/V3; Figure 5 shows (1,1) ~10% for all models.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Most false positives on patched code arise from 'Patch Deemed Insufficient' reasoning errors (29 cases) rather than genuine failure to notice patches 'Patch Ignored' (8 cases).",
    388       "evidence": "Table 5 shows Patch Ignored accounts for 6+2=8 cases vs Patch Deemed Insufficient 10+19=29 cases across ds-v3 and ds-r1 disagreements.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Test-time scaling (sequential) follows an approximate power-law: 5x more reasoning tokens yields only ~5% accuracy gain with recall declining ~10% due to overthinking.",
    393       "evidence": "Figure 7 shows o3-mini-high (5000+ tokens) vs medium (1000+ tokens) accuracy difference <0.05; sequential r1-qn-14b recall declines sharply in the 2k–4k token range.",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "LLMs excel at common fixed-pattern vulnerabilities (CWE-664, CWE-682, CWE-691, CWE-710, F1 ~0.7) but struggle with rare types (CWE-697, F1 ~0.4), revealing generalization limits.",
    398       "evidence": "Table 4 reports max F1: CWE-664=0.700, CWE-682=0.713 vs CWE-697=0.400, CWE-703=0.479; Figure 6 visualizes these differences across models.",
    399       "supported": "strong"
    400     },
    401     {
    402       "claim": "Parallel test-time scaling outperforms sequential scaling by maintaining recall stability via majority voting, avoiding the overthinking-induced recall degradation.",
    403       "evidence": "Figure 7 shows r1-qn-14b parallel recall is stable or improving while sequential recall shows clear downward trend from 1k to 4k tokens.",
    404       "supported": "moderate"
    405     }
    406   ],
    407   "methodology_tags": [
    408     "benchmark-eval"
    409   ],
    410   "key_findings": "The paper introduces CORRECT, showing that prior evaluations severely underestimated LLM vulnerability detection capability by omitting program context (callee functions, type declarations, slicing-extracted execution logic). With appropriate context, SOTA models achieve 67% accuracy and precision ~0.8 versus near-random performance in context-free settings. The dominant failure mode is not patch blindness but reasoning errors where LLMs correctly identify a patch exists but wrongly conclude it is insufficient. Both model-size and test-time scaling improve performance but with diminishing returns, and sequential scaling actively harms recall (~10% drop) due to overthinking in reasoning models.",
    411   "red_flags": [
    412     {
    413       "flag": "Anonymous code/data release",
    414       "detail": "Code and dataset are released only via a temporary anonymous peer-review link (anonymous.4open.science), which will not persist after publication, making reproduction contingent on a permanent release that has not yet occurred."
    415     },
    416     {
    417       "flag": "No statistical significance tests",
    418       "detail": "All comparative claims between models and conditions are made without confidence intervals, hypothesis tests, or variance estimates, despite comparing 13 models across multiple metrics and conditions."
    419     },
    420     {
    421       "flag": "Benchmark contamination unaddressed",
    422       "detail": "The dataset includes CVEs from 2012 onward that were publicly disclosed before training cutoffs of all evaluated models; potential contamination from LLMs having seen these vulnerabilities during training is never discussed."
    423     },
    424     {
    425       "flag": "C/C++ scope not stated as limitation",
    426       "detail": "All 2,000 program pairs are C/C++ code, but findings are framed as generalizable insights about 'LLM vulnerability detection' without explicitly bounding scope to C/C++ or function-level detection."
    427     },
    428     {
    429       "flag": "LLM-as-judge error propagation not accounted for",
    430       "detail": "GPT-4o as judge achieves 92% accuracy on 50 sampled cases (meaning ~8% error rate), but this error is not propagated through the main results or uncertainty estimates."
    431     },
    432     {
    433       "flag": "No API model version pinning",
    434       "detail": "DeepSeek-R1, DeepSeek-V3, and o3-mini are accessed via API without snapshot dates, making exact reproduction impossible as models may be updated silently."
    435     }
    436   ],
    437   "cited_papers": [
    438     {
    439       "title": "Vulnerability Detection with Code Language Models: How Far Are We? (PrimeVul)",
    440       "relevance": "Provides the primary baseline dataset and context-free evaluation methodology that CORRECT is designed to supersede; key source of Consensus #1 and #2 statistics."
    441     },
    442     {
    443       "title": "LLMs Cannot Reliably Identify and Reason About Security Vulnerabilities (Yet?): A Comprehensive Evaluation (SecLLMHolmes)",
    444       "relevance": "Establishes Consensus #1 with 13% (1,0) proportion for GPT-4; also pioneers rationale evaluation which CORRECT scales via LLM-as-a-judge."
    445     },
    446     {
    447       "title": "To Err is Machine: Vulnerability Detection Challenges LLM Reasoning (Steenhoek et al.)",
    448       "relevance": "Key source of Consensus #3 (plateaued performance); evaluated GPT-4-turbo through 7B models finding 0.5–0.55 balanced accuracy across scales."
    449     },
    450     {
    451       "title": "LLM4Vuln: A Unified Evaluation Framework for Decoupling and Enhancing LLMs' Vulnerability Reasoning",
    452       "relevance": "Prior context-augmentation work using caller-callee relationships; directly compared in Table 1 as partial solution to the context problem CORRECT addresses."
    453     },
    454     {
    455       "title": "Scaling LLM Test-Time Compute Optimally Can Be More Effective Than Scaling Model Parameters",
    456       "relevance": "Provides the theoretical framework for test-time scaling (sequential vs parallel) that RQ3 directly tests in the vulnerability detection domain."
    457     },
    458     {
    459       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    460       "relevance": "Defines the SOTA reasoning LLM used as the primary benchmark model achieving 67% accuracy in CORRECT's evaluation."
    461     },
    462     {
    463       "title": "Top Score on the Wrong Exam: On Benchmarking in Machine Learning for Vulnerability Detection",
    464       "relevance": "Motivating work on evaluation flaws in vulnerability detection ML benchmarks, directly cited as conceptual foundation for CORRECT's context-building methodology."
    465     },
    466     {
    467       "title": "MoreFixes: A Large-Scale Dataset of CVE Fix Commits Mined Through Enhanced Repository Discovery",
    468       "relevance": "One of three source datasets for CORRECT's 2,000-pair benchmark; provides real-world CVE-based vulnerable/patched pairs."
    469     }
    470   ],
    471   "engagement_factors": {
    472     "practical_relevance": {
    473       "score": 3,
    474       "justification": "Security practitioners using LLMs for vulnerability detection can directly apply the finding that context-rich prompting substantially improves performance, with concrete F1/precision numbers across 13 models."
    475     },
    476     "surprise_contrarian": {
    477       "score": 3,
    478       "justification": "Explicitly overturns three widely-cited consensus beliefs (LLMs are unreliable, insensitive to patches, and plateaued) held by the security research community, arguing they are measurement artifacts."
    479     },
    480     "fear_safety": {
    481       "score": 2,
    482       "justification": "Addresses real-world software security risk (20,000+ CVEs/year) and identifies failure modes in AI-based security tools, but does not raise catastrophic or systemic AI safety concerns."
    483     },
    484     "drama_conflict": {
    485       "score": 2,
    486       "justification": "Frames itself as a direct challenge to prior community consensus with strong language ('misconceptions,' 'artifacts of flawed evaluation'), creating a natural conflict narrative with named prior works."
    487     },
    488     "demo_ability": {
    489       "score": 1,
    490       "justification": "The CORRECT framework requires building code property graphs (Joern), extracting slices, and running multiple LLM calls with specialized prompts — not easily demo-able without the unreleased stable codebase."
    491     },
    492     "brand_recognition": {
    493       "score": 1,
    494       "justification": "Authors are from Nanjing University and Shandong University — credible academic institutions but not well-known AI lab brands; no industry affiliation."
    495     }
    496   },
    497   "hn_data": {
    498     "threads": [
    499       {
    500         "hn_id": "27146649",
    501         "title": "Geometric Deep Learning: Grids, Groups, Graphs, Geodesics, and Gauges",
    502         "points": 3,
    503         "comments": 1,
    504         "url": "https://news.ycombinator.com/item?id=27146649",
    505         "created_at": "2021-05-13T20:00:26Z"
    506       },
    507       {
    508         "hn_id": "45166677",
    509         "title": "Geometric Deep Learning Grids, Groups, Graphs, Geodesics, and Gauges [pdf]",
    510         "points": 3,
    511         "comments": 0,
    512         "url": "https://news.ycombinator.com/item?id=45166677",
    513         "created_at": "2025-09-08T10:39:40Z"
    514       },
    515       {
    516         "hn_id": "42855137",
    517         "title": "Why a Race to Artificial Superintelligence Is Self-Defeating [pdf]",
    518         "points": 3,
    519         "comments": 0,
    520         "url": "https://news.ycombinator.com/item?id=42855137",
    521         "created_at": "2025-01-28T17:27:43Z"
    522       },
    523       {
    524         "hn_id": "43788230",
    525         "title": "Show HN: A new way to verify remote AI model execution (no TEEs, no ZK)",
    526         "points": 2,
    527         "comments": 0,
    528         "url": "https://news.ycombinator.com/item?id=43788230",
    529         "created_at": "2025-04-24T22:31:33Z"
    530       },
    531       {
    532         "hn_id": "44796040",
    533         "title": "From Large to Super-Tiny: End-to-End Optimization for Cost-Efficient LLMs",
    534         "points": 2,
    535         "comments": 0,
    536         "url": "https://news.ycombinator.com/item?id=44796040",
    537         "created_at": "2025-08-05T09:39:59Z"
    538       },
    539       {
    540         "hn_id": "44968425",
    541         "title": "Consumer Autonomy or Illusion? Rethinking Consumer Agency in Age of Algorithms",
    542         "points": 2,
    543         "comments": 1,
    544         "url": "https://news.ycombinator.com/item?id=44968425",
    545         "created_at": "2025-08-21T02:16:50Z"
    546       },
    547       {
    548         "hn_id": "45483510",
    549         "title": "A Convex Formulation of Compliant Contact Between Filaments and Rigid Bodies",
    550         "points": 2,
    551         "comments": 0,
    552         "url": "https://news.ycombinator.com/item?id=45483510",
    553         "created_at": "2025-10-05T17:33:41Z"
    554       },
    555       {
    556         "hn_id": "42836005",
    557         "title": "Autonomy-of-Experts Models (ArXiv)",
    558         "points": 2,
    559         "comments": 0,
    560         "url": "https://news.ycombinator.com/item?id=42836005",
    561         "created_at": "2025-01-27T00:43:16Z"
    562       },
    563       {
    564         "hn_id": "42008373",
    565         "title": "Geometric Deep Learning: Grids, Groups, Graphs, Geodesics, and Gauges",
    566         "points": 2,
    567         "comments": 0,
    568         "url": "https://news.ycombinator.com/item?id=42008373",
    569         "created_at": "2024-10-31T16:19:04Z"
    570       },
    571       {
    572         "hn_id": "30395596",
    573         "title": "Geometric Deep Learning: Grids, Groups, Graphs, Geodesics, and Gauges",
    574         "points": 2,
    575         "comments": 0,
    576         "url": "https://news.ycombinator.com/item?id=30395596",
    577         "created_at": "2022-02-19T09:10:06Z"
    578       }
    579     ],
    580     "top_points": 3,
    581     "total_points": 23,
    582     "total_comments": 2
    583   }
    584 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs