scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28161B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "LLM-based Vulnerability Detection at Project Scale: An Empirical Study",
      6     "authors": [
      7       "Fengjie Li",
      8       "Jiajun Jiang",
      9       "Dongchi Chen",
     10       "Yingfei Xiong"
     11     ],
     12     "year": 2026,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2601.19239",
     15     "doi": "10.48550/arXiv.2601.19239"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "All abstract claims—low recall (21.09%/33.82%), high false discovery rates (best tool at 85.3% SFDR), and high computational costs (up to hundreds of millions of tokens, multi-day runtimes)—are directly supported by Tables IV–VII and quantitative results.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper claims specific root causes (shallow interprocedural reasoning, imprecise source/sink identification) account for 37.47% and 19.00% of FPs, but this taxonomy was built by 2 authors with no reported inter-rater reliability statistics (no Cohen's kappa or agreement rate), undermining the causal attribution.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Threats to validity section explicitly acknowledges evaluation covers only 5 tools, 2 languages (Java/C++), 8 CWE types, and notes results may not generalize to Rust, Go, or TypeScript systems.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper attributes failures primarily to tool design choices without considering alternative explanations such as potential tool misconfiguration (using default rather than tuned settings), benchmark difficulty relative to original evaluation settings, or API-level LLM behavior differences.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper clearly explains that SFDR is a sampled estimate (up to 10 warnings per tool per project), that recall is measured only on labeled vulnerable points, and that real-world projects have unknown ground truth requiring manual sampling rather than full measurement.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section VI.B 'Threats to Validity' provides a dedicated discussion of internal validity (expert judgment bias, labeling accuracy) and external validity (tool selection, language/CWE coverage, evolving field).",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Specific threats named include: personal experience influencing FP taxonomy coding, selection of only open-source/runnable tools, evaluation limited to Java and C/C++, and lack of generalizability to Rust/Go/TypeScript multi-language projects.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "External validity section explicitly states results may not represent 'all possible architectures, languages, or prompting paradigms' and that custom enterprise-scale deployments may behave differently.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding sources are mentioned anywhere in the paper; no acknowledgments section is present.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations are clearly stated: Tianjin University (Li, Jiang), International Joint Institute of Tianjin University/Fuzhou (Chen), and Peking University Key Lab (Xiong), with email addresses provided.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funding disclosed; cannot assess funder independence.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement, patent disclosure, or financial interest declaration appears anywhere in the paper.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms are defined precisely: source/sink, taint analysis, SFDR (sampled false discovery rate), recall formula provided, CWE types defined via MITRE classification, and each tool's workflow is explained.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Five explicit contributions are bulleted at the end of the introduction: first project-scale empirical comparison, systematic detection evaluation, FP taxonomy with 385 labeled reports, overhead measurement, and artifact release.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section II and III.C systematically compare prior work and explain how this study differs—prior work evaluates at function/hunk level with incomplete project context, whereas this study evaluates at full project scale with real-world settings.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "Paper explicitly states: 'we release all experimental artifacts, including evaluation scripts, prompts, taxonomy labels, and detailed statistics, on our project homepage: https://github.com/Feng-Jay/LLM4Security.'",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "In-house benchmark built from three public datasets (ReposVul, CWE-Bench-Java, JLeaks); real-world projects at specific commits are identified in Table III; manual FP taxonomy labels are released on GitHub.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "Server hardware is described (Intel Xeon 6388, 512GB RAM, NVIDIA A800, Ubuntu 20.04.6 LTS) but no formal dependency specifications (requirements.txt, Dockerfile) are provided by the authors.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "Paper provides only a URL pointer to GitHub; no step-by-step reproduction instructions appear in the paper itself, and completeness of the GitHub documentation cannot be verified from the paper.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "No confidence intervals or error bars are reported for any recall percentages or SFDR values; results are presented as point estimates only.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No statistical significance tests are used for comparative claims between tools (e.g., that RepoAudit outperforms CodeQL), despite multiple pairwise comparisons being made throughout the paper.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Percentage differences in recall and SFDR across tools are explicitly reported with absolute baselines (e.g., RepoAudit 55.00% recall vs. 0.00% for CodeQL/Semgrep/KNighter on CWE-401).",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "No justification is provided for the choice of 222 vulnerabilities, 24 projects, or 'up to 10 warnings per tool per project'; no power analysis or discussion of statistical adequacy.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "Overhead table (Table VII) reports min/max/avg but not standard deviation; recall results are single point estimates per CWE type with no measure of spread.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Two traditional static analysis tools (CodeQL and Semgrep) serve as explicit baselines against which all LLM-based methods are compared throughout Tables IV–VI.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "CodeQL and Semgrep are industry-standard, actively maintained tools widely used in practice; they represent the current state of traditional static analysis.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": false,
    192           "answer": false,
    193           "justification": "Paper evaluates existing tools rather than proposing new components; no ablation is applicable.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Multiple metrics used: recall on in-house benchmark, SFDR on real-world projects, number of reports, files affected, input/output token consumption, and end-to-end runtime.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": true,
    205           "justification": "Two authors independently labeled 385 sampled warnings as true/false positives requiring 150+ human hours, with disagreements resolved through discussion.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "Two distinct evaluation settings: an in-house benchmark with known ground truth and 24 separate real-world projects with no pre-labeled vulnerabilities, providing an out-of-distribution evaluation.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Results broken down by CWE type (Tables IV–VI) and by individual project, with per-tool per-project SFDR values reported.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "RQ3 constructs a taxonomy of 8 FP cause categories with quantified distribution across tools (Figure 5), and Appendix A provides illustrated code-level examples for all 9 FP reason types.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "The core findings are negative: low recall (21–34%), near-universal high FDR (85–100% SFDR), and severe scalability failures are the primary reported results.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": false,
    237           "justification": "Table I lists 'Claude 3.5 Sonnet,' 'O3-mini,' and 'GPT-4' without specifying snapshot dates or version hashes for any of these models, which can vary significantly in capability.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "Example prompts shown inline in Figures 8–9 illustrate actual LLM instructions; paper states all prompts are released on GitHub homepage.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": false,
    249           "justification": "Paper states it follows 'hyperparameters recommended in the original papers' but does not report actual values for temperature, top-p, call-chain exploration depth, or any other parameter.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "Section III.B describes each tool's workflow architecture in detail: multi-agent path exploration (RepoAudit), LLM+CSA checker generation (KNighter), LLM+CodeQL source/sink inference (IRIS), iterative dataflow tracing (LLMDFA), and CFG-guided intent inference (INFERROI).",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "In-house dataset construction described in Section III.C including source filtering, compilation verification, and manual checking; real-world project selection criteria (active maintenance, historical vulnerability types, buildability) are documented.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "GitHub release explicitly includes taxonomy labels, detailed statistics, and evaluation scripts; raw detection results and manual annotations are stated to be publicly available.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Section III.C describes the construction of the in-house dataset from three source benchmarks with filtering criteria, and real-world project selection process with 24 projects at specific commits.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants recruited; study uses code repositories and existing public benchmark datasets.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "Full pipeline documented: source benchmark collection → filtering/verification → tool execution → sampling (up to 10 per tool per project) → independent labeling → disagreement resolution through discussion.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "Training cutoffs for GPT-4, Claude 3.5 Sonnet, and O3-mini are not stated anywhere in the paper despite these models being central to the evaluated tools.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "While authors mention constructing the real-world dataset 'to avoid data leakage,' they do not discuss whether LLMs may have been trained on the public benchmark vulnerabilities (ReposVul, CWE-Bench-Java, JLeaks) or open-source projects used in evaluation.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "Public benchmark datasets (ReposVul published ICSE 2024, CWE-Bench-Java, JLeaks) predate some LLM training cutoffs and are publicly available; no analysis of whether benchmark CVEs appeared in LLM training data is provided.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human subjects study; the 2-author manual labeling is researcher coding, not a participant study.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human subjects study.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human subjects study.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human subjects study.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human subjects study.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human subjects study.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human subjects study.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": true,
    359           "justification": "Table VII provides detailed per-tool min/max/avg input tokens, output tokens, and runtime; Appendix C Tables X–XI give per-project breakdowns; RepoAudit shown to use up to 225M input tokens and 38M output tokens for a single project.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": true,
    365           "justification": "Server specifications fully described (dual Intel Xeon 6388, 512GB RAM, 4x NVIDIA A800, Ubuntu 20.04.6 LTS) and total per-project token consumption and runtimes reported throughout Section V-D.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "LLM-based methods achieve low recall on known vulnerabilities: 21.09% average for C/C++ and 33.82% for Java on the in-house benchmark of 222 vulnerabilities.",
    374       "evidence": "Table IV showing per-CWE detection results across 5 LLM tools and 2 traditional tools; most tools achieve 0% on many CWE types.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "LLM-based tools uncover more unique vulnerabilities than traditional tools and are complementary to each other.",
    379       "evidence": "Figure 3 showing unique vulnerabilities per tool with up to 23 unique detections by INFERROI on CWE-722 vs. 0 for traditional tools.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Both LLM-based and traditional tools exhibit very high false discovery rates in real-world projects; even the best tool averages 85.3% SFDR.",
    384       "evidence": "Tables V and VI with SFDR measurements; RepoAudit averages 97.0% SFDR and IRIS averages 94.4% SFDR.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Shallow interprocedural reasoning (A1) and imprecise source/sink identification (B1) are the dominant FP causes, accounting for 37.47% and 19.00% of false positives respectively.",
    389       "evidence": "Figure 5 distribution of FP reasons across tools based on manual coding of 385 sampled reports by 2 authors; inductive open-coding procedure used.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Project-scale LLM-based detection is computationally prohibitive: up to 225M input tokens per project and up to 4,638 minutes (77 hours) for a single project.",
    394       "evidence": "Table VII overhead measurements and Appendix C Tables X–XI per-project details; LLMDFA averages 2,000 minutes per Java project.",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "Source/sink mismatch is the primary reason for missed detections in the in-house benchmark, accounting for 62/64, 64/64, and 64/64 missed vulnerabilities for CodeQL, Semgrep, and KNighter respectively.",
    399       "evidence": "Figure 1 showing cause distribution for undetected C/C++ vulnerabilities; Figure 2 illustrating how dev_alloc_skb is not recognized as an allocator.",
    400       "supported": "moderate"
    401     }
    402   ],
    403   "methodology_tags": [
    404     "benchmark-eval",
    405     "observational",
    406     "case-study"
    407   ],
    408   "key_findings": "LLM-based vulnerability detectors achieve only 21–34% recall on known vulnerabilities and suffer from catastrophically high false discovery rates (85–100% SFDR) when applied to real-world projects, making them currently impractical for deployment. Manual analysis of 385 false positives identified shallow interprocedural dataflow reasoning and imprecise source/sink identification as the dominant failure causes (accounting for over 56% of FPs combined), with LLM-specific failures including hallucinated control flow and prompt non-compliance. Project-scale LLM analysis is computationally prohibitive—up to 225 million input tokens and 77 hours per project—representing a fundamental scalability bottleneck. Despite these limitations, LLM-based tools consistently surface more unique vulnerabilities than traditional static analyzers, suggesting potential value in hybrid or targeted deployment settings.",
    409   "red_flags": [
    410     {
    411       "flag": "No inter-rater reliability statistics",
    412       "detail": "The entire FP taxonomy and its quantified distribution (Figure 5) rests on manual coding by 2 authors using inductive open-coding, but no Cohen's kappa or inter-rater agreement rate is reported. The central claim about dominant FP causes is therefore unverified."
    413     },
    414     {
    415       "flag": "No statistical significance testing",
    416       "detail": "All comparative claims between tools (recall differences, SFDR differences, complementarity) are stated without statistical tests or confidence intervals despite the small sample sizes involved."
    417     },
    418     {
    419       "flag": "LLM model versions underspecified",
    420       "detail": "Table I lists 'GPT-4,' 'Claude 3.5 Sonnet,' and 'O3-mini' without snapshot dates or version hashes; GPT-4 alone encompasses dozens of distinct model checkpoints with significant capability differences."
    421     },
    422     {
    423       "flag": "Benchmark contamination not addressed",
    424       "detail": "Public benchmark datasets (ReposVul, CWE-Bench-Java, JLeaks) and the open-source projects evaluated were publicly available before LLM training cutoffs; no analysis of whether models had seen these vulnerable code patterns during training."
    425     },
    426     {
    427       "flag": "SFDR estimation under-analyzed",
    428       "detail": "Sampling up to 10 warnings per tool per project produces SFDR estimates with unknown confidence intervals; some cells in Tables V–VI have as few as 1 sampled report, making SFDR claims for those configurations unreliable."
    429     },
    430     {
    431       "flag": "Tool configuration not independently optimized",
    432       "detail": "Using default hyperparameters 'recommended in original papers' may systematically disadvantage tools on new datasets; the study conflates tool capability with default-configuration performance."
    433     }
    434   ],
    435   "cited_papers": [
    436     {
    437       "title": "RepoAudit: An Autonomous LLM-Agent for Repository-Level Code Auditing",
    438       "relevance": "One of the 5 evaluated LLM-based vulnerability detectors; multi-agent framework central to the study's comparative analysis."
    439     },
    440     {
    441       "title": "LLMDFA: Analyzing Dataflow in Code with Large Language Models",
    442       "relevance": "One of the 5 evaluated tools; agent-centric multi-path exploration approach evaluated at project scale."
    443     },
    444     {
    445       "title": "LLM-Assisted Static Analysis for Detecting Security Vulnerabilities (IRIS)",
    446       "relevance": "One of the 5 evaluated tools; uses LLM to infer sources/sinks and augment CodeQL; provides CWE-Bench-Java dataset used in evaluation."
    447     },
    448     {
    449       "title": "KNighter: Transforming Static Analysis with LLM-Synthesized Checkers",
    450       "relevance": "One of the 5 evaluated tools; generates Clang Static Analyzer checkers from vulnerability-fixing commits."
    451     },
    452     {
    453       "title": "Boosting Static Resource Leak Detection via LLM-Based Resource-Oriented Intention Inference (INFERROI)",
    454       "relevance": "One of the 5 evaluated tools; CFG-guided LLM approach specialized for CWE-772 resource leaks."
    455     },
    456     {
    457       "title": "VulEval: Towards Repository-Level Evaluation of Software Vulnerability Detection",
    458       "relevance": "Prior study on LLM vulnerability detection at repository level; directly compared and critiqued in related work for incomplete project-context evaluation."
    459     },
    460     {
    461       "title": "LLMs Cannot Reliably Identify and Reason About Security Vulnerabilities (Yet?): A Comprehensive Evaluation",
    462       "relevance": "Related empirical study on LLM vulnerability detection limitations; confirms and contextualizes this paper's findings at a different granularity."
    463     },
    464     {
    465       "title": "Top Score on the Wrong Exam: On Benchmarking in Machine Learning for Vulnerability Detection",
    466       "relevance": "Methodological critique of vulnerability detection benchmarks; cited as motivation for moving beyond function-level evaluation."
    467     },
    468     {
    469       "title": "ReposVul: A Repository-Level High-Quality Vulnerability Dataset",
    470       "relevance": "Source of C/C++ vulnerabilities for the in-house benchmark; provides ground-truth CVE entries used in RQ1."
    471     },
    472     {
    473       "title": "JLeaks: A Featured Resource Leak Repository Collected from Hundreds of Open-Source Java Projects",
    474       "relevance": "Source of 50 CWE-772 (resource leak) vulnerabilities for the in-house benchmark used in RQ1 evaluation."
    475     }
    476   ],
    477   "engagement_factors": {
    478     "practical_relevance": {
    479       "score": 3,
    480       "justification": "Directly actionable for security practitioners evaluating whether to adopt LLM-based scanning tools; provides concrete cost and accuracy benchmarks across 7 tools."
    481     },
    482     "surprise_contrarian": {
    483       "score": 2,
    484       "justification": "Challenges widespread optimism about LLM-based security tools with hard numbers showing 85–100% false discovery rates and recalls below 34%, contradicting tool papers' own reported results."
    485     },
    486     "fear_safety": {
    487       "score": 2,
    488       "justification": "Demonstrates that current LLM security tools are unreliable for real-world deployment, raising concern about organizations falsely believing they are protected by these tools."
    489     },
    490     "drama_conflict": {
    491       "score": 1,
    492       "justification": "Implicitly contradicts positive claims from the original tool papers (RepoAudit, IRIS, LLMDFA), but no explicit confrontation or strong controversy angle is developed."
    493     },
    494     "demo_ability": {
    495       "score": 1,
    496       "justification": "Evaluated tools are publicly available but require expensive LLM API access (up to hundreds of millions of tokens) and multi-day runtimes, making casual reproduction impractical."
    497     },
    498     "brand_recognition": {
    499       "score": 1,
    500       "justification": "Uses Claude 3.5 Sonnet, GPT-4, and O3-mini as backbones (Anthropic/OpenAI products), but authors are from Chinese universities without major brand recognition."
    501     }
    502   },
    503   "hn_data": {
    504     "threads": [
    505       {
    506         "hn_id": "46781038",
    507         "title": "An ultra-high-resolution map of (dark) matter",
    508         "points": 1,
    509         "comments": 0,
    510         "url": "https://news.ycombinator.com/item?id=46781038",
    511         "created_at": "2026-01-27T15:12:25Z"
    512       }
    513     ],
    514     "top_points": 1,
    515     "total_points": 1,
    516     "total_comments": 0
    517   }
    518 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs