scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (22213B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "From Vulnerabilities to Remediation: A Systematic Literature Review of LLMs in Code Security",
      6     "authors": [
      7       "Enna Basic",
      8       "Alberto Giaretta"
      9     ],
     10     "year": 2024,
     11     "venue": "arXiv",
     12     "arxiv_id": "2412.15004",
     13     "doi": null
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The abstract promises coverage of three areas (vulnerabilities introduced, detection/fixing capabilities, data poisoning), and the paper delivers dedicated sections (4, 5-7, 9) for each. All abstract claims are substantiated.",
     21         "source": "haiku"
     22       },
     23       "causal_claims_justified": {
     24         "applies": false,
     25         "answer": false,
     26         "justification": "The paper is a synthesis survey; it reports comparative findings from other studies rather than making original causal claims from its own experiments. Directional synthesis claims ('CoT outperforms zero-shot') are appropriately framed as literature summaries.",
     27         "source": "haiku"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "Broad claims like 'LLMs often generate insecure code' and 'LLMs generally outperform traditional SATs' are made without consistently bounding them to specific models, benchmark contexts, or programming languages. Different studies use different models and datasets that are not always distinguished in summary claims.",
     33         "source": "haiku"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper notes conflicting results between studies (Tables 5-6) but does not systematically analyze why contradictions occur—differences in evaluation methodology, dataset construction, or prompt design are described but not analyzed as alternative explanations for the conflicting patterns.",
     39         "source": "haiku"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "The paper explicitly flags the benchmark-vs-reality gap, citing Ding et al.'s finding that a model achieving 68.26% F1 on BigVul scored only 3.09% on a more realistic dataset, and Section 5.2 explicitly states 'LLMs are not able to meet the requirements of effective vulnerability detection in real-world scenarios.'",
     45         "source": "haiku"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Section 10.3 'Threats to Validity' is a named dedicated section, satisfying the formal requirement.",
     53         "source": "haiku"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "Section 10.3 contains only two sentences: one about 'rapid evolution of LLMs' (pure boilerplate) and one noting 'varying prompts are used across studies addressed RQ1.' Neither identifies specific, bounded threats such as search completeness, inter-rater reliability in paper selection, or publication bias.",
     59         "source": "haiku"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The paper does not explicitly state what the review's results do NOT show—e.g., that findings on controlled benchmark datasets may not generalize to production codebases, or that coverage is limited to English-language peer-reviewed and arXiv papers through a specific date. Generic disclaimers about LLM evolution do not constitute scope boundaries.",
     65         "source": "haiku"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No funding disclosure appears in the paper text. One author is affiliated with Epiroc Rock Drills (an industrial company), but no grant, contract, or funding source is acknowledged.",
     73         "source": "haiku"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Both authors' institutional affiliations are clearly disclosed on the title page: Örebro University and Epiroc Rock Drills for Basic; Örebro University for Giaretta.",
     79         "source": "haiku"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "No funder is disclosed, making this criterion not assessable. The industrial affiliation (Epiroc Rock Drills) does not appear to create outcome-relevant conflict since the survey does not evaluate Epiroc products.",
     85         "source": "haiku"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No competing interests or financial interests declaration appears anywhere in the paper.",
     91         "source": "haiku"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "LLMs are minimally defined as 'Machine Learning models designed for natural language processing,' but core evaluative terms like 'secure code,' 'vulnerability detection effectiveness,' and what thresholds constitute 'high' or 'low' false positive rates are not defined. CWE numbers are used without defining the classification system for readers unfamiliar with it.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Section 1 explicitly states three aims (explore vulnerabilities in LLM-generated code, evaluate detection/fixing effectiveness and prompting strategies, investigate data poisoning impact) and Section 2 explicitly differentiates the contribution from prior surveys.",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 2 provides substantive engagement with at least 6 related surveys (Negri et al., Yao et al., Zhou et al., Chen et al., Xu et al., Hou et al., Zhang et al.) and explicitly articulates how this paper extends each one, not merely listing them.",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "survey": {
    117       "search_and_selection": {
    118         "search_strategy_reproducible": {
    119           "applies": true,
    120           "answer": true,
    121           "justification": "Section 3.2 provides three complete Boolean search strings verbatim, including all OR/AND operators and keyword variants, sufficient for independent replication.",
    122           "source": "haiku"
    123         },
    124         "inclusion_exclusion_explicit": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "Table 1 lists five inclusion criteria (IN1-IN5) and five exclusion criteria (EX1-EX5) with clear logical rules (must satisfy IN1 AND IN2 AND (IN3 OR IN4 OR IN5)).",
    128           "source": "haiku"
    129         },
    130         "prisma_or_structured_protocol": {
    131           "applies": true,
    132           "answer": true,
    133           "justification": "The paper follows Petersen et al.'s SLR guidelines with three documented phases (Planning, Conducting, Reporting) and a methodology diagram (Figure 1), constituting a structured protocol even if not PRISMA.",
    134           "source": "haiku"
    135         },
    136         "search_terms_provided": {
    137           "applies": true,
    138           "answer": true,
    139           "justification": "All keyword sets (Sets 1-5) and all three full search strings are provided in Section 3.2.",
    140           "source": "haiku"
    141         },
    142         "databases_listed": {
    143           "applies": true,
    144           "answer": true,
    145           "justification": "Inclusion criterion IN2 explicitly lists: IEEE Xplore, ACM Digital Library, ScienceDirect, SpringerLink, and arXiv.",
    146           "source": "haiku"
    147         },
    148         "screening_process_documented": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "No paper counts are provided at any screening stage—no initial retrieval count, no count after title/abstract screening, no count after full-text review. The total number of included papers cannot be determined from the methodology section alone, which is a fundamental SLR reporting failure.",
    152           "source": "haiku"
    153         },
    154         "review_scope_justified": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "The paper justifies excluding date restrictions and including arXiv, but provides no rationale for choosing these four specific databases over alternatives (Scopus, Google Scholar, Web of Science), leaving the completeness of coverage unjustified.",
    158           "source": "haiku"
    159         }
    160       },
    161       "synthesis_quality": {
    162         "conflicting_findings_acknowledged": {
    163           "applies": true,
    164           "answer": true,
    165           "justification": "Tables 5 and 6 explicitly partition studies into those where LLMs outperformed SATs vs. those where SATs outperformed LLMs, and Section 5.3 directly narrates the contradiction and its nuances.",
    166           "source": "haiku"
    167         },
    168         "quality_assessment_of_sources": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "No quality rubric, risk-of-bias assessment, or structured evaluation of individual included papers is performed. Studies using controlled lab settings are synthesized alongside user studies and benchmark evaluations without weighting or quality scoring.",
    172           "source": "haiku"
    173         },
    174         "publication_bias_discussed": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "The paper never discusses publication bias or acknowledges that published studies are more likely to report positive LLM capabilities, which is especially relevant given the rapidly evolving and commercially incentivized field.",
    178           "source": "haiku"
    179         },
    180         "quantitative_synthesis_present": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "The paper performs vote counting across studies—Table 2 counts how many of 20 papers identify each vulnerability category, and Figure 2 presents a bar chart of these counts. This constitutes a minimal but present quantitative synthesis.",
    184           "source": "haiku"
    185         },
    186         "recommendations_supported_by_evidence": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "The RQ answers in Section 10.1 are directly derived from the reviewed literature and cite specific studies as evidence; conclusions such as 'CoT prompting appears to be more consistent in improving accuracy' are supported by the multiple studies summarized in Section 8.",
    190           "source": "haiku"
    191         }
    192       }
    193     }
    194   },
    195   "claims": [
    196     {
    197       "claim": "Injection vulnerabilities (SQL injection, XSS) are the most prevalent security issue in LLM-generated code, appearing in 16 of 20 reviewed studies.",
    198       "evidence": "Table 2 and Figure 2 explicitly count occurrences across all 20 papers; Section 4.1 provides specific study-level evidence including Tóth et al. on 2500 GPT-4-generated PHP sites.",
    199       "supported": "strong"
    200     },
    201     {
    202       "claim": "LLMs generally detect more vulnerabilities than traditional static analysis tools but exhibit significantly higher false positive rates (up to 97%).",
    203       "evidence": "Tables 5-6 partition 14+ studies; Çetin et al. found GPT-4 with 63% FP rate; Ozturk et al. found ChatGPT at 91% FP vs. best SAT at 82%.",
    204       "supported": "moderate"
    205     },
    206     {
    207       "claim": "CoT prompting consistently outperforms zero-shot and few-shot prompting for vulnerability detection and fixing tasks.",
    208       "evidence": "Section 8.3 synthesizes multiple studies; Zhang et al. showed +21.6% C/C++ accuracy with CoT; Nong et al.'s VSP achieved 97.65% vs. 65.88% baseline on SARD.",
    209       "supported": "moderate"
    210     },
    211     {
    212       "claim": "Fine-tuning LLMs on security-specific datasets dramatically improves vulnerability detection, with one study achieving F1 of 97% vs. ~51% for non-fine-tuned models.",
    213       "evidence": "Section 5.5; Guo et al. [38]: CodeLlama-7b-fine-tuned at 97% F1 vs. ~51% for general-purpose models on the same task.",
    214       "supported": "moderate"
    215     },
    216     {
    217       "claim": "Poisoning less than 3% of training data can cause LLMs to generate vulnerable code that evades both SAT and LLM detection.",
    218       "evidence": "Cotroneo et al. [25] in Section 9.1: <3% poisoned data triggered insecure suggestions; Yan et al. CodeBreaker achieved 92% SAT evasion and ~75% LLM evasion.",
    219       "supported": "moderate"
    220     },
    221     {
    222       "claim": "No published research addresses how data poisoning specifically affects LLMs' ability to detect and fix vulnerabilities (as opposed to generate them).",
    223       "evidence": "Section 9.2 explicitly states this gap: 'there are currently no studies that examine its particular effects on vulnerability detection and fixing tasks.'",
    224       "supported": "strong"
    225     },
    226     {
    227       "claim": "ChatGPT-generated code contains 20% fewer vulnerabilities than equivalent Stack Overflow code snippets.",
    228       "evidence": "Hamer et al. [40] in Section 4.6: comparison study of ChatGPT-generated vs. Stack Overflow code across 216 snippets.",
    229       "supported": "weak"
    230     }
    231   ],
    232   "methodology_tags": [
    233     "survey",
    234     "meta-analysis"
    235   ],
    236   "key_findings": "LLM-generated code introduces security vulnerabilities across 10 categories, with injection vulnerabilities most prevalent (16/20 studies); despite these risks, code from models like GitHub Copilot contains fewer vulnerabilities than human-written Stack Overflow code. LLMs generally outperform static analysis tools in vulnerability detection but suffer from unacceptably high false positive rates (up to 97%), limiting practical adoption. Chain-of-thought prompting consistently outperforms zero-shot and few-shot approaches across detection and fixing tasks, while fine-tuning on security-specific data yields dramatic F1 improvements. Data poisoning with under 3% contaminated training data can covertly compromise secure code generation; research on how poisoning affects detection and fixing capabilities is entirely absent from the literature.",
    237   "red_flags": [
    238     {
    239       "flag": "Missing screening counts",
    240       "detail": "No paper counts are reported at any stage of the selection process (initial retrieval, title/abstract screening, full-text review). The total number of included papers cannot be determined from the methods section, which is a fundamental SLR reporting failure that prevents verification of the review's comprehensiveness."
    241     },
    242     {
    243       "flag": "Minimal threats to validity",
    244       "detail": "Section 10.3 consists of approximately two sentences: one generic observation about LLMs evolving rapidly and one noting prompt variation across RQ1 studies. No coverage of search completeness, inter-rater reliability in study selection, or publication bias."
    245     },
    246     {
    247       "flag": "No quality assessment of sources",
    248       "detail": "Individual studies are synthesized without any quality scoring or risk-of-bias assessment. Controlled lab experiments, industry-funded studies, and preprints are treated with equal weight, inflating confidence in synthesis claims."
    249     },
    250     {
    251       "flag": "Publication bias unaddressed",
    252       "detail": "No acknowledgment that positive LLM results are more likely to be published, particularly relevant given commercial stakes (OpenAI, GitHub, Google) of the models under review."
    253     },
    254     {
    255       "flag": "Overgeneralized summary claims",
    256       "detail": "Summary statements like 'LLMs often produce code with security vulnerabilities' and 'CoT prompting outperforms zero-shot' do not specify which LLMs, which benchmarks, or which vulnerability types—obscuring the significant heterogeneity documented in the reviewed studies."
    257     }
    258   ],
    259   "cited_papers": [
    260     {
    261       "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions",
    262       "relevance": "Foundational empirical study on Copilot security; found ~40% of generated programs vulnerable, benchmarked across 89 high-risk cybersecurity scenarios"
    263     },
    264     {
    265       "title": "Do Users Write More Insecure Code with AI Assistants?",
    266       "relevance": "User study showing AI-assisted users produced more insecure code than control group, particularly in cryptography tasks"
    267     },
    268     {
    269       "title": "You Autocomplete Me: Poisoning Vulnerabilities in Neural Code Completion",
    270       "relevance": "First demonstration that code completion models are vulnerable to data poisoning attacks; introduced targeted attack concept"
    271     },
    272     {
    273       "title": "TrojanPuzzle: Covertly Poisoning Code-Suggestion Models",
    274       "relevance": "Advanced poisoning attack hiding malicious code in docstrings/comments to evade static analysis detection"
    275     },
    276     {
    277       "title": "Vulnerability Detection with Code Language Models: How Far Are We?",
    278       "relevance": "Key study showing benchmark inflation: 68.26% F1 on BigVul vs 3.09% on realistic dataset, central to proxy-outcome gap discussion"
    279     },
    280     {
    281       "title": "Lost at C: A User Study on the Security Implications of Large Language Model Code Assistants",
    282       "relevance": "User study showing LLMs do not significantly increase severe security bug rates in C programming tasks"
    283     },
    284     {
    285       "title": "A Systematic Literature Review on the Impact of AI Models on the Security of Code Generation",
    286       "relevance": "Closest related SLR; this paper explicitly extends it by adding a vulnerability categorization taxonomy and data poisoning coverage"
    287     },
    288     {
    289       "title": "Vulnerabilities in AI Code Generators: Exploring Targeted Data Poisoning Attacks",
    290       "relevance": "Demonstrated <3% poisoned training data sufficient to compromise secure code generation across CodeBERT and CodeT5+"
    291     },
    292     {
    293       "title": "Poisoned ChatGPT Finds Work for Idle Hands: Exploring Developers' Coding Practices with Insecure Suggestions",
    294       "relevance": "In-lab study with 30 developers using poisoned CodeGen model; showed poisoning increases developers' likelihood of introducing insecure code"
    295     },
    296     {
    297       "title": "No Need to Lift a Finger Anymore? Assessing the Quality of Code Generation by ChatGPT",
    298       "relevance": "Systematic empirical assessment on 728 algorithmic problems; found 91.8% of memory vulnerabilities were null pointer issues; demonstrated multi-round prompting fixes 89% of vulnerabilities"
    299     }
    300   ],
    301   "engagement_factors": {
    302     "practical_relevance": {
    303       "score": 3,
    304       "justification": "Directly actionable for practitioners using LLMs for code generation or security review: categorizes vulnerability types, ranks prompting strategies, and documents poisoning risks with specific attack vectors."
    305     },
    306     "surprise_contrarian": {
    307       "score": 1,
    308       "justification": "Findings largely confirm existing intuitions (LLMs are imperfect, false positives are high); the TrojanPuzzle/CodeBreaker poisoning results are somewhat surprising but not the paper's primary contribution."
    309     },
    310     "fear_safety": {
    311       "score": 2,
    312       "justification": "Data poisoning with <3% contaminated data and attacks that evade 92% of SATs are legitimate AI supply-chain risk concerns with direct safety implications for software deployed in production."
    313     },
    314     "drama_conflict": {
    315       "score": 1,
    316       "justification": "The field has commercial tensions (GitHub Copilot, ChatGPT vendors vs. security community) but the paper takes a neutral academic tone without dramatizing the conflict."
    317     },
    318     "demo_ability": {
    319       "score": 2,
    320       "justification": "The specific prompting strategies (CoT, role-oriented, few-shot) can be immediately tried by readers in current LLM interfaces; the vulnerability scenarios described are replicable."
    321     },
    322     "brand_recognition": {
    323       "score": 1,
    324       "justification": "Authors are from Örebro University (not a top-tier AI lab) and Epiroc Rock Drills (industrial, low AI brand recognition); no famous model or product is being introduced."
    325     }
    326   },
    327   "hn_data": {
    328     "threads": [
    329       {
    330         "hn_id": "42476192",
    331         "title": "Compiling C to Safe Rust, Formalized",
    332         "points": 291,
    333         "comments": 157,
    334         "url": "https://news.ycombinator.com/item?id=42476192",
    335         "created_at": "2024-12-20T23:30:03Z"
    336       },
    337       {
    338         "hn_id": "41937780",
    339         "title": "Dynamic Models of Gentrification",
    340         "points": 43,
    341         "comments": 32,
    342         "url": "https://news.ycombinator.com/item?id=41937780",
    343         "created_at": "2024-10-24T17:49:30Z"
    344       },
    345       {
    346         "hn_id": "43442131",
    347         "title": "Quantitative Finance: Kronecker-Factored Approximate Curvature Deep Hedging",
    348         "points": 5,
    349         "comments": 0,
    350         "url": "https://news.ycombinator.com/item?id=43442131",
    351         "created_at": "2025-03-22T00:24:21Z"
    352       },
    353       {
    354         "hn_id": "38831285",
    355         "title": "Towards Detecting Cascades of Biased Medical Claims on Twitter",
    356         "points": 3,
    357         "comments": 0,
    358         "url": "https://news.ycombinator.com/item?id=38831285",
    359         "created_at": "2024-01-01T12:06:57Z"
    360       },
    361       {
    362         "hn_id": "42476328",
    363         "title": "Affirmative Resolution of Bourgain's Slicing Problem",
    364         "points": 2,
    365         "comments": 0,
    366         "url": "https://news.ycombinator.com/item?id=42476328",
    367         "created_at": "2024-12-20T23:54:08Z"
    368       },
    369       {
    370         "hn_id": "42687845",
    371         "title": "A Framework for Training and Deploying Language Models at the Edge Computers",
    372         "points": 1,
    373         "comments": 0,
    374         "url": "https://news.ycombinator.com/item?id=42687845",
    375         "created_at": "2025-01-13T19:38:18Z"
    376       }
    377     ],
    378     "top_points": 291,
    379     "total_points": 345,
    380     "total_comments": 189
    381   }
    382 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs