scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (32905B)
      1 {
      2   "paper": {
      3     "title": "Secure Coding with AI – From Detection to Repair: Vulnerability Analysis and Fixing Using Large Language Models",
      4     "authors": [
      5       "Vladislav Belozerov",
      6       "Peter J Barclay",
      7       "Ashkan Sami"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv",
     11     "arxiv_id": "2504.20814"
     12   },
     13   "scan_version": 3,
     14   "active_modules": ["experimental_rigor", "data_leakage"],
     15   "methodology_tags": ["benchmark-eval", "observational"],
     16   "key_findings": "Analysis of 2,315 C/C++/C# code snippets from the DevGPT dataset identified 56 confirmed vulnerabilities in 48 files. LLM vulnerability detection improved substantially from October 2024 (GPT-4o at ~53-56%) to September 2025 (GPT-4.1, GPT-5, Claude Opus 4.1 at ~78-82%), though data contamination cannot be ruled out. User-provided code and GPT-generated code showed roughly equal vulnerability rates (25 vs 31 of 56 issues). All models exhibited false positives and sycophantic behavior, over-reporting issues even in safe code.",
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The paper provides a GitHub repository with software tools [17] (https://github.com/vlad2010/MS_DataEngineering_Public) and a results repository [34] (https://github.com/vlad2010/MSc_DataEngineering_Results)."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The DevGPT dataset is publicly available on Zenodo [14]. Results including LLM outputs and analysis are shared in the GitHub results repository [34]. Section 10.5 confirms data availability."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "While static scanner versions are listed in Table 5 (Snyk 1.1294.0, Semgrep 1.87.0, FlawFinder 2.0.19, CppCheck 2.14.0), there is no requirements.txt, Dockerfile, or detailed environment specification for reproducing the Python tooling or analysis pipeline."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No step-by-step reproduction instructions are provided. The paper describes the methodology conceptually but does not include a README or scripts to replicate the main experiments end-to-end."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "All results are reported as point estimates (e.g., '82.14%', '78.57%') with no confidence intervals or error bars. Tables 12, 17, and 18 show only raw counts and percentages."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper claims models 'performed about the same, roughly 80%' and that there was 'substantial improvement' from 2024 to 2025, but no statistical significance tests are applied to any comparison."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "The paper provides baseline context for improvements: 'from roughly 50% to around 75–80%' (abstract), and per-model breakdowns (e.g., GPT-4.1 detected 82.14% vs GPT-4o's ~56%), allowing readers to assess magnitude."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The sample of 56 confirmed vulnerabilities in 48 files is not justified by power analysis or any rationale. Section 7.2 acknowledges 'The sample size is relatively small' but offers no justification for why this size is adequate for the claims."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "Single-run results are reported for each model. Section 7.3 acknowledges 'the inherent non-determinism of LLMs' outputs' but no variance, standard deviation, or repeat-run results are provided."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The October 2024 GPT-4o results serve as a temporal baseline for the 2025 experiment. Static scanners (Flawfinder, Snyk, Semgrep, Cppcheck) serve as reference detection tools in the detection matrices (Tables 10, 15, 16)."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "The September 2025 experiment uses GPT-4.1, GPT-5, and Claude Opus 4.1 — current-generation models at the time of the experiment. Static scanners also used recent versions (Table 5)."
     77       },
     78       "ablation_study": {
     79         "applies": false,
     80         "answer": false,
     81         "justification": "The evaluation involves prompting LLMs with a single fixed prompt on code snippets. There is no multi-component system whose components could be ablated."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Two distinct metrics are reported for each model: detection rate (whether the LLM identified the vulnerability) and fixing rate (whether the LLM produced a correct fix). Tables 12, 17, and 18 present both."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "All results were manually reviewed. Section 5.3: 'results for all 114 files were manually reviewed by the first author and the findings were discussed and validated by the second and third authors.' LLM outputs were also manually verified."
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "No tuning was performed on the evaluation data. The paper uses zero-shot prompting with a fixed prompt (Section 6.2). Temperatures were set based on general literature recommendations [3], not based on this evaluation data. The evaluation set functions as a pure test set."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Results are broken down by programming language (Table 9), by CWE type (Table 8), by source of code (user vs GPT-generated, Tables 17-18), by model (Table 12), and by DevGPT snapshot (Tables 13-14)."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Multiple detailed failure cases are discussed: false positive on fprintf (Listing 1, Finding 6), sycophantic over-correction of safe code (Listings 2-3, Finding 8), incorrect CWE classification (Listing 4, Finding 9), and silent fix without explanation (Finding 11)."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper reports GPT-4o's poor 2024 performance (~50% success, Finding 3), cases where all models failed to detect issues (e.g., CWE-22 path traversal in Tables 15-16), and sycophantic behavior where models reported non-existent vulnerabilities (Finding 6)."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Abstract claims about 56 vulnerabilities in 48 files, per-model detection/fixing rates (46/44/45 detected, 42/44/43 fixed), improvement from ~50% to ~75-80%, and equal vulnerability rates in user vs GPT code are all substantiated in Sections 5-6 and Tables 12-18."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "Finding 5 states 'vulnerability detection and fixing performance improved significantly' and 'This represents a substantial increase in capability within one year.' This implies genuine model improvement but the study design — different models, different time points, partially different datasets, public availability of the preprint — cannot support causal inference. Section 7.2 acknowledges contamination but the framing still treats it as genuine improvement."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The title 'Secure Coding with AI – From Detection to Repair' and abstract language about 'current LLMs' are broader than the tested setting of 56 vulnerabilities in C/C++/C# from one dataset. Section 7.2 acknowledges limited generalizability but the title and framing overreach."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Section 7 discusses multiple alternative explanations: data contamination from the preprint being public (7.2), false negatives from scanners (7.1), selection bias in manual file review (7.1), and non-determinism of LLM outputs (7.3)."
    134       },
    135       "proxy_outcome_distinction": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The paper measures vulnerability detection rate and fixing rate against scanner-confirmed issues, and frames results accordingly. The measurements match the claim granularity — they claim to measure detection/fixing ability and measure exactly that. They acknowledge scanners may miss issues (Section 7.1), noting the proxy limitation."
    139       }
    140     },
    141     "setup_transparency": {
    142       "model_versions_specified": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "Section 6.1 specifies GPT-5 (2025-08-07), GPT-4.1 (2025-04-14), and Claude Opus 4.1 (2025-08-05) with dates, but the first experiment uses 'GPT-4o, without specifying version and with the default temperature 1.0.' GPT-4o, a key baseline model, lacks a version or snapshot identifier."
    146       },
    147       "prompts_provided": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "The exact prompt text used in experiments is provided verbatim in Section 6.2, including the original misspelling. The paper states the same prompt was used in 2025 with the spelling corrected."
    151       },
    152       "hyperparameters_reported": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Section 6.1 and 6.3 report temperatures: GPT-4o default 1.0, GPT-5 temperature 1.0, GPT-4.1 temperature 0.1, Claude Opus 4.1 temperature 0.1. Temperature selection rationale is referenced [3]."
    156       },
    157       "scaffolding_described": {
    158         "applies": false,
    159         "answer": false,
    160         "justification": "No agentic scaffolding is used. The paper uses simple single-shot API prompts with no tools, retry logic, or multi-step reasoning."
    161       },
    162       "data_preprocessing_documented": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Sections 4 and 5.2 describe the data pipeline in detail: DevGPT extraction, folder structure, file selection algorithm with explicit criteria (scanner detection required, one file per sharing, deduplication, snapshot exclusion). The pipeline stages are documented with counts (2,315 files → 114 selected → 48 with confirmed issues)."
    166       }
    167     },
    168     "limitations_and_scope": {
    169       "limitations_section_present": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 7 'Threats to Validity' contains three substantive subsections: Internal Validity (7.1), External Validity (7.2), and Construct Validity (7.3)."
    173       },
    174       "threats_to_validity_specific": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 7 discusses study-specific threats: incomplete vulnerability detection by scanners (7.1), subjective bias in file selection by three authors (7.1), data contamination from the preprint being publicly available between experiments (7.2), and LLM non-determinism affecting reproducibility (7.3)."
    178       },
    179       "scope_boundaries_stated": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 7.2 states: 'our analysis was focused on DevGPT and was limited to 56 security issues across 48 files, which may not be sufficient to generalise the findings.' They explicitly bound to C/C++/C# and the DevGPT dataset."
    183       }
    184     },
    185     "data_integrity": {
    186       "raw_data_available": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "The DevGPT dataset is publicly available on Zenodo [14]. All LLM responses and analysis results are in the GitHub repository [34]. The original code snippets and model outputs can be independently verified."
    190       },
    191       "data_collection_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Section 4 describes data extraction from DevGPT in detail: version used (v10), snapshots selected (20231012 and 20240514), folder structure, extraction tools, and file organization. Table 2 and 3 provide dataset statistics."
    195       },
    196       "recruitment_methods_described": {
    197         "applies": false,
    198         "answer": false,
    199         "justification": "No human participants are involved. The data source is the publicly available DevGPT dataset, which is a curated research dataset of developer-AI interactions."
    200       },
    201       "data_pipeline_documented": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "The full pipeline is documented: DevGPT extraction (Section 4), static scanning (Section 5.1), file selection algorithm with criteria (Section 5.2, from 2,315 files to 114 selected), manual validation (Section 5.3, 56 confirmed issues in 48 files), and LLM evaluation (Section 6). Figure 1 provides a dataflow diagram."
    205       }
    206     },
    207     "conflicts_of_interest": {
    208       "funding_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Section 10.1 explicitly states: 'The research received no external funding.'"
    212       },
    213       "affiliations_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "All three authors are listed as affiliated with Edinburgh Napier University, Edinburgh, Scotland. They are independent academic researchers evaluating third-party commercial LLMs."
    217       },
    218       "funder_independent_of_outcome": {
    219         "applies": false,
    220         "answer": false,
    221         "justification": "The research received no external funding (Section 10.1), so the question of funder independence is not applicable."
    222       },
    223       "financial_interests_declared": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "Section 10.6 states: 'The authors have no competing interests to declare.'"
    227       }
    228     },
    229     "contamination": {
    230       "training_cutoff_stated": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No training data cutoff dates are stated for any of the models used (GPT-4o, GPT-4.1, GPT-5, Claude Opus 4.1). This is significant because the DevGPT dataset is public and could have been in training data."
    234       },
    235       "train_test_overlap_discussed": {
    236         "applies": true,
    237         "answer": true,
    238         "justification": "Section 7.2 explicitly discusses this: 'since our paper and dataset were publicly available online between these experiments, the observed improvement may reflect data leakage (the LLMs being trained on our online paper and DevGPT being online) rather than genuine model performance improvements.'"
    239       },
    240       "benchmark_contamination_addressed": {
    241         "applies": true,
    242         "answer": true,
    243         "justification": "Section 7.2 addresses contamination as a major threat: 'This threatens the generalisability of conclusions about models' improvements over time, as the results may not reflect true capability gains but rather exposure to our specific evaluation data.' Also noted in Section 6.3."
    244       }
    245     },
    246     "human_studies": {
    247       "pre_registered": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants are involved. The study analyzes code snippets from a public dataset and evaluates LLM outputs."
    251       },
    252       "irb_or_ethics_approval": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants. Section 10.2 confirms: 'Not applicable' for ethical approval."
    256       },
    257       "demographics_reported": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants. The study evaluates code and LLM outputs, not human behavior."
    261       },
    262       "inclusion_exclusion_criteria": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants. File selection criteria for code snippets are documented in Section 5.2, but these are not participant recruitment criteria."
    266       },
    267       "randomization_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants and no experimental conditions requiring randomization."
    271       },
    272       "blinding_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants. The manual review was conducted by the authors who were aware of the study design."
    276       },
    277       "attrition_reported": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants."
    281       }
    282     },
    283     "cost_and_practicality": {
    284       "inference_cost_reported": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No API costs, tokens consumed, or wall-clock time are reported for any of the LLM evaluations across the 114 files and 4 models."
    288       },
    289       "compute_budget_stated": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No total computational budget, API spend, or processing time is stated for either the 2024 or 2025 experiments."
    293       }
    294     },
    295     "experimental_rigor": {
    296       "seed_sensitivity_reported": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "All results are single-run per model. Section 7.3 acknowledges 'the model can produce different responses to the same prompt on different occasions' but no multi-seed or repeat-run results are reported."
    300       },
    301       "number_of_runs_stated": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The number of experimental runs is never explicitly stated. The results appear to be from a single evaluation per model, but this is implied rather than stated."
    305       },
    306       "hyperparameter_search_budget": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "Temperature settings were selected based on general literature recommendations [3] rather than a systematic search. No search budget is reported, and the rationale for using 0.1 for some models and 1.0 for others is not thoroughly justified."
    310       },
    311       "best_config_selection_justified": {
    312         "applies": true,
    313         "answer": true,
    314         "justification": "Section 6.1 justifies temperature selection: 'Temperature was selected based on the observation that lower temperature is better for technical and code-generation tasks, and LLMs behave more predictably at lower temperatures [3].' While this doesn't explain why GPT-5 used 1.0, the general selection criterion is stated."
    315       },
    316       "multiple_comparison_correction": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The paper makes multiple comparisons across 4 models, 2 time points, 2 code sources, and 3 languages without any statistical tests at all, let alone multiple comparison corrections."
    320       },
    321       "self_comparison_bias_addressed": {
    322         "applies": false,
    323         "answer": false,
    324         "justification": "The authors evaluate third-party commercial LLMs (OpenAI, Anthropic), not their own system. Self-comparison bias is not applicable."
    325       },
    326       "compute_budget_vs_performance": {
    327         "applies": false,
    328         "answer": false,
    329         "justification": "All models are accessed via commercial APIs with simple single-shot prompts. Compute differences between models are not under the authors' control and are not meaningfully comparable."
    330       },
    331       "benchmark_construct_validity": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The paper does not discuss whether the 56 scanner-confirmed vulnerabilities constitute a valid benchmark for measuring LLM security capabilities. Issues like whether static-scanner-detectable vulnerabilities are representative of real-world security problems, or whether the selection methodology biases toward certain vulnerability types, are not addressed."
    335       },
    336       "scaffold_confound_addressed": {
    337         "applies": false,
    338         "answer": false,
    339         "justification": "No scaffolding is used. Simple single-shot API prompts are sent directly to each model."
    340       }
    341     },
    342     "data_leakage": {
    343       "temporal_leakage_addressed": {
    344         "applies": true,
    345         "answer": true,
    346         "justification": "Section 7.2 explicitly discusses temporal leakage: the DevGPT dataset and the preprint were publicly available between the 2024 and 2025 experiments, meaning models may have trained on the specific code and vulnerabilities being tested."
    347       },
    348       "feature_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "The paper does not discuss whether the prompt itself (which explicitly asks to find security issues) provides hints that would not be available in real-world usage, or whether the framing of the evaluation leaks information about expected findings."
    352       },
    353       "non_independence_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "The paper does not discuss whether the code snippets from DevGPT share structural similarities (e.g., same developers, similar coding patterns, duplicate or near-duplicate code). Snippets from the same 'Sharing' are filtered but cross-sharing similarity is not addressed."
    357       },
    358       "leakage_detection_method": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No concrete leakage detection method is applied. The paper only discusses contamination risk conceptually in Section 7.2 without using canary strings, membership inference, n-gram overlap, or other detection techniques."
    362       }
    363     }
    364   },
    365   "claims": [
    366     {
    367       "claim": "Out of 2,315 code snippets from DevGPT, 56 vulnerabilities were confirmed in 48 files after static scanning and manual review.",
    368       "evidence": "Section 5.3 and Table 12: static scanners detected potential issues in 114 files, manual review confirmed 56 vulnerabilities across 48 files. CWE distribution in Table 8.",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "LLM vulnerability detection and fixing improved substantially from October 2024 (~53-56%) to September 2025 (~78-82%).",
    373       "evidence": "Section 6.2-6.3 and Table 12: GPT-4o detected 56%/fixed 53% in 2024; GPT-4.1 detected 82.1%/fixed 75%, GPT-5 detected and fixed 78.6%, Claude Opus 4.1 detected 80.4%/fixed 76.8% in 2025.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "All three 2025 LLMs performed comparably at approximately 78% detection and fixing accuracy.",
    378       "evidence": "Table 12 shows GPT-4.1 at 82.1%/75%, GPT-5 at 78.6%/78.6%, Claude Opus 4.1 at 80.4%/76.8%. Finding 4 summarizes this.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "LLM-generated code is about as likely to contain vulnerabilities as developer-written code.",
    383       "evidence": "Tables 17-18: 25 of 56 issues in user-provided code vs 31 in GPT-generated code. Finding 13 states 44.6% of issues were in user code.",
    384       "supported": "weak"
    385     },
    386     {
    387       "claim": "LLMs exhibit sycophantic behavior, reporting non-existent vulnerabilities and making unnecessary modifications to safe code.",
    388       "evidence": "Findings 6-8: Listing 1 shows a single fprintf line where all LLMs reported multiple non-existent CWEs. Listings 2-3 show unnecessary modification of already-safe memcpy code. Models 'are trying to satisfy the user who asked them to find security problems.'",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "C has the highest vulnerability detection density among the three languages tested.",
    393       "evidence": "Table 9: C had 10.4 lines per detected issue vs C++ at 83.8 and C# at 2,937.5. Finding 2.",
    394       "supported": "moderate"
    395     }
    396   ],
    397   "red_flags": [
    398     {
    399       "flag": "Very small sample size",
    400       "detail": "Only 56 confirmed vulnerabilities across 48 files form the evaluation set. This is extremely small for claims about LLM security capabilities, especially given 23 different CWE types. Many CWE categories have only 1-2 instances, making per-category conclusions unreliable."
    401     },
    402     {
    403       "flag": "No statistical tests or uncertainty quantification",
    404       "detail": "All comparisons between models and between time periods are based on raw percentages from single runs with no confidence intervals, significance tests, or repeat experiments. With n=56, the difference between 78.6% and 82.1% could easily be noise."
    405     },
    406     {
    407       "flag": "Single-run non-deterministic evaluation",
    408       "detail": "Section 7.3 acknowledges LLM non-determinism but all results come from single runs per model. The paper itself states 'The model can produce different responses to the same prompt on different occasions' yet does not run multiple trials."
    409     },
    410     {
    411       "flag": "Data contamination between experiments",
    412       "detail": "The preprint and DevGPT dataset were publicly available between the October 2024 and September 2025 experiments. The observed improvement from ~50% to ~78% may reflect training on the specific evaluation data rather than genuine capability gains. The paper acknowledges this but still frames results as model improvement."
    413     },
    414     {
    415       "flag": "Inconsistent temperature settings across models",
    416       "detail": "GPT-5 used temperature 1.0 while GPT-4.1 and Claude Opus 4.1 used 0.1. Higher temperature increases output variability, potentially disadvantaging GPT-5 in the comparison. The paper references general literature for the 0.1 choice but does not explain why GPT-5 was not also set to 0.1."
    417     },
    418     {
    419       "flag": "Manual selection introduces bias",
    420       "detail": "The file selection process (Section 5.2) involves multiple subjective decisions: which files are 'most interesting,' manual deduplication, and single-file selection per 'Sharing.' Section 7.1 acknowledges this may introduce bias."
    421     }
    422   ],
    423   "cited_papers": [
    424     {
    425       "title": "Using GPT as a static application security testing tool",
    426       "authors": ["A. Bakhshandeh", "A. Keramatfar", "A. Norouzi", "M. M. Chekidehkhoun"],
    427       "year": 2023,
    428       "arxiv_id": "2308.14434",
    429       "relevance": "Directly compares GPT-3.5 with static analysis tools (Bandit, Semgrep, SonarQube) for vulnerability detection in Python code."
    430     },
    431     {
    432       "title": "Evaluation of GPT Model for Vulnerability Detection",
    433       "authors": ["A. Cheshkov", "P. Zadorozhny", "R. Levichev"],
    434       "year": 2023,
    435       "arxiv_id": "2304.07232",
    436       "relevance": "Evaluates GPT-3 and GPT-3.5-turbo for vulnerability detection in Java, finding significant limitations and biases."
    437     },
    438     {
    439       "title": "GPT for Vulnerability Detection, Classification, and Repair: How Far Are We?",
    440       "authors": ["M. Fu", "C. K. Tantithamthavorn", "V. Nguyen", "T. Le"],
    441       "year": 2023,
    442       "doi": "10.1109/APSEC60848.2023.00085",
    443       "relevance": "Evaluates GPT on four vulnerability-related tasks using real-life datasets, finding it underperforms compared to specialized models like CodeBERT."
    444     },
    445     {
    446       "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions",
    447       "authors": ["H. Pearce", "B. Ahmad", "B. Tan", "B. Dolan-Gavitt", "R. Karri"],
    448       "year": 2021,
    449       "arxiv_id": "2108.09293",
    450       "relevance": "Early assessment of Copilot code generation security, finding instances of insecure coding practices."
    451     },
    452     {
    453       "title": "Do Users Write More Insecure Code with AI Assistants?",
    454       "authors": ["N. Perry", "M. Srivastava", "D. Kumar", "D. Boneh"],
    455       "year": 2023,
    456       "doi": "10.1145/3576915.3623157",
    457       "relevance": "User study examining whether AI coding assistants lead to more insecure code, directly relevant to the impact of LLM-assisted development."
    458     },
    459     {
    460       "title": "How secure is code generated by GPT?",
    461       "authors": ["R. Khoury", "A. R. Avila", "J. Brunelle", "B. M. Camara"],
    462       "year": 2023,
    463       "doi": "10.1109/SMC53992.2023.10394237",
    464       "relevance": "Evaluates 21 GPT-generated programs across five languages for specific vulnerabilities including SQL injection and memory corruption."
    465     },
    466     {
    467       "title": "Exploring the limits of GPT in software security applications",
    468       "authors": ["F. Wu", "Q. Zhang", "A. P. Bajaj", "T. Bao", "N. Zhang", "R. Wang", "C. Xiao"],
    469       "year": 2023,
    470       "arxiv_id": "2312.05275",
    471       "relevance": "Compares GPT-3.5 and GPT-4 on seven security tasks including vulnerability detection, repair, debugging, and fuzzing."
    472     },
    473     {
    474       "title": "Fight Fire With Fire: How Much Can We Trust GPT on Source Code-Related Tasks?",
    475       "authors": ["X. Yu", "L. Liu", "X. Hu", "J. W. Keung", "J. Liu", "X. Xia"],
    476       "year": 2024,
    477       "doi": "10.1109/TSE.2024.3492204",
    478       "relevance": "Evaluates GPT's self-verification ability, finding it frequently misjudges its own code and shows self-contradictory behavior."
    479     },
    480     {
    481       "title": "Do LLMs Consider Security? An Empirical Study on Responses to Programming Questions",
    482       "authors": ["A. Sajadi", "B. Le", "A. Nguyen", "K. Damevski", "P. Chatterjee"],
    483       "year": 2025,
    484       "arxiv_id": "2502.14202",
    485       "relevance": "Tests Claude 3, GPT-4, and Llama 3 on Stack Overflow snippets with known vulnerabilities, finding models don't warn about security unless prompted."
    486     },
    487     {
    488       "title": "Security and Quality in LLM-Generated Code: A Multi-Language, Multi-Model Analysis",
    489       "authors": ["M. Kharma", "S. Choi", "M. AlKhanafseh", "D. Mohaisen"],
    490       "year": 2025,
    491       "arxiv_id": "2502.01853",
    492       "relevance": "Comprehensive study of modern LLMs (GPT-4, Claude-3.5, Gemini-1.5, Llama-3) on 200 coding tasks identifying hidden security risks."
    493     },
    494     {
    495       "title": "Just another copy and paste? Comparing the security vulnerabilities of GPT generated code and StackOverflow answers",
    496       "authors": ["S. Hamer", "M. d'Amorim", "L. Williams"],
    497       "year": 2024,
    498       "relevance": "Compares GPT-generated Java code with StackOverflow answers using CodeQL, finding GPT produced 20% fewer vulnerabilities but still 248 issues."
    499     },
    500     {
    501       "title": "Security Weaknesses of Copilot Generated Code in GitHub: An empirical study",
    502       "authors": ["Y. Fu", "P. Liang", "A. Tahir", "Z. Li", "M. Shahin", "J. Yu", "J. Chen"],
    503       "year": 2023,
    504       "relevance": "Empirical study of Copilot-generated code security weaknesses found in actual GitHub repositories."
    505     },
    506     {
    507       "title": "A Qualitative Study on Using GPT for Software Security: Perception vs. Practicality",
    508       "authors": ["M. M. Kholoosi", "M. A. Babar", "R. Croft"],
    509       "year": 2024,
    510       "doi": "10.1109/TPS-ISA62245.2024.00022",
    511       "relevance": "Qualitative study of security professionals' experiences using GPT-3.5 for vulnerability detection and penetration testing."
    512     }
    513   ],
    514   "engagement_factors": {
    515     "practical_relevance": {
    516       "score": 2,
    517       "justification": "Practitioners can use the findings to calibrate trust in LLM security analysis, though no new tool or technique is provided."
    518     },
    519     "surprise_contrarian": {
    520       "score": 1,
    521       "justification": "Results mostly confirm expected findings — LLMs are imperfect at security analysis — though the ~78% improvement over 50% and the sycophancy observations add some nuance."
    522     },
    523     "fear_safety": {
    524       "score": 2,
    525       "justification": "Raises concerns about LLM-generated code security and the risk of confident false reassurance, relevant to developers relying on AI coding assistants."
    526     },
    527     "drama_conflict": {
    528       "score": 0,
    529       "justification": "No controversy, no conflict with vendors or other researchers. Academic evaluation with straightforward findings."
    530     },
    531     "demo_ability": {
    532       "score": 1,
    533       "justification": "Code tools and results are on GitHub but there is no interactive demo or pip-installable tool."
    534     },
    535     "brand_recognition": {
    536       "score": 2,
    537       "justification": "Evaluates GPT-5, GPT-4.1, and Claude Opus 4.1 — well-known models from leading AI labs — but the paper itself comes from a lesser-known university."
    538     }
    539   }
    540 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs