scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23844B)
      1 {
      2   "paper": {
      3     "title": "Security Degradation in Iterative AI Code Generation: A Systematic Analysis of the Paradox",
      4     "authors": ["Shivani Shukla", "Himanshu Joshi", "Romilla Syed"],
      5     "year": 2025,
      6     "venue": "IEEE-ISTAS 2025",
      7     "arxiv_id": "2506.11022"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "Iterative LLM code refinement without human intervention is associated with a 37.6% increase in critical vulnerabilities after 5 iterations, with vulnerability counts accelerating in later iterations (avg 2.1 early vs 6.2 late). Different prompting strategies produce distinct vulnerability profiles: efficiency-focused prompts cause memory safety issues (42.7%), feature-focused cause concurrency issues (30.4%), and even security-focused prompts introduce cryptographic errors (21.1%). Code complexity correlates with vulnerability count (r=0.64), and the repeated-measures ANOVA shows significant iteration effects (F(9,90)=14.32, p<0.001, η²=0.42).",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No repository URL, code archive, or data release mentioned anywhere in the paper."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The 400 generated code samples and vulnerability analysis data are not released. No dataset link provided."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment specification beyond stating GPT-4o with temperature=0.7 and top_p=1.0. No static analysis tool versions, no OS, no dependency specifications."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No reproduction instructions provided. The appendix includes example prompts but no scripts or step-by-step guide."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "95% CIs reported for regression coefficients (Table V, e.g., complexity β=0.64, 95% CI [0.50, 0.78]) and for the complexity-vulnerability relationship (95% CI: 10.7%-17.9%). Standard deviations reported for iteration-level vulnerability counts."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Repeated measures ANOVA (F(9,90)=14.32, p<0.001), chi-square test (χ²(33)=172.4, p<0.001), multiple regression (F(5,394)=160.2, p<0.001), and post-hoc Tukey HSD tests are reported."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Effect sizes reported: η²=0.42 for ANOVA, Cramer's V=0.38 for chi-square, R²=0.67 for regression, β coefficients with CIs. Also reports '14.3% increase per 10% complexity increase.'"
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No justification for why 10 baseline code samples were chosen. No power analysis. 10 samples is quite small for the claims being made."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Standard deviations reported for vulnerability counts per iteration (e.g., SD=0.9, SD=1.2, SD=1.8). Standard errors reported in regression table."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The 10 verified-secure baseline code samples serve as the starting point, and the four prompting strategies are compared against each other. The design inherently has baselines (iteration 0 = secure code)."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Uses GPT-4o, which was current at the time of study. The comparison is across prompting strategies and iterations rather than across models, so contemporariness applies to the model choice."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No ablation study. The four prompting strategies are compared but no components are systematically removed or varied (e.g., temperature settings, different prompt structures)."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Multiple metrics: vulnerability count, severity distribution (Critical/High/Medium/Low), vulnerability type distribution, cyclomatic complexity, lines of code, functional correctness."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Manual security code review was performed alongside static analysis tools: 'After each iteration, we performed: Static analysis using multiple tools... Manual security code review... Categorization and severity assessment' (Section III-A)."
     88       },
     89       "held_out_test_set": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "Not applicable — this is not a train/test split evaluation. All 400 generated samples are analyzed."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results broken down by prompting strategy (Table II), by vulnerability type per strategy (Table III), and by iteration group (early/middle/late)."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Three detailed case studies (Section IV-E) trace specific vulnerability evolution patterns. Section IV-F discusses where security improvements did occur (27% of security-focused iterations)."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Reports that 27% of security-focused iterations resulted in net security improvements (Section IV-F), acknowledging partial success. Also notes security-focused prompts introduced the fewest vulnerabilities overall."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims ('37.6% increase in critical vulnerabilities after five iterations', 'distinct vulnerability patterns') are supported by Tables II-III and the ANOVA results in Section IV."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper frequently uses causal-adjacent language ('iterative LLM refinement improves code security' is 'challenged', effects are 'isolated'). The controlled experiment manipulates prompting strategy, but with only 10 baseline samples the design is underpowered for causal claims. The paper acknowledges the scenario is artificial but still frames conclusions causally."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "Title says 'A Systematic Analysis of the Paradox' suggesting generality, but results are from one model (GPT-4o), two languages (C, Java), and 10 baseline samples. Limitations mention single model and language constraints but the title and abstract overreach."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "No discussion of alternative explanations for why vulnerabilities increase. Could be temperature effects, code length growth as a confound (they show correlation but don't control for it), or prompt drift. The complexity-vulnerability correlation is noted but not explored as a confound."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper measures static analysis tool findings and manual review results, then frames this as 'security degradation.' No discussion of whether static analysis findings map to actual exploitable vulnerabilities. The gap between tool-detected issues and real-world security risk is not acknowledged."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "States 'OpenAI's GPT-4o' without a snapshot date or API version. GPT-4o has multiple versions (e.g., gpt-4o-2024-05-13, gpt-4o-2024-08-06). No version specified."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Appendix A provides 5 example prompts for each of the 4 strategies (20 total). These appear to be the actual prompts used, not just descriptions."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Temperature=0.7 and top_p=1.0 explicitly stated in Section III-C."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding used. The experiment is simple iterative prompting — submit code, receive output, submit output as next input."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section III-A describes the full pipeline: baseline selection with security verification, prompt definition, iterative feedback process, and security analysis procedure. Section III-B describes the 10 baseline categories."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section VI 'Limitations and Future Work' provides substantive discussion of study limitations."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Specific threats discussed: single model (GPT-4o), two languages only (C, Java), artificial scenario without human intervention, model evolution over time. These are specific to this study."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section VI explicitly states what was not tested: other LLMs (Claude, Llama), other languages (Rust, Go), human-AI collaborative workflows. Also acknowledges the 'pure LLM interactions without human intervention' limitation."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "The 400 code samples, static analysis outputs, and manual review data are not released. No supplementary materials available."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section III details the experimental procedure: 10 baseline samples, 4 strategies, 10 iterations each, using GPT-4o with specified parameters. Static analysis tools named (Clang Static Analyzer, CodeQL, SpotBugs)."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. The 'samples' are code snippets, and the data source is LLM-generated code from a controlled experiment."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section III-F documents the pipeline: generated code → static analysis (3 tools) → manual review → categorization and severity assessment → statistical analysis. Section III-D lists the 12 vulnerability categories and CVSS-based severity levels."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding or acknowledgments section present. No mention of grants or sponsorship."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations clearly stated: University of San Francisco, Vector Institute for AI, University of Massachusetts Boston. None affiliated with OpenAI."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding disclosed at all, so independence cannot be assessed. Absence of disclosure is not absence of conflict."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The study evaluates GPT-4o's code generation on 10 baseline code samples. No mention of GPT-4o's training data cutoff date."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of whether the baseline code samples or similar patterns exist in GPT-4o's training data. The model may have seen similar security patterns."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The 10 baseline samples are described categorically but their provenance is unclear. No discussion of whether similar code exists in training data."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants. The experiment generates and analyzes code samples."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "400 GPT-4o API calls made but no cost, token count, or latency information reported."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No mention of total API cost, compute resources for static analysis, or time to complete the experiment."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "Temperature=0.7 introduces stochasticity but no multi-seed or multi-run analysis is reported. Each iteration appears to be a single run."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No explicit statement of how many times each iteration was run. The 400 samples appear to be from single runs (10 samples × 4 strategies × 10 iterations)."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "Temperature=0.7 chosen without justification. No search over temperature, top_p, or other settings."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": false,
    309         "answer": false,
    310         "justification": "No configuration selection — a single configuration was used throughout."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "Multiple statistical tests performed (ANOVA, chi-square, regression, Tukey HSD) but no family-wise error correction discussed across the set of analyses."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "The paper does not propose a new system to compare against baselines. It evaluates GPT-4o's behavior."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": false,
    324         "answer": false,
    325         "justification": "Not applicable — the paper does not compare methods at different compute budgets."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "No discussion of whether the 10 hand-selected baseline code samples are representative of real-world security-critical code. No analysis of whether static analysis tool findings map to exploitable vulnerabilities."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No scaffolding used. Simple iterative prompting with no agent framework."
    336       }
    337     }
    338   },
    339   "claims": [
    340     {
    341       "claim": "37.6% increase in critical vulnerabilities after just five iterations of LLM refinement",
    342       "evidence": "Stated in abstract and supported by Table II showing 87 total critical vulnerabilities across strategies. Iteration-level data shows early avg 2.1 (SD=0.9) vs late avg 6.2 (SD=1.8) vulnerabilities per sample.",
    343       "supported": "moderate"
    344     },
    345     {
    346       "claim": "Different prompting strategies produce distinct vulnerability patterns",
    347       "evidence": "Table III shows vulnerability type distributions by strategy, with chi-square test confirming significant differences (χ²(33)=172.4, p<0.001, Cramer's V=0.38). Efficiency-focused: 42.7% memory safety; Feature-focused: 30.4% concurrency.",
    348       "supported": "strong"
    349     },
    350     {
    351       "claim": "Security vulnerabilities accumulate non-linearly across iterations, accelerating in later iterations",
    352       "evidence": "Repeated measures ANOVA (F(9,90)=14.32, p<0.001, η²=0.42). Post-hoc Tukey HSD shows significant differences between iterations 1-3 and 8-10 (p<0.001).",
    353       "supported": "moderate"
    354     },
    355     {
    356       "claim": "Even security-focused prompts introduce new vulnerabilities, particularly cryptographic implementation errors (21.1%)",
    357       "evidence": "Table II shows 38 total vulnerabilities for security-focused prompts (fewest but non-zero). Table III shows 21.1% cryptographic errors. Section IV-C provides qualitative analysis of three patterns.",
    358       "supported": "moderate"
    359     },
    360     {
    361       "claim": "Positive correlation (r=0.64) between code complexity increases and security vulnerability counts",
    362       "evidence": "Section IV-D reports r=0.64 (p<0.001), β=0.64 in multiple regression (R²=0.67). 95% CI for complexity effect: 10.7%-17.9% vulnerability increase per 10% complexity increase.",
    363       "supported": "moderate"
    364     }
    365   ],
    366   "red_flags": [
    367     {
    368       "flag": "Very small baseline sample",
    369       "detail": "Only 10 baseline code samples used. This is extremely small for generalizing about 'security degradation in iterative AI code generation.' The statistical analyses (ANOVA with N=10) have limited power and the samples may not be representative."
    370     },
    371     {
    372       "flag": "Single model tested, broad claims",
    373       "detail": "Only GPT-4o tested, but conclusions are framed as being about LLMs generally. Title says 'Iterative AI Code Generation' not 'GPT-4o Code Generation.'"
    374     },
    375     {
    376       "flag": "No reproducibility artifacts",
    377       "detail": "No code, data, or baseline samples released. The 10 baseline code samples, 400 generated outputs, and analysis scripts are all unavailable. Results cannot be independently verified."
    378     },
    379     {
    380       "flag": "Temperature stochasticity not addressed",
    381       "detail": "Temperature=0.7 introduces randomness but each iteration appears to be a single run. Different random seeds could produce very different vulnerability trajectories. No multi-run analysis."
    382     },
    383     {
    384       "flag": "Proxy-outcome gap unacknowledged",
    385       "detail": "Static analysis tool findings are treated as equivalent to security vulnerabilities. No discussion of false positive rates of static analysis tools or whether findings represent exploitable vulnerabilities."
    386     },
    387     {
    388       "flag": "Manual review methodology unclear",
    389       "detail": "Manual security code review is listed as part of the analysis but inter-rater reliability is not discussed. How many reviewers? What was the agreement rate? Were they blinded to the prompting strategy?"
    390     },
    391     {
    392       "flag": "Broken section reference",
    393       "detail": "Section II-D references 'Section ??' — a broken LaTeX cross-reference, suggesting incomplete preparation."
    394     }
    395   ],
    396   "cited_papers": [
    397     {
    398       "title": "Asleep at the keyboard? Assessing the security of GitHub Copilot's code contributions",
    399       "authors": ["H. Pearce", "B. Ahmad", "B. Tan", "B. Dolan-Gavitt", "R. Karri"],
    400       "year": 2022,
    401       "relevance": "Foundational empirical study of Copilot security finding 40% of generated programs contained vulnerabilities."
    402     },
    403     {
    404       "title": "Do users write more insecure code with AI assistants?",
    405       "authors": ["N. Perry", "M. Srivastava", "D. Kumar", "D. Boneh"],
    406       "year": 2023,
    407       "relevance": "User study showing AI-assisted developers write less secure code with a false sense of security."
    408     },
    409     {
    410       "title": "Artificial-Intelligence Generated Code Considered Harmful: A Road Map for Secure and High-Quality Code Generation",
    411       "authors": ["C. J. Chong", "Z. Yao", "I. Neamtiu"],
    412       "year": 2024,
    413       "arxiv_id": "2409.19182",
    414       "relevance": "Found LLM-generated code lacks defensive programming constructs; noted that prompting can introduce issues in previously clean files."
    415     },
    416     {
    417       "title": "A systematic literature review on the impact of AI models on the security of code generation",
    418       "authors": ["C. Negri-Ribalta", "R. Geraud-Stewart", "A. Sergeeva", "G. Lenzini"],
    419       "year": 2024,
    420       "relevance": "SLR synthesizing 19 studies confirming AI models do not produce safe code despite mitigations."
    421     },
    422     {
    423       "title": "Cybersecurity Risks of AI-Generated Code",
    424       "authors": ["J. Ji", "J. Jun", "M. Wu", "R. Gelles"],
    425       "year": 2024,
    426       "relevance": "CSET study finding ~50% of AI-generated code contained exploitable bugs."
    427     },
    428     {
    429       "title": "Refining ChatGPT-generated code: Characterizing and mitigating code quality issues",
    430       "authors": ["Y. Liu", "T. Le-Cong", "R. Widyasari", "D. Lo", "M. Tao", "S. Han"],
    431       "year": 2024,
    432       "relevance": "Analyzed ChatGPT code refinement process finding it can introduce new quality issues."
    433     },
    434     {
    435       "title": "LLM critics help catch LLM bugs",
    436       "authors": ["N. McAleese"],
    437       "year": 2024,
    438       "arxiv_id": "2407.00215",
    439       "relevance": "Critic-based model for automated feedback on generated code, showing structured prompting improves code quality."
    440     },
    441     {
    442       "title": "LLM4CVE: Enabling Iterative Automated Vulnerability Repair with Large Language Models",
    443       "authors": ["M. Fakih"],
    444       "year": 2025,
    445       "arxiv_id": "2501.03446",
    446       "relevance": "Explores iterative LLM-based vulnerability repair — complementary perspective to this paper's degradation findings."
    447     }
    448   ]
    449 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs