scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (35581B)
      1 {
      2   "paper": {
      3     "title": "How Safe Are AI-Generated Patches? A Large-scale Study on Security Risks in LLM and Agentic Automated Program Repair on SWE-bench",
      4     "authors": [
      5       "Amirali Sajadi",
      6       "Kostadin Damevski",
      7       "Preetha Chatterjee"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv",
     11     "arxiv_id": "2507.02976"
     12   },
     13   "scan_version": 3,
     14   "active_modules": ["experimental_rigor", "data_leakage"],
     15   "methodology_tags": ["observational", "benchmark-eval"],
     16   "key_findings": "Llama 3.3 introduces 11x more new vulnerabilities than developers when generating patches for 20,000+ SWE-bench issues (135 vs 12), with CWE types (eval injection, insecure deserialization) distinct from developer-introduced vulnerabilities. Among agentic frameworks, OpenHands produced disproportionately more vulnerabilities (44) than AutoCodeRover (3) or HoneyComb (2), linked to its greater autonomy and tendency toward excessive file rewrites. Vulnerable patches are associated with more modified files and file types, issues lacking code snippets and reproduction steps, but project-level factors (size, complexity, contributor count) showed no significant correlation with vulnerability rates.",
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "A replication package is provided on figshare (reference [3]: https://figshare.com/s/174a976de48f28ae1482). The paper states 'Our data and research artifacts publicly accessible through our replication package to support further research and validation of our findings.'"
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "SWE-bench is a publicly available dataset. The replication package on figshare contains research artifacts. The paper also notes that developer patches were collected via the public GitHub API."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No environment specifications (requirements.txt, Dockerfile, library versions) are provided in the paper. Tools used (CodeQL, Semgrep, Bandit, Radon) are named but their versions are not specified."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper describes the methodology at a high level but does not include step-by-step reproduction instructions, commands to run, or a description of how to use the replication package to reproduce the results."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "Results are reported as point estimates (counts, means, percentages) without confidence intervals or error bars. Table 1 reports vulnerability counts, Table 3 reports means and p-values but no confidence intervals."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Mann-Whitney U tests are used with p-values reported for code-level metrics (Table 3). Chi-squared tests are used for code snippet presence (Section 3.3.2). Spearman's rank correlation with p-values is used for project-level factors (Section 3.3.3)."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Cliff's delta (δ) is reported for all Mann-Whitney U comparisons in Table 3 (e.g., δ=0.200 for files modified). Phi coefficient (φ=0.017) is reported for code snippet presence. Spearman's ρ is reported for project-level correlations."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No power analysis or formal sample size justification is provided. The authors use the full SWE-bench dataset (21,294 instances) for Llama and acknowledge the limited sample for agentic frameworks in Section 4, but provide no formal justification for adequacy."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No variance, standard deviation, or spread measures are reported for the main results. Llama patch generation appears to be a single run. Means are reported in Tables 3-4 without standard deviations."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Developer-written patches serve as the baseline comparison throughout the study. Section 2.3 describes collection of developer patches from corresponding PRs. Table 1 compares vulnerability counts across developers, Llama, and all three agentic frameworks."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Developer patches come from the same SWE-bench issues being evaluated. The agentic frameworks (OpenHands, AutoCodeRover, HoneyComb) are described as 'top publicly available entries on the leaderboard at the time of our study' (Section 2.2.2). Llama 3.3 is described as comparable to GPT-4o."
     77       },
     78       "ablation_study": {
     79         "applies": false,
     80         "answer": false,
     81         "justification": "This is an empirical observational study analyzing security properties of existing systems, not proposing a multi-component system that could be ablated."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Multiple metrics are used: vulnerability counts per source, CWE type distributions, code-level metrics (files modified, unique file types, LOC, cyclomatic complexity), issue-level factors (type, information completeness, word count, code snippet presence), and project-level metrics (contributors, files, maintainability index)."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Step 3 of the vulnerability detection pipeline involves manual inspection: 'the first author inspected the patched file, and judged whether the tool-reported issue constituted a plausible security threat rather than a benign code smell' (Section 2.4). Issue type and bug type were manually annotated by two independent annotators with Cohen's kappa > 0.9."
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No held-out validation split was used for developing or validating the vulnerability detection pipeline. The same multi-step process was applied uniformly to all instances. The SWE-bench train/test split is used but not for pipeline validation."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Extensive breakdowns are provided: by CWE type (Table 2), by patch source (Table 1), by code-level metrics for vulnerable vs. all instances (Tables 3-4), by bug type (Table 5), and by framework. Results are also broken down by SWE-bench split (train+test vs. test-only)."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Multiple concrete failure examples are discussed: Llama introducing CWE-95 via eval() in a Qiskit project, Llama downgrading Argon2 to SHA-256/MD5 for password hashing, OpenHands introducing SQL injection (CWE-89) by bypassing Django's ORM (Section 3.1-3.2)."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Several hypothesized factors were found to be non-significant: sensitive file modifications (p=0.369), cyclomatic complexity of generated code (p=0.8796), word count of issues (p=0.5346), comments prior to PR (p=0.68), and all project-level factors (Section 3.3.3). These non-findings are reported transparently."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Abstract claims are supported: 'Llama introduces many new vulnerabilities' (135 vs 12, Table 1), 'exhibiting unique patterns' (CWE-95/502 vs CWE-732/377, Table 2), 'agentic workflows also generate vulnerabilities, particularly when given more autonomy' (OH=44, ACR=3, HC=2, Table 1), 'associated with distinctive code characteristics' (Table 3, significant p-values)."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The paper makes causal-adjacent claims such as 'contextual factors play a critical role in the security of the generated patches' and 'conditions that result in vulnerability-inducing AI-generated patches.' The study design is observational (correlational analysis) without causal identification strategies. Confounds (e.g., oracle retrieval effects, prompt design bias) are not addressed."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The title 'How Safe Are AI-Generated Patches?' and abstract language ('LLMs and their agentic frameworks') generalize beyond the tested setting of one standalone model (Llama 3.3), three agentic frameworks, Python-only code (SWE-bench), and three specific static analysis tools. While the threats section acknowledges some limitations, the framing is broader than the evidence supports."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The threats section discusses static analysis tool accuracy and scale limitations. However, alternative explanations for the main findings are not substantively discussed. For example: could oracle retrieval bias Llama toward different vulnerability patterns? Could the prompt design (requesting complete files vs. diffs) affect security? Could OH's higher count be due to the larger diff size triggering more static analysis alerts rather than genuine security differences?"
    134       },
    135       "proxy_outcome_distinction": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The paper measures 'vulnerabilities flagged by static analysis tools after majority voting and manual validation' and claims to find 'security vulnerabilities.' The multi-step validation pipeline (Section 2.4) explicitly bridges the gap between automated detection (proxy) and actual security threats (outcome), with detailed criteria for manual inspection including 'whether the tool-reported issue constituted a plausible security threat rather than a benign code smell.'"
    139       }
    140     },
    141     "setup_transparency": {
    142       "model_versions_specified": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Specific model versions are stated: 'Llama 3.3 Instruct (70B)' (Section 2.2.1), 'Claude-3.5-Sonnet-20241022' for both OpenHands and AutoCodeRover (Section 2.2.2). The Claude model includes a snapshot date."
    146       },
    147       "prompts_provided": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "The paper describes the prompt approach in natural language: 'We adopt the prompt structure used in SWE-bench, with slight modifications. Rather than generating diffs like SWE-bench, we explicitly instruct the model to produce complete files.' The actual prompt text is not provided in the paper."
    151       },
    152       "hyperparameters_reported": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "No generation hyperparameters are reported for Llama 3.3 (temperature, top-p, max tokens, etc.). No sampling parameters are mentioned for any model. These settings significantly affect output quality and security properties."
    156       },
    157       "scaffolding_described": {
    158         "applies": false,
    159         "answer": false,
    160         "justification": "The paper evaluates three third-party agentic frameworks (OpenHands, AutoCodeRover, HoneyComb) as existing systems using their publicly released patches. The authors did not build the scaffolding and cannot be expected to describe internal implementation details they have no access to."
    161       },
    162       "data_preprocessing_documented": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "The vulnerability detection pipeline is documented in detail (Section 2.4): static analysis with three tools, majority voting strategy, and manual inspection with specific inclusion/exclusion criteria. The patch collection process is described for each source (Sections 2.2-2.3), including how pre/post-patch file pairs were extracted."
    166       }
    167     },
    168     "limitations_and_scope": {
    169       "limitations_section_present": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 4 'Threats to Validity' provides a dedicated, substantive discussion of limitations spanning static analysis accuracy, model/framework selection, computational constraints, annotation scale, and inter-rater reliability."
    173       },
    174       "threats_to_validity_specific": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Specific threats are discussed: 'our evaluation focused on one high-performing LLM and the top three publicly available agentic frameworks,' 'AutoCodeRover reports a cost of $0.70 per instance... meaning it would cost over $13,000 to generate patches for the full 19,000 instances,' and 'manually annotating all 20,000 SWE-bench issues is infeasible' (Section 4)."
    178       },
    179       "scope_boundaries_stated": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "The paper explicitly states: 'our RQ2 evaluation is not a direct comparison between agentic frameworks and Llama' (Section 2.2.2), 'we refrain from making strong statistical claims about the agentic systems' and 'we do not claim that bug-fixing issues are inherently more prone to vulnerable patches' (Section 4). They restrict quantitative conclusions to each method's evaluation data."
    183       }
    184     },
    185     "data_integrity": {
    186       "raw_data_available": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "A replication package is provided on figshare (reference [3]) containing research artifacts. SWE-bench is publicly available. The paper states data are 'publicly accessible through our replication package to support further research and validation of our findings.'"
    190       },
    191       "data_collection_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Data collection is described in detail: SWE-bench dataset (Section 2.1), Llama patch generation with oracle retrieval and prompt design (Section 2.2.1), agentic framework patch collection from public releases with modified SWE-bench harness (Section 2.2.2), developer patch collection via GitHub API (Section 2.3)."
    195       },
    196       "recruitment_methods_described": {
    197         "applies": false,
    198         "answer": false,
    199         "justification": "No human participants. The data source is SWE-bench, a standard benchmark of GitHub issues. Manual annotators are part of the methodology, not study participants."
    200       },
    201       "data_pipeline_documented": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "The full pipeline is documented with counts at each stage: for Llama, 1440 raw detections → 185 after majority vote → 135 after manual check. For developers, 1398 raw → 17 after majority vote → 12 after manual check. For OH, 195 after majority vote → 44 after manual check (Section 3.1-3.2, Table 1). Individual tool counts are also reported."
    205       }
    206     },
    207     "conflicts_of_interest": {
    208       "funding_disclosed": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "No funding source is disclosed anywhere in the paper. There is no acknowledgments section mentioning grants, sponsors, or funding agencies."
    212       },
    213       "affiliations_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Author affiliations are clearly stated: Amirali Sajadi and Preetha Chatterjee at Drexel University, Kostadin Damevski at Virginia Commonwealth University. None are affiliated with the evaluated tools/models."
    217       },
    218       "funder_independent_of_outcome": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "Since funding is not disclosed, independence of the funder cannot be assessed. The authors are academic researchers not affiliated with any of the evaluated systems, but the absence of funding disclosure prevents verification."
    222       },
    223       "financial_interests_declared": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No competing interests or financial interests statement is present in the paper. Absence of disclosure is not the same as absence of conflict."
    227       }
    228     },
    229     "contamination": {
    230       "training_cutoff_stated": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "The training data cutoff for Llama 3.3 is not stated. The training cutoff for Claude 3.5 Sonnet (used by the agentic frameworks) is also not stated. This is important because SWE-bench issues come from public GitHub repositories that are likely in the training data."
    234       },
    235       "train_test_overlap_discussed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No discussion of potential train/test overlap. SWE-bench issues and their solutions are from public GitHub repositories that are very likely included in the training corpora of both Llama 3.3 and Claude 3.5 Sonnet. The paper does not address whether the models may have seen the ground-truth solutions."
    239       },
    240       "benchmark_contamination_addressed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "SWE-bench has been public since 2024, and the underlying GitHub issues predate model training. No contamination analysis or mitigation is discussed. Even though the study focuses on security rather than functional correctness, contamination could affect how models generate code patterns."
    244       }
    245     },
    246     "human_studies": {
    247       "pre_registered": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in the study. Manual annotators are part of the research methodology, not study subjects."
    251       },
    252       "irb_or_ethics_approval": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants. The study analyzes publicly available code from SWE-bench."
    256       },
    257       "demographics_reported": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants. The two annotators are described only as having 'at least three years of programming experience.'"
    261       },
    262       "inclusion_exclusion_criteria": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in the study."
    266       },
    267       "randomization_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants or experimental conditions requiring randomization."
    271       },
    272       "blinding_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants or experimental conditions requiring blinding."
    276       },
    277       "attrition_reported": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in the study."
    281       }
    282     },
    283     "cost_and_practicality": {
    284       "inference_cost_reported": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "The paper mentions AutoCodeRover costs '$0.70 per instance' and estimates '$13,000' for the full training set, but these are cited from external sources. The paper's own generation cost for Llama 3.3 over 20,000+ instances is not quantified. Static analysis costs are also not reported."
    288       },
    289       "compute_budget_stated": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "The paper states Llama generation 'took well over a week' but does not specify hardware (GPUs, number of nodes), total GPU hours, or other compute budget details. The computational demands of running three static analysis tools across 20,000+ instances are not quantified."
    293       }
    294     },
    295     "experimental_rigor": {
    296       "seed_sensitivity_reported": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "Results appear to be from single runs. No multiple random seeds or sensitivity analysis is mentioned for the Llama patch generation process."
    300       },
    301       "number_of_runs_stated": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The number of experimental runs (e.g., how many times Llama generated patches per issue) is never stated. It appears to be a single generation per issue with no repetition."
    305       },
    306       "hyperparameter_search_budget": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "No hyperparameter search is reported. The prompt design and generation settings appear fixed without exploring alternatives or reporting a search budget."
    310       },
    311       "best_config_selection_justified": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "The choice of Llama 3.3 is justified by benchmark performance comparisons, but the specific configuration (prompt design, generation parameters) is not justified against alternatives. The decision to generate complete files vs. diffs is justified only for reliability, not security implications."
    315       },
    316       "multiple_comparison_correction": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "Multiple statistical tests are performed across many metrics in RQ3 (Table 3, Section 3.3.2, Section 3.3.3) without any mention of correction for multiple comparisons (Bonferroni, Holm, etc.)."
    320       },
    321       "self_comparison_bias_addressed": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The authors evaluate third-party systems rather than their own, which reduces self-comparison bias. However, they designed the vulnerability detection pipeline and manually validated results without discussing potential author bias in these subjective judgments."
    325       },
    326       "compute_budget_vs_performance": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The agentic frameworks differ substantially in compute usage (OH generates 500,000+ line diffs in some cases vs. smaller patches from ACR/HC). The relationship between compute budget and vulnerability counts is not analyzed."
    330       },
    331       "benchmark_construct_validity": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The paper uses SWE-bench as its evaluation dataset but does not discuss whether SWE-bench adequately represents real-world APR scenarios for security analysis. SWE-bench is Python-only, from 12-37 popular repositories, which may not reflect the diversity of real-world vulnerability-inducing contexts."
    335       },
    336       "scaffold_confound_addressed": {
    337         "applies": true,
    338         "answer": true,
    339         "justification": "The paper explicitly states 'our RQ2 evaluation is not a direct comparison between agentic frameworks and Llama' (Section 2.2.2) and restricts 'all quantitative conclusions to the datasets on which each method is actually evaluated.' They avoid confounded cross-setting claims."
    340       }
    341     },
    342     "data_leakage": {
    343       "temporal_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "SWE-bench issues are from public GitHub repositories (2016-2023 era) and both Llama 3.3 and Claude 3.5 Sonnet were trained on data likely including these repositories and their solutions. The paper does not discuss temporal leakage or its implications for the observed vulnerability patterns."
    347       },
    348       "feature_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "Oracle retrieval provides Llama with the exact files modified in the developer fix, which leaks information about where the fix should be applied. While justified as isolating generation quality, the paper does not discuss whether this information leakage could affect security properties of generated patches."
    352       },
    353       "non_independence_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "Many SWE-bench instances come from the same repositories (e.g., multiple Django issues). The paper does not discuss whether instances from the same project share vulnerability patterns or whether non-independence inflates apparent correlations in RQ3."
    357       },
    358       "leakage_detection_method": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No concrete leakage detection or prevention methods (canary strings, membership inference, n-gram overlap analysis, decontamination pipelines) are applied."
    362       }
    363     }
    364   },
    365   "claims": [
    366     {
    367       "claim": "Llama 3.3 introduces 11x more new vulnerabilities than developers: 135 in LLM-generated patches versus 12 in developer patches across the full SWE-bench dataset.",
    368       "evidence": "Table 1 shows vulnerability counts after majority voting and manual validation. Llama: 185 after majority vote → 135 after manual check. Developers: 17 → 12. (Section 3.1)",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "LLM-generated vulnerabilities exhibit distinct CWE patterns from developer vulnerabilities: eval injection (CWE-95, 39 instances) and insecure deserialization (CWE-502, 17) are far more prevalent in LLM-generated code than in developer-written patches.",
    373       "evidence": "Table 2 shows CWE type distributions. CWE-95 appears 39 times in Llama patches vs. 1 in developer patches. CWE-502 appears 17 times vs. 2. Developer patterns center on CWE-377 and CWE-732, which are absent from top LLM categories. (Section 3.1)",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "OpenHands produces disproportionately more vulnerabilities (44) than AutoCodeRover (3) or HoneyComb (2), linked to its greater autonomy over the codebase.",
    378       "evidence": "Table 1 shows vulnerability counts per framework. OH's 44 vulnerabilities came from only 11 issue instances, with 3 issues accounting for 35 of 44. OH modified 344.27 files on average for vulnerable instances vs. 11.38 for all instances. (Section 3.2)",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Vulnerable LLM patches consistently modify more files and file types than non-vulnerable patches.",
    383       "evidence": "Table 3: Llama vulnerable patches modify 1.72 files vs. 1.25 overall (p<0.001, δ=0.200). Table 4: OH vulnerable patches modify 344.27 vs. 11.38 files (p<0.001, δ=0.77). Unique file types: Llama 1.10 vs. 0.99 (p=0.007). (Section 3.3.1)",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Vulnerable Llama patches are associated with issues lacking code snippets, and when expected behavior or reproduction steps are missing.",
    388       "evidence": "Only 47.4% of vulnerable issues include code snippets vs. 58.6% overall (χ²=6.44, p=0.011, φ=0.017). Only 39% of vulnerable bug issues provided expected behavior, and 64.7% had steps to reproduce. (Section 3.3.2)",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Project-level factors (repository size, cyclomatic complexity, maintainability, contributor count) do not reliably predict vulnerability rates in LLM-generated code.",
    393       "evidence": "Spearman correlations for Llama: file count ρ=0.136 (p=0.362), maintainability ρ=-0.006 (p=0.968), complexity ρ=-0.062 (p=0.679), contributors ρ=0.184 (p=0.217). Similar non-significant results across all frameworks. (Section 3.3.3)",
    394       "supported": "strong"
    395     }
    396   ],
    397   "red_flags": [
    398     {
    399       "flag": "Single annotator for critical validation step",
    400       "detail": "The manual vulnerability validation (Step 3, the most critical quality gate) was performed by 'the first author' alone. While issue-level annotations had two annotators with strong inter-rater agreement (κ>0.9), the security judgment of whether each detected vulnerability is a genuine threat had no independent second reviewer."
    401     },
    402     {
    403       "flag": "Extreme concentration of OH vulnerabilities",
    404       "detail": "35 of 44 confirmed OH vulnerabilities came from just 3 issues (django #14056, pytest #5787, django #14681). The OH result is effectively driven by a handful of pathological cases rather than a systematic vulnerability pattern, making quantitative claims about OH's vulnerability rate fragile."
    405     },
    406     {
    407       "flag": "No contamination or leakage analysis",
    408       "detail": "SWE-bench issues and their developer-written solutions are from public GitHub repositories almost certainly included in the training data of Llama 3.3 and Claude 3.5 Sonnet. The paper does not discuss whether models having seen ground-truth solutions could influence the types of vulnerabilities they introduce."
    409     },
    410     {
    411       "flag": "Oracle retrieval creates unrealistic advantage",
    412       "detail": "Llama receives the exact files modified in the developer fix (oracle retrieval), which is never available in realistic usage. This could systematically bias the types of vulnerabilities introduced (e.g., the model may write more code in security-sensitive files than it would with natural retrieval), making the findings less generalizable."
    413     },
    414     {
    415       "flag": "No multiple comparison correction",
    416       "detail": "Dozens of statistical tests are performed across code-level, issue-level, and project-level factors in RQ3 without any correction for multiple comparisons. Some significant p-values (e.g., unique file types p=0.007, code snippets p=0.011) may not survive Bonferroni or similar corrections."
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "Asleep at the keyboard? assessing the security of github copilot's code contributions",
    422       "authors": ["Hammond Pearce", "Baleegh Ahmad", "Benjamin Tan", "Brendan Dolan-Gavitt", "Ramesh Karri"],
    423       "year": 2025,
    424       "relevance": "Foundational assessment of security of LLM-generated code, specifically GitHub Copilot's code contributions."
    425     },
    426     {
    427       "title": "Do users write more insecure code with AI assistants?",
    428       "authors": ["Neil Perry", "Megha Srivastava", "Deepak Kumar", "Dan Boneh"],
    429       "year": 2023,
    430       "relevance": "Studies whether AI coding assistants lead developers to write more insecure code through user experiments."
    431     },
    432     {
    433       "title": "Security weaknesses of copilot generated code in github",
    434       "authors": ["Yujia Fu", "Peng Liang", "Amjed Tahir", "Zengyang Li"],
    435       "year": 2023,
    436       "relevance": "Empirical analysis of security weaknesses in Copilot-generated code found in real GitHub projects."
    437     },
    438     {
    439       "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?",
    440       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R Narasimhan"],
    441       "year": 2024,
    442       "relevance": "The benchmark dataset used in this study — real-world GitHub issues paired with pull requests for evaluating LLM program repair."
    443     },
    444     {
    445       "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    446       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"],
    447       "year": 2024,
    448       "arxiv_id": "2407.16741",
    449       "relevance": "One of three agentic APR frameworks evaluated in the study, the top-performing entry on SWE-bench at the time."
    450     },
    451     {
    452       "title": "AutoCodeRover: Autonomous Program Improvement",
    453       "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"],
    454       "year": 2024,
    455       "doi": "10.1145/3650212.3680384",
    456       "relevance": "One of three agentic APR frameworks evaluated, featuring more constrained edit patterns than OpenHands."
    457     },
    458     {
    459       "title": "Red Teaming Program Repair Agents: When Correct Patches can Hide Vulnerabilities",
    460       "authors": ["Simin Chen", "Yixin He", "Suman Jana", "Baishakhi Ray"],
    461       "year": 2025,
    462       "arxiv_id": "2509.25894",
    463       "relevance": "Complementary red-teaming study showing APR agents can be compelled into generating vulnerable patches that still pass tests."
    464     },
    465     {
    466       "title": "Adversarial Bug Reports as a Security Risk in Language Model-Based Automated Program Repair",
    467       "authors": ["Piotr Przymus", "Andreas Happe", "Jürgen Cito"],
    468       "year": 2025,
    469       "arxiv_id": "2509.05372",
    470       "relevance": "Demonstrates that adversarial bug reports can mislead APR agents into producing insecure yet functionally correct patches."
    471     },
    472     {
    473       "title": "How secure is code generated by chatgpt?",
    474       "authors": ["Raphaël Khoury", "Anderson R Avila", "Jacob Brunelle", "Baba Mamadou Camara"],
    475       "year": 2023,
    476       "relevance": "Assessment of security properties of ChatGPT-generated code across security-sensitive tasks."
    477     },
    478     {
    479       "title": "Generate and pray: Using sallms to evaluate the security of llm generated code",
    480       "authors": ["Mohammed Latif Siddiq", "Joanna CS Santos"],
    481       "year": 2023,
    482       "relevance": "Framework for evaluating security of LLM-generated code using automated analysis tools."
    483     },
    484     {
    485       "title": "Is Your AI-Generated Code Really Safe? Evaluating Large Language Models on Secure Code Generation with CodeSecEval",
    486       "authors": ["Jiexin Wang", "Xitong Luo", "Liuwen Cao"],
    487       "year": 2024,
    488       "arxiv_id": "2407.02395",
    489       "relevance": "Evaluates LLMs on secure code generation using a dedicated security benchmark."
    490     },
    491     {
    492       "title": "Anomalicious: Automated detection of anomalous and potentially malicious commits on github",
    493       "authors": ["Danielle Gonzalez", "Thomas Zimmermann", "Patrice Godefroid", "Max Schäfer"],
    494       "year": 2021,
    495       "relevance": "Foundational work on detecting risky/anomalous code commits, which the paper's RQ3 analysis builds upon."
    496     }
    497   ],
    498   "engagement_factors": {
    499     "practical_relevance": {
    500       "score": 2,
    501       "justification": "Findings directly inform practitioners using LLM-based APR tools about when to trust outputs and when to require additional security checks."
    502     },
    503     "surprise_contrarian": {
    504       "score": 1,
    505       "justification": "That LLMs generate insecure code is known from prior work; the 11x magnitude and distinctive CWE patterns add incremental surprise but don't overturn conventional wisdom."
    506     },
    507     "fear_safety": {
    508       "score": 2,
    509       "justification": "Demonstrates concrete, quantified security risks of LLM-generated patches at scale in real-world repositories, raising concerns about automated CI/CD integration."
    510     },
    511     "drama_conflict": {
    512       "score": 1,
    513       "justification": "Shows OpenHands, a top-performing SWE-bench framework, producing far more vulnerabilities than alternatives, but presented analytically rather than provocatively."
    514     },
    515     "demo_ability": {
    516       "score": 1,
    517       "justification": "Replication package exists on figshare but no interactive tool or demo is available."
    518     },
    519     "brand_recognition": {
    520       "score": 1,
    521       "justification": "Uses well-known tools (SWE-bench, OpenHands, Llama, Claude) but authors are from academic institutions without major brand recognition."
    522     }
    523   }
    524 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs