ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (21096B)


      1 {
      2   "paper": {
      3     "title": "Security Weaknesses of Copilot-Generated Code in GitHub Projects: An Empirical Study",
      4     "authors": ["Yujia Fu", "Peng Liang", "Amjed Tahir", "Zengyang Li", "Mojtaba Shahin", "Jiaxin Yu", "Jinfu Chen"],
      5     "year": 2025,
      6     "venue": "ACM Transactions on Software Engineering and Methodology",
      7     "arxiv_id": "2310.02059",
      8     "doi": "10.1145/nnnnnnn.nnnnnnn"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": false,
     15         "justification": "No source code repository is provided. The paper releases a dataset via Zenodo (reference [21]) but no analysis scripts or tools."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "A curated dataset is made available on Zenodo (doi:10.5281/zenodo.10802054), referenced as [21], containing collected code snippets, analysis results, and filtered results."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No requirements.txt, Dockerfile, or detailed environment setup is provided. The paper mentions tools (CodeQL, Bandit, ESLint) but not their versions or environment configuration."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No step-by-step reproduction instructions are provided. The methodology is described in prose but there are no scripts or README with commands to replicate the analysis."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "Results are reported as point estimates (e.g., '29.5% of Python snippets') with no confidence intervals or error bars."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper makes comparative observations (e.g., Python vs JavaScript rates, fix rates across prompts) but uses no statistical significance tests."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Only raw percentages are reported. No formal effect sizes (Cohen's d, odds ratios) are provided for any comparisons."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The sample of 733 snippets is the result of filtering but no power analysis or justification for sample size adequacy is provided."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Standard deviations are reported for descriptive statistics like LoC (std dev 390.83), stars, forks, and security weaknesses per snippet (std dev 3.91)."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper compares against prior work (Pearce et al. [55], Majdinasab et al. [43]) and compares the CWE Top-25 as a baseline for prevalence. Three prompt strategies are compared for RQ3."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Comparisons reference recent work (Pearce et al. 2022, Majdinasab et al. 2024) and use the 2023 CWE Top-25 list."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "RQ3 tests three progressively detailed prompts (/fix, basic prompt, enhanced prompt), functioning as an ablation of prompt information on fix effectiveness."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Multiple metrics are reported: percentage of snippets with weaknesses, number of CWE types, fix rates, per-CWE fix rates, distribution across application domains."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Two authors manually filtered results, verified whether security weaknesses were genuine (not false positives), and whether they were from AI-generated code. Cohen's Kappa reported (0.82-0.85)."
     85       },
     86       "held_out_test_set": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "This is a mining/empirical study, not a model training study. There is no train/test split concept."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Results are broken down by language (Python vs JavaScript), by CWE type (43 categories in Table 5), by application domain (Table 3, Figures 14-15), and by prompt type (Table 7)."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The paper discusses CWEs that Copilot Chat failed to fix (e.g., CWE-78 OS Command Injection at 0% fix rate, CWE-94 Code Injection below 20%) in Section 5.1 and Figure 17."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper reports that Copilot Chat cannot fix certain CWEs (CWE-78, CWE-284 at 0% fix rate with enhanced prompt), and that /fix command only fixes 19.3%."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Abstract claims (29.5% Python, 24.2% JavaScript affected; 43 CWEs; 8 in Top-25; up to 55.5% fixed) are all supported by Tables 4, 5, and 7."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "The paper makes implicit causal claims linking security weaknesses to code generation tools, but the study cannot establish causality — the code context and developer modifications are confounds. The paper speculates about causes (e.g., 'This may be because...') without adequate causal identification."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The title says 'GitHub Projects' which is appropriate, but the paper's implications sections make broad recommendations for 'developers using Copilot' without sufficiently bounding to the specific sample (keyword-searchable, mostly low-popularity projects, Python/JavaScript only)."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The threats to validity section discusses methodological limitations (keyword search coverage, false positives) but does not substantively discuss alternative explanations for the observed weakness rates — e.g., whether the weaknesses reflect developer context rather than tool behavior, or whether low-popularity projects have systematically different code patterns."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper refers to 'GitHub Copilot', 'CodeWhisperer', and 'Codeium' without specifying versions. Copilot Chat is described as 'based on the GPT-4 model' without a version or snapshot date."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The three prompts used for RQ3 are provided verbatim in Section 3.4: the /fix command, basic prompt, and enhanced prompt with the template for warning messages."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "No temperature, top-p, or other API parameters are reported for Copilot Chat interactions."
    144       },
    145       "scaffolding_described": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "The paper evaluates Copilot and Copilot Chat as black-box third-party tools; the authors cannot be expected to describe their internal scaffolding."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "The filtering pipeline is documented in detail: 9,927 initial search results → 3,589 after dedup → manual filtering with inclusion/exclusion criteria → 733 code snippets. Filtering criteria are explicitly stated (Section 3.2.2)."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 6 'Threats to Validity' provides substantive discussion of construct, external, and reliability threats."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Specific threats are discussed: keyword-based search may miss code snippets, manual filtering introduces personal bias, static analysis tools cannot scan all CWEs, dataset skewed toward Game projects."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "The paper does not explicitly state what the results do NOT show. The external validity section mentions limitations but does not clearly bound claims — e.g., it does not state 'our results do not show the security of Copilot-generated code in enterprise settings' or similar explicit scope boundaries."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The complete dataset including code snippets, full analysis results, and filtered results is provided via Zenodo (reference [21], doi:10.5281/zenodo.10802054)."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 3.2 describes data collection in detail: GitHub REST API search, keyword combinations, language filters, time period implicit from Copilot's launch (2021)."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants. Data source is public GitHub repositories mined via API search."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The full pipeline is documented with counts at each stage: 9,927 search results → 3,589 after dedup → 733 code snippets after manual filtering → 200 with security weaknesses → 628 security issues. Figure 3 provides an overview."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Acknowledgments section states funding from NSFC under Grant No. 62172311 and Hubei Province under Grant No. 2024BAA008."
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "All authors' affiliations are listed (Wuhan University, Massey University, Central China Normal University, RMIT University). None are affiliated with GitHub/Microsoft, Amazon, or Codeium."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Funding is from NSFC (Chinese government science foundation) and Hubei Province — neither has a financial interest in the outcome of evaluating Copilot's security."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests or financial interests statement is present in the paper."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": false,
    220         "answer": false,
    221         "justification": "This study mines GitHub repositories for already-generated code snippets and analyzes them with static analysis tools. It does not evaluate a model's capability on a benchmark."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "Not a benchmark evaluation study. The study analyzes existing code with static analysis tools."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "Not a benchmark evaluation study."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants. This is a mining study of public GitHub repositories."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "The paper uses Copilot Chat to fix code (RQ3) but does not report API costs, tokens consumed, or wall-clock time."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No mention of computational budget, hardware used, or total time for static analysis or Copilot Chat interactions."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "29.5% of Python and 24.2% of JavaScript code snippets generated by AI code generation tools contain security weaknesses.",
    287       "evidence": "Table 4 shows 124/419 Python and 76/314 JavaScript snippets with weaknesses, from static analysis with CodeQL + Bandit/ESLint and manual false positive filtering (Section 4.1).",
    288       "supported": "moderate"
    289     },
    290     {
    291       "claim": "Security weaknesses span 43 different CWE categories, with CWE-330 (Use of Insufficiently Random Values) being the most frequent at 18.15%.",
    292       "evidence": "Table 5 provides the full distribution of 628 weaknesses across 43 CWEs (Section 4.2).",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "Eight of the identified CWEs belong to the 2023 CWE Top-25, covering 37.1% of all identified weaknesses.",
    297       "evidence": "Section 4.2 compares findings against the MITRE 2023 CWE Top-25 list, finding 233/628 weaknesses match Top-25 CWEs.",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "Enhanced prompts with static analysis warning messages allow Copilot Chat to fix 55.5% of security weaknesses, compared to 19.3% with the /fix command.",
    302       "evidence": "Table 7 shows fix rates: /fix 19.3%, basic prompt 31.8%, enhanced prompt 55.5%, tested on 295 security weaknesses in 90 code snippets (Section 4.3).",
    303       "supported": "moderate"
    304     },
    305     {
    306       "claim": "Copilot Chat is unable to fix certain CWEs such as CWE-78 (OS Command Injection) and CWE-284 (Improper Access Control).",
    307       "evidence": "Figure 17 heatmap shows 0% fix rate for CWE-78 and CWE-284 across all three prompt types.",
    308       "supported": "strong"
    309     }
    310   ],
    311   "methodology_tags": ["observational", "case-study"],
    312   "key_findings": "An empirical analysis of 733 AI-generated code snippets from GitHub projects found that ~27% contain security weaknesses spanning 43 CWE types, with CWE-330 (Insufficiently Random Values), CWE-94 (Code Injection), and CWE-79 (XSS) being most prevalent. Eight CWEs overlap with the 2023 CWE Top-25. Copilot Chat can fix up to 55.5% of identified security weaknesses when provided with static analysis warning messages, but its effectiveness varies significantly across CWE types, with some (e.g., OS Command Injection) remaining unfixable.",
    313   "red_flags": [
    314     {
    315       "flag": "No statistical significance tests",
    316       "detail": "Comparisons between languages, application domains, and prompt strategies are made without any statistical tests, making it unclear whether observed differences are meaningful or due to chance."
    317     },
    318     {
    319       "flag": "Selection bias in data collection",
    320       "detail": "Code snippets are found via keyword search for comments mentioning Copilot. This selects for developers who explicitly annotate AI-generated code, which may not be representative of typical Copilot usage. Most projects have 0 stars/forks (median), suggesting predominantly personal/learning projects."
    321     },
    322     {
    323       "flag": "Static analysis limitations underreported",
    324       "detail": "The study relies entirely on static analysis tools which have known false positive and false negative rates. While false positives were manually filtered, false negatives (missed vulnerabilities) are not quantified, meaning the reported weakness rates are likely lower bounds."
    325     },
    326     {
    327       "flag": "Confounded attribution",
    328       "detail": "For Code label results, determining whether a security weakness is in AI-generated code relies on proximity to comments mentioning Copilot. The surrounding human-written code context could contribute to or mitigate vulnerabilities, making attribution to the AI tool imprecise."
    329     }
    330   ],
    331   "cited_papers": [
    332     {
    333       "title": "Asleep at the keyboard? assessing the security of github copilot's code contributions",
    334       "authors": ["Hammond Pearce", "Baleegh Ahmad", "Benjamin Tan", "Brendan Dolan-Gavitt", "Ramesh Karri"],
    335       "year": 2022,
    336       "relevance": "Foundational study evaluating Copilot's security using crafted scenarios, direct predecessor to this work."
    337     },
    338     {
    339       "title": "Do users write more insecure code with AI assistants?",
    340       "authors": ["Neil Perry", "Megha Srivastava", "Deepak Kumar", "Dan Boneh"],
    341       "year": 2023,
    342       "relevance": "User study examining whether AI coding assistants increase security vulnerability rates in developer-written code."
    343     },
    344     {
    345       "title": "Is GitHub's Copilot as Bad as Humans at Introducing Vulnerabilities in Code?",
    346       "authors": ["Owura Asare", "Meiyappan Nagappan", "N. Asokan"],
    347       "year": 2023,
    348       "relevance": "Comparative study of Copilot vs human developers in introducing code vulnerabilities."
    349     },
    350     {
    351       "title": "Github copilot ai pair programmer: Asset or liability?",
    352       "authors": ["Arghavan Moradi Dakhel", "Vahid Majdinasab", "Amin Nikanjam", "Foutse Khomh", "Michel C Desmarais", "Zhen Ming Jack Jiang"],
    353       "year": 2023,
    354       "relevance": "Evaluates Copilot's code quality including correctness and security aspects."
    355     },
    356     {
    357       "title": "Assessing the Security of GitHub Copilot Generated Code - A Targeted Replication Study",
    358       "authors": ["Vahid Majdinasab", "Michael Joshua Bishop", "Shawn Rasheed", "Arghavan Moradidakhel", "Amjed Tahir", "Foutse Khomh"],
    359       "year": 2024,
    360       "relevance": "Replication study of Pearce et al.'s Copilot security evaluation with updated methodology."
    361     },
    362     {
    363       "title": "Evaluating large language models trained on code",
    364       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    365       "year": 2021,
    366       "arxiv_id": "2107.03374",
    367       "relevance": "Introduces Codex and HumanEval benchmark, the foundation model behind Copilot."
    368     },
    369     {
    370       "title": "Refining chatgpt-generated code: Characterizing and mitigating code quality issues",
    371       "authors": ["Yue Liu", "Thanh Le-Cong", "Ratnadira Widyasari", "Chakkrit Tantithamthavorn", "Li Li", "Xuan-Bach D Le", "David Lo"],
    372       "year": 2024,
    373       "relevance": "Studies quality issues in LLM-generated code and mitigation strategies."
    374     },
    375     {
    376       "title": "AI Code Generators for Security: Friend or Foe?",
    377       "authors": ["Roberto Natella", "Pietro Liguori", "Cristina Improta", "Bojan Cukic", "Domenico Cotroneo"],
    378       "year": 2024,
    379       "relevance": "Evaluates security of code from multiple LLM-based code generators including Copilot and CodeWhisperer."
    380     },
    381     {
    382       "title": "How secure is code generated by chatgpt?",
    383       "authors": ["Raphaël Khoury", "Anderson R Avila", "Jacob Brunelle", "Baba Mamadou Camara"],
    384       "year": 2023,
    385       "relevance": "Evaluates security of ChatGPT-generated code, complementary to Copilot security studies."
    386     },
    387     {
    388       "title": "Instruction Tuning for Secure Code Generation",
    389       "authors": ["Jingxuan He", "Mark Vero", "Gabriela Krasnopolska", "Martin Vechev"],
    390       "year": 2024,
    391       "arxiv_id": "2402.09497",
    392       "relevance": "Proposes instruction tuning to jointly optimize security and utility in code generation."
    393     }
    394   ]
    395 }

Impressum · Datenschutz