ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27515B)


      1 {
      2   "paper": {
      3     "title": "Will It Survive? Deciphering the Fate of AI-Generated Code in Open Source",
      4     "authors": ["Musfiqur Rahman", "Emad Shihab"],
      5     "year": 2026,
      6     "venue": "EASE 2026",
      7     "arxiv_id": "2601.16809",
      8     "doi": null
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": true,
     15         "justification": "The paper provides a replication package URL in the Data Availability section: https://anonymous.4open.science/r/agentic-code-survival_replication_package-B5DB, which includes data and analysis scripts."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The replication package is stated to include data. Additionally, the study uses the publicly available AIDev dataset [30]. The Data Availability section states 'Our replication package, which includes data and analysis scripts, can be found here.'"
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No mention of environment specifications, requirements.txt, Dockerfile, or library versions anywhere in the paper. The tools used (scikit-learn, SMOTE, XGBoost, etc.) are mentioned by name but without version information."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "While a replication package URL is provided, the paper itself does not contain step-by-step reproduction instructions. The methodology is described but there are no specific commands or README instructions for reproducing results."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Confidence intervals are reported throughout: Cox regression 95% CI [0.833, 0.852] in Table 4, Kaplan-Meier curves with 95% confidence interval shading in Figure 1, and 95% CIs for all RQ3 metrics in Tables 8 and 9 (e.g., AUC-ROC 0.671 [0.663–0.679])."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "Multiple significance tests are used: log-rank test for survival differences (p < 0.001), chi-square test of independence for modification intent distributions (chi-squared = 1739.17, p < 0.001), Cox regression p-values, and Scott-Knott ESD test for model comparison."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Effect sizes are reported with context: hazard ratio HR = 0.842 (16% lower hazard), Cramer's V = 0.116 for modification intent differences, percentage-point differences (e.g., 15.4 pp survival advantage), Cliff's delta used in Scott-Knott ESD test, and AUC-ROC improvements over baselines (e.g., 34.2% above random)."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No power analysis or explicit justification for the sample size of 201 repositories or 5,171 PRs. The dataset size is determined by filtering the AIDev dataset rather than by statistical power requirements."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "The study uses 30 repetitions of 10-fold cross-validation (300 performance estimates) and reports 95% confidence intervals for all predictive metrics (Tables 8 and 9), which capture variance across runs."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Human-authored code serves as the baseline throughout for survival analysis (RQ1, RQ2). For RQ3, random baselines are explicitly stated: AUC-ROC = 0.5, AUC-PR = 0.83 (prevalence), F1 = 0.624 for RQ3a; Macro F1 = 0.250 (1/4 classes) for RQ3b."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The human-authored code baseline is drawn from the same repositories and time period as the agent-authored code, making it inherently contemporary. The model tournament includes contemporary classifiers (XGBoost, CatBoost, MLP) compared via Scott-Knott ESD."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No ablation study is performed. For RQ3a, the BoW feature engineering choices (max_features, min_df, max_df, SyntaxToken filtering) are described but not ablated. For RQ3b, AutoSpearman selects features but individual feature contributions are not ablated, only ranked by importance."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Multiple metrics are used: for survival analysis, death rate, hazard ratio, and log-rank p-value; for RQ3a, AUC-ROC, AUC-PR, and F1; for RQ3b, Macro F1, Weighted F1, and AUC-ROC; for RQ2, chi-square, Cramer's V, and standardized residuals."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "No human evaluation of any system output. The commit intent classification relies entirely on keyword matching. The paper mentions 'we validated a subset manually to ensure accuracy' in threats to validity (Section 7), but this manual check is not described in detail and no inter-rater reliability is reported."
     85       },
     86       "held_out_test_set": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "The study uses Repeated Group K-Fold Cross-Validation (30 repetitions of 10-fold CV) with repository slug as the grouping variable, ensuring all observations from a repository appear exclusively in either training or test folds. This prevents data leakage."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Results are broken down by agent type (Table 5: line-level survival by agent; Table 7: corrective rate by agent), by granularity level (file vs. line in Tables 3-4), and by modification intent category (Table 6)."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Section 5.2.3-5.2.4 discusses false positive cases in detail with a concrete example (Figure 2, bottom): the glaredb/glaredb false positive where generic Rust tokens caused incorrect prediction. The vocabulary ambiguity limitation is explicitly discussed as the 'primary failure mode.'"
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Several negative results are reported: file-level Cox regression is not significant (HR = 1.038, p = 0.052, Table 4); temporal prediction achieves only Macro F1 = 0.285, described as 'challenging'; the proportional hazards assumption is violated; line coverage does not distinguish correct from incorrect predictions (p = 0.23)."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Abstract claims match results: '15.8 percentage-point lower modification rate' matches Table 3 (15.4 pp, with 15.8 from the Cox HR interpretation); 'HR = 0.842, p < 0.001' matches Table 4; corrective rates '26.3% vs. 23.0%' match Table 6; 'Cramer's V = 0.116' matches Section 4.3.2; AUC-ROC 0.671 and Macro F1 0.285 match Tables 8-9."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper is careful about causal language. It uses 'is associated with' rather than 'causes' for the Cox regression. The survival advantage is described as a statistical finding, and the ownership hypothesis is explicitly labeled a 'hypothesis' that 'remains untested' (Section 9). The Cox model controls for PR churn, files changed, stars, and contributors. The language 'we hypothesize' is used for explanatory mechanisms."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Section 7 (Threats to Validity, External Validity) states the dataset 'may not generalize to closed-source enterprise environments.' The conclusion recommends 'replicating this analysis in closed-source enterprise environments.' The study is explicitly scoped to 201 open-source projects and five specific AI agents from 2024-2025."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The paper discusses multiple alternative explanations: the code ownership hypothesis ('developers are reluctant to modify code they did not author' - Section 6.1), that survival may not imply robustness ('Code may persist because developers tend not to edit code they did not write' - Section 3.4), and that Devin's higher death rate may be due to more 'experimental code requiring subsequent refinement' (Section 3.3.3)."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The AI agents are named (GitHub Copilot, OpenAI Codex, Cursor, Devin, Claude Code) but no specific version numbers or time periods of these tools are stated. For the ML models (XGBoost, Logistic Regression, etc.), no hyperparameter versions or library versions are given."
    134       },
    135       "prompts_provided": {
    136         "applies": false,
    137         "answer": false,
    138         "justification": "This is a mining/observational study that does not use prompting. The study analyzes existing code from repositories; no LLM prompting is involved in the methodology."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Key hyperparameters are reported: CountVectorizer configured with max_features=1000, min_df=5, max_df=0.90 (Section 5.2.2); AutoSpearman threshold |rho| > 0.7 and VIF > 5 (Section 5.3.2); SMOTE for class imbalance; Scott-Knott ESD alpha = 0.05; 30 repetitions of 10-fold CV."
    144       },
    145       "scaffolding_described": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "No agentic scaffolding is used in this study. It is a mining/observational study analyzing existing code repositories."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "The filtering pipeline is documented in detail in Section 2.1.2 with five stages: cohort identification, license filter, repository state filter, Q1 removal, and code ratio CI filter. The final cohort size (201 repos, 5,171 PRs) is stated. SyntaxToken filtering reducing tokens by 66.8% is described. AutoSpearman feature selection is documented."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 7 'Threats to Validity' is a dedicated section addressing construct, internal, and external validity threats across three substantive paragraphs."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Threats are specific to this study: keyword matching for commit classification has ~60% accuracy (citing prior work [29]); Cox proportional hazards assumption violated (Schoenfeld residuals p < 0.005); unobserved variables like developer experience could influence survival; 201 open-source projects may not generalize to enterprise; AI agents are rapidly evolving."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Multiple specific scope boundaries: results may not generalize to 'closed-source enterprise environments'; 'survival characteristics observed in 2024-2025 may not reflect future model iterations'; the ownership hypothesis 'remains untested'; Section 3.4 states 'survival does not imply robustness'; Section 6.4 explicitly cautions 'do not equate longevity with robustness.'"
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The replication package includes data (stated in Data Availability section). The source AIDev dataset [30] is publicly available. The anonymous repository URL is provided."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 2.1 describes the data source (AIDev dataset), filtering criteria (5 steps in Section 2.1.2), and final cohort statistics (Section 2.1.3). The survival operationalization is described in Section 2.2 with definitions for birth, death, and censoring events."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants. This is a mining study analyzing code from public open-source repositories. The data source is the AIDev dataset, which is a standard public dataset."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The pipeline is documented: AIDev dataset (932,791 PRs) → cohort identification → license filter → repository state filter → Q1 removal → code ratio CI filter → final cohort (201 repos, 5,171 PRs). Table 2 provides survival event counts (15,990 files, 210,184 lines). However, exact counts at each intermediate filtering stage are not reported."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "No funding source or acknowledgments section is visible in the paper. There is no mention of grants, sponsors, or funding agencies."
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Both authors are listed as affiliated with Concordia University, Montreal, Canada. Neither author appears to be affiliated with any of the AI agent companies evaluated (GitHub, OpenAI, Anthropic, Cognition)."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "No funding information is disclosed, so independence of funder cannot be assessed. The absence of a funding disclosure does not mean the work is unfunded."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests statement is present in the paper. Absence of disclosure is not the same as absence of conflict."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": false,
    220         "answer": false,
    221         "justification": "This is a mining study that does not evaluate any pre-trained model's capability on a benchmark. The ML classifiers (XGBoost, Logistic Regression) are trained by the authors on their own dataset, not evaluated for pre-existing knowledge."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "This is a mining study, not a benchmark evaluation of a pre-trained model. The authors' own classifiers use group k-fold CV to prevent leakage, which is an evaluation design concern rather than a contamination concern."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "This is a mining study that does not evaluate any pre-trained model on an existing benchmark. The classifiers are trained and tested on the authors' own dataset."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants. This is a mining software repositories study analyzing code from public open-source projects."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants. The study analyzes publicly available code repositories."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants. The study analyzes code units, not people."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants. Repository inclusion/exclusion criteria are discussed under data preprocessing."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants and no experimental treatment assignment. This is an observational mining study."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants and no experimental conditions requiring blinding."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants. Right-censoring in the survival analysis is reported (Table 2) but is a statistical concept, not participant attrition."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "No mention of computational cost for the survival analysis, classification experiments, or LIME explanations. The 300 cross-validation runs and LIME per-file explanations likely required non-trivial compute, but this is not quantified."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No mention of hardware used, wall-clock time, or total computational budget for any part of the analysis."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "Agent-authored code survives significantly longer than human-authored code at the line level, with a 15.4 percentage-point lower death rate (53.9% vs 69.3%) and 16% lower hazard of modification (HR = 0.842, p < 0.001).",
    287       "evidence": "Table 3 shows death rates, Table 4 shows Cox regression HR = 0.842 [0.833, 0.852], and Figure 1 shows Kaplan-Meier curves with 95% CIs (Section 3.3).",
    288       "supported": "strong"
    289     },
    290     {
    291       "claim": "The survival advantage varies substantially by tool type: copilot-style assistants show 20-30pp lower death rates than human code, while Devin shows slightly higher death rates (+2.4pp).",
    292       "evidence": "Table 5 provides per-agent death rates: Cursor 38.7%, Claude Code 41.0%, OpenAI Codex 48.5%, GitHub Copilot 48.6%, Human 69.3%, Devin 71.7% (Section 3.3.3).",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "Agent-authored code shows modestly elevated corrective modification rates (26.3% vs 23.0%) with small effect size (Cramer's V = 0.116), and per-agent variation exceeds the agent-human gap.",
    297       "evidence": "Table 6 shows intent distributions; chi-square test confirms significance (chi-squared = 1739.17, p < 0.001) but Cramer's V = 0.116 indicates small effect. Table 7 shows per-agent corrective rates ranging from 13.8% (Cursor) to 44.4% (Claude Code) (Sections 4.3.1-4.3.4).",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "Textual features can identify modification-prone code (AUC-ROC = 0.671, 34.2% above random baseline), but predicting when modifications occur remains challenging (Macro F1 = 0.285, 14% above random).",
    302       "evidence": "Table 8 shows RQ3a performance with 95% CIs from 30x10 CV. Table 9 shows RQ3b performance. Both use group k-fold CV with repository as grouping variable (Sections 5.2-5.3).",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "The survival advantage may be partly explained by code ownership dynamics: developers are reluctant to modify code they did not author, and agent-generated code lacks a clear human owner.",
    307       "evidence": "This is presented as a hypothesis in Section 6.1, citing Bird et al. [10] and Greiler et al. [19] on code ownership effects. The paper explicitly acknowledges this 'remains untested' (Section 9).",
    308       "supported": "weak"
    309     },
    310     {
    311       "claim": "File-level analysis is confounded by mixed authorship, making line-level the more appropriate granularity. File-level Cox regression is not statistically significant (HR = 1.038, p = 0.052).",
    312       "evidence": "Table 4 shows file-level HR = 1.038, p = 0.052 vs line-level HR = 0.842, p < 0.001. Section 2.2.2 explains the mixed authorship confound (Sections 2.2.2, 3.3.2).",
    313       "supported": "strong"
    314     }
    315   ],
    316   "methodology_tags": ["observational", "benchmark-eval"],
    317   "key_findings": "Agent-authored code in open-source repositories survives significantly longer than human-authored code at the line level (HR = 0.842, p < 0.001), contradicting the 'disposable code' hypothesis. However, modification profiles differ: agent code shows slightly elevated corrective rates while human code shows higher adaptive rates, with small effect sizes (Cramer's V = 0.116) and per-agent variation exceeding the agent-human gap. Textual features can partially predict which code will be modified (AUC-ROC = 0.671), but predicting when modifications occur resists static analysis (Macro F1 = 0.285), suggesting code fate depends on organizational dynamics rather than inherent code quality.",
    318   "red_flags": [
    319     {
    320       "flag": "Keyword-based intent classification",
    321       "detail": "Modification intent is classified using keyword matching on commit messages, which the paper acknowledges has approximately 60% accuracy (citing [29]). While the large sample size (n=129,484) may mitigate random misclassification, systematic biases in keyword matching (e.g., AI-related commits using different vocabulary) could introduce differential misclassification between agent and human code."
    322     },
    323     {
    324       "flag": "Proportional hazards assumption violated",
    325       "detail": "Schoenfeld residuals show significant violations of the proportional hazards assumption for all covariates (p < 0.005) in the Cox regression. The authors acknowledge this and present Cox results as 'average effects,' but this means the hazard ratio may not be constant over time, potentially masking important temporal dynamics."
    326     },
    327     {
    328       "flag": "Anonymized replication package",
    329       "detail": "The replication package URL uses anonymous.4open.science, which is an anonymous review service. It is unclear whether this will be updated to a permanent, accessible URL upon publication. The reproducibility claim depends on this package being available and complete."
    330     },
    331     {
    332       "flag": "Survival may conflate neglect with quality",
    333       "detail": "The paper acknowledges but does not resolve that 'survival' (lack of modification) could indicate either robust code or neglected code that no one dares to touch. The ownership hypothesis is untested, leaving the causal mechanism behind the survival advantage ambiguous."
    334     }
    335   ],
    336   "cited_papers": [
    337     {
    338       "title": "Evaluating large language models trained on code",
    339       "authors": ["Mark Chen"],
    340       "year": 2021,
    341       "arxiv_id": "2107.03374",
    342       "relevance": "Established Pass@k as the standard metric for evaluating code generation, which this paper argues is insufficient for assessing long-term code viability."
    343     },
    344     {
    345       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    346       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"],
    347       "year": 2023,
    348       "arxiv_id": "2310.06770",
    349       "relevance": "Major benchmark for evaluating LLM coding agents on real-world GitHub issues, directly relevant to agentic code generation evaluation."
    350     },
    351     {
    352       "title": "Asleep at the keyboard? Assessing the security of GitHub Copilot's code contributions",
    353       "authors": ["Hammond Pearce", "Baleegh Ahmad", "Benjamin Tan", "Brendan Dolan-Gavitt", "Ramesh Karri"],
    354       "year": 2025,
    355       "relevance": "Found that Copilot frequently introduces security vulnerabilities, directly relevant to AI code quality assessment."
    356     },
    357     {
    358       "title": "Where Do AI Coding Agents Fail? An Empirical Study of Failed Agentic Pull Requests in GitHub",
    359       "authors": ["Ramtin Ehsani", "Sakshi Pathak", "Shriya Rawal", "Abdullah Al Mujahid", "Mia Mohammad Imran", "Preetha Chatterjee"],
    360       "year": 2026,
    361       "arxiv_id": "2601.15195",
    362       "relevance": "Studies 33k agent-authored PRs on GitHub to understand failure modes of AI coding agents, complementary to this paper's post-merge analysis."
    363     },
    364     {
    365       "title": "The rise of AI teammates in software engineering (SE) 3.0: How autonomous coding agents are reshaping software engineering",
    366       "authors": ["Hao Li", "Haoxiang Zhang", "Ahmed E. Hassan"],
    367       "year": 2025,
    368       "relevance": "Source of the AIDev dataset used in this study, a large-scale collection of agent-authored PRs from real-world GitHub repositories."
    369     },
    370     {
    371       "title": "Evaluating the code quality of AI-assisted code generation tools: An empirical study on GitHub Copilot, Amazon CodeWhisperer, and ChatGPT",
    372       "authors": ["Burak Yetiştiren", "Işık Özsoy", "Miray Ayerdem", "Eray Tüzün"],
    373       "year": 2023,
    374       "arxiv_id": "2304.10778",
    375       "relevance": "Empirical comparison of AI code generation tool quality, relevant to AI coding tool evaluation methodology."
    376     },
    377     {
    378       "title": "Self-admitted GenAI usage in open-source software",
    379       "authors": ["Tao Xiao", "Youmei Fan", "Fabio Calefato", "Christoph Treude", "Raula Gaikovina Kula", "Hideaki Hata", "Sebastian Baltes"],
    380       "year": 2025,
    381       "relevance": "Studies self-reported AI usage in open-source projects and provides repository filtering criteria adapted for this study."
    382     },
    383     {
    384       "title": "Is GitHub's Copilot as bad as humans at introducing vulnerabilities in code?",
    385       "authors": ["Owura Asare", "Meiyappan Nagappan", "Nirmal Asokan"],
    386       "year": 2023,
    387       "relevance": "Compares vulnerability introduction rates between Copilot and human code, directly relevant to AI code quality assessment."
    388     },
    389     {
    390       "title": "Lost at C: A user study on the security implications of large language model code assistants",
    391       "authors": ["Gustavo Sandoval", "Hammond Pearce", "Teo Nys", "Ramesh Karri", "Siddharth Garg", "Brendan Dolan-Gavitt"],
    392       "year": 2023,
    393       "relevance": "User study examining security of LLM-generated code, relevant to understanding risks of AI coding tools."
    394     },
    395     {
    396       "title": "Beyond Synthetic Benchmarks: Evaluating LLM Performance on Real-World Class-Level Code Generation",
    397       "authors": ["Musfiqur Rahman", "SayedHassan Khatoonabadi", "Emad Shihab"],
    398       "year": 2025,
    399       "relevance": "Evaluates LLM code generation on real-world tasks beyond synthetic benchmarks, directly relevant to AI code generation evaluation."
    400     },
    401     {
    402       "title": "Coding on Copilot: 2023 Data Suggests Downward Pressure on Code Quality",
    403       "authors": ["GitClear"],
    404       "year": 2023,
    405       "relevance": "Industry analysis finding doubled code churn coinciding with AI adoption, which this paper's findings directly challenge."
    406     }
    407   ]
    408 }

Impressum · Datenschutz