scan.json (24254B)
1 { 2 "paper": { 3 "title": "Code Ownership in Open-Source AI Software Security", 4 "authors": ["Jiawen Wen", "Dong Yuan", "Lei Ma", "Huaming Chen"], 5 "year": 2023, 6 "venue": "arXiv preprint", 7 "arxiv_id": "2312.10861" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "A GitHub repository is provided: https://github.com/jemjemzzZ/Code-Ownership (footnote 1, Section 2). The paper states they 'implemented a Python-based command-line application' and released it." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "The paper describes collecting 39,323 records from NVD and GitHub repositories but does not provide a download link for the processed dataset. The raw data sources (NVD, GitHub) are public, but the curated vulnerability dataset is not released." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No environment specifications, requirements.txt, Dockerfile, or dependency versions are mentioned in the paper. The tool is described as Python-based but no setup details are provided." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions are provided. The paper describes the methodology at a high level (Section 5) but does not provide commands, scripts, or a README for reproducing the experiments." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Correlation coefficients are reported as point estimates (e.g., Spearman -0.62, Pearson 0.45) without confidence intervals or error bars in any of the results tables (Tables 2-5)." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": true, 41 "justification": "The paper uses the Kolmogorov-Smirnov test with p-values (Section 6.1: 'the p-value substantially exceeds 0.05'), the Mantel test (Section 5.2, 6.1), and F-statistics for multiple linear regression models (Table 5). P-values from the Mantel test 'gravitate closely to 1' (Section 6.1)." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "Correlation coefficients (Pearson, Spearman, Kendall) are reported throughout Tables 2-4 with actual values (e.g., Spearman of -0.62 for Ownership, 0.7 for Num of Contributor). Adjusted R-squared values are reported in Table 5 (e.g., 0.659, 0.553). These provide magnitude context." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "The dataset contains 39,323 records across 5 projects, but there is no justification for why these 5 specific projects were chosen beyond being 'prominent deep learning projects.' No power analysis or sample size justification is discussed." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No variance, standard deviation, or spread measures are reported for any results. Correlation values and regression coefficients are single-number point estimates without uncertainty ranges." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "The paper compares its proposed code ownership metrics against Bird et al.'s original code ownership metrics and classic process metrics (code churn, churn rate, file size) — see Tables 3, 5, and the discussion of RQ2 in Section 6.3." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": false, 68 "justification": "The primary baseline is Bird et al. (2011), which is 12 years old at time of publication. While the paper justifies using it as the foundational work, no more recent code ownership or security metrics studies are used as baselines. Foucault et al. (2014) and Greiler et al. (2015) are discussed in related work but not directly compared against in the experiments." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "The paper systematically examines the impact of individual metric components: different threshold definitions for minor contributors (5%, 10%, 20%, 50%) in Section 6.1, and multiple linear regression models testing different metric combinations in Table 5 (e.g., 'Per of Minor + Oss Stage Aged' showing 33% improvement in adjusted R-squared)." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Three correlation methods are used (Pearson, Spearman, Kendall — Section 5.2), plus multiple similarity metrics (Min-Max Scaling, Exponential Decay, cosine similarity, K-S test) and regression metrics (adjusted R-squared, F-statistic, coefficient values) in Sections 5.2 and 6.1." 79 }, 80 "human_evaluation": { 81 "applies": false, 82 "answer": false, 83 "justification": "This is a mining/repository study analyzing code ownership metrics against vulnerability databases. Human evaluation of outputs is not relevant to the claims." 84 }, 85 "held_out_test_set": { 86 "applies": false, 87 "answer": false, 88 "justification": "This is a correlation/regression study, not a predictive modeling study. There is no train/test split or held-out evaluation applicable here." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down by time stage (T1-T5), OSS stage, pre-release vs. post-release, CVE severity, and individual metrics across all five projects. Tables 2-5 provide detailed per-metric breakdowns." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "The paper discusses cases where metrics do not show significant correlation: 'other metrics do not demonstrate any notable correlation with vulnerability' (Section 6.2), pre-/post-release classification has 'absolute values all falling below 0.1' (Section 6.2), and code ownership metrics show weak correlation with CVE severity (Table 4, values mostly below 0.12)." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "Several negative results are reported: code ownership metrics do not significantly correlate with CVE severity (Table 4, values below 0.12); pre-/post-release classification shows negligible correlation with code ownership (below 0.1); classic metrics lag behind ownership metrics; churn rate shows near-zero correlation with vulnerability (Table 2)." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims a 'positive relationship between high-level ownership and a decrease in vulnerabilities,' which is supported by correlation results in Tables 2-3 (Ownership showing negative correlation with vulnerability, minor contributors showing positive correlation). The time metrics claim is supported by Tables 2-4." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": false, 115 "justification": "The paper uses correlational language ('correlation,' 'relationship') appropriately in most places, but makes implicit causal claims: 'an increase in the number of minor contributors could lead to a heightened risk of vulnerabilities' (Section 3.2), and recommendations that managers should 'closely monitor projects' based on these correlations. The study design is purely observational with no causal identification strategy, making these causal implications unjustified." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The title 'Code Ownership in Open-Source AI Software Security' implies broad applicability, but the study examines only 5 deep learning framework projects (TensorFlow, Caffe, OpenCV, Keras, PyTorch). The paper acknowledges in threats to validity that 'our findings might not represent all such software' (Section 7), but the title and abstract do not bound the claims to these specific projects." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper discusses confounding effects of file size and code churn (Section 5.2, Table 5), tests whether vulnerability occurrence rate is a distortion factor (Section 6.1), and discusses dependency modification as an alternative vulnerability vector (Section 7). The threats-to-validity section considers programming language effects and license types as potential alternative explanations." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": false, 131 "answer": false, 132 "justification": "This paper does not use any AI/ML models for inference or evaluation. It is a mining study analyzing code ownership metrics from GitHub repositories." 133 }, 134 "prompts_provided": { 135 "applies": false, 136 "answer": false, 137 "justification": "No prompting or language models are used in this study." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "The minor contributor threshold is set at 10% and justified (Section 4.1, 6.1). Time stage boundaries (T1-T5) are explicitly defined in Table 1. The vulnerable-to-non-vulnerable ratio for correlation analysis is specified as 1:1 (Section 6.2). OSS stage definitions with their numerical criteria are fully specified in Table 1." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "No agentic scaffolding is used in this study." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 5.1 describes the data collection and processing pipeline: NVD and GitHub as sources, API usage for CVE details, GitHub REST APIs for commit/PR details, GitPython for commit history analysis. The dataset composition is quantified: 39,323 records total, 27,075 vulnerable, 12,248 non-vulnerable, 904 with CVE annotations. The non-vulnerable benchmark from latest TensorFlow is also described." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 7 'Threat to Validity' is a dedicated section discussing limitations across three categories: Project Attribute Limitations, Data Quality & Generalizability, and Metric Completeness." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "The threats are specific to this study: programming language and license type influences may be overlooked; the 5 projects may not represent all open-source AI software; NVD CVE severity scores could affect metric validity; latest TensorFlow as non-vulnerable reference 'may not be the most suitable choice'; exclusion of complexity analysis in diverse languages (Section 7)." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "While the threats section mentions some limitations, the paper does not explicitly state what the results do NOT show or what claims the authors are NOT making. The conclusion and abstract make broad claims about 'open-source AI software' without explicitly bounding them to the 5 specific deep learning frameworks studied." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "The raw vulnerability dataset (39,323 records) is not available for download. While the source data (NVD, GitHub) is public, the curated and processed dataset used in the analysis is not released, making independent verification difficult." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 5.1 describes the data collection procedure: NVD for CVE details, GitHub security advisories for commits and pull requests linked to security issues, GitHub REST APIs for extracting source files and timestamps, GitPython for complete commit history analysis. Five specific projects are named with their GitHub URLs." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants in this study. The data comes from public GitHub repositories and NVD, which are standard public sources." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The pipeline is documented in Section 5.1 and Figure 1: NVD crawler/filter → Pull Request/Commit Analysor → Time/Release Metrics + Code Ownership Metrics → Distilled Result Matrix → Correlation Matrix Heatmap. The dataset composition (39,323 total, 27,075 vulnerable, 12,248 non-vulnerable, 904 with CVE annotations) is quantified." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding disclosure or acknowledgments section mentioning grants or sponsors is present in the paper." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are clearly listed: University of Sydney (Wen, Yuan, Chen), University of Alberta (Ma), and University of Tokyo (Ma). These are academic institutions with no obvious conflict of interest with the projects studied." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": false, 207 "answer": false, 208 "justification": "No funding is disclosed. The authors are at academic institutions with no apparent financial stake in the outcome. Treating as NA/unfunded." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests statement or financial disclosure is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": false, 219 "answer": false, 220 "justification": "This is a mining/repository study analyzing code ownership metrics. No pre-trained AI model is evaluated on any benchmark." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": false, 224 "answer": false, 225 "justification": "No pre-trained model is evaluated on any benchmark in this study." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": false, 229 "answer": false, 230 "justification": "No pre-trained model is evaluated on any benchmark in this study." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study. It is a repository mining study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": false, 273 "answer": false, 274 "justification": "This is a repository mining study. No inference cost or API calls are involved. The computational cost of running the analysis scripts is not a meaningful concern for reproducibility." 275 }, 276 "compute_budget_stated": { 277 "applies": false, 278 "answer": false, 279 "justification": "This is a repository mining and correlation study. The computational requirements are modest (GitHub API calls, correlation analysis) and do not warrant compute budget reporting." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "High-level ownership (limited minor contributors) correlates with decreased vulnerabilities in open-source AI software projects.", 286 "evidence": "Table 2 shows Ownership has negative correlation with vulnerability (Pearson -0.12, Spearman -0.1). Table 3 shows stronger negative correlation when classified by Time Stage Aged (Spearman -0.62). Minor contributor metrics show positive correlation with vulnerability (Num of Minor Spearman 0.64, Per of Minor Pearson 0.64 in Table 3).", 287 "supported": "moderate" 288 }, 289 { 290 "claim": "Time-based classification of vulnerabilities is more effective than pre-/post-release dichotomy for open-source AI projects.", 291 "evidence": "Table 3 shows Time Stage Aged classification yields correlation values up to 0.96 (Age, Spearman), while pre-/post-release classification shows values below 0.1 for code ownership metrics. Section 6.2 discusses this comparison explicitly.", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "Code ownership metrics outperform classic process metrics (code churn, file size, churn rate) in correlating with vulnerabilities.", 296 "evidence": "Table 3 shows code ownership metrics (Ownership -0.62, Num of Contributor 0.7 Spearman) vs. classic metrics (Code churn 0.49, File Size 0.25 Spearman) when classified by Time Stage Aged. Section 6.3 RQ2 discusses this finding.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "Vulnerability severity increases with project lifespan and number of releases.", 301 "evidence": "Table 4 shows Days Difference has Pearson correlation of 0.45 with CVE Severity, and Release Amounts has 0.43. Post-release shows positive correlation (0.4), pre-release negative (-0.41). Section 6.2 discusses these findings for RQ3.", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "The metric measurements are robust to distortion factors including vulnerability occurrence rate, minor contributor threshold, and software locality.", 306 "evidence": "Section 6.1: Mantel test p-values 'gravitate closely to 1' for vulnerability proportion changes; K-S test shows consistent distribution across thresholds (5%, 10%, 20%, 50%); Mantel test shows 0.822 correlation between file and group component matrices.", 307 "supported": "moderate" 308 } 309 ], 310 "methodology_tags": ["observational"], 311 "key_findings": "This study finds that code ownership metrics, particularly the proportion of minor contributors, correlate with vulnerability presence in five open-source AI software projects (TensorFlow, Caffe, OpenCV, Keras, PyTorch). The correlation is strongest when vulnerabilities are classified by the proposed 'Time Stage Aged' metric rather than traditional pre-/post-release dichotomy. Code ownership metrics outperform classic process metrics (code churn, file size) in this correlation. The authors also find that vulnerability severity tends to increase with project lifespan and release count.", 312 "red_flags": [ 313 { 314 "flag": "Only 5 projects studied", 315 "detail": "The study examines only 5 deep learning framework projects (TensorFlow, Caffe, OpenCV, Keras, PyTorch) but makes claims about 'open-source AI software' broadly. These are all mature, large-scale frameworks, which may not represent the broader ecosystem of open-source AI projects." 316 }, 317 { 318 "flag": "Correlation treated as actionable evidence", 319 "detail": "The paper derives management recommendations ('post-release phases warrant enhanced scrutiny,' 'prolonged software components with a burgeoning number of minor contributors demand rigorous oversight') from purely correlational findings without establishing causation. Confounders like project maturity, code complexity, and disclosure practices could explain the observed correlations." 320 }, 321 { 322 "flag": "Non-vulnerable benchmark questionable", 323 "detail": "The non-vulnerable dataset uses only the latest TensorFlow version, assuming it 'boasts significant security measures.' This is a strong assumption — the latest version may contain undiscovered vulnerabilities, and using one project as the non-vulnerable benchmark for a study of five projects introduces systematic bias." 324 }, 325 { 326 "flag": "No confidence intervals on correlation coefficients", 327 "detail": "All correlation coefficients (Tables 2-4) are reported as point estimates without confidence intervals or standard errors, making it impossible to assess the precision of these estimates." 328 } 329 ], 330 "cited_papers": [ 331 { 332 "title": "Don't touch my code! Examining the effects of ownership on software quality", 333 "authors": ["C. Bird", "B. Murphy", "H. Gall"], 334 "year": 2011, 335 "relevance": "Foundational work on code ownership metrics and their relationship to software quality, which this paper extends to open-source AI projects." 336 }, 337 { 338 "title": "Code ownership in open-source software", 339 "authors": ["M. Foucault", "J.-R. Falleri", "X. Blanc"], 340 "year": 2014, 341 "relevance": "Replication study of Bird et al. extending code ownership analysis to Java open-source projects, providing context for cross-domain ownership metric effectiveness." 342 }, 343 { 344 "title": "Code Ownership and Software Quality: A Replication Study", 345 "authors": ["M. Greiler", "K. Herzig", "J. Czerwonka"], 346 "year": 2015, 347 "relevance": "Replication study within Microsoft examining code ownership at file and directory levels, relevant to understanding ownership-quality relationships in large software projects." 348 }, 349 { 350 "title": "Sok: Taxonomy of attacks on open-source software supply chains", 351 "authors": ["P. Ladisa", "H. Plate", "M. Martinez", "O. Barais"], 352 "year": 2023, 353 "relevance": "Comprehensive taxonomy of supply chain attacks on open-source software, directly relevant to understanding security threats that code ownership metrics might help detect." 354 }, 355 { 356 "title": "Do developers update their library dependencies? - an empirical study on the impact of security advisories on library migration", 357 "authors": ["R. G. Kula", "D. M. German", "A. Ouni", "T. Ishio", "K. Inoue"], 358 "year": 2018, 359 "relevance": "Empirical study on dependency management and security in open-source software, relevant to understanding developer behavior and vulnerability patterns." 360 }, 361 { 362 "title": "Use of relative code churn measures to predict system defect density", 363 "authors": ["N. Nagappan", "T. Ball"], 364 "year": 2005, 365 "relevance": "Classic work on code churn metrics for defect prediction, which serves as a baseline metric category in this study's comparison." 366 }, 367 { 368 "title": "When a patch goes bad: Exploring the properties of vulnerability-contributing commits", 369 "authors": ["Andrew Meneely", "Harshavardhan Srinivasan", "Afiqah Musa", "Alberto Rodriguez-Tejeda", "Matthew Mokary", "Brian Spates"], 370 "year": 2013, 371 "relevance": "Empirical study examining commit properties associated with vulnerability introduction, relevant to understanding how developer contributions relate to security." 372 } 373 ] 374 }