scan.json (24280B)
1 { 2 "paper": { 3 "title": "Does Co-Development with AI Assistants Lead to More Maintainable Code? A Registered Report", 4 "authors": [ 5 "Markus Borg", 6 "Dave Hewett", 7 "Donald Graham", 8 "Noric Couderc", 9 "Emma Söderberg", 10 "Luke Church", 11 "Dave Farley" 12 ], 13 "year": 2024, 14 "venue": "arXiv", 15 "arxiv_id": "2408.10758", 16 "doi": null 17 }, 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "A replication package is available on Zenodo (https://zenodo.org/records/12827815), referenced in Section VII and [17]. The code base and tasks are shared as PDF documents on Zenodo, though the actual code is not in a public git repository to prevent leakage to AI training data." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": false, 28 "justification": "This is a registered report describing a planned study. No experimental data has been collected yet, so no data can be released. The replication package contains the study materials (task descriptions, code base) but not experimental results data." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "The paper mentions Java/Spring Boot and references the snapcode.review platform, but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": true, 38 "justification": "The paper describes the study execution plan in detail (Section VIII) and provides a Zenodo replication package [17] with task instructions and the code base. As a registered report, the instructions are for replicating the study protocol rather than computational results." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": true, 45 "justification": "Section VIII states: 'We will perform standard t-tests and report effect sizes and confidence intervals in accordance with the empirical standard.' While this is a planned analysis (no results yet), the pre-registered commitment to report CIs is documented." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": true, 50 "justification": "Section V specifies t-tests at α = 0.05 for normally distributed variables and rank-sum tests for ordinal (Likert) data. Four null hypotheses (H01–H04) are formally stated." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "Section V states: 'We will complement the p-values by reporting effect sizes.' Section VIII reiterates: 'report effect sizes and confidence intervals.' The power analysis uses d=0.5 (medium effect size). Pre-registered commitment to report effect sizes." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": true, 60 "justification": "Section VI includes a formal power analysis: 'We used G*Power (v.3.1.9.7) to run a power analysis for a two-tailed t-test... Based on our expectation of medium effect sizes (d=0.5), α = 0.05, and a power of 0.80, the result indicates that we need at least 64 participants per group.' Total target: 256." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "No results are reported in this registered report, and the pre-registration does not explicitly commit to reporting variance or standard deviations across observations, though this would be expected as part of the planned t-tests." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "The RCT design includes a control group (!AI-devs) whose code is evolved alongside treatment code (AI-devs). This is a proper baseline comparison built into the experimental design (Section III, Figure 2)." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "The baseline is the contemporary practice of coding without AI assistants vs. with AI assistants. This is inherently contemporary since it compares current development practices." 78 }, 79 "ablation_study": { 80 "applies": false, 81 "answer": false, 82 "justification": "This is a controlled experiment comparing two conditions (AI-assisted vs. not), not a system with components to ablate." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": true, 87 "justification": "Four dependent variables are measured: completion time, Code Health (CH), test coverage (TC), and perceived productivity (PP) using the SPACE framework (Section IV, Figure 2)." 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": true, 92 "justification": "Perceived productivity is measured via human self-assessment using a Likert-scale exit questionnaire (Table II, Q2-6) based on the SPACE framework. The entire study centers on human participants performing coding tasks." 93 }, 94 "held_out_test_set": { 95 "applies": false, 96 "answer": false, 97 "justification": "This is a controlled experiment with human participants, not a machine learning evaluation. The concept of held-out test sets does not apply." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": false, 102 "justification": "No results are reported yet (registered report). The pre-registration mentions secondary analysis controlling for confounders (Section VIII) but does not explicitly commit to per-category breakdowns (e.g., by developer experience level)." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": false, 107 "justification": "No results are reported yet. As a registered report, no failure cases can be discussed." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": false, 112 "justification": "No results are reported yet. The commitment to report all results (positive or negative) is implicit in the registered report format but not explicitly stated." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract makes no empirical claims about results — it describes the planned method. All statements in the abstract accurately reflect the study design described in the paper. The abstract correctly states 'We will conduct a two-phased controlled experiment.'" 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "The paper proposes to make causal claims via an RCT design. The causal graph is explicitly modeled using DAGitty (Section IV, Figure 3), confounding variables are identified (Table IV), and the Phase 2 RCT with randomized assignment provides adequate basis for causal inference." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": true, 129 "justification": "Section VII explicitly states: 'the conclusions drawn from studying this specific task cannot be generalized to all possible development scenarios' and acknowledges threats from 'our choice of a single programming language and the complexity of the task.'" 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": true, 134 "justification": "Section IX (Risk Analysis and Threats to Validity) discusses multiple alternative explanations: participant imbalance, different interpretations of 'AI assistant', non-compliance with instructions, developer experience as a confounder. The DAGitty causal graph (Figure 3) explicitly models confounders." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": false, 140 "answer": false, 141 "justification": "The paper does not evaluate any specific AI model. It studies the general phenomenon of AI-assisted development; participants choose their own AI assistant tool. This is a study of developer behavior, not model capability." 142 }, 143 "prompts_provided": { 144 "applies": false, 145 "answer": false, 146 "justification": "The paper does not use prompting as part of its methodology. Participants use their own AI assistants with their own prompts. The study is about the effect on code maintainability, not about specific prompts." 147 }, 148 "hyperparameters_reported": { 149 "applies": false, 150 "answer": false, 151 "justification": "No LLM hyperparameters are part of the study design. Participants choose their own tools and settings. The statistical analysis hyperparameters (α = 0.05, d = 0.5) are reported." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "No agentic scaffolding is used. Participants use off-the-shelf AI assistants (e.g., GitHub Copilot) as black boxes within their IDEs." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section VIII (Data cleaning) specifies: exclude participants whose solutions fail acceptance tests, remove those who didn't adhere to instructions (per exit questionnaire), and remove outlier submissions indicating gaming. Pre-registered data cleaning criteria are documented." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section IX is titled 'Risk Analysis and Threats to Validity' and provides substantive discussion of threats to the study." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section IX discusses specific threats: failing to attract enough participants, imbalance between AI-devs and !AI-devs, participants interpreting 'AI assistant' differently, non-compliance with instructions in remote settings. Section VII also discusses external validity threats from single programming language and task complexity." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": true, 178 "justification": "Section VII explicitly states: 'the conclusions drawn from studying this specific task cannot be generalized to all possible development scenarios.' It further notes 'The main threats to external validity stem from our choice of a single programming language and the complexity of the task.'" 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "No data has been collected yet (registered report). The replication package on Zenodo contains study materials but no experimental data." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section VIII describes data collection in detail: the snapcode.review platform administers tasks, timestamps are collected from repository access to submission, exit questionnaires gather self-reported data, and code quality is measured via CodeScene's Code Health metric." 191 }, 192 "recruitment_methods_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "Section VI describes recruitment: 'We will recruit participants through i) social media advertisements on platforms such as YouTube, LinkedIn, and X and ii) using our personal networks.' Pre-screening questionnaire (Table I) is fully documented." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "The full pipeline is documented: recruitment → pre-screening questionnaire → stratified random assignment → Phase 1 task → Phase 2 RCT → exit questionnaire → data cleaning (acceptance test filtering, compliance filtering, outlier removal) → Bayesian and frequentist analysis." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding source or acknowledgment section is present in the paper. The involvement of CodeScene (first author's affiliation) and Equal Experts is mentioned but formal funding disclosure is absent." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are clearly listed: Markus Borg at CodeScene and Lund University, Dave Hewett and Donald Graham at Equal Experts, others at Lund University, Dave Farley at Continuous Delivery." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding disclosure is provided. The first author is affiliated with CodeScene, whose Code Health metric is used as a dependent variable. Equal Experts provides the snapcode.review platform. The independence of these organizations from the study outcome is not discussed." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial interests statement is present. The first author's dual affiliation with CodeScene (a commercial company whose metric is used in the study) represents a potential conflict that is not declared." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": false, 228 "answer": false, 229 "justification": "This study does not evaluate a pre-trained model's capability on any benchmark. It evaluates the effect of AI assistants on human developer code quality." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": false, 233 "answer": false, 234 "justification": "Not a model benchmark evaluation. However, the paper does address a related contamination concern: Section VII notes the code base is not hosted publicly to prevent 'leakage to AI assistants' training data' [19]." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": false, 238 "answer": false, 239 "justification": "Not a model benchmark evaluation. The paper studies human developer behavior, not model capability." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": true, 245 "answer": true, 246 "justification": "This IS a registered report — the entire paper is a pre-registration of the study design submitted for peer review before data collection. This is the gold standard for pre-registration." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": true, 250 "answer": false, 251 "justification": "No IRB or ethics board approval is mentioned. Section VI states the study 'adheres to the essential attributes of the ACM SigSoft Empirical Standard Ethics (Studies with Human Participants)' but no institutional ethics review is referenced." 252 }, 253 "demographics_reported": { 254 "applies": true, 255 "answer": true, 256 "justification": "The pre-screening questionnaire (Table I) collects: gender (Q1-1), age (Q1-2), location (Q1-3), occupation type (Q1-4), programming experience (Q1-5), Java proficiency (Q1-6), and AI assistant experience (Q1-7, Q1-8). Demographics will be collected though no data exists yet." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": true, 260 "answer": true, 261 "justification": "Section VI describes criteria: AI-dev cohort requires affirmative Q1-7, agreement with Q1-8a, and positive mean on Q1-8b-i. Q1-6 is used to filter out Java novices. Data cleaning excludes participants failing acceptance tests or not adhering to instructions." 262 }, 263 "randomization_described": { 264 "applies": true, 265 "answer": true, 266 "justification": "Section III and VI describe randomization: 'random stratified sampling to split the participants into either Task 1 or Task 2.' Phase 2 uses random assignment without replacement to evolve Task 1 solutions. Stratification ensures equal AI-dev/!AI-dev distribution." 267 }, 268 "blinding_described": { 269 "applies": true, 270 "answer": false, 271 "justification": "Blinding is not discussed. Phase 2 participants evolve code from Phase 1 but it is not stated whether they know if the code was AI-assisted or not. This is a notable omission — knowing whether code was AI-generated could affect behavior." 272 }, 273 "attrition_reported": { 274 "applies": true, 275 "answer": false, 276 "justification": "No data has been collected yet, so attrition cannot be reported. The paper discusses potential dropout risks (Section IX) but does not commit to specific attrition reporting (e.g., CONSORT flow diagram)." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "This is a controlled experiment studying human developers, not a computational method. Inference cost is not applicable." 284 }, 285 "compute_budget_stated": { 286 "applies": false, 287 "answer": false, 288 "justification": "This is a controlled experiment studying human developers. Computational budget is not applicable." 289 } 290 } 291 }, 292 "claims": [ 293 { 294 "claim": "AI assistants like GitHub Copilot are transforming software engineering with productivity improvements.", 295 "evidence": "Cited studies: Peng et al. found 55.8% faster completion with Copilot [3], Ziegler et al. report large positive impact on perceived productivity [4], JetBrains survey shows 77% use ChatGPT and 46% use Copilot [1]. Section I and II.", 296 "supported": "moderate" 297 }, 298 { 299 "claim": "The study requires 256 participants (64 per group in Phase 2) for adequate statistical power.", 300 "evidence": "Section VI reports G*Power analysis: two-tailed t-test, medium effect size d=0.5, α=0.05, power=0.80 yields 64 per group. Total 256 accounts for both phases.", 301 "supported": "strong" 302 }, 303 { 304 "claim": "A two-phase RCT design can isolate the causal effect of AI-assisted code on maintainability.", 305 "evidence": "Section III-IV describe the design with a DAGitty causal graph (Figure 3), stratified random assignment, control for confounders (Table IV), and separation of Phase 1 code creation from Phase 2 evolution.", 306 "supported": "strong" 307 }, 308 { 309 "claim": "CodeScene's Code Health metric is associated with defect counts and development effort.", 310 "evidence": "Section IV cites [15] Tornhill & Borg (2022) and [16] Borg et al. (2024) as supporting evidence for using CH as a maintainability measure.", 311 "supported": "moderate" 312 } 313 ], 314 "methodology_tags": [ 315 "rct" 316 ], 317 "key_findings": "This is a registered report presenting the design for a two-phase controlled experiment on AI assistant impact on code maintainability. No empirical findings are reported. The study proposes an RCT where professional developers evolve Java code previously written with or without AI assistants, measuring completion time, Code Health, test coverage, and perceived productivity. A power analysis targets 256 participants with Bayesian and frequentist analyses planned.", 318 "red_flags": [ 319 { 320 "flag": "No empirical results", 321 "detail": "This is a registered report describing planned research. No data has been collected and no results are reported. The value lies entirely in the study design, not findings." 322 }, 323 { 324 "flag": "Potential conflict of interest with CodeScene", 325 "detail": "First author Markus Borg is affiliated with CodeScene, and CodeScene's proprietary Code Health metric is used as one of four dependent variables. No conflict of interest statement is provided, and no alternative code quality metrics are included for triangulation." 326 }, 327 { 328 "flag": "No IRB/ethics approval mentioned", 329 "detail": "The study involves human participants completing 2-4 hour tasks but no institutional ethics review or IRB approval is mentioned. The paper cites ACM SigSoft ethical standards but this is self-assessed compliance, not independent review." 330 }, 331 { 332 "flag": "Blinding not addressed", 333 "detail": "Phase 2 participants evolve Phase 1 code, but whether they are blinded to the AI-assistance condition is not discussed. Knowledge of whether code was AI-assisted could influence behavior and perceptions." 334 }, 335 { 336 "flag": "Ambitious recruitment target", 337 "detail": "The study targets 256 professional developer volunteers for a 2-4 hour unpaid coding task (with only raffle incentives). Section IX acknowledges this as the 'main risk' but the feasibility of reaching this number is uncertain." 338 } 339 ], 340 "cited_papers": [ 341 { 342 "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot", 343 "authors": ["S. Peng", "E. Kalliamvakou", "P. Cihon", "M. Demirer"], 344 "year": 2023, 345 "relevance": "Seminal RCT showing 55.8% faster task completion with GitHub Copilot; key comparison study for AI-assisted development productivity." 346 }, 347 { 348 "title": "Measuring GitHub Copilot's Impact on Productivity", 349 "authors": ["A. Ziegler", "E. Kalliamvakou", "X. A. Li", "A. Rice", "D. Rifkin", "S. Simister", "G. Sittampalam", "E. Aftandilian"], 350 "year": 2024, 351 "relevance": "GitHub's survey-based study of Copilot's perceived productivity impact using the SPACE framework; directly adapted by this paper." 352 }, 353 { 354 "title": "GitHub Copilot AI pair Programmer: Asset or Liability?", 355 "authors": ["A. Moradi Dakhel", "V. Majdinasab", "A. Nikanjam", "F. Khomh", "M. C. Desmarais", "Z. M. J. Jiang"], 356 "year": 2023, 357 "relevance": "Evaluation of Copilot's code generation quality showing variable quality and defects; relevant to AI code quality assessment." 358 }, 359 { 360 "title": "Do Users Write More Insecure Code with AI Assistants?", 361 "authors": ["N. Perry", "M. Srivastava", "D. Kumar", "D. Boneh"], 362 "year": 2023, 363 "relevance": "User study (n=47) finding AI-assisted participants produced less secure code; highlights quality risks of AI coding tools." 364 }, 365 { 366 "title": "Is GitHub's Copilot as Bad as Humans at Introducing Vulnerabilities in Code?", 367 "authors": ["O. Asare", "M. Nagappan", "N. Asokan"], 368 "year": 2023, 369 "relevance": "Empirical study comparing vulnerability introduction rates between Copilot and human developers." 370 }, 371 { 372 "title": "How Readable is Model-generated Code? Examining Readability and Visual Inspection of GitHub Copilot", 373 "authors": ["N. Al Madi"], 374 "year": 2023, 375 "relevance": "Controlled experiment (n=21) on code readability of AI-assisted code, finding comparable quality to human pair programming." 376 }, 377 { 378 "title": "The Recent Trends of Research on GitHub Copilot: A Systematic Review", 379 "authors": ["Z. C. Ani", "Z. A. Hamid", "N. N. Zhamri"], 380 "year": 2024, 381 "relevance": "Systematic literature review of GitHub Copilot research; maps the landscape of AI assistant studies." 382 }, 383 { 384 "title": "The SPACE of Developer Productivity: There's more to it than you think", 385 "authors": ["N. Forsgren", "M.-A. Storey", "C. Maddila", "T. Zimmermann", "B. Houck", "J. Butler"], 386 "year": 2021, 387 "relevance": "Foundational framework for measuring developer productivity used as basis for perceived productivity measurement in AI-assisted development studies." 388 }, 389 { 390 "title": "Code Red: The Business Impact of Code Quality - A Quantitative Study of 39 Proprietary Production Codebases", 391 "authors": ["A. Tornhill", "M. Borg"], 392 "year": 2022, 393 "relevance": "Validates CodeScene's Code Health metric against defect counts and development effort; provides construct validity for maintainability measurement." 394 } 395 ] 396 }