scan.json (27536B)
1 { 2 "paper": { 3 "title": "An Evaluation of the Impact of Code Generation Tools on Software Development", 4 "authors": [ 5 "Luiz Fernando Mendes Osório", 6 "Pedro de A. dos Santos Neto", 7 "Guilherme Avelino", 8 "Werney Ayala Luz Lira" 9 ], 10 "year": 2025, 11 "venue": "SBSI 2025 (Brazilian Symposium on Information Systems)", 12 "doi": "10.5753/sbsi.2025.246605" 13 }, 14 "scan_version": 3, 15 "active_modules": [], 16 "methodology_tags": ["rct"], 17 "key_findings": "A randomized experiment with 49 student developers found that GitHub Copilot (chat-only mode) significantly reduced task completion time (median 16 vs 23.5 min, p=0.0029, Cliff's δ=-0.254) but had no statistically significant effect on code correctness measured by unit test failures (p=0.866, Cliff's δ=-0.014). The effect size for time reduction was small, and the study was limited to students working on Java/Spring Boot API tasks.", 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "The experimental materials (two Java/Spring Boot APIs) are available on Google Drive (references [1], [3]) and the experimental data on Google Sheets (reference [2]), but no analysis code or statistical scripts are released." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The paper states 'Os dados completos estão disponíveis em [2]' pointing to a Google Sheets dataset. The two APIs used as experimental materials are also available on Google Drive (references [1], [3])." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": true, 33 "justification": "The experimental environment is specified with exact versions: Visual Studio Code 1.90.1, GitHub Copilot extension version 0.12, with autocomplete disabled. The APIs use Java/Spring Boot. However, the statistical analysis environment is not specified." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "While the methodology section describes the experimental procedure, there are no step-by-step reproduction instructions, no README, and no scripts to replicate the statistical analyses or experimental setup." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": false, 45 "justification": "Table 3 reports means, medians, and standard deviations, but no confidence intervals or error bars are provided for any results." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": true, 50 "justification": "Mann-Whitney U test is used for both variables after Shapiro-Wilk confirmed non-normality: time gasto (U=3148.0, p=0.0029) and testes falhos (U=4075.0, p=0.866). Test selection is appropriate and justified." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "Cliff's delta is reported for both variables: δ=-0.254 (small) for time and δ=-0.014 (negligible) for test failures. The interpretation of effect size magnitude is provided." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "No power analysis or sample size justification is provided. The sample of 49 students appears to be a convenience sample based on course enrollment, with no discussion of whether this is sufficient to detect meaningful effects." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": true, 65 "justification": "Standard deviations are reported in Table 3 for both groups and both variables (e.g., SD=25.03 for Copilot time, SD=23.05 for non-Copilot time)." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "The 'without Copilot' condition serves as the baseline, with randomized assignment ensuring balanced comparison across all four problems." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "The baseline (human developer without AI assistance) is the natural and appropriate comparator for evaluating the impact of an AI code generation tool." 78 }, 79 "ablation_study": { 80 "applies": false, 81 "answer": false, 82 "justification": "Copilot is evaluated as a single black-box tool. There are no components to ablate, though the authors did restrict the study to chat-only mode (disabling autocomplete)." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": true, 87 "justification": "Two metrics are used: task completion time (efficiency) and number of failed unit tests (code correctness). Both are analyzed with descriptive and inferential statistics." 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": false, 92 "justification": "Code quality was evaluated only through automated unit tests. No human experts reviewed the generated code for quality, readability, or maintainability, despite the paper discussing code quality indicators like maintainability and code smells in Section 5.1." 93 }, 94 "held_out_test_set": { 95 "applies": false, 96 "answer": false, 97 "justification": "This is a human subjects experiment, not a machine learning evaluation. There is no train/dev/test split concept applicable here." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": false, 102 "justification": "Table 2 shows participant distribution by education level, experience, and Spring usage, but the statistical analysis is only performed at the aggregate level. No per-problem, per-experience-level, or per-education-level breakdowns of results are provided." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": false, 107 "justification": "There is a brief mention that participants introduced auxiliary methods causing inconsistencies, but no systematic error analysis, no qualitative examples of failures, and no examination of which tasks or participant profiles led to more failures." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "The paper honestly reports that Copilot users had slightly higher mean test failures (1.65 vs 1.48) and that no statistically significant improvement in code correctness was found. This null finding is prominently discussed." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract claims Copilot 'can significantly reduce task completion time' (supported by Mann-Whitney U p=0.0029) and 'no statistically significant differences were observed in code correctness' (supported by p=0.866). Both claims match the results." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "The study uses randomized assignment of tasks to with/without Copilot conditions, which supports causal inference. The abstract states Copilot 'can significantly reduce task completion time,' a causal claim justified by the randomized design. The paper also appropriately hedges in Section 4.3.2, noting correlations should not be interpreted as causal, though this contradicts the randomized design." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": false, 129 "justification": "The title claims to evaluate 'Code Generation Tools' (plural) on 'Software Development' (general), but the study tests only Copilot chat (no autocomplete) with 49 CS students on two Java/Spring Boot APIs. While Section 6 acknowledges limited generalizability to professional contexts, the title and framing substantially exceed the tested scope." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": true, 134 "justification": "Section 6 (Threats to Validity) discusses specific alternatives: participant experience variability, controlled vs real-world environments, student vs professional populations. Section 5.5 discusses that code quality 'pode ter sido influenciada pela familiaridade dos participantes com a ferramenta e pela natureza das tarefas propostas.'" 135 }, 136 "proxy_outcome_distinction": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper measures task completion time and unit test failures but frames results as evaluating 'impact on software development' (title) and 'developer performance' (abstract). While Section 6 notes 'outras medidas poderiam ser consideradas,' the paper does not explicitly discuss how these two proxies fall short of capturing the broader constructs claimed." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": true, 145 "answer": false, 146 "justification": "The Copilot extension version (0.12) and VSCode version (1.90.1) are specified, but the underlying model version powering Copilot is not stated. Since Copilot's model can change server-side even within the same extension version, this is insufficient for reproducibility." 147 }, 148 "prompts_provided": { 149 "applies": true, 150 "answer": false, 151 "justification": "Participants used Copilot chat to generate code, but neither the participant prompts nor any structured prompt templates are provided. The interactions with Copilot are not documented or released." 152 }, 153 "hyperparameters_reported": { 154 "applies": true, 155 "answer": false, 156 "justification": "Copilot is used as a black-box commercial tool. No hyperparameters (temperature, model settings) are reported, nor is the fact that these are uncontrollable acknowledged as a limitation." 157 }, 158 "scaffolding_described": { 159 "applies": false, 160 "answer": false, 161 "justification": "The paper evaluates GitHub Copilot as a third-party black-box tool. The authors cannot be expected to describe internal scaffolding they have no access to." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": false, 166 "justification": "Section 4.3.1 briefly mentions that timestamps were verified against participant videos, but the preprocessing is thin. Of 196 expected responses, only 184 were received — the 12 missing responses are noted ('nem todos os participantes conseguiram enviar a solução para os quatro problemas') but not explained in detail." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 6 'Ameaças à Validade' provides a dedicated threats-to-validity section covering internal, external, construction, and conclusion validity." 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": true, 178 "justification": "The threats are specific to this study: student participants vs professional developers, variability in programming experience and Copilot familiarity, controlled environment vs real-world conditions, limited sample size affecting robustness, and limited indicator selection." 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": true, 183 "justification": "Section 6 explicitly states that results may not generalize to 'ambientes de trabalho profissionais' (professional settings) or to 'desenvolvedores mais experientes' (more experienced developers), and Section 7 proposes future work in industrial settings." 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": true, 190 "justification": "The complete dataset is available on Google Sheets (reference [2]): 'Os dados completos estão disponíveis em [2].' The experimental APIs are also available on Google Drive." 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "Section 4.2 describes data collection in detail: participants recorded start/end times manually, submitted code solutions, and recorded screen videos. The experimental procedure (recreating deleted API methods) and tools (VSCode, Swagger for validation) are described." 196 }, 197 "recruitment_methods_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "Section 4.2.2 states: 'Todos os alunos das disciplinas Engenharia de Software e Programação Orientada a Objetos (Graduação) e Engenharia de Software (Pós-Graduação) foram convidados a participar, sem realização de sorteio.' All students in specific courses were invited." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": false, 205 "justification": "The pipeline from data collection to analysis is only partially documented. Of 196 expected responses (49 × 4), only 184 were received. The paper notes not all participants submitted all four problems but does not explain which responses are missing, why, or whether this attrition could bias results." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": false, 212 "justification": "No funding source, acknowledgments section, or grant information is mentioned anywhere in the paper." 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "All authors' affiliations are clearly listed: Universidade Federal do Piauí and Instituto Federal do Piauí. They are university researchers evaluating a Microsoft/GitHub product with no apparent affiliation to the company." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": false, 221 "answer": false, 222 "justification": "No funding is disclosed. This appears to be unfunded university research by Brazilian federal institution researchers." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "No competing interests statement or financial disclosure is present in the paper." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": false, 233 "answer": false, 234 "justification": "The paper evaluates Copilot as a tool through a human subjects experiment, not a pre-trained model's capability on a benchmark. Contamination criteria do not apply." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": false, 238 "answer": false, 239 "justification": "Same as above — this is a human study evaluating a commercial tool, not a benchmark evaluation of a pre-trained model." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": false, 243 "answer": false, 244 "justification": "Same as above — no benchmark evaluation of model knowledge is performed." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": true, 250 "answer": false, 251 "justification": "No mention of pre-registration on any platform (OSF, AsPredicted, or similar). The analysis plan was not committed before data collection." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": true, 255 "answer": false, 256 "justification": "The paper mentions informed consent was obtained ('Foi obtido o consentimento informado de cada participante'), but no IRB or ethics board approval is mentioned." 257 }, 258 "demographics_reported": { 259 "applies": true, 260 "answer": true, 261 "justification": "Table 2 reports participant demographics: education level (34 undergraduate, 12 postgraduate), programming experience (>24 months vs ≤24 months, median=24 months), and prior Spring usage (4 yes, ~42 no per problem)." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": true, 265 "answer": true, 266 "justification": "Inclusion criteria are stated: students enrolled in Software Engineering or Object-Oriented Programming (undergraduate) and Software Engineering (postgraduate) courses. All enrolled students were invited. No explicit exclusion criteria are mentioned." 267 }, 268 "randomization_described": { 269 "applies": true, 270 "answer": false, 271 "justification": "The paper states problems were 'distribuídos aleatoriamente entre os participantes' (randomly distributed) to ensure balanced conditions, but the randomization mechanism (tool, method, stratification procedure) is not described." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "Blinding is not feasible in this study — participants necessarily know whether they are using Copilot or not, as the tool is visible in the IDE. Outcome evaluation was automated (unit tests), which is inherently blinded." 277 }, 278 "attrition_reported": { 279 "applies": true, 280 "answer": false, 281 "justification": "The paper notes 184 responses received vs 196 expected (49 × 4 problems), stating 'nem todos os participantes conseguiram enviar a solução para os quatro problemas.' However, no analysis of which participants dropped which tasks, reasons for missing data, or potential attrition bias is provided." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": false, 287 "answer": false, 288 "justification": "This is a human subjects experiment evaluating a commercial tool (Copilot, free for students). Per-inference costs are not a meaningful concept for this study design." 289 }, 290 "compute_budget_stated": { 291 "applies": false, 292 "answer": false, 293 "justification": "The study involves human participants using a commercial tool. There is no significant compute budget associated with the research methodology." 294 } 295 } 296 }, 297 "claims": [ 298 { 299 "claim": "GitHub Copilot significantly reduces task completion time for programming tasks", 300 "evidence": "Mann-Whitney U=3148.0, p=0.0029; median 16 min with Copilot vs 23.5 min without; Cliff's delta=-0.254 (small effect). Based on 96 Copilot and 88 non-Copilot task responses from 49 student participants across 4 Java/Spring Boot tasks (Section 5.4.1).", 301 "supported": "moderate" 302 }, 303 { 304 "claim": "GitHub Copilot does not significantly impact code correctness as measured by unit test failures", 305 "evidence": "Mann-Whitney U=4075.0, p=0.866; Cliff's delta=-0.014 (negligible); mean test failures 1.65 (Copilot) vs 1.48 (non-Copilot), median identical at 1.0 for both groups (Section 5.4.1, Table 3).", 306 "supported": "moderate" 307 }, 308 { 309 "claim": "AI-assisted coding tools accelerate development but do not substitute human oversight for code quality", 310 "evidence": "Derived from the combination of the above two findings. Discussed in Section 5.5 and Section 7, framed as: efficiency gains exist but quality requires human review.", 311 "supported": "weak" 312 } 313 ], 314 "red_flags": [ 315 { 316 "flag": "Student-only sample with broad title claims", 317 "detail": "The title claims to evaluate 'Code Generation Tools on Software Development' but the sample consists entirely of 49 CS students (undergraduate and postgraduate). No professional developers were included, yet the conclusions are framed broadly." 318 }, 319 { 320 "flag": "Small sample size without power analysis", 321 "detail": "49 participants yielding 184 task responses with no power analysis. The small effect size (Cliff's delta=-0.254) combined with the sample size raises questions about whether the study was adequately powered to detect meaningful differences, particularly for the null finding on code correctness." 322 }, 323 { 324 "flag": "Ecological validity concerns", 325 "detail": "Copilot autocomplete was disabled, restricting participants to chat-only interaction. This is not how most developers use Copilot in practice, limiting the generalizability of findings to real-world usage patterns." 326 }, 327 { 328 "flag": "Manual time measurement", 329 "detail": "Participants self-reported start and end times, which were then cross-checked against video recordings. While the video verification mitigates some concern, the primary measurement relied on manual reporting, introducing potential measurement error." 330 }, 331 { 332 "flag": "No unit tests available during experiment", 333 "detail": "Participants could not run unit tests during the experiment (only Swagger API testing). The unit tests used for evaluation were run post-hoc by the researchers. This means participants had limited feedback on correctness during the task, which may have differentially affected Copilot and non-Copilot conditions." 334 }, 335 { 336 "flag": "Missing data not analyzed", 337 "detail": "12 of 196 expected responses (6.1%) are missing with only a vague explanation ('not all participants managed to submit'). No analysis of whether missing data is related to condition (Copilot vs not) or task difficulty, which could bias results." 338 } 339 ], 340 "cited_papers": [ 341 { 342 "title": "Evaluating Large Language Models Trained on Code", 343 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 344 "year": 2021, 345 "arxiv_id": "2107.03374", 346 "relevance": "Foundational Codex paper underlying GitHub Copilot; defines HumanEval benchmark for code generation evaluation." 347 }, 348 { 349 "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot", 350 "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"], 351 "year": 2023, 352 "arxiv_id": "2302.06590", 353 "relevance": "Large-scale study of Copilot's impact on developer productivity, directly comparable to this paper's research question." 354 }, 355 { 356 "title": "An Empirical Evaluation of GitHub Copilot's Code Suggestions", 357 "authors": ["Nhan Nguyen", "Sarah Nadi"], 358 "year": 2022, 359 "doi": "10.1145/3524842.3528470", 360 "relevance": "Early empirical evaluation of Copilot code suggestion quality and correctness." 361 }, 362 { 363 "title": "GitHub Copilot AI pair programmer: Asset or Liability?", 364 "authors": ["Arghavan Moradi Dakhel", "Vahid Majdinasab", "Amin Nikanjam", "Foutse Khomh", "Michel C. Desmarais", "Zhen Ming (Jack) Jiang"], 365 "year": 2023, 366 "doi": "10.1016/j.jss.2023.111734", 367 "relevance": "Comprehensive evaluation of Copilot covering correctness, reproducibility, diversity, and code quality metrics." 368 }, 369 { 370 "title": "Evaluating the Code Quality of AI-Assisted Code Generation Tools: An Empirical Study on GitHub Copilot, Amazon CodeWhisperer, and ChatGPT", 371 "authors": ["Burak Yetiştiren", "Işık Özsoy", "Miray Ayerdem", "Eray Tüzün"], 372 "year": 2023, 373 "arxiv_id": "2304.10778", 374 "relevance": "Multi-tool comparison of AI code generation quality including correctness, code smells, and maintainability." 375 }, 376 { 377 "title": "Studying the effect of AI Code Generators on Supporting Novice Learners in Introductory Programming", 378 "authors": ["Majeed Kazemitabaar", "Justin Chow", "Carl Ka To Ma", "Barbara J. Ericson", "David Weintrop", "Tovi Grossman"], 379 "year": 2023, 380 "doi": "10.1145/3544548.3580919", 381 "relevance": "Studies AI code generators' impact on novice programming learners, directly relevant to this paper's student participants." 382 }, 383 { 384 "title": "\"It's Weird That it Knows What I Want\": Usability and Interactions with Copilot for Novice Programmers", 385 "authors": ["James Prather", "Brent N. Reeves", "Paul Denny", "Brett A. Becker", "Juho Leinonen", "Andrew Luxton-Reilly"], 386 "year": 2023, 387 "doi": "10.1145/3617367", 388 "relevance": "Qualitative study of novice programmers' interactions with Copilot, complementary perspective to this paper's quantitative approach." 389 }, 390 { 391 "title": "Code on Demand: A Comparative Analysis of the Efficiency Understandability and Self-Correction Capability of Copilot ChatGPT and Gemini", 392 "authors": ["Samuel Silvestre Batista", "Bruno Branco", "Otávio Castro", "Guilherme Avelino"], 393 "year": 2024, 394 "doi": "10.1145/3701625.3701673", 395 "relevance": "Multi-tool comparison including self-correction capability, by co-author of this paper." 396 }, 397 { 398 "title": "Generating Java Methods: An Empirical Assessment of Four AI-Based Code Assistants", 399 "authors": ["Vincenzo Corso", "Leonardo Mariani", "Daniela Micucci", "Oliviero Riganelli"], 400 "year": 2024, 401 "doi": "10.1145/3643916.3644402", 402 "relevance": "Empirical evaluation of AI code assistants specifically for Java method generation, same language as this study." 403 }, 404 { 405 "title": "An Industry Case Study on Adoption of AI-based Programming Assistants", 406 "authors": ["Nicole Davila", "Igor Wiese", "Igor Steinmacher", "Lucas Lucio da Silva", "Andre Kawamoto", "Gilson Jose Peres Favaro", "Ingrid Nunes"], 407 "year": 2024, 408 "doi": "10.1145/3639477.3643648", 409 "relevance": "Industry perspective on AI programming assistant adoption in Brazilian companies, complementing this academic study." 410 }, 411 { 412 "title": "Refining ChatGPT-Generated Code: Characterizing and Mitigating Code Quality Issues", 413 "authors": ["Yue Liu", "Thanh Le-Cong", "Ratnadira Widyasari", "Chakkrit Tantithamthavorn", "Li Li", "Xuan-Bach D. Le", "David Lo"], 414 "year": 2024, 415 "doi": "10.1145/3643674", 416 "relevance": "Characterizes code quality issues in AI-generated code, relevant to this paper's finding of no quality improvement." 417 }, 418 { 419 "title": "Large language models for code completion: A systematic literature review", 420 "authors": ["Rasha Ahmad Husein", "Hala Aburajouh", "Cagatay Catal"], 421 "year": 2025, 422 "doi": "10.1016/j.csi.2024.103917", 423 "relevance": "Systematic review of LLMs for code completion, provides broader context for AI-assisted coding research." 424 } 425 ], 426 "engagement_factors": { 427 "practical_relevance": { 428 "score": 2, 429 "justification": "Practitioners evaluating whether to adopt Copilot will find the time savings vs no quality improvement trade-off directly useful, though the student-only sample limits applicability." 430 }, 431 "surprise_contrarian": { 432 "score": 1, 433 "justification": "The finding that Copilot speeds up coding but doesn't improve quality is mildly interesting but aligns with the emerging consensus in the literature." 434 }, 435 "fear_safety": { 436 "score": 0, 437 "justification": "No security, safety, or risk concerns are raised by this study." 438 }, 439 "drama_conflict": { 440 "score": 0, 441 "justification": "No controversy or conflict angle; straightforward empirical evaluation." 442 }, 443 "demo_ability": { 444 "score": 0, 445 "justification": "No tool, code, or demo is produced; this is a human study with results only." 446 }, 447 "brand_recognition": { 448 "score": 2, 449 "justification": "GitHub Copilot is a widely recognized AI coding tool, though the paper itself is from a regional venue." 450 } 451 } 452 }