scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (27511B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "An Evaluation of the Impact of Code Generation Tools on Software Development",
      6     "authors": [
      7       "Luiz Fernando Mendes Osório",
      8       "P. D. A. S. Neto",
      9       "Guilherme Avelino",
     10       "Werney Ayala Luz Lira"
     11     ],
     12     "year": 2025,
     13     "venue": "SBSI25 (Brazilian Symposium on Information Systems)",
     14     "arxiv_id": null,
     15     "doi": "10.5753/sbsi.2025.246605"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The abstract claims Copilot 'can significantly reduce task completion time' (supported by Mann-Whitney U p=0.0029) and 'no statistically significant differences were observed in code correctness' (supported by p=0.866). Both claims match the results.",
     23         "source": "opus"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The study uses randomized assignment of tasks to with/without Copilot conditions, which supports causal inference. The abstract states Copilot 'can significantly reduce task completion time,' a causal claim justified by the randomized design. The paper also appropriately hedges in Section 4.3.2, noting correlations should not be interpreted as causal, though this contradicts the randomized design.",
     29         "source": "opus"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The title claims to evaluate 'Code Generation Tools' (plural) on 'Software Development' (general), but the study tests only Copilot chat (no autocomplete) with 49 CS students on two Java/Spring Boot APIs. While Section 6 acknowledges limited generalizability to professional contexts, the title and framing substantially exceed the tested scope.",
     35         "source": "opus"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Section 6 (Threats to Validity) discusses specific alternatives: participant experience variability, controlled vs real-world environments, student vs professional populations. Section 5.5 discusses that code quality 'pode ter sido influenciada pela familiaridade dos participantes com a ferramenta e pela natureza das tarefas propostas.'",
     41         "source": "opus"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper measures task completion time and unit test failures but frames results as evaluating 'impact on software development' (title) and 'developer performance' (abstract). While Section 6 notes 'outras medidas poderiam ser consideradas,' the paper does not explicitly discuss how these two proxies fall short of capturing the broader constructs claimed.",
     47         "source": "opus"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 6 'Ameaças à Validade' provides a dedicated threats-to-validity section covering internal, external, construction, and conclusion validity.",
     55         "source": "opus"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "The threats are specific to this study: student participants vs professional developers, variability in programming experience and Copilot familiarity, controlled environment vs real-world conditions, limited sample size affecting robustness, and limited indicator selection.",
     61         "source": "opus"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Section 6 explicitly states that results may not generalize to 'ambientes de trabalho profissionais' (professional settings) or to 'desenvolvedores mais experientes' (more experienced developers), and Section 7 proposes future work in industrial settings.",
     67         "source": "opus"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding source, acknowledgments section, or grant information is mentioned anywhere in the paper.",
     75         "source": "opus"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All authors' affiliations are clearly listed: Universidade Federal do Piauí and Instituto Federal do Piauí. They are university researchers evaluating a Microsoft/GitHub product with no apparent affiliation to the company.",
     81         "source": "opus"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funding is disclosed. This appears to be unfunded university research by Brazilian federal institution researchers.",
     87         "source": "opus"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement or financial disclosure is present in the paper.",
     93         "source": "opus"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "'Code correctness' is operationalized as passing unit tests, 'task efficiency' as completion time, and GitHub Copilot's architecture is described; Task-Technology Fit theory is named but not deeply elaborated.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper explicitly states its contribution: providing empirical evidence on Copilot's impact on task efficiency and code correctness for developers with varying experience levels via a controlled student study.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 3 discusses prior work and explicitly differentiates this study by its API-based tasks, participant pre-training, and closer-to-real-world scenario compared to simple coding problems used in prior work.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "The experimental materials (two Java/Spring Boot APIs) are available on Google Drive (references [1], [3]) and the experimental data on Google Sheets (reference [2]), but no analysis code or statistical scripts are released.",
    124           "source": "opus"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "The paper states 'Os dados completos estão disponíveis em [2]' pointing to a Google Sheets dataset. The two APIs used as experimental materials are also available on Google Drive (references [1], [3]).",
    130           "source": "opus"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": true,
    135           "justification": "The experimental environment is specified with exact versions: Visual Studio Code 1.90.1, GitHub Copilot extension version 0.12, with autocomplete disabled. The APIs use Java/Spring Boot. However, the statistical analysis environment is not specified.",
    136           "source": "opus"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "While the methodology section describes the experimental procedure, there are no step-by-step reproduction instructions, no README, and no scripts to replicate the statistical analyses or experimental setup.",
    142           "source": "opus"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "Table 3 reports means, medians, and standard deviations, but no confidence intervals or error bars are provided for any results.",
    150           "source": "opus"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": true,
    155           "justification": "Mann-Whitney U test is used for both variables after Shapiro-Wilk confirmed non-normality: time gasto (U=3148.0, p=0.0029) and testes falhos (U=4075.0, p=0.866). Test selection is appropriate and justified.",
    156           "source": "opus"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Cliff's delta is reported for both variables: δ=-0.254 (small) for time and δ=-0.014 (negligible) for test failures. The interpretation of effect size magnitude is provided.",
    162           "source": "opus"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "No power analysis or sample size justification is provided. The sample of 49 students appears to be a convenience sample based on course enrollment, with no discussion of whether this is sufficient to detect meaningful effects.",
    168           "source": "opus"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "Standard deviations are reported in Table 3 for both groups and both variables (e.g., SD=25.03 for Copilot time, SD=23.05 for non-Copilot time).",
    174           "source": "opus"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "The 'without Copilot' condition serves as the baseline, with randomized assignment ensuring balanced comparison across all four problems.",
    182           "source": "opus"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "The baseline (human developer without AI assistance) is the natural and appropriate comparator for evaluating the impact of an AI code generation tool.",
    188           "source": "opus"
    189         },
    190         "ablation_study": {
    191           "applies": false,
    192           "answer": false,
    193           "justification": "Copilot is evaluated as a single black-box tool. There are no components to ablate, though the authors did restrict the study to chat-only mode (disabling autocomplete).",
    194           "source": "opus"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Two metrics are used: task completion time (efficiency) and number of failed unit tests (code correctness). Both are analyzed with descriptive and inferential statistics.",
    200           "source": "opus"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": false,
    205           "justification": "Code quality was evaluated only through automated unit tests. No human experts reviewed the generated code for quality, readability, or maintainability, despite the paper discussing code quality indicators like maintainability and code smells in Section 5.1.",
    206           "source": "opus"
    207         },
    208         "held_out_test_set": {
    209           "applies": false,
    210           "answer": false,
    211           "justification": "This is a human subjects experiment, not a machine learning evaluation. There is no train/dev/test split concept applicable here.",
    212           "source": "opus"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": false,
    217           "justification": "Table 2 shows participant distribution by education level, experience, and Spring usage, but the statistical analysis is only performed at the aggregate level. No per-problem, per-experience-level, or per-education-level breakdowns of results are provided.",
    218           "source": "opus"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": false,
    223           "justification": "There is a brief mention that participants introduced auxiliary methods causing inconsistencies, but no systematic error analysis, no qualitative examples of failures, and no examination of which tasks or participant profiles led to more failures.",
    224           "source": "opus"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "The paper honestly reports that Copilot users had slightly higher mean test failures (1.65 vs 1.48) and that no statistically significant improvement in code correctness was found. This null finding is prominently discussed.",
    230           "source": "opus"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": false,
    237           "justification": "The Copilot extension version (0.12) and VSCode version (1.90.1) are specified, but the underlying model version powering Copilot is not stated. Since Copilot's model can change server-side even within the same extension version, this is insufficient for reproducibility.",
    238           "source": "opus"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": false,
    243           "justification": "Participants used Copilot chat to generate code, but neither the participant prompts nor any structured prompt templates are provided. The interactions with Copilot are not documented or released.",
    244           "source": "opus"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": false,
    249           "justification": "Copilot is used as a black-box commercial tool. No hyperparameters (temperature, model settings) are reported, nor is the fact that these are uncontrollable acknowledged as a limitation.",
    250           "source": "opus"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "The paper evaluates GitHub Copilot as a third-party black-box tool. The authors cannot be expected to describe internal scaffolding they have no access to.",
    256           "source": "opus"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": false,
    261           "justification": "Section 4.3.1 briefly mentions that timestamps were verified against participant videos, but the preprocessing is thin. Of 196 expected responses, only 184 were received — the 12 missing responses are noted ('nem todos os participantes conseguiram enviar a solução para os quatro problemas') but not explained in detail.",
    262           "source": "opus"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "The complete dataset is available on Google Sheets (reference [2]): 'Os dados completos estão disponíveis em [2].' The experimental APIs are also available on Google Drive.",
    270           "source": "opus"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Section 4.2 describes data collection in detail: participants recorded start/end times manually, submitted code solutions, and recorded screen videos. The experimental procedure (recreating deleted API methods) and tools (VSCode, Swagger for validation) are described.",
    276           "source": "opus"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": true,
    280           "answer": true,
    281           "justification": "Section 4.2.2 states: 'Todos os alunos das disciplinas Engenharia de Software e Programação Orientada a Objetos (Graduação) e Engenharia de Software (Pós-Graduação) foram convidados a participar, sem realização de sorteio.' All students in specific courses were invited.",
    282           "source": "opus"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": false,
    287           "justification": "The pipeline from data collection to analysis is only partially documented. Of 196 expected responses (49 × 4), only 184 were received. The paper notes not all participants submitted all four problems but does not explain which responses are missing, why, or whether this attrition could bias results.",
    288           "source": "opus"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": false,
    294           "answer": false,
    295           "justification": "The paper evaluates Copilot as a tool through a human subjects experiment, not a pre-trained model's capability on a benchmark. Contamination criteria do not apply.",
    296           "source": "opus"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": false,
    300           "answer": false,
    301           "justification": "Same as above — this is a human study evaluating a commercial tool, not a benchmark evaluation of a pre-trained model.",
    302           "source": "opus"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": false,
    306           "answer": false,
    307           "justification": "Same as above — no benchmark evaluation of model knowledge is performed.",
    308           "source": "opus"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": true,
    314           "answer": false,
    315           "justification": "No mention of pre-registration on any platform (OSF, AsPredicted, or similar). The analysis plan was not committed before data collection.",
    316           "source": "opus"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": true,
    320           "answer": false,
    321           "justification": "The paper mentions informed consent was obtained ('Foi obtido o consentimento informado de cada participante'), but no IRB or ethics board approval is mentioned.",
    322           "source": "opus"
    323         },
    324         "demographics_reported": {
    325           "applies": true,
    326           "answer": true,
    327           "justification": "Table 2 reports participant demographics: education level (34 undergraduate, 12 postgraduate), programming experience (>24 months vs ≤24 months, median=24 months), and prior Spring usage (4 yes, ~42 no per problem).",
    328           "source": "opus"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": true,
    332           "answer": true,
    333           "justification": "Inclusion criteria are stated: students enrolled in Software Engineering or Object-Oriented Programming (undergraduate) and Software Engineering (postgraduate) courses. All enrolled students were invited. No explicit exclusion criteria are mentioned.",
    334           "source": "opus"
    335         },
    336         "randomization_described": {
    337           "applies": true,
    338           "answer": false,
    339           "justification": "The paper states problems were 'distribuídos aleatoriamente entre os participantes' (randomly distributed) to ensure balanced conditions, but the randomization mechanism (tool, method, stratification procedure) is not described.",
    340           "source": "opus"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "Blinding is not feasible in this study — participants necessarily know whether they are using Copilot or not, as the tool is visible in the IDE. Outcome evaluation was automated (unit tests), which is inherently blinded.",
    346           "source": "opus"
    347         },
    348         "attrition_reported": {
    349           "applies": true,
    350           "answer": false,
    351           "justification": "The paper notes 184 responses received vs 196 expected (49 × 4 problems), stating 'nem todos os participantes conseguiram enviar a solução para os quatro problemas.' However, no analysis of which participants dropped which tasks, reasons for missing data, or potential attrition bias is provided.",
    352           "source": "opus"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": false,
    358           "answer": false,
    359           "justification": "This is a human subjects experiment evaluating a commercial tool (Copilot, free for students). Per-inference costs are not a meaningful concept for this study design.",
    360           "source": "opus"
    361         },
    362         "compute_budget_stated": {
    363           "applies": false,
    364           "answer": false,
    365           "justification": "The study involves human participants using a commercial tool. There is no significant compute budget associated with the research methodology.",
    366           "source": "opus"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "GitHub Copilot significantly reduces task completion time for student developers",
    374       "evidence": "Mann-Whitney U: U=3148.0, p=0.0029; Cliff's delta=-0.254 (small effect); median time 16 min with Copilot vs 23.5 min without",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "GitHub Copilot does not significantly improve code correctness as measured by unit test failures",
    379       "evidence": "Mann-Whitney U: U=4075.0, p=0.866; Cliff's delta=-0.014 (negligible); median failure count 1.0 in both conditions",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "AI-assisted coding tools improve developer efficiency but require human oversight for quality",
    384       "evidence": "Derived from the combination of the time reduction finding and the null correctness finding; framed as a practical recommendation in the conclusion",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "Copilot's code suggestions require careful review as they can introduce variability in code quality",
    389       "evidence": "Copilot group had slightly higher mean failure rate (1.65 vs 1.48) and higher variance in failures (SD 2.44 vs 1.84), though neither difference is statistically significant",
    390       "supported": "weak"
    391     }
    392   ],
    393   "methodology_tags": [
    394     "rct",
    395     "observational"
    396   ],
    397   "key_findings": "In a controlled experiment with 49 CS students solving API method reconstruction tasks in Java/Spring Boot, GitHub Copilot significantly reduced task completion time (Mann-Whitney U p=0.0029, Cliff's delta=-0.254, small effect), cutting the median from 23.5 to 16 minutes. However, no statistically significant difference was found in code correctness measured by failed unit tests (p=0.866, δ=-0.014, negligible effect). The Copilot group showed slightly higher variance in failures (SD 2.44 vs 1.84), suggesting the tool may introduce inconsistency. The study confirms efficiency benefits while challenging the assumption that AI coding tools improve output correctness, reinforcing the need for human oversight.",
    398   "red_flags": [
    399     {
    400       "flag": "Student-only sample",
    401       "detail": "All 49 participants are CS students; the paper acknowledges limited generalizability to professional developers, but the title and abstract do not adequately bound this claim."
    402     },
    403     {
    404       "flag": "No power analysis",
    405       "detail": "Sample size was determined by course enrollment convenience; no power calculation was provided, and 49 participants may be underpowered for detecting small effects in subgroup analyses."
    406     },
    407     {
    408       "flag": "Non-standard Copilot configuration",
    409       "detail": "Auto-complete was disabled and only chat mode was used — atypical of real-world Copilot usage — limiting ecological validity of the efficiency findings."
    410     },
    411     {
    412       "flag": "No IRB/ethics approval mentioned",
    413       "detail": "Study involves human participants; only informed consent is mentioned with no reference to institutional ethics review."
    414     },
    415     {
    416       "flag": "No pre-registration",
    417       "detail": "No pre-registration of hypotheses or analysis plan; outcome reporting flexibility is possible."
    418     },
    419     {
    420       "flag": "Manual time self-reporting",
    421       "detail": "Participants self-reported start/end times; video validation is a partial remedy but the primary measurement is still subjective and error-prone."
    422     }
    423   ],
    424   "cited_papers": [
    425     {
    426       "title": "GitHub Copilot AI pair programmer: Asset or Liability?",
    427       "relevance": "Key reference for Copilot evaluation methodology, code diversity, and maintainability metrics; used to frame the study's indicators in QP1"
    428     },
    429     {
    430       "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
    431       "relevance": "Peng et al. RCT study on Copilot productivity at Microsoft; foundational reference for the efficiency claim this study seeks to corroborate"
    432     },
    433     {
    434       "title": "Evaluating the Code Quality of AI-Assisted Code Generation Tools: An Empirical Study on GitHub Copilot, Amazon CodeWhisperer, and ChatGPT",
    435       "relevance": "Direct methodological precedent for code quality evaluation metrics including unit test correctness and code smells"
    436     },
    437     {
    438       "title": "Studying the effect of AI Code Generators on Supporting Novice Learners in Introductory Programming",
    439       "relevance": "Related work on novice programmers and AI tools; informs this study's focus on student participants and learning impact"
    440     },
    441     {
    442       "title": "An Empirical Evaluation of GitHub Copilot's Code Suggestions",
    443       "relevance": "Prior empirical evaluation of Copilot correctness; used as comparison baseline for the correctness findings"
    444     },
    445     {
    446       "title": "Generating Java Methods: An Empirical Assessment of Four AI-Based Code Assistants",
    447       "relevance": "Closely related evaluation of code assistants on Java method generation, the same language and task type used in this study"
    448     },
    449     {
    450       "title": "An Industry Case Study on Adoption of AI-based Programming Assistants",
    451       "relevance": "Real-world adoption study in a Brazilian company; contrasts with this student study and motivates the future work recommendation"
    452     },
    453     {
    454       "title": "'It's Weird That it Knows What I Want': Usability and Interactions with Copilot for Novice Programmers",
    455       "relevance": "Usability and interaction study of Copilot with novice developers; directly relevant to participant profile of this study"
    456     }
    457   ],
    458   "engagement_factors": {
    459     "practical_relevance": {
    460       "score": 2,
    461       "justification": "Practitioners evaluating whether to adopt Copilot will find the time savings vs no quality improvement trade-off directly useful, though the student-only sample limits applicability."
    462     },
    463     "surprise_contrarian": {
    464       "score": 1,
    465       "justification": "The finding that Copilot speeds up coding but doesn't improve quality is mildly interesting but aligns with the emerging consensus in the literature."
    466     },
    467     "fear_safety": {
    468       "score": 0,
    469       "justification": "No security, safety, or risk concerns are raised by this study."
    470     },
    471     "drama_conflict": {
    472       "score": 0,
    473       "justification": "No controversy or conflict angle; straightforward empirical evaluation."
    474     },
    475     "demo_ability": {
    476       "score": 0,
    477       "justification": "No tool, code, or demo is produced; this is a human study with results only."
    478     },
    479     "brand_recognition": {
    480       "score": 2,
    481       "justification": "GitHub Copilot is a widely recognized AI coding tool, though the paper itself is from a regional venue."
    482     }
    483   },
    484   "hn_data": {
    485     "threads": [],
    486     "top_points": 0,
    487     "total_points": 0,
    488     "total_comments": 0
    489   }
    490 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs