ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (26203B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "An Evaluation of the Impact of Code Generation Tools on Software Development",
      6     "authors": [
      7       "Luiz Fernando Mendes Osório",
      8       "P. D. A. S. Neto",
      9       "Guilherme Avelino",
     10       "Werney Ayala Luz Lira"
     11     ],
     12     "year": 2025,
     13     "venue": "SBSI25 (Brazilian Symposium on Information Systems)",
     14     "arxiv_id": null,
     15     "doi": "10.5753/sbsi.2025.246605"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The abstract's two primary claims — that Copilot significantly reduces task completion time and shows no significant difference in code correctness — are directly supported by Mann-Whitney U results (p=0.0029 and p=0.866 respectively) reported in Section 5.4.1.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The abstract uses causal language ('Copilot can significantly reduce task completion time'), but Section 4.3.2 explicitly states the correlations cannot be interpreted as causal; the student-only between-subjects design with uncontrolled experience confounds does not support strong causal inference.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The conclusion states 'Copilot proved effective in reducing development time' without qualifying this to the student sample; while Section 6 acknowledges the limitation, the abstract and conclusions make unqualified claims that exceed the student-only evidence base.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Section 5.5 discusses that higher test failures with Copilot may result from participants introducing auxiliary methods causing inconsistencies, and that limited tool familiarity and task complexity variation may have influenced results.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Section 5.1 explicitly justifies using task completion time as a proxy for efficiency and failed unit tests as a proxy for code correctness, with citations to prior literature, and these operationalizations are consistently applied throughout the paper.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 6 'Ameaças à Validade' (Threats to Validity) is a dedicated section organized into four categories: internal, external, construct, and conclusion validity.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Specific threats are named: variability in participant experience levels despite uniform training, student-only sample limiting generalization to industry, manual time recording susceptibility to errors, and limited sample size affecting robustness — these go beyond generic boilerplate.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Section 6 explicitly states that 'generalization to industrial development contexts or with more experienced developers is limited' since the study was conducted exclusively with students.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding acknowledgment or disclosure appears anywhere in the paper.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All four authors' institutional affiliations (UFPI and IFPI — Brazilian public universities) are disclosed in the paper header, with no apparent ties to GitHub or Microsoft.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funding is disclosed; the study appears to be unfunded academic work at public universities with no industry ties to the evaluated product.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests or financial interests statement appears anywhere in the paper.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms are operationally defined: 'efficiency' is measured as task completion time, 'code correctness' as passing unit tests, and 'AI-assisted code generation tools' are described with historical context in Section 2.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper clearly states its contribution: empirical evidence on Copilot's impact on task completion time and code correctness among student developers using API-level Java tasks, filling a gap in prior literature that used simpler isolated problems.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 3 explicitly contrasts this work from prior studies, noting differences in task complexity (API-level vs. simple isolated problems), participant involvement (vs. researcher-only analysis), and inclusion of participant training before evaluation.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "The APIs used as task materials are shared via Google Drive links and the data via Google Sheets, but no analysis scripts for the statistical procedures are released.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "Section 5.2 explicitly states 'Os dados completos estão disponíveis em [2]' pointing to a Google Sheets link with the full experimental dataset.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "VSCode 1.90.1 and GitHub Copilot extension 0.12 are specified, but no JDK version, Spring Boot version, Maven/Gradle version, or complete dependency specifications are provided.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No step-by-step instructions for reproducing the experiment are provided; the methodology describes what was done but not how an independent researcher could replicate the setup.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "No confidence intervals or error bars are reported; Table 3 presents only means, medians, and standard deviations without inferential interval estimates.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": true,
    155           "justification": "Mann-Whitney U test is applied to both outcomes after Shapiro-Wilk normality testing, with p-values reported: time p=0.0029, test failures p=0.866.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Cliff's delta is reported for both variables: δ=-0.254 (small effect) for task time and δ=-0.014 (negligible) for test failures, with effect size interpretation provided.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "The study uses 49 participants with no power analysis or justification that this sample size is adequate to detect expected effect sizes; the limitations section acknowledges 'limited sample' but offers no quantification.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "Table 3 reports standard deviations for both outcomes: time SD 23.05/25.03 and test failures SD 1.84/2.44 for without/with Copilot conditions.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "The 'without Copilot' condition is the explicit baseline, with Table 1 showing approximately balanced responses per problem across conditions.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "The baseline (unassisted human development) is the appropriate comparison for evaluating AI tool impact on developer performance.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": false,
    192           "answer": false,
    193           "justification": "Only one component (Copilot chat vs. no AI tool) is evaluated; ablation is not applicable to this single-tool comparison design.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Two metrics are used: task completion time (efficiency proxy) and number of failed unit tests (code correctness proxy), covering both process and output dimensions.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": false,
    205           "justification": "Code quality is evaluated via automated unit tests only; no human raters assess code readability, maintainability, or other qualitative aspects of the generated code.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": false,
    210           "answer": false,
    211           "justification": "This is a human-subject performance study, not a prediction task; held-out test sets are not applicable.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": false,
    217           "justification": "Table 2 shows participant distribution by education level, experience, and Spring usage, but no per-subgroup analysis of outcomes (time, test failures) by these demographic variables is presented in the results.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Section 5.4 discusses that Copilot users may have introduced auxiliary methods causing inconsistencies, explaining the slightly higher mean test failures in the Copilot condition.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "The null result on code correctness (p=0.866, δ=-0.014) is prominently reported as a primary finding throughout the abstract, results, and conclusion sections.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": false,
    237           "justification": "GitHub Copilot extension v0.12 is specified, but the underlying LLM snapshot used by Copilot at that time is not disclosed and is not accessible to users of the product.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": false,
    243           "justification": "Participants used Copilot's chat feature, but no example prompts, interaction guidelines, or participant instructions for how to query Copilot are documented.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": false,
    248           "answer": false,
    249           "justification": "Copilot is a black-box tool; users have no access to model hyperparameters such as temperature, making this criterion inapplicable.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "Participants used Copilot's native chat interface as a black-box tool; no custom agentic scaffolding was involved.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Section 4.3.1 describes that manually recorded times were cross-validated against participant screen recordings, and Shapiro-Wilk testing determined the appropriate non-parametric statistical approach.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "Raw experimental data is available at a Google Sheets link [2] cited in Section 5.2, described as 'complete data.'",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Section 4.2.4 describes data collection procedures: manual time recording by participants, screen recording for validation, and code submission with unit test evaluation.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": true,
    280           "answer": true,
    281           "justification": "Section 4.2.2 states all students from three specific named courses were invited without lottery, clearly describing the convenience sample recruitment approach.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "Figure 1 and Sections 4.3.1–4.3.2 document the full pipeline: collection (manual timing + screen recording) → preprocessing (video verification) → analysis (Shapiro-Wilk → Mann-Whitney U → Cliff's delta).",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": false,
    294           "answer": false,
    295           "justification": "This study evaluates human developer performance using Copilot as a tool; model training cutoff is not relevant to interpreting the results.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": false,
    300           "answer": false,
    301           "justification": "Custom Java/Spring Boot APIs were used as task materials rather than standardized benchmarks; train/test overlap in model pretraining is not a relevant concern here.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": false,
    306           "answer": false,
    307           "justification": "No standardized benchmark is used; the evaluation tasks are custom APIs created for this study, making benchmark contamination inapplicable.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": true,
    314           "answer": false,
    315           "justification": "No pre-registration is mentioned anywhere in the paper.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": true,
    320           "answer": false,
    321           "justification": "Section 4.2.2 mentions informed consent was obtained and participants were told of their right to withdraw, but no specific IRB or ethics committee approval number or institution is cited.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": true,
    326           "answer": true,
    327           "justification": "Table 2 reports participant distribution by education level (graduation vs. post-graduation), programming experience (>24 vs. ≤24 months), and prior Spring framework usage.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": true,
    332           "answer": false,
    333           "justification": "No explicit inclusion or exclusion criteria are stated; all students enrolled in the specified courses were invited with no selection filter beyond course enrollment.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": true,
    338           "answer": true,
    339           "justification": "Section 4.2.3 describes that problems were randomly assigned among participants to ensure balanced distribution of with/without Copilot conditions across all four tasks.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": true,
    344           "answer": false,
    345           "justification": "Participants necessarily knew whether they were using Copilot; no blinding was employed or discussed.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": true,
    350           "answer": true,
    351           "justification": "Section 5.2 explicitly notes that 184 responses were collected versus 196 expected (49 × 4), acknowledging that not all participants submitted solutions for all four tasks.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": false,
    358           "answer": false,
    359           "justification": "Copilot was used free through GitHub Education; this is a human-performance study where inference cost is not a relevant research dimension.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": false,
    364           "answer": false,
    365           "justification": "No significant compute budget was involved; participants used Copilot as a web-connected tool through standard IDE integration.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "GitHub Copilot significantly reduces task completion time for student developers",
    374       "evidence": "Mann-Whitney U=3148.0, p=0.0029; median time 16 min (with Copilot) vs. 23.5 min (without); Cliff's delta δ=-0.254 (small effect)",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "GitHub Copilot does not significantly improve code correctness as measured by unit test failures",
    379       "evidence": "Mann-Whitney U=4075.0, p=0.866; median failed tests identical (1.0 in both conditions); Cliff's delta δ=-0.014 (negligible effect)",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Human oversight remains essential when using Copilot to maintain code quality",
    384       "evidence": "Mean test failures slightly higher with Copilot (1.65 vs. 1.48); discussion in Section 5.5 notes Copilot 'requires the developer's critical analysis to maintain desired quality'",
    385       "supported": "weak"
    386     },
    387     {
    388       "claim": "This study is more ecologically valid than prior work due to API-level task complexity and participant training",
    389       "evidence": "Section 3 argues prior studies use isolated simple problems without developer involvement; this study uses Java/Spring Boot API reconstruction tasks with 8 hours of pre-study training",
    390       "supported": "weak"
    391     },
    392     {
    393       "claim": "Task completion time variance is similar between Copilot and non-Copilot conditions",
    394       "evidence": "Standard deviation 25.03 (with Copilot) vs. 23.05 (without), but test failures show higher variance with Copilot (2.44 vs. 1.84)",
    395       "supported": "moderate"
    396     }
    397   ],
    398   "methodology_tags": [
    399     "rct",
    400     "case-study"
    401   ],
    402   "key_findings": "Among 49 student developers completing Java/Spring Boot API tasks, GitHub Copilot (chat-only mode) significantly reduced task completion time (median 16 vs. 23.5 minutes, Mann-Whitney p=0.0029, Cliff's delta -0.254 small effect) but showed no significant impact on code correctness measured by unit test failures (p=0.866, delta -0.014 negligible). The study found efficiency gains without corresponding quality improvements, with slightly higher mean test failures in the Copilot condition (1.65 vs. 1.48) attributed to participants incorrectly introducing auxiliary methods. Results are limited to students using Copilot's chat interface with autocomplete disabled in an academic Java/Spring Boot setting.",
    403   "red_flags": [
    404     {
    405       "flag": "Student-only sample with broad conclusions",
    406       "detail": "All 49 participants are students; conclusions state 'Copilot proved effective in reducing development time' without consistently qualifying this to the student population, despite the threats section acknowledging generalization limits."
    407     },
    408     {
    409       "flag": "Non-standard Copilot configuration",
    410       "detail": "Autocomplete was disabled so participants could only use Copilot chat — this differs substantially from typical Copilot usage in industry, where autocomplete is the primary interaction mode, limiting ecological validity."
    411     },
    412     {
    413       "flag": "No power analysis for small sample",
    414       "detail": "With 49 participants and small effect sizes (Cliff's delta -0.254), the study is likely underpowered to detect subgroup differences or interactions; no power analysis is presented or discussed."
    415     },
    416     {
    417       "flag": "Manual time recording",
    418       "detail": "Task times were self-reported manually by participants, introducing recall and reporting bias; video cross-validation mitigates but does not eliminate this concern."
    419     },
    420     {
    421       "flag": "Prompts not documented",
    422       "detail": "Participants used Copilot chat throughout the experiment but no example prompts, prompt strategies, or interaction guidelines are documented, making the human-AI interaction non-reproducible."
    423     },
    424     {
    425       "flag": "No pre-registration",
    426       "detail": "The study was not pre-registered; while both positive and null results were reported, the absence of pre-specification leaves open the possibility of outcome selection."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
    432       "relevance": "Peng et al. 2023 — large controlled study measuring Copilot productivity impact in professional settings; the primary benchmark comparison for this student-focused replication"
    433     },
    434     {
    435       "title": "GitHub Copilot AI pair programmer: Asset or Liability?",
    436       "relevance": "Moradi Dakhel et al. 2023 — empirical assessment of Copilot code quality, correctness, and diversity; directly cited for evaluation indicators used in this study"
    437     },
    438     {
    439       "title": "Evaluating the Code Quality of AI-Assisted Code Generation Tools: An Empirical Study on GitHub Copilot, Amazon CodeWhisperer, and ChatGPT",
    440       "relevance": "Yetistiren et al. 2023 — multi-tool evaluation using unit tests and code smells; provides the code correctness measurement framework adopted here"
    441     },
    442     {
    443       "title": "Studying the effect of AI Code Generators on Supporting Novice Learners in Introductory Programming",
    444       "relevance": "Kazemitabaar et al. CHI 2023 — closely comparable study of Copilot with novice learners in educational settings"
    445     },
    446     {
    447       "title": "An Empirical Evaluation of GitHub Copilot's Code Suggestions",
    448       "relevance": "Nguyen & Nadi MSR 2022 — early evaluation of Copilot suggestion correctness; foundational related work on Copilot quality"
    449     },
    450     {
    451       "title": "Generating Java Methods: An Empirical Assessment of Four AI-Based Code Assistants",
    452       "relevance": "Corso et al. ICPC 2024 — Java-specific evaluation of AI code assistants comparable to this study's Java/Spring Boot focus"
    453     },
    454     {
    455       "title": "An Industry Case Study on Adoption of AI-based Programming Assistants",
    456       "relevance": "Davila et al. ICSE-SEIP 2024 — industry-context perspective on AI coding tool adoption in Brazilian companies, contrasting with this academic setting"
    457     }
    458   ],
    459   "engagement_factors": {
    460     "practical_relevance": {
    461       "score": 2,
    462       "justification": "Quantifies GitHub Copilot's time savings (35% median reduction) and null quality impact with statistical rigor, directly informing practitioner decisions on AI tool adoption."
    463     },
    464     "surprise_contrarian": {
    465       "score": 1,
    466       "justification": "The mixed result (efficiency gains without quality improvement) partially challenges productivity-optimistic Copilot marketing, but this pattern is well-established in prior literature."
    467     },
    468     "fear_safety": {
    469       "score": 0,
    470       "justification": "No AI safety, security, or risk concerns are raised; the study focuses narrowly on developer performance metrics in an educational setting."
    471     },
    472     "drama_conflict": {
    473       "score": 1,
    474       "justification": "Slight tension with Copilot marketing around code quality claims, but the paper is balanced and not provocative in framing."
    475     },
    476     "demo_ability": {
    477       "score": 1,
    478       "justification": "GitHub Copilot is freely accessible to students through GitHub Education, but replicating the specific experiment requires Java/Spring Boot APIs and a student cohort."
    479     },
    480     "brand_recognition": {
    481       "score": 2,
    482       "justification": "GitHub Copilot is one of the most widely recognized AI coding tools, driving practitioner interest in any empirical performance evaluation."
    483     }
    484   },
    485   "hn_data": {
    486     "threads": [],
    487     "top_points": 0,
    488     "total_points": 0,
    489     "total_comments": 0
    490   }
    491 }

Impressum · Datenschutz