scan.json (26499B)
1 { 2 "paper": { 3 "title": "Code with Me or for Me? How Increasing AI Automation Transforms Developer Workflows", 4 "authors": ["Valerie Chen", "Ameet Talwalkar", "Robert Brennan", "Graham Neubig"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2507.08149" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "The paper provides analysis scripts at https://github.com/valeriechen/copilot-agent-comparison (footnote 1, Section 3.6)." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "The paper does not release participant interaction data, screen recordings, survey responses, or telemetry data. Only analysis scripts are provided, not the underlying data." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided in the paper. The GitHub repository link is given but the paper itself does not describe environment setup for reproducing the analysis." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "While analysis scripts are released, there are no step-by-step reproduction instructions in the paper. The paper does not provide a README or instructions for replicating the analysis." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": true, 36 "justification": "Standard errors are reported for main results (e.g., task correctness: 'SE = 10%' and 'SE = 11%'; user effort: 'SE = 6.22' and 'SE = 2.8' in Section 4.1). Error bars are shown in Figure 4." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": true, 41 "justification": "Statistical significance tests are reported: p-values for task correctness (p = 0.02), user effort (p = 0.01), and Wilcoxon Signed-Rank Tests for Likert responses (e.g., C1: p = 0.0013, C2: p = 0.0006). Section 3.6 describes the analysis approach." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "Effect sizes are reported with baseline context: '35% increase in task correctness' (from 25% to 60%), 'about 50% the user effort' (25.1 min vs 12.5 min) in Section 4.1. Mean and SE provided for both conditions." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "No power analysis or sample size justification is provided. The paper recruited 20 participants without explaining why this number was chosen or acknowledging it may be insufficient for some analyses." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Standard errors are reported for task correctness and user effort metrics (Section 4.1), and error bars are shown in Figure 4. Breakdown analyses in Figures 8 and 9 also show error bars." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "The study directly compares GitHub Copilot (copilot baseline) against OpenHands (agent). This is the core study design—a within-participant comparison of two tools (Section 3)." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "GitHub Copilot is a current, widely-used tool, and OpenHands is described as a 'state-of-the-art, open-source coding agent' (Section 3.2). Both represent contemporary tools as of April-May 2025." 69 }, 70 "ablation_study": { 71 "applies": false, 72 "answer": false, 73 "justification": "This is a controlled user study comparing two complete tools, not a system with separable components. An ablation study is not structurally applicable." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Multiple metrics are used: task correctness rate, user effort (time), six Likert-scale user experience comparisons (C1-C6), and qualitative analysis of interaction trajectories (Section 3.5)." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": true, 83 "justification": "This is a human user study. Participants directly evaluated both tools through task performance and post-study surveys. Task correctness was evaluated based on submitted code (Section 3.5)." 84 }, 85 "held_out_test_set": { 86 "applies": false, 87 "answer": false, 88 "justification": "This is a user study, not a machine learning evaluation. The concept of held-out test sets does not apply." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Appendix D provides breakdowns by task type (data analysis, fix bug, add feature), programming experience, copilot usage frequency, and ordering effect (Figures 8 and 9)." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "The paper discusses where agents underperform: bug-fixing tasks are hardest (Section 4.1), users report poor understanding of agent outputs (C6), agents being overly proactive (Section 5.1), and multiple user experience limitations." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "Several negative results are reported: agents were not preferred for output understanding (C6, 55% preferred copilot), no significant difference on satisfaction (C5, p = 0.26), flow (C4, p = 0.39), or speed (C3, p = 0.09). Participants would still prefer GitHub Copilot overall (F2, p = 0.04)." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims agents 'can assist developers in ways that surpass copilots' (supported by 35% task correctness increase), 'reduce the effort required' (supported by ~50% reduction in user effort), and 'challenges remain' (supported by C5, C6, and discussion). All claims are supported in Sections 4.1-4.3." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The study uses a within-participant controlled design with randomized ordering (Section 3.4), which is adequate for causal claims about tool effects on productivity. Participants solve same-type tasks with both tools. Causal language like 'agents tend to enable users to complete more tasks' is justified by the controlled comparison." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": true, 120 "justification": "Section 5.3 (Limitations) explicitly bounds generalization: results are for one copilot and one agent, student participants, novice agent users, Python tasks, and a specific point in time (April-May 2025). The paper notes findings 'may not generalize to developers who are working on lower-resource programming languages.'" 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": true, 125 "justification": "Section 5.3 discusses alternative explanations: uncontrolled LLM choice in copilot phase, novelty effects from first-time agent use, student population bias, and task scope limitations. Appendix D analyzes ordering effects. Section 5.2 compares findings to prior work." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper states OpenHands was 'powered by Claude Sonnet 3.7' and that Copilot users could choose from 'GPT-4o, Claude Sonnet 3.7, Gemini 2.5 Pro, and o3-mini' (Section 3.2). These are marketing names without snapshot dates or API version identifiers." 133 }, 134 "prompts_provided": { 135 "applies": false, 136 "answer": false, 137 "justification": "The study evaluates third-party tools (GitHub Copilot and OpenHands) as black boxes. The authors did not design prompts—participants wrote their own messages to the tools. The system prompts of these tools are not under the authors' control." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": false, 142 "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported for either GitHub Copilot or OpenHands. The OpenHands versions (0.35-0.38) are stated in Appendix A.2, but inference parameters are not." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "The paper evaluates third-party tools (GitHub Copilot and OpenHands) as black boxes. While OpenHands's tools are briefly listed (bash shell, Jupyter, browser, file tool in Section 3.2), the authors cannot be expected to fully describe internal scaffolding they do not control." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 3.5 describes data collection methods. Section 3.6 explains the analysis approach: how user effort is computed from event streams, how correctness is evaluated, and how Likert responses are analyzed using Wilcoxon Signed-Rank Tests." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 5.3 is a dedicated Limitations section spanning approximately two paragraphs with substantive discussion of multiple specific limitations." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "Section 5.3 discusses specific threats: single instantiation of each tool type, uncontrolled LLM choice in copilot phase, student participant pool, limited number of tasks, 40-minute time constraint not representative of real workflows, task selection bias toward popular languages, and temporal limitation (April-May 2025)." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "The paper explicitly states boundaries: 'our findings are primarily representative of those who are novice coding agent users,' 'findings on the effect of agents on productivity may not generalize to developers who are working on lower-resource programming languages,' and 'our study findings are only representative of the point in time when the study was completed' (Section 5.3)." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "Raw data (interaction logs, survey responses, screen recordings, code submissions) is not released. Only analysis scripts are provided at the GitHub repository." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 3.5 describes four data collection methods: user trajectories (screen recordings for copilot, event streams for agent), final code snapshots, Likert responses, and qualitative responses. Appendix C provides full study instructions." 182 }, 183 "recruitment_methods_described": { 184 "applies": true, 185 "answer": true, 186 "justification": "Section 3.1 states participants were 'recruited via university mailing lists' with inclusion criteria: 'must have access to and use GitHub Copilot regularly and have experience programming in Python.' Demographics are reported (experience levels, Python proficiency, AI tool usage)." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 3.6 documents how user effort was computed from event streams (time between agent actions and user messages), how task correctness was evaluated, and how Likert data was analyzed. The pipeline from data collection to analysis is described." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "The Acknowledgements section thanks participants and colleagues but does not mention any funding source, grants, or sponsors." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are listed: Carnegie Mellon University and All Hands AI. All Hands AI is the company behind OpenHands, which is one of the tools being evaluated. This affiliation is visible in the author list." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding is disclosed, so independence cannot be assessed. However, two authors (Valerie Chen and Graham Neubig) are affiliated with All Hands AI, the maker of OpenHands, which is one of the tools being evaluated. This creates a clear conflict of interest that is not explicitly acknowledged." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial disclosures statement is present in the paper. Given the All Hands AI affiliation of two authors evaluating OpenHands, this is a notable omission." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": false, 219 "answer": false, 220 "justification": "This is a user study evaluating developer interactions with tools, not a benchmark evaluation of model capabilities. Contamination is not the relevant concern here." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": false, 224 "answer": false, 225 "justification": "This is a user study, not a benchmark evaluation of pre-trained model capabilities. Train/test overlap is not applicable." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": false, 229 "answer": false, 230 "justification": "This is a user study, not a benchmark evaluation. While tasks were sourced from GAIA and SWE-Bench, the evaluation is of human-AI collaboration, not model capability on benchmark questions." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": true, 236 "answer": false, 237 "justification": "No pre-registration link or mention of pre-registration is found in the paper." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": true, 241 "answer": false, 242 "justification": "No mention of IRB or ethics board approval is found in the paper, despite collecting data from 20 human participants including screen recordings." 243 }, 244 "demographics_reported": { 245 "applies": true, 246 "answer": true, 247 "justification": "Section 3.1 reports: 60% had 3-5 years of professional programming experience, 25% had 0-2 years, 15% had 6-10 years. 95% rated themselves intermediate to advanced in Python. All used GitHub Copilot at least weekly." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": true, 251 "answer": true, 252 "justification": "Section 3.1 states inclusion criteria: 'must have access to and use GitHub Copilot regularly and have experience programming in Python.' Participants were students recruited via university mailing lists." 253 }, 254 "randomization_described": { 255 "applies": true, 256 "answer": true, 257 "justification": "Section 3.4 describes randomization: 'We randomize the order in which participants interact with either [tool]' and 'We use a between-subjects set-up to randomize task types.' Figure 3 illustrates the counterbalanced design." 258 }, 259 "blinding_described": { 260 "applies": true, 261 "answer": false, 262 "justification": "Blinding is not described. Participants knew which tool they were using (GitHub Copilot vs OpenHands), which is inherent to the study design. However, evaluator blinding for task correctness grading is not mentioned." 263 }, 264 "attrition_reported": { 265 "applies": true, 266 "answer": false, 267 "justification": "The paper states 20 participants were recruited but does not report whether all 20 completed both phases, nor any dropout or exclusion of participant data." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "No API costs or token usage are reported for either GitHub Copilot or OpenHands, despite both making substantial LLM API calls during the study." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No total computational budget is stated. The paper does not report API spend, hosting costs for OpenHands, or other compute requirements." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "Agents improve task correctness rate by 35% compared to copilots (25% with copilot vs 60% with agent).", 286 "evidence": "Section 4.1, Figure 4: 'we observe a 35% increase in task correctness when users have access to agents as compared to copilots (μ = 25%, SE = 10% compared to μ = 60%, SE = 11% respectively). We find that these productivity improvements are significant (p = 0.02).'", 287 "supported": "moderate" 288 }, 289 { 290 "claim": "Agents reduce user effort to about 50% of copilot effort (25.1 min vs 12.5 min).", 291 "evidence": "Section 4.1: 'We find a significant difference in user effort between the time spent using copilots and agents (p = 0.01), where the former took 25.1 minutes of a user's time on average (SE = 6.22) and the latter only took 12.5 minutes of a user's time (SE = 2.8).'", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "Agents significantly reduce cognitive load compared to copilots.", 296 "evidence": "Section 4.2, Table 2: C2 cognitive load comparison with p = 0.0006, 75% agree/strongly agree, median = 4.0, mean = 3.95.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "Agents enable users to accomplish new tasks they could not with copilots.", 301 "evidence": "Section 4.2, Table 2: C1 with p = 0.0013, 70% agree/strongly agree, median = 5.0, mean = 4.25. 'Some tasks, such as both of the data analysis problems, were only completed by participants when they had access to agents.'", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "Users have better understanding of copilot outputs than agent outputs.", 306 "evidence": "Section 4.2, Table 2: C6 with 55% disagreeing (preferring copilot), median = 2.0, mean = 2.45, p = 0.07 (not significant at alpha = 0.05).", 307 "supported": "weak" 308 }, 309 { 310 "claim": "Despite higher productivity with agents, participants would still prefer to use GitHub Copilot over OpenHands.", 311 "evidence": "Section 5.1, Table 3: F2 'Moving forward, I would continue to use OpenHands over Github Copilot' with 60% disagreeing, p = 0.04.", 312 "supported": "moderate" 313 } 314 ], 315 "methodology_tags": ["rct", "qualitative"], 316 "key_findings": "In a within-participant controlled study with 20 developers, coding agents (OpenHands) improved task correctness by 35% over copilots (GitHub Copilot) and reduced user effort by approximately 50%. However, agents were only significantly preferred on 2 of 6 user experience dimensions (cognitive load and ability to accomplish new tasks). Participants still preferred copilots overall for future use, citing better output understanding and control. The study identifies three design desiderata for coding agents: transparency, balanced proactivity, and effective leverage of human effort.", 317 "red_flags": [ 318 { 319 "flag": "Conflict of interest: authors affiliated with evaluated tool", 320 "detail": "Two authors (Valerie Chen and Graham Neubig) are affiliated with All Hands AI, the company behind OpenHands, which is one of the two tools being evaluated. This conflict is not explicitly acknowledged in the paper, and there is no competing interests statement." 321 }, 322 { 323 "flag": "Small sample size without justification", 324 "detail": "The study uses only 20 student participants with no power analysis or sample size justification. With N=20 and a within-participant design, subgroup analyses (by task type, experience, etc.) have very small cell sizes, limiting the reliability of breakdown analyses in Appendix D." 325 }, 326 { 327 "flag": "No IRB/ethics approval mentioned", 328 "detail": "The study collects screen recordings, interaction logs, and survey data from 20 human participants but does not mention IRB or ethics board approval." 329 }, 330 { 331 "flag": "Uncontrolled LLM choice in copilot condition", 332 "detail": "Participants could choose any LLM in the GitHub Copilot chat panel (GPT-4o, Claude Sonnet 3.7, Gemini 2.5 Pro, o3-mini), while OpenHands was fixed to Claude Sonnet 3.7. This introduces a confound—differences may be due to model choice rather than tool type." 333 }, 334 { 335 "flag": "User effort measured differently across conditions", 336 "detail": "User effort for copilots is total time (start to end), while for agents it is only the time spent writing instructions. This asymmetric measurement makes direct comparison problematic, as acknowledged in the paper (total agent time including autonomous work was 27.9 min, comparable to copilot's 25.1 min)." 337 }, 338 { 339 "flag": "Novice agent users vs experienced copilot users", 340 "detail": "All participants were experienced copilot users but had no prior agent experience. Novelty effects or learning curves could explain some results. The paper acknowledges this but the confound limits interpretation." 341 } 342 ], 343 "cited_papers": [ 344 { 345 "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents", 346 "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"], 347 "year": 2024, 348 "relevance": "Describes the coding agent platform evaluated in this study; relevant to agentic AI software engineering." 349 }, 350 { 351 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 352 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"], 353 "year": 2023, 354 "arxiv_id": "2310.06770", 355 "relevance": "The benchmark from which bug-fixing tasks were sourced; central to evaluating coding agents." 356 }, 357 { 358 "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot", 359 "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"], 360 "year": 2023, 361 "arxiv_id": "2302.06590", 362 "relevance": "Prior controlled study of GitHub Copilot's productivity impact; key baseline for comparison." 363 }, 364 { 365 "title": "Reading Between the Lines: Modeling User Behavior and Costs in AI-Assisted Programming", 366 "authors": ["Hussein Mozannar", "Gagan Bansal", "Adam Fourney", "Eric Horvitz"], 367 "year": 2024, 368 "relevance": "Developed CUPS taxonomy for programmer-AI interactions; directly relevant to understanding developer workflows with AI tools." 369 }, 370 { 371 "title": "The RealHumanEval: Evaluating Large Language Models' Abilities to Support Programmers", 372 "authors": ["Hussein Mozannar", "Valerie Chen", "Mohammed Alsobay"], 373 "relevance": "Demonstrated that benchmark performance may not correlate with downstream user utility for coding assistants." 374 }, 375 { 376 "title": "GAIA: A Benchmark for General AI Assistants", 377 "authors": ["Grégoire Mialon", "Clémentine Fourrier", "Thomas Wolf"], 378 "year": 2023, 379 "relevance": "Source of data analysis tasks used in the study; evaluates AI agents as general-purpose assistants." 380 }, 381 { 382 "title": "Grounded Copilot: How Programmers Interact with Code-Generating Models", 383 "authors": ["Shraddha Barke", "Michael B. James", "Nadia Polikarpova"], 384 "year": 2023, 385 "relevance": "Introduced acceleration vs exploration framework for understanding developer-AI interactions." 386 }, 387 { 388 "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity", 389 "authors": ["Joel Becker", "Nate Rush", "Elizabeth Barnes", "David Rein"], 390 "year": 2025, 391 "arxiv_id": "2507.09089", 392 "relevance": "Contemporary study measuring AI productivity impact on developers; directly comparable methodology." 393 }, 394 { 395 "title": "The Effects of Generative AI on High-Skilled Work: Evidence from Three Field Experiments with Software Developers", 396 "authors": ["Zheyuan Kevin Cui", "Mert Demirer", "Sonia Jaffe"], 397 "year": 2025, 398 "relevance": "Field experiments on AI's effect on developer productivity; relevant methodological comparison." 399 }, 400 { 401 "title": "Vibe Coding: Programming Through Conversation with Artificial Intelligence", 402 "authors": ["Advait Sarkar", "Ian Drosos"], 403 "year": 2025, 404 "arxiv_id": "2506.23253", 405 "relevance": "Defines the vibe-coding paradigm discussed in relation to agentic workflows." 406 }, 407 { 408 "title": "The Rise of AI Teammates in Software Engineering (SE) 3.0: How Autonomous Coding Agents are Reshaping Software Engineering", 409 "authors": ["Hao Li", "Haoxiang Zhang", "Ahmed E. Hassan"], 410 "year": 2025, 411 "arxiv_id": "2507.15003", 412 "relevance": "Tracks adoption of coding agents on GitHub; complementary to this study's controlled evaluation." 413 }, 414 { 415 "title": "Collaborative Gym: A Framework for Enabling and Evaluating Human-Agent Collaboration", 416 "authors": ["Yijia Shao", "Vinay Samuel", "Yucheng Jiang"], 417 "year": 2024, 418 "arxiv_id": "2412.15701", 419 "relevance": "Framework for evaluating human-agent collaboration, addressing the gap in interactive agent evaluation." 420 }, 421 { 422 "title": "Interactive Agents to Overcome Ambiguity in Software Engineering", 423 "authors": ["Sanidhya Vijayvargiya", "Xuhui Zhou", "Akhila Yerukola"], 424 "year": 2025, 425 "arxiv_id": "2502.13069", 426 "relevance": "Addresses interactive agent evaluation with synthetic users for software engineering tasks." 427 } 428 ] 429 }