scan-v5.json (27266B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Are Coding Agents Generating Over-Mocked Tests? An Empirical Study", 6 "authors": [ 7 "Andre Hora", 8 "Romain Robbes" 9 ], 10 "year": 2026, 11 "venue": "MSR '26", 12 "arxiv_id": "2602.00409", 13 "doi": "10.1145/3793302.3793362" 14 }, 15 "checklist": { 16 "claims_and_evidence": { 17 "abstract_claims_supported": { 18 "applies": true, 19 "answer": true, 20 "justification": "All quantitative claims in the abstract (60%, 23% vs 13%, 68%, 36% vs 26%, 95% mock type concentration) are directly backed by contingency tables and statistical tests in Section 3.", 21 "source": "haiku" 22 }, 23 "causal_claims_justified": { 24 "applies": false, 25 "answer": false, 26 "justification": "The paper makes observational association claims ('more likely to') rather than causal claims; the study design is a mining study without intervention and the authors consistently use correlational language throughout.", 27 "source": "haiku" 28 }, 29 "generalization_bounded": { 30 "applies": true, 31 "answer": true, 32 "justification": "Section 5 explicitly states 'our findings cannot be directly generalized to repositories written in other languages or using other agents,' bounding scope to Python, JavaScript, and TypeScript in 2025.", 33 "source": "haiku" 34 }, 35 "alternative_explanations_discussed": { 36 "applies": true, 37 "answer": false, 38 "justification": "The paper does not discuss key alternative explanations for why agents mock more — e.g., selection effects (agent-adopting repos may have more complex code requiring more mocking) or developer-preference confounds; only the 'easier to generate automatically' hypothesis is briefly proposed without evaluation.", 39 "source": "haiku" 40 }, 41 "proxy_outcome_distinction": { 42 "applies": true, 43 "answer": true, 44 "justification": "The paper clearly distinguishes between what is measured (presence of mock identifiers in test commit diffs, validated at 94% precision) and what is claimed (mocking frequency tendencies of coding agents), and does not conflate commit counts with test quality.", 45 "source": "haiku" 46 } 47 }, 48 "limitations_and_scope": { 49 "limitations_section_present": { 50 "applies": true, 51 "answer": true, 52 "justification": "Section 5 'Threats to Validity' provides a dedicated limitations discussion covering detection precision, agent commit attribution, and generalization.", 53 "source": "haiku" 54 }, 55 "threats_to_validity_specific": { 56 "applies": true, 57 "answer": true, 58 "justification": "Specific threats are quantified: 94% precision for mock detection (manually inspected 100 commits across 10 repositories), 100% precision for agent commit detection (500 manually inspected commits), and handling of Co-Authored-By variant casing.", 59 "source": "haiku" 60 }, 61 "scope_boundaries_stated": { 62 "applies": true, 63 "answer": true, 64 "justification": "The paper explicitly bounds scope to three languages, three specific coding agents, commits from 2025, and repositories meeting stated criteria (≥100 commits, ≥5,000 non-blank LOC, not forks, recently active).", 65 "source": "haiku" 66 } 67 }, 68 "conflicts_of_interest": { 69 "funding_disclosed": { 70 "applies": true, 71 "answer": true, 72 "justification": "Acknowledgments disclose funding from CNPq grants (408817/2024-0 and 403304/2025-3), CAPES, FAPEMIG, INES.IA, and the French State/IdEx université de Bordeaux.", 73 "source": "haiku" 74 }, 75 "affiliations_disclosed": { 76 "applies": true, 77 "answer": true, 78 "justification": "Author affiliations are clearly stated: Hora at UFMG (Brazil) and Robbes at Univ. Bordeaux, CNRS, Bordeaux INP, LaBRI (France).", 79 "source": "haiku" 80 }, 81 "funder_independent_of_outcome": { 82 "applies": true, 83 "answer": true, 84 "justification": "Funding comes from government and academic agencies (CNPq, CAPES, French State) with no affiliation to the coding agent companies (Anthropic, GitHub, Cursor) whose products are studied.", 85 "source": "haiku" 86 }, 87 "financial_interests_declared": { 88 "applies": true, 89 "answer": false, 90 "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) is included anywhere in the paper.", 91 "source": "haiku" 92 } 93 }, 94 "scope_and_framing": { 95 "key_terms_defined": { 96 "applies": true, 97 "answer": true, 98 "justification": "Key terms are explicitly defined: 'coding agents' (Section 2.1.1 — autonomous tools that invoke external tools, execute code, and author commits), 'test doubles/mocks' (Section 2.6 — Meszaros taxonomy: dummy, stub, spy, mock, fake), and 'agent commits', 'test commits', 'mock commits' are operationally defined.", 99 "source": "haiku" 100 }, 101 "intended_contribution_clear": { 102 "applies": true, 103 "answer": true, 104 "justification": "Contributions are explicitly stated: '(1) the first empirical study to analyze agent-generated tests in real-world software systems; and (2) multiple actionable implications for practitioners and researchers.'", 105 "source": "haiku" 106 }, 107 "engagement_with_prior_work": { 108 "applies": true, 109 "answer": true, 110 "justification": "Section 6 engages substantively with prior work on coding agents (Becker et al., Kumar et al., Bouzenia & Pradel), LLM-generated test quality (Alshahwan et al., Ouédraogo et al.), and mocking practices (Spadini et al., Qin 2025), positioning this study as the first to examine mocking in agent-generated code at scale in the wild.", 111 "source": "haiku" 112 } 113 } 114 }, 115 "type_checklist": { 116 "empirical": { 117 "artifacts": { 118 "code_released": { 119 "applies": true, 120 "answer": true, 121 "justification": "Section 2.7 explicitly states 'Our scripts and dataset are publicly available at: https://doi.org/10.5281/zenodo.17427638.'", 122 "source": "haiku" 123 }, 124 "data_released": { 125 "applies": true, 126 "answer": true, 127 "justification": "The dataset (commits and repository metadata) is publicly available on Zenodo at the stated DOI.", 128 "source": "haiku" 129 }, 130 "environment_specified": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper mentions using PyDriller and GitEvo but provides no requirements.txt, Dockerfile, or specific version numbers for any dependency.", 134 "source": "haiku" 135 }, 136 "reproduction_instructions": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper provides a Zenodo link for scripts but includes no step-by-step reproduction instructions; reproducing the pipeline would require inferring the full workflow from the methodology description.", 140 "source": "haiku" 141 } 142 }, 143 "statistical_methodology": { 144 "confidence_intervals_or_error_bars": { 145 "applies": true, 146 "answer": false, 147 "justification": "No confidence intervals or error bars are reported for any headline percentages (23%, 36%, etc.); only Chi-squared statistics, p-values, and Cliff's delta are provided.", 148 "source": "haiku" 149 }, 150 "significance_tests": { 151 "applies": true, 152 "answer": true, 153 "justification": "Chi-squared tests of independence are applied for commit-level analyses in RQ1 and RQ2; paired Wilcoxon tests (with normality confirmed via Shapiro-Wilk and D'Agostino) are used for repository-level comparisons.", 154 "source": "haiku" 155 }, 156 "effect_sizes_reported": { 157 "applies": true, 158 "answer": true, 159 "justification": "Cliff's delta effect sizes are reported for both repository-level comparisons: negligible for lower agentic activity repositories and small (0.252) for higher agentic activity.", 160 "source": "haiku" 161 }, 162 "sample_size_justified": { 163 "applies": true, 164 "answer": false, 165 "justification": "No power analysis or principled sample size justification is provided; the sample of 2,168 repositories emerges from SEART selection criteria rather than any prospective sizing calculation.", 166 "source": "haiku" 167 }, 168 "variance_reported": { 169 "applies": true, 170 "answer": false, 171 "justification": "Table 10 reports medians for repository-level mock commit ratios but provides no standard deviations, interquartile ranges, or other spread measures for any result.", 172 "source": "haiku" 173 } 174 }, 175 "evaluation_design": { 176 "baselines_included": { 177 "applies": true, 178 "answer": true, 179 "justification": "Non-agent commits serve as the direct baseline throughout all three RQs, with explicit agent vs. non-agent proportions in every contingency table.", 180 "source": "haiku" 181 }, 182 "baselines_contemporary": { 183 "applies": true, 184 "answer": true, 185 "justification": "Non-agent commits are drawn from the same repositories and same time period (2025) as agent commits, making comparisons directly contemporary.", 186 "source": "haiku" 187 }, 188 "ablation_study": { 189 "applies": false, 190 "answer": false, 191 "justification": "This is an observational mining study, not a system design paper; ablation analysis is not applicable.", 192 "source": "haiku" 193 }, 194 "multiple_metrics": { 195 "applies": true, 196 "answer": true, 197 "justification": "The paper uses commit-level ratios, repository-level proportions, Chi-squared statistics with standardized residuals, Wilcoxon p-values, Cliff's delta, and mock type distribution across all five test double categories.", 198 "source": "haiku" 199 }, 200 "human_evaluation": { 201 "applies": true, 202 "answer": true, 203 "justification": "Authors manually inspected 500 agent commits to validate classifier precision (100%) and 100 randomly selected mock commits across 10 repositories to validate mock detection precision (94%).", 204 "source": "haiku" 205 }, 206 "held_out_test_set": { 207 "applies": false, 208 "answer": false, 209 "justification": "This is an observational mining study, not a prediction task; held-out test sets are not applicable.", 210 "source": "haiku" 211 }, 212 "per_category_breakdown": { 213 "applies": true, 214 "answer": true, 215 "justification": "Results are broken down by programming language (Python vs JS/TS) in Tables 5, 8, and 10, and by individual coding agent (Claude, Copilot, Cursor) in Tables 5 and 8.", 216 "source": "haiku" 217 }, 218 "failure_cases_discussed": { 219 "applies": true, 220 "answer": false, 221 "justification": "The browser-use example where agents added mocks despite explicit configuration to the contrary is a descriptive observation, not a systematic discussion of failure modes or when the methodology breaks down.", 222 "source": "haiku" 223 }, 224 "negative_results_reported": { 225 "applies": true, 226 "answer": true, 227 "justification": "The paper reports null results: no notable language difference in mock rates (Python 37% vs JS/TS 35%), and negligible Cliff's delta for lower-agentic-activity repositories despite a statistically significant Wilcoxon result.", 228 "source": "haiku" 229 } 230 }, 231 "setup_transparency": { 232 "model_versions_specified": { 233 "applies": false, 234 "answer": false, 235 "justification": "This is a mining study that does not run LLM inference; the specific versions of Claude Code, Copilot, and Cursor active during studied commits cannot be determined from commit metadata and are not reported.", 236 "source": "haiku" 237 }, 238 "prompts_provided": { 239 "applies": false, 240 "answer": false, 241 "justification": "No LLMs are invoked in the authors' analysis pipeline; the study mines existing commit data rather than querying models.", 242 "source": "haiku" 243 }, 244 "hyperparameters_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No models are run by the authors; hyperparameters are not applicable to a repository mining study.", 248 "source": "haiku" 249 }, 250 "scaffolding_described": { 251 "applies": false, 252 "answer": false, 253 "justification": "The authors do not deploy agentic scaffolding; they analyze traces left by existing coding agents in real repositories.", 254 "source": "haiku" 255 }, 256 "data_preprocessing_documented": { 257 "applies": true, 258 "answer": true, 259 "justification": "Preprocessing is thoroughly documented: SEART selection criteria (Section 2.2), agent file detection patterns (Table 1), commit author/co-author matching logic (Section 2.4), test file patterns (Table 2), mock identifier detection rules (Section 2.6.1), and mock commit classification (Section 2.6.2) are all fully specified.", 260 "source": "haiku" 261 } 262 }, 263 "data_integrity": { 264 "raw_data_available": { 265 "applies": true, 266 "answer": true, 267 "justification": "The dataset is publicly available at Zenodo (doi.org/10.5281/zenodo.17427638) as explicitly stated in Section 2.7.", 268 "source": "haiku" 269 }, 270 "data_collection_described": { 271 "applies": true, 272 "answer": true, 273 "justification": "Data collection is thoroughly described: SEART tool and selection criteria, filtering from 114,098 to 2,168 repositories, cloning for agent file detection, and commit metadata parsing for all three classification steps.", 274 "source": "haiku" 275 }, 276 "recruitment_methods_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants; this is a repository mining study using automated collection from GitHub via the SEART tool.", 280 "source": "haiku" 281 }, 282 "data_pipeline_documented": { 283 "applies": true, 284 "answer": true, 285 "justification": "The full pipeline is documented across Sections 2.2–2.7: SEART selection → language/agent filter → agent commit detection → test commit detection → mock commit detection → RQ analysis, including the tools used (PyDriller, GitEvo).", 286 "source": "haiku" 287 } 288 }, 289 "contamination": { 290 "training_cutoff_stated": { 291 "applies": false, 292 "answer": false, 293 "justification": "This is not a benchmark evaluation of model capabilities; no models are evaluated on test sets.", 294 "source": "haiku" 295 }, 296 "train_test_overlap_discussed": { 297 "applies": false, 298 "answer": false, 299 "justification": "Not applicable; this is a mining study, not a model capability evaluation.", 300 "source": "haiku" 301 }, 302 "benchmark_contamination_addressed": { 303 "applies": false, 304 "answer": false, 305 "justification": "Not applicable; no benchmarks are used for model evaluation.", 306 "source": "haiku" 307 } 308 }, 309 "human_studies": { 310 "pre_registered": { 311 "applies": false, 312 "answer": false, 313 "justification": "No human participants.", 314 "source": "haiku" 315 }, 316 "irb_or_ethics_approval": { 317 "applies": false, 318 "answer": false, 319 "justification": "No human participants.", 320 "source": "haiku" 321 }, 322 "demographics_reported": { 323 "applies": false, 324 "answer": false, 325 "justification": "No human participants.", 326 "source": "haiku" 327 }, 328 "inclusion_exclusion_criteria": { 329 "applies": false, 330 "answer": false, 331 "justification": "No human participants; inclusion/exclusion criteria apply to repositories and are fully documented in Section 2.2.", 332 "source": "haiku" 333 }, 334 "randomization_described": { 335 "applies": false, 336 "answer": false, 337 "justification": "No human participants.", 338 "source": "haiku" 339 }, 340 "blinding_described": { 341 "applies": false, 342 "answer": false, 343 "justification": "No human participants.", 344 "source": "haiku" 345 }, 346 "attrition_reported": { 347 "applies": false, 348 "answer": false, 349 "justification": "No human participants.", 350 "source": "haiku" 351 } 352 }, 353 "cost_and_practicality": { 354 "inference_cost_reported": { 355 "applies": false, 356 "answer": false, 357 "justification": "No LLM inference is performed by the authors; this is a repository mining study.", 358 "source": "haiku" 359 }, 360 "compute_budget_stated": { 361 "applies": true, 362 "answer": false, 363 "justification": "The paper does not report the computational cost of cloning and analyzing 2,168 repositories and 1.2 million commits, which is non-trivial.", 364 "source": "haiku" 365 } 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "23% of commits made by coding agents add or modify test files, compared with 13% for non-agents", 372 "evidence": "Table 4: 11,035/48,563 agent commits are test commits vs 158,326/1,206,315 non-agent commits; Chi-squared = 3,683.06, p < 0.001, standardized residual = 55.35", 373 "supported": "strong" 374 }, 375 { 376 "claim": "60% of repositories with agent activity also contain agent test activity", 377 "evidence": "Table 4 and Section 3.1: 729 out of 1,219 repositories with agent commits also contain agent test commits", 378 "supported": "strong" 379 }, 380 { 381 "claim": "36% of test commits made by coding agents add mocks, compared with 26% for non-agents", 382 "evidence": "Table 7: 3,934/11,035 agent test commits are mock commits vs 40,966/158,326 non-agent test commits; Chi-squared = 505.5, p < 0.001", 383 "supported": "strong" 384 }, 385 { 386 "claim": "In repositories with higher agentic activity (≥50 agent commits), agents have a significantly higher mock ratio (36%) than non-agents (28%) with small effect size", 387 "evidence": "Table 10b: Wilcoxon p < 0.001, Cliff's delta = 0.252 across 179 repositories; lower-agentic-activity repos show negligible effect despite statistical significance", 388 "supported": "moderate" 389 }, 390 { 391 "claim": "Coding agents predominantly use the 'mock' type (95%) while non-agents use a wider variety — fake (57%), spy (51%), mock (91%)", 392 "evidence": "Figure 5: Distribution of mock types across 496 repositories with agent mock activity; agents are concentrated on the generic mock type while non-agents show broader distribution", 393 "supported": "strong" 394 }, 395 { 396 "claim": "Repositories created in 2025 show a higher share of agent test commits (17%) and mock commits (19%) compared to the full dataset (7% and 9%)", 397 "evidence": "Tables 6 and 9: For 2025-created repos, 4,526/26,654 test commits are agent commits (17%) and 1,529/7,855 mock commits are agent commits (19%)", 398 "supported": "strong" 399 }, 400 { 401 "claim": "Mock-related instructions in agent configuration files are far less common than test instructions, suggesting a guidance gap", 402 "evidence": "Table 12: GitHub Code Search finds 13k CLAUDE.md files with 'mock' vs 102k with 'test' out of 112k total; causal link between guidance and behavior not established", 403 "supported": "weak" 404 } 405 ], 406 "methodology_tags": [ 407 "observational", 408 "case-study" 409 ], 410 "key_findings": "Coding agents are significantly more likely to modify test files (23% vs 13% of commits) and add mocks to those tests (36% vs 26%) than non-agent contributors, with both differences statistically significant (p < 0.001) and the mock difference confirmed in a paired within-repository analysis (Cliff's delta = 0.252, small). Agents show markedly less diversity in test double types, relying almost exclusively on the generic 'mock' type (95%) compared to non-agents who also commonly use 'fake' (57%) and 'spy' (51%). The proportion of agent-generated tests and mocks is growing rapidly, accounting for 17–19% of recently created repositories' test/mock commits vs 7–9% overall. The paper finds that mock guidance in agent configuration files (e.g., CLAUDE.md) is uncommon, and agents occasionally add mocks even in repositories that explicitly prohibit it, suggesting configuration-based guidance has limited enforcement.", 411 "red_flags": [ 412 { 413 "flag": "Title implies quality judgment not demonstrated", 414 "detail": "The paper establishes that agents mock more frequently but cannot demonstrate this constitutes 'over-mocking' — no assessment of mock appropriateness, test effectiveness, bug-detection rates, or maintenance cost is included; the normative claim in the title exceeds the observational evidence." 415 }, 416 { 417 "flag": "Selection confound not fully addressed", 418 "detail": "Repositories adopting coding agents may systematically differ in type (newer projects, higher complexity, specific domains) creating selection effects that independently explain higher mocking rates; the paired within-repository analysis partially mitigates this but developer-preference confounds remain (agent-adopting developers may already favor mocking)." 419 }, 420 { 421 "flag": "Agent versions not tracked", 422 "detail": "Specific versions of Claude Code, Copilot, and Cursor active during the studied commits are not identified; since model updates change agent behavior rapidly, findings may not reflect current or future agent behavior." 423 }, 424 { 425 "flag": "No confidence intervals on main estimates", 426 "detail": "All headline percentages (23%, 36%, 95%, etc.) are reported as point estimates without confidence intervals, making precision of the key comparative claims unassessable." 427 }, 428 { 429 "flag": "Unknown recall of mock detection method", 430 "detail": "The identifier-based mock detection is validated only for precision (94%) but not recall; unknown false-negative rate could systematically bias the agent vs. non-agent comparison if agents use different naming conventions than the patterns searched." 431 } 432 ], 433 "cited_papers": [ 434 { 435 "title": "Promises, Perils, and (Timely) Heuristics for Mining Coding Agent Activity", 436 "relevance": "Foundational companion paper by same authors establishing the methodology for detecting agent commits via co-authorship metadata in real repositories — directly enables this study" 437 }, 438 { 439 "title": "Agentic Much? Adoption of Coding Agents on GitHub", 440 "relevance": "Under-submission companion paper measuring overall adoption rates of coding agents on GitHub, providing broader context for this study's scope and agent selection rationale" 441 }, 442 { 443 "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity", 444 "relevance": "Becker et al. controlled experiment finding 19% task completion time increase despite 20% perceived productivity gain — key context for evaluating coding agent real-world effectiveness" 445 }, 446 { 447 "title": "To Mock or Not to Mock: Divergence in Mocking Practices Between LLM and Developers", 448 "relevance": "Direct predecessor: Qin 2025 compared GPT-4o mock decisions vs developers in a single system, finding LLMs generate more mocks; this paper scales that finding to real-world agent commits across thousands of repositories" 449 }, 450 { 451 "title": "Mock objects for testing Java systems: Why and how developers use them, and how they evolve", 452 "relevance": "Spadini et al. foundational empirical study of human mocking practices in Java; establishes baseline understanding for comparison with agent behavior" 453 }, 454 { 455 "title": "Use of test doubles in Android testing: An in-depth investigation", 456 "relevance": "Fazzini et al. study whose identifier-based mock detection methodology is directly adapted by this paper for detecting test doubles in commits" 457 }, 458 { 459 "title": "Understanding Software Engineering Agents: A Study of Thought-Action-Result Trajectories", 460 "relevance": "Bouzenia & Pradel study of agent interaction logs from SWE-bench; related characterization of coding agent action patterns in software engineering tasks" 461 }, 462 { 463 "title": "The Rise of AI Teammates in Software Engineering: How Autonomous Coding Agents Are Reshaping Software Engineering", 464 "relevance": "Li et al. survey providing context on the broader adoption and capabilities of coding agents used to motivate the scope of this study" 465 } 466 ], 467 "engagement_factors": { 468 "practical_relevance": { 469 "score": 3, 470 "justification": "Directly actionable for anyone using Claude Code, Copilot, or Cursor — the recommendation to add mock guidance to CLAUDE.md configuration files is immediately applicable and the finding applies to millions of developers." 471 }, 472 "surprise_contrarian": { 473 "score": 2, 474 "justification": "Finding that agents mock at 36% vs 26% and concentrate almost exclusively on the generic 'mock' type (95% vs 91%/57%/51% for non-agents) is a concrete, counterintuitive result about agent behavior that challenges assumptions of quality parity." 475 }, 476 "fear_safety": { 477 "score": 1, 478 "justification": "Tests with excessive mocking may mask integration bugs and allow code to drift from mock contracts, with software reliability implications, but no direct safety or security concerns are raised." 479 }, 480 "drama_conflict": { 481 "score": 1, 482 "justification": "Mild controversy around AI-generated code quality; Kent Beck's LinkedIn quote adds human interest color but the paper is primarily technical without major conflict angles." 483 }, 484 "demo_ability": { 485 "score": 1, 486 "justification": "Scripts and dataset are available on Zenodo, but reproducing requires cloning thousands of GitHub repositories and running analysis scripts — not a quick demo, though practitioners can immediately apply configuration file guidance." 487 }, 488 "brand_recognition": { 489 "score": 2, 490 "justification": "Directly studies Claude Code, GitHub Copilot, and Cursor with data from Microsoft/VS Code, home-assistant/core, and Apache repositories — high brand recognition among software engineering practitioners." 491 } 492 }, 493 "hn_data": { 494 "threads": [], 495 "top_points": 0, 496 "total_points": 0, 497 "total_comments": 0 498 } 499 }