ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (24490B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Are Coding Agents Generating Over-Mocked Tests? An Empirical Study",
      6     "authors": [
      7       "Andre Hora",
      8       "Romain Robbes"
      9     ],
     10     "year": 2026,
     11     "venue": "MSR '26",
     12     "arxiv_id": "2602.00409",
     13     "doi": "10.1145/3793302.3793362"
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "All five numbered claims in the abstract (60%, 23% vs 13%, 68%, 36% vs 26%, recent repos higher proportion) are directly supported by tables and statistical tests in the results section.",
     21         "source": "opus"
     22       },
     23       "causal_claims_justified": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "The paper uses language like 'coding agents are more likely to modify tests' and 'tend to rely more heavily on mocking' which suggest causal interpretations from observational data. The Chi-squared test shows association, not causation. The paper does not adequately discuss confounds (e.g., selection bias in which tasks agents are assigned to).",
     27         "source": "opus"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Section 5 (Threats to Validity) explicitly states: 'our findings — as usual in empirical software engineering — cannot be directly generalized to repositories written in other languages or using other agents.'",
     33         "source": "opus"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper does not discuss alternative explanations for why agents mock more. For example, agents may be assigned to tasks that inherently require more mocking, or developers may selectively use agents for test-heavy tasks. The discussion assumes the difference reflects agent behavior rather than task selection bias.",
     39         "source": "opus"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "The paper measures specific commit-level metrics (test file modification rates, mock addition rates) and frames findings at that exact granularity: '23% of agent commits modify test files' and '36% of agent test commits add mocks.' The paper does not claim broader constructs like 'test quality' — it explicitly discusses that higher mocking may or may not indicate quality issues (Section 4.2), acknowledging the interpretive gap.",
     45         "source": "opus"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Section 5 'Threats to Validity' provides a dedicated discussion of limitations.",
     53         "source": "opus"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Specific threats discussed: detection precision for test/mock commits (with manual validation), agent commit detection methodology, co-authorship ambiguity, and language/agent generalization bounds (Section 5).",
     59         "source": "opus"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper explicitly bounds scope to three languages and notes that findings 'cannot be directly generalized to repositories written in other languages or using other agents' (Section 5). Also states benchmark evaluations are out of scope (Section 6.1).",
     65         "source": "opus"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Acknowledgments section lists funding from CNPq, CAPES, FAPEMIG, INES.IA, and the French State.",
     73         "source": "opus"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Authors affiliated with UFMG (Brazil) and Univ. Bordeaux/CNRS/LaBRI (France). No product affiliations with the evaluated coding agents.",
     79         "source": "opus"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Funding from Brazilian and French public research agencies. These funders have no stake in whether coding agents over-mock.",
     85         "source": "opus"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No competing interests statement found in the paper.",
     91         "source": "opus"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 2.1.1 defines 'coding agents', Section 2.6 defines 'mock' and the five test double types (dummy, stub, spy, mock, fake), and Sections 2.4–2.6 define 'agent commit', 'test commit', and 'mock commit' with precise detection criteria.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper explicitly states two contributions: the first empirical study of agent-generated mocks in real-world systems, and actionable implications for practitioners and researchers.",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 6 engages substantively with prior work on coding agents (Becker et al., Kumar et al.), LLM test generation (Schäfer et al., Chen et al.), and mocking practices (Spadini et al., Fazzini et al.), situating this as the first agent-specific mocking study.",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "empirical": {
    117       "artifacts": {
    118         "code_released": {
    119           "applies": true,
    120           "answer": true,
    121           "justification": "Scripts and dataset are publicly available at https://doi.org/10.5281/zenodo.17427638 (Section 2.7).",
    122           "source": "opus"
    123         },
    124         "data_released": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "Dataset publicly available at the same Zenodo link (Section 2.7).",
    128           "source": "opus"
    129         },
    130         "environment_specified": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "No mention of requirements.txt, Dockerfile, or dependency specifications. Tools mentioned (PyDriller, GitEvo) but no versions or environment setup.",
    134           "source": "opus"
    135         },
    136         "reproduction_instructions": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "No step-by-step reproduction instructions provided in the paper. The Zenodo archive may contain them, but the paper itself does not describe how to reproduce the analysis.",
    140           "source": "opus"
    141         }
    142       },
    143       "statistical_methodology": {
    144         "confidence_intervals_or_error_bars": {
    145           "applies": true,
    146           "answer": false,
    147           "justification": "No confidence intervals or error bars reported. Results are presented as point estimates (percentages, ratios) without uncertainty quantification.",
    148           "source": "opus"
    149         },
    150         "significance_tests": {
    151           "applies": true,
    152           "answer": true,
    153           "justification": "Chi-squared tests of independence used for RQ1 and RQ2 commit-level analysis. Paired Wilcoxon test used for RQ2 repository-level analysis with normality tests reported (Section 3.2.2).",
    154           "source": "opus"
    155         },
    156         "effect_sizes_reported": {
    157           "applies": true,
    158           "answer": true,
    159           "justification": "Cliff's delta reported for repository-level analysis: negligible for lower agentic activity, small (0.252) for higher agentic activity (Section 3.2.2, Table 10).",
    160           "source": "opus"
    161         },
    162         "sample_size_justified": {
    163           "applies": true,
    164           "answer": false,
    165           "justification": "No justification for the sample size or power analysis. The selection criteria (≥100 commits, ≥5000 NLOC, etc.) are described but not justified in terms of statistical power.",
    166           "source": "opus"
    167         },
    168         "variance_reported": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "Results are reported as medians and ratios without standard deviations or interquartile ranges. Table 10 shows medians but no spread measures.",
    172           "source": "opus"
    173         }
    174       },
    175       "evaluation_design": {
    176         "baselines_included": {
    177           "applies": true,
    178           "answer": true,
    179           "justification": "Non-agent commits serve as the baseline comparison throughout (agent vs non-agent commits for test and mock ratios).",
    180           "source": "opus"
    181         },
    182         "baselines_contemporary": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "The baseline is non-agent commits from the same repositories and time period (2025), which is the appropriate contemporary comparison.",
    186           "source": "opus"
    187         },
    188         "ablation_study": {
    189           "applies": false,
    190           "answer": false,
    191           "justification": "This is an observational mining study, not a system with components to ablate.",
    192           "source": "opus"
    193         },
    194         "multiple_metrics": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "Multiple metrics used: commit-level ratios, repository-level ratios, Chi-squared statistics, standardized residuals, Cliff's delta effect sizes, and per-language/per-agent breakdowns.",
    198           "source": "opus"
    199         },
    200         "human_evaluation": {
    201           "applies": true,
    202           "answer": true,
    203           "justification": "Manual inspection of 500 agent commits (100% precision) and 100 mock commits (94% precision) to validate the automated detection approach (Sections 2.4 and 2.6.1).",
    204           "source": "opus"
    205         },
    206         "held_out_test_set": {
    207           "applies": false,
    208           "answer": false,
    209           "justification": "Not a prediction task; this is a descriptive mining study with no train/test split.",
    210           "source": "opus"
    211         },
    212         "per_category_breakdown": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "Results broken down by programming language (Python vs JS/TS), by coding agent (Claude, Copilot, Cursor, Other), by repository age (2025 vs all), and by agentic activity level (Tables 5, 8, 10).",
    216           "source": "opus"
    217         },
    218         "failure_cases_discussed": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "Mock detection precision reported at 94% (6% false positives). Threats to validity discuss detection limitations. The browser-use example shows agents violating mocking instructions (Section 4.2).",
    222           "source": "opus"
    223         },
    224         "negative_results_reported": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "Repository-level analysis for lower agentic activity showed negligible effect size despite statistical significance (Section 3.2.2, Table 10a), which is a negative/nuanced result.",
    228           "source": "opus"
    229         }
    230       },
    231       "setup_transparency": {
    232         "model_versions_specified": {
    233           "applies": false,
    234           "answer": false,
    235           "justification": "This study mines commit history; it does not invoke or evaluate any LLM directly.",
    236           "source": "opus"
    237         },
    238         "prompts_provided": {
    239           "applies": false,
    240           "answer": false,
    241           "justification": "No prompting is used; this is a repository mining study.",
    242           "source": "opus"
    243         },
    244         "hyperparameters_reported": {
    245           "applies": false,
    246           "answer": false,
    247           "justification": "No LLM API calls or model training; this is a mining study.",
    248           "source": "opus"
    249         },
    250         "scaffolding_described": {
    251           "applies": false,
    252           "answer": false,
    253           "justification": "No agentic scaffolding is used by the authors. They observe third-party agents through commit traces.",
    254           "source": "opus"
    255         },
    256         "data_preprocessing_documented": {
    257           "applies": true,
    258           "answer": true,
    259           "justification": "The full data pipeline is well documented: initial set (114,098 repos via SEART) → agent file detection (2,168) → agent commit detection (48,563 in 1,219 repos) → test commit detection (169,361 in 1,779) → mock commit detection (44,900 in 1,381). Selection criteria stated at each stage (Sections 2.2-2.6).",
    260           "source": "opus"
    261         }
    262       },
    263       "data_integrity": {
    264         "raw_data_available": {
    265           "applies": true,
    266           "answer": true,
    267           "justification": "Scripts and dataset publicly available at Zenodo (https://doi.org/10.5281/zenodo.17427638, Section 2.7).",
    268           "source": "opus"
    269         },
    270         "data_collection_described": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "Data collection thoroughly described: SEART GitHub Search Engine used with explicit criteria (≥100 commits, ≥5000 NLOC, not forks, recent activity), yielding 114,098 initial repositories (Section 2.2).",
    274           "source": "opus"
    275         },
    276         "recruitment_methods_described": {
    277           "applies": false,
    278           "answer": false,
    279           "justification": "No human participants. Data source is public GitHub repositories selected via documented automated criteria.",
    280           "source": "opus"
    281         },
    282         "data_pipeline_documented": {
    283           "applies": true,
    284           "answer": true,
    285           "justification": "Full pipeline documented with counts at each stage: 114,098 initial repos → 2,168 with agent files → 1,219 with agent commits → test/mock detection. Table 3 provides comprehensive summary (Sections 2.2-2.7).",
    286           "source": "opus"
    287         }
    288       },
    289       "contamination": {
    290         "training_cutoff_stated": {
    291           "applies": false,
    292           "answer": false,
    293           "justification": "This study does not evaluate any model's capabilities on a benchmark. It mines commit history.",
    294           "source": "opus"
    295         },
    296         "train_test_overlap_discussed": {
    297           "applies": false,
    298           "answer": false,
    299           "justification": "Not applicable; no model benchmark evaluation.",
    300           "source": "opus"
    301         },
    302         "benchmark_contamination_addressed": {
    303           "applies": false,
    304           "answer": false,
    305           "justification": "Not applicable; no model benchmark evaluation.",
    306           "source": "opus"
    307         }
    308       },
    309       "human_studies": {
    310         "pre_registered": {
    311           "applies": false,
    312           "answer": false,
    313           "justification": "No human participants. This is a repository mining study.",
    314           "source": "opus"
    315         },
    316         "irb_or_ethics_approval": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "No human participants.",
    320           "source": "opus"
    321         },
    322         "demographics_reported": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "No human participants.",
    326           "source": "opus"
    327         },
    328         "inclusion_exclusion_criteria": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "No human participants.",
    332           "source": "opus"
    333         },
    334         "randomization_described": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "No human participants.",
    338           "source": "opus"
    339         },
    340         "blinding_described": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "No human participants.",
    344           "source": "opus"
    345         },
    346         "attrition_reported": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "No human participants.",
    350           "source": "opus"
    351         }
    352       },
    353       "cost_and_practicality": {
    354         "inference_cost_reported": {
    355           "applies": false,
    356           "answer": false,
    357           "justification": "This is a mining study, not proposing a method with inference costs.",
    358           "source": "opus"
    359         },
    360         "compute_budget_stated": {
    361           "applies": true,
    362           "answer": false,
    363           "justification": "No mention of compute resources used for cloning and analyzing 2,168 repositories and 1.2M commits.",
    364           "source": "opus"
    365         }
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "Coding agents generate tests at nearly double the rate of non-agents: 23% of agent commits modify test files vs. 13% for non-agents.",
    372       "evidence": "Table 4 contingency table with Chi-squared test (χ²=3,683.06, p<0.001); standardized residual of 55.35 for the agent+test cell.",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "Coding agents add mocks to tests more frequently: 36% of agent test commits add mocks vs. 26% for non-agents.",
    377       "evidence": "Table 7 contingency table with Chi-squared test (χ²=505.5, p<0.001); repository-level confirmation in Table 10b (36% vs 28%, Cliff's delta=0.252).",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "Coding agents use far less diverse test double types, concentrating on 'mock' (95%) while non-agents use fake (57%) and spy (51%) substantially.",
    382       "evidence": "Figure 5 shows distribution across 496 repositories with at least one agent mock commit.",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "Repositories created in 2025 show 2-3x higher proportions of agent-generated tests (17% vs 7%) and mocks (19% vs 9%) compared to the full dataset.",
    387       "evidence": "Tables 6 and 9 compare 2025-created repos vs. full dataset; sample sizes are adequate (247K commits for 2025 subset).",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "The title framing 'over-mocked' implies agent mocking is qualitatively problematic, not merely more frequent.",
    392       "evidence": "The paper only measures relative frequencies; no test quality, coverage, flakiness, or maintenance burden outcomes are measured to establish harm.",
    393       "supported": "weak"
    394     },
    395     {
    396       "claim": "Agent configuration files (CLAUDE.md etc.) can influence mock generation behavior, but instructions are not always followed.",
    397       "evidence": "browser-use/browser-use had explicit 'never mock' instructions yet had 2 non-compliant commits; anecdotal, from a single repository.",
    398       "supported": "weak"
    399     }
    400   ],
    401   "methodology_tags": [
    402     "observational",
    403     "case-study"
    404   ],
    405   "key_findings": "Mining 1.2M commits from 2,168 Python/JavaScript/TypeScript repositories in 2025, the study finds that coding agents (Claude, Copilot, Cursor) generate tests at nearly double the rate of non-agents (23% vs 13% of commits) and add mocks to those tests at a higher rate (36% vs 26%). The over-mocking pattern is statistically significant and strengthens in repositories with more agent activity. Agents show dramatically less diversity in test double types, relying almost exclusively on generic mocks (95%) compared to non-agents who also use fakes (57%) and spies (51%). The agent share of test and mock commits is 2-3x higher in repositories created in 2025, suggesting growing agent influence on testing practices.",
    406   "red_flags": [
    407     {
    408       "flag": "Title overstates finding",
    409       "detail": "The paper is titled 'Over-Mocked Tests' but only demonstrates that agents mock more frequently than non-agents — it does not measure whether those mocks are actually harmful, excessive relative to context, or degrading test effectiveness."
    410     },
    411     {
    412       "flag": "Selection bias in repository sample",
    413       "detail": "Repositories with agent configuration files may self-select for development cultures (e.g., AI-first startups, tool-heavy projects) that have different testing norms independent of the agent tools themselves, confounding the agent vs. non-agent comparison."
    414     },
    415     {
    416       "flag": "No variance reported for main results",
    417       "detail": "The headline proportions (23%, 36%, etc.) are reported without confidence intervals or dispersion measures, making it difficult to assess the reliability of these estimates across the population of repositories."
    418     },
    419     {
    420       "flag": "Newer repos confounder unaddressed",
    421       "detail": "The finding that 2025-created repos have higher agent activity is presented as evidence of growing agent adoption, but these repos are also newer, smaller, and potentially more JS/TS-heavy — confounders not controlled for."
    422     }
    423   ],
    424   "cited_papers": [
    425     {
    426       "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
    427       "relevance": "RCT finding that Cursor increased task completion time by 19% despite developers perceiving 20% productivity gain — key adjacent empirical study on coding agent effects."
    428     },
    429     {
    430       "title": "Promises, Perils, and (Timely) Heuristics for Mining Coding Agent Activity",
    431       "relevance": "Companion MSR paper providing methodological foundation for detecting agent traces in repositories."
    432     },
    433     {
    434       "title": "Agentic Much? Adoption of Coding Agents on GitHub",
    435       "relevance": "Background paper on the scale and patterns of coding agent adoption that contextualizes the study."
    436     },
    437     {
    438       "title": "Why AI Agents Still Need You: Findings from Developer-Agent Collaborations in the Wild",
    439       "relevance": "Observational study of 19 developers using Cursor on 33 issues — characterizes collaboration strategies that may influence testing behavior."
    440     },
    441     {
    442       "title": "To Mock or Not to Mock: Divergence in Mocking Practices Between LLM and Developers",
    443       "relevance": "Direct precursor: controlled experiment showing GPT-4o generates more mocks than developers in Apache Dubbo — motivates this larger-scale study."
    444     },
    445     {
    446       "title": "Mock objects for testing Java systems: Why and how developers use them, and how they evolve",
    447       "relevance": "Foundational empirical study of mocking practices in Java, providing baseline for comparison."
    448     },
    449     {
    450       "title": "Use of test doubles in android testing: An in-depth investigation",
    451       "relevance": "Prior work on test double usage in mobile software, establishing methodology for detecting test doubles via code identifiers."
    452     },
    453     {
    454       "title": "Understanding Software Engineering Agents: A Study of Thought-Action-Result Trajectories",
    455       "relevance": "Analyzes agent interaction logs from SWE-bench for successful vs. failed tasks — complements the real-world repository analysis."
    456     }
    457   ],
    458   "engagement_factors": {
    459     "practical_relevance": {
    460       "score": 2,
    461       "justification": "Directly actionable advice for practitioners: add mocking guidance to CLAUDE.md/agent config files, and review agent-generated tests for mock overuse."
    462     },
    463     "surprise_contrarian": {
    464       "score": 2,
    465       "justification": "The finding that agents mock 10pp more than humans and use almost exclusively the mock type (95% vs broader variety) is a concrete, counterintuitive quantification of a vaguely suspected problem."
    466     },
    467     "fear_safety": {
    468       "score": 0,
    469       "justification": "No safety, security, or risk angle — this is about test quality, not AI danger."
    470     },
    471     "drama_conflict": {
    472       "score": 1,
    473       "justification": "Mildly challenges the narrative that coding agents improve developer productivity by showing they may degrade test quality through over-mocking, but stops short of naming specific tools as problematic."
    474     },
    475     "demo_ability": {
    476       "score": 1,
    477       "justification": "Dataset and scripts are publicly available on Zenodo, but reproducing the analysis requires cloning thousands of repos and running custom mining scripts."
    478     },
    479     "brand_recognition": {
    480       "score": 2,
    481       "justification": "Studies Claude Code, GitHub Copilot, and Cursor by name, and includes results from Microsoft, Home Assistant, and Apache repositories."
    482     }
    483   },
    484   "hn_data": {
    485     "threads": [],
    486     "top_points": 0,
    487     "total_points": 0,
    488     "total_comments": 0
    489   }
    490 }

Impressum · Datenschutz