ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (21912B)


      1 {
      2   "paper": {
      3     "title": "Are Coding Agents Generating Over-Mocked Tests? An Empirical Study",
      4     "authors": [
      5       "Andre Hora",
      6       "Romain Robbes"
      7     ],
      8     "year": 2026,
      9     "venue": "MSR '26",
     10     "arxiv_id": "2602.00409",
     11     "doi": "10.1145/3793302.3793362"
     12   },
     13   "scan_version": 3,
     14   "active_modules": [],
     15   "methodology_tags": [
     16     "observational"
     17   ],
     18   "key_findings": "Coding agents are more likely to modify test files (23% of agent commits vs 13% for non-agents) and to add mocks to tests (36% vs 26%). In repositories with higher agentic activity (≥50 agent commits), the mock ratio gap persists (36% vs 28%, small effect size). Coding agents predominantly use the mock type (95%), whereas non-agents employ a wider variety including fake (57%) and spy (51%).",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Scripts and dataset are publicly available at https://doi.org/10.5281/zenodo.17427638 (Section 2.7)."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Dataset publicly available at the same Zenodo link (Section 2.7)."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No mention of requirements.txt, Dockerfile, or dependency specifications. Tools mentioned (PyDriller, GitEvo) but no versions or environment setup."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions provided in the paper. The Zenodo archive may contain them, but the paper itself does not describe how to reproduce the analysis."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No confidence intervals or error bars reported. Results are presented as point estimates (percentages, ratios) without uncertainty quantification."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Chi-squared tests of independence used for RQ1 and RQ2 commit-level analysis. Paired Wilcoxon test used for RQ2 repository-level analysis with normality tests reported (Section 3.2.2)."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Cliff's delta reported for repository-level analysis: negligible for lower agentic activity, small (0.252) for higher agentic activity (Section 3.2.2, Table 10)."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No justification for the sample size or power analysis. The selection criteria (≥100 commits, ≥5000 NLOC, etc.) are described but not justified in terms of statistical power."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "Results are reported as medians and ratios without standard deviations or interquartile ranges. Table 10 shows medians but no spread measures."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Non-agent commits serve as the baseline comparison throughout (agent vs non-agent commits for test and mock ratios)."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The baseline is non-agent commits from the same repositories and time period (2025), which is the appropriate contemporary comparison."
     79       },
     80       "ablation_study": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "This is an observational mining study, not a system with components to ablate."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Multiple metrics used: commit-level ratios, repository-level ratios, Chi-squared statistics, standardized residuals, Cliff's delta effect sizes, and per-language/per-agent breakdowns."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Manual inspection of 500 agent commits (100% precision) and 100 mock commits (94% precision) to validate the automated detection approach (Sections 2.4 and 2.6.1)."
     94       },
     95       "held_out_test_set": {
     96         "applies": false,
     97         "answer": false,
     98         "justification": "Not a prediction task; this is a descriptive mining study with no train/test split."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results broken down by programming language (Python vs JS/TS), by coding agent (Claude, Copilot, Cursor, Other), by repository age (2025 vs all), and by agentic activity level (Tables 5, 8, 10)."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Mock detection precision reported at 94% (6% false positives). Threats to validity discuss detection limitations. The browser-use example shows agents violating mocking instructions (Section 4.2)."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Repository-level analysis for lower agentic activity showed negligible effect size despite statistical significance (Section 3.2.2, Table 10a), which is a negative/nuanced result."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "All five numbered claims in the abstract (60%, 23% vs 13%, 68%, 36% vs 26%, recent repos higher proportion) are directly supported by tables and statistical tests in the results section."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper uses language like 'coding agents are more likely to modify tests' and 'tend to rely more heavily on mocking' which suggest causal interpretations from observational data. The Chi-squared test shows association, not causation. The paper does not adequately discuss confounds (e.g., selection bias in which tasks agents are assigned to)."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 5 (Threats to Validity) explicitly states: 'our findings — as usual in empirical software engineering — cannot be directly generalized to repositories written in other languages or using other agents.'"
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper does not discuss alternative explanations for why agents mock more. For example, agents may be assigned to tasks that inherently require more mocking, or developers may selectively use agents for test-heavy tasks. The discussion assumes the difference reflects agent behavior rather than task selection bias."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper measures specific commit-level metrics (test file modification rates, mock addition rates) and frames findings at that exact granularity: '23% of agent commits modify test files' and '36% of agent test commits add mocks.' The paper does not claim broader constructs like 'test quality' — it explicitly discusses that higher mocking may or may not indicate quality issues (Section 4.2), acknowledging the interpretive gap."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "This study mines commit history; it does not invoke or evaluate any LLM directly."
    148       },
    149       "prompts_provided": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "No prompting is used; this is a repository mining study."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No LLM API calls or model training; this is a mining study."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used by the authors. They observe third-party agents through commit traces."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "The full data pipeline is well documented: initial set (114,098 repos via SEART) → agent file detection (2,168) → agent commit detection (48,563 in 1,219 repos) → test commit detection (169,361 in 1,779) → mock commit detection (44,900 in 1,381). Selection criteria stated at each stage (Sections 2.2-2.6)."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 5 'Threats to Validity' provides a dedicated discussion of limitations."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Specific threats discussed: detection precision for test/mock commits (with manual validation), agent commit detection methodology, co-authorship ambiguity, and language/agent generalization bounds (Section 5)."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The paper explicitly bounds scope to three languages and notes that findings 'cannot be directly generalized to repositories written in other languages or using other agents' (Section 5). Also states benchmark evaluations are out of scope (Section 6.1)."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Scripts and dataset publicly available at Zenodo (https://doi.org/10.5281/zenodo.17427638, Section 2.7)."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Data collection thoroughly described: SEART GitHub Search Engine used with explicit criteria (≥100 commits, ≥5000 NLOC, not forks, recent activity), yielding 114,098 initial repositories (Section 2.2)."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. Data source is public GitHub repositories selected via documented automated criteria."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Full pipeline documented with counts at each stage: 114,098 initial repos → 2,168 with agent files → 1,219 with agent commits → test/mock detection. Table 3 provides comprehensive summary (Sections 2.2-2.7)."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Acknowledgments section lists funding from CNPq, CAPES, FAPEMIG, INES.IA, and the French State."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Authors affiliated with UFMG (Brazil) and Univ. Bordeaux/CNRS/LaBRI (France). No product affiliations with the evaluated coding agents."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Funding from Brazilian and French public research agencies. These funders have no stake in whether coding agents over-mock."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests statement found in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "This study does not evaluate any model's capabilities on a benchmark. It mines commit history."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Not applicable; no model benchmark evaluation."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "Not applicable; no model benchmark evaluation."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. This is a repository mining study."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "This is a mining study, not proposing a method with inference costs."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No mention of compute resources used for cloning and analyzing 2,168 repositories and 1.2M commits."
    295       }
    296     }
    297   },
    298   "claims": [
    299     {
    300       "claim": "60% of repositories with agent activity also contain agent test activity, and 23% of agent commits modify test files vs 13% for non-agents.",
    301       "evidence": "Table 4 contingency table, Chi-squared test (χ²=3683.06, p<0.001), standardized residuals in Figure 3 (Section 3.1).",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "68% of repositories with agent test activity contain agent mock activity, and 36% of agent test commits add mocks vs 26% for non-agents.",
    306       "evidence": "Table 7 contingency table, Chi-squared test (χ²=505.5, p<0.001), standardized residuals in Figure 4 (Section 3.2.1).",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "In repositories with higher agentic activity (≥50 agent commits), agents have a higher mock ratio (36%) than non-agents (28%) with small effect size.",
    311       "evidence": "Table 10b, paired Wilcoxon test p<0.001, Cliff's delta=0.252 (Section 3.2.2).",
    312       "supported": "strong"
    313     },
    314     {
    315       "claim": "Coding agents predominantly use the mock type (95%), whereas non-agents employ wider variety: mock (91%), fake (57%), spy (51%).",
    316       "evidence": "Figure 5, distribution across 496 repositories with agent mock commits (Section 3.3).",
    317       "supported": "strong"
    318     },
    319     {
    320       "claim": "Higher mocking by agents may reflect overuse of isolation techniques, leading to tests easier to generate but less effective at validating real interactions.",
    321       "evidence": "Discussion in Section 4.2 citing Google Testing Blog [28]. No direct evidence that agent mocks are less effective — this is interpretive.",
    322       "supported": "weak"
    323     }
    324   ],
    325   "red_flags": [
    326     {
    327       "flag": "Task selection confound unaddressed",
    328       "detail": "Agents may be disproportionately assigned to tasks that inherently require more mocking (e.g., adding integrations, API wrappers). The paper compares agent vs non-agent commit mock rates without controlling for task type, which could explain the difference independently of agent behavior."
    329     },
    330     {
    331       "flag": "Title implies causation from observational data",
    332       "detail": "The title 'Are Coding Agents Generating Over-Mocked Tests?' frames the study as evaluating whether agents over-mock, but the observational design cannot distinguish agent tendencies from task selection bias or developer instruction effects."
    333     }
    334   ],
    335   "cited_papers": [
    336     {
    337       "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
    338       "authors": [
    339         "Joel Becker",
    340         "Nate Rush",
    341         "Elizabeth Barnes",
    342         "David Rein"
    343       ],
    344       "year": 2025,
    345       "arxiv_id": "2507.09089",
    346       "relevance": "RCT measuring AI coding agent impact on developer productivity, finding 19% slower task completion despite perceived 20% speedup."
    347     },
    348     {
    349       "title": "Why AI Agents Still Need You: Findings from Developer-Agent Collaborations in the Wild",
    350       "authors": [
    351         "Aayush Kumar",
    352         "Yasharth Bajpai",
    353         "Sumit Gulwani",
    354         "Gustavo Soares",
    355         "Emerson Murphy-Hill"
    356       ],
    357       "year": 2025,
    358       "relevance": "Observational study of developer-agent collaboration strategies with Cursor, relevant to understanding agent usage patterns."
    359     },
    360     {
    361       "title": "Understanding Software Engineering Agents: A Study of Thought-Action-Result Trajectories",
    362       "authors": [
    363         "Islem Bouzenia",
    364         "Michael Pradel"
    365       ],
    366       "year": 2025,
    367       "arxiv_id": "2506.18824",
    368       "relevance": "Analysis of agent interaction logs from SWE-bench, studying agent behavior patterns and anti-patterns."
    369     },
    370     {
    371       "title": "The Rise of AI Teammates in Software Engineering (SE) 3.0",
    372       "authors": [
    373         "Hao Li",
    374         "Haoxiang Zhang",
    375         "Ahmed E Hassan"
    376       ],
    377       "year": 2025,
    378       "arxiv_id": "2507.15003",
    379       "relevance": "Survey on autonomous coding agents reshaping software engineering practices."
    380     },
    381     {
    382       "title": "Promises, Perils, and (Timely) Heuristics for Mining Coding Agent Activity",
    383       "authors": [
    384         "Romain Robbes",
    385         "Théo Matricon",
    386         "Thomas Degueule",
    387         "Andre Hora",
    388         "Stefano Zacchiroli"
    389       ],
    390       "year": 2026,
    391       "relevance": "Foundational work on detecting and mining coding agent traces in repositories, methodology used in this paper."
    392     },
    393     {
    394       "title": "Agentic Much? Adoption of Coding Agents on GitHub",
    395       "authors": [
    396         "Romain Robbes",
    397         "Théo Matricon",
    398         "Thomas Degueule",
    399         "Andre Hora",
    400         "Stefano Zacchiroli"
    401       ],
    402       "year": 2025,
    403       "relevance": "Study of coding agent adoption patterns on GitHub, companion work to this paper."
    404     },
    405     {
    406       "title": "ChatUniTest: A framework for LLM-based test generation",
    407       "authors": [
    408         "Yinghao Chen"
    409       ],
    410       "year": 2024,
    411       "relevance": "LLM-based test generation framework, relevant to understanding automated test quality."
    412     },
    413     {
    414       "title": "Automated unit test improvement using large language models at Meta",
    415       "authors": [
    416         "Nadia Alshahwan"
    417       ],
    418       "year": 2024,
    419       "relevance": "Industry study of LLM-generated test quality at Meta, including quality issues like flaky tests."
    420     },
    421     {
    422       "title": "An empirical evaluation of using large language models for automated unit test generation",
    423       "authors": [
    424         "Max Schäfer",
    425         "Sarah Nadi",
    426         "Aryaz Eghbali",
    427         "Frank Tip"
    428       ],
    429       "year": 2023,
    430       "relevance": "Evaluation of LLM capabilities for unit test generation."
    431     },
    432     {
    433       "title": "To Mock or Not to Mock: Divergence in Mocking Practices Between LLM and Developers",
    434       "authors": [
    435         "Hanbin Qin"
    436       ],
    437       "year": 2025,
    438       "relevance": "Direct predecessor studying LLM mocking decisions vs developer choices in a controlled setting."
    439     },
    440     {
    441       "title": "Self-admitted GenAI usage in open-source software",
    442       "authors": [
    443         "Tao Xiao"
    444       ],
    445       "year": 2025,
    446       "arxiv_id": "2507.10422",
    447       "relevance": "Study of how developers self-report LLM usage in open-source, complementary approach to agent trace mining."
    448     }
    449   ],
    450   "engagement_factors": {
    451     "practical_relevance": {
    452       "score": 2,
    453       "justification": "Directly actionable advice for practitioners: add mocking guidance to CLAUDE.md/agent config files, and review agent-generated tests for mock overuse."
    454     },
    455     "surprise_contrarian": {
    456       "score": 2,
    457       "justification": "The finding that agents mock 10pp more than humans and use almost exclusively the mock type (95% vs broader variety) is a concrete, counterintuitive quantification of a vaguely suspected problem."
    458     },
    459     "fear_safety": {
    460       "score": 0,
    461       "justification": "No safety, security, or risk angle — this is about test quality, not AI danger."
    462     },
    463     "drama_conflict": {
    464       "score": 1,
    465       "justification": "Mildly challenges the narrative that coding agents improve developer productivity by showing they may degrade test quality through over-mocking, but stops short of naming specific tools as problematic."
    466     },
    467     "demo_ability": {
    468       "score": 1,
    469       "justification": "Dataset and scripts are publicly available on Zenodo, but reproducing the analysis requires cloning thousands of repos and running custom mining scripts."
    470     },
    471     "brand_recognition": {
    472       "score": 2,
    473       "justification": "Studies Claude Code, GitHub Copilot, and Cursor by name, and includes results from Microsoft, Home Assistant, and Apache repositories."
    474     }
    475   }
    476 }

Impressum · Datenschutz