ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (23535B)


      1 {
      2   "paper": {
      3     "title": "Agentic Refactoring: An Empirical Study of AI Coding Agents",
      4     "authors": [
      5       "Kosei Horikawa",
      6       "Hao Li",
      7       "Yutaro Kashiwa",
      8       "Bram Adams",
      9       "Hajimu Iida",
     10       "Ahmed E. Hassan"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv preprint",
     14     "arxiv_id": "2511.04824"
     15   },
     16   "scan_version": 3,
     17   "active_modules": [],
     18   "methodology_tags": [
     19     "observational"
     20   ],
     21   "key_findings": "Large-scale mining study of 15,451 refactoring instances from AI coding agents in open-source Java projects. Refactoring appears in 26.1% of agentic commits, dominated by low-level edits (renaming, type changes) rather than high-level architectural changes. Agentic refactoring is primarily motivated by maintainability (52.5%) and readability (28.1%). Structural metrics show small but statistically significant improvements (e.g., Class LOC median Δ = -15.25), but design/implementation smell counts show no meaningful reduction.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Replication package provided at https://github.com/Mont9165/Agent_Refactoring_Analysis (Section 1, footnote 4)."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The study builds on the publicly available AIDev dataset [28] and provides its own replication package with derived data."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No requirements.txt, Dockerfile, or environment specification is mentioned. Tools are named (RefactoringMiner 3.0.11, DesigniteJava, GPT-4.1-mini) but no environment setup details are provided."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions are described in the paper. A replication package is linked but no README or reproduction steps are mentioned in the paper itself."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "Results report point estimates (percentages, medians, effect sizes) but no confidence intervals or error bars are provided."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Mann-Whitney U test (RQ1), Wilcoxon signed-rank test with Benjamini-Hochberg FDR correction (RQ4), and Kruskal-Wallis tests are used throughout (Sections 4.1, 4.4)."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Cliff's delta reported for RQ1 (d = 0.838, large), rank-biserial effect size for RQ4, and Cohen's d for smell analysis (d = -0.027, -0.026). Median Δ values provide concrete magnitude context (e.g., Class LOC Δ = -15.25)."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No justification for why the dataset size is sufficient or power analysis. The sample is large (14,998 commits) but no formal justification is given."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "Median values are reported but no standard deviations, IQR, or other numeric spread measures are stated in tables. Box plots are shown (Figures 3, 5) but no numeric spread measures accompany the main results."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Human refactoring patterns from prior work [22, 26] are used as baselines for comparison in RQ2 and RQ3."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Human refactoring baseline from Horikawa et al. [22] (2025) is contemporary. Kim et al. [26] (2014) is older but justified as the foundational source for refactoring motivation categories."
     82       },
     83       "ablation_study": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "This is a mining/observational study, not a system with components to ablate."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Multiple code quality metrics used: LOC, WMC, Fan-In, Fan-Out, Cyclomatic Complexity, Depth of Inheritance Tree, LCOM, plus design and implementation smell counts (Section 4.4)."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Two human annotators independently labeled a stratified sample of refactoring purposes for RQ3, with Cohen's κ = 0.83 inter-rater agreement (Section 4.3.2, Table 6)."
     97       },
     98       "held_out_test_set": {
     99         "applies": false,
    100         "answer": false,
    101         "justification": "Not a prediction/classification study requiring train/test splits."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Results broken down by refactoring abstraction level (high/medium/low), by refactoring type (Table 5), by purpose category (Figure 4), and by metric (Table 7)."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Finding #8 discusses that agentic refactoring fails to reduce smell counts. Finding #12 discusses that some refactoring types show no metric improvement. Section 5 discusses limitations of agents for high-level refactoring."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Key negative results reported: agents fail to reduce design/implementation smells (Finding #8), low-level edits may slightly increase cyclomatic complexity (Finding #10), and agents underperform humans in high-level refactoring."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Abstract claims about 26.1% refactoring rate, dominance of low-level edits, maintainability/readability motivation, and small but significant metric improvements are all supported by results in Sections 4.1-4.4."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The paper is careful to use observational language ('suggests', 'indicates') and explicitly frames findings as descriptive rather than causal. The construct validity section (7.2) acknowledges the difficulty of isolating agent vs. human contributions."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "External validity section (7.3) explicitly states limitations to OSS projects, Java language only, and the specific agents in the AIDev dataset. Caution against generalizing to closed-source or other languages is stated."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Section 7.2 discusses that commits labeled 'agentic' may include human modifications. Section 7.1 discusses tool limitations (RefactoringMiner false positives/negatives). The paper considers whether agents are doing 'code churn' vs. genuine improvement (Section 5.1)."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper measures specific code metrics (LOC, WMC, Cyclomatic Complexity, LCOM, smell counts) and frames findings at the same granularity: 'small but statistically significant improvements in structural metrics.' It explicitly acknowledges that these metrics are proxies: Section 7.2 discusses that RefactoringMiner/DesigniteJava 'may not fully capture all aspects' and Section 5 discusses whether metric improvements translate to actual quality gains."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "GPT-4.1-mini is specified for classification tasks (Sections 3.2.2, 4.3.2). RefactoringMiner 3.0.11 is specified with exact version (footnote 8)."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "GPT-4.1-mini is used for project classification and refactoring purpose classification, but the actual prompts used are not provided — only the categories are described."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "No temperature, top-p, or other LLM hyperparameters are reported for the GPT-4.1-mini classification tasks."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "The study mines existing agent outputs rather than building an agentic system. The agents studied (Codex, Cursor, etc.) are third-party tools evaluated as black boxes."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Multi-stage filtering pipeline is thoroughly documented with counts at each stage: 1.3M commits → Java filter → project filtering (automated + manual) → 1,613 repos, 14,998 commits → RefactoringMiner → 5,789 refactoring commits → SAR patterns → 3,907 agentic refactoring commits (Sections 3.1-3.2)."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Dedicated Section 7 'Threats to Validity' covering internal, construct, and external validity."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Specific threats discussed: RefactoringMiner/DesigniteJava tool accuracy (7.1), GPT-4.1-mini misclassification risk mitigated by κ = 0.77 (7.1), difficulty isolating human vs. agent contributions in commits (7.2), limitation to Java and OSS (7.3)."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section 7.3 explicitly states scope is limited to OSS projects, Java files only, and the five agents in the AIDev dataset. Section 7.2 notes the study cannot determine exact human intervention levels."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Replication package at https://github.com/Mont9165/Agent_Refactoring_Analysis is provided, and the study builds on the publicly available AIDev dataset."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 3 thoroughly describes data mining from AIDev dataset, GitHub REST API commit collection, and multi-stage filtering. Source dataset, API endpoints, and filtering criteria are all specified."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No human participants recruited. The study mines public GitHub repositories from an existing dataset."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Full pipeline documented in Section 3 and Figure 2: AIDev dataset → GitHub API mining → Java file filtering → project classification (GPT-4.1-mini + manual review) → fork removal → RefactoringMiner → SAR pattern matching, with counts at each stage."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Acknowledgments section lists JSPS KAKENHI grants, JST PRESTO, ASPIRE, AIP Accelerated Program, and NSERC support."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations clearly listed: NAIST (Japan) and Queen's University (Canada). No evaluated product affiliations."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "Funders are government research agencies (JSPS, JST, NSERC) with no financial stake in the study's outcomes regarding any specific coding agent."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial interests statement is present in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "This is a mining study analyzing existing agent-generated commits. It does not evaluate a pre-trained model's capability on any benchmark."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "Not a benchmark evaluation study. The study mines and analyzes existing code contributions."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "Not a benchmark evaluation study."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants. The study mines public repositories; human annotators validated LLM classifications but are not study subjects."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in the study."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants. Annotators' experience levels are noted (7 and 17 years) but they are not study subjects."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "The study uses GPT-4.1-mini for classification of thousands of commits and projects but does not report the API cost or token consumption."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "No mention of total compute budget for running RefactoringMiner, DesigniteJava, or GPT-4.1-mini across the dataset."
    298       }
    299     }
    300   },
    301   "claims": [
    302     {
    303       "claim": "Refactoring appears in 26.1% of agentic commits (3,907 out of 14,998), with 7,127 detected refactoring instances.",
    304       "evidence": "Table 3 and Section 4.1.3 (Finding #1). RefactoringMiner detection combined with SAR pattern matching on commit messages.",
    305       "supported": "strong"
    306     },
    307     {
    308       "claim": "When agents explicitly state refactoring intent, they perform significantly more refactoring instances per commit than other commits (Cliff's d = 0.838, large effect).",
    309       "evidence": "Section 4.1.3 (Finding #2), Figure 3. Mann-Whitney U test p ≤ 0.001.",
    310       "supported": "strong"
    311     },
    312     {
    313       "claim": "Agentic refactoring is dominated by low-level edits (35.8%) compared to human refactoring (24.4%), while performing fewer high-level changes (43.0% vs. 54.9%).",
    314       "evidence": "Table 4 and Section 4.2.3 (Finding #3). Comparison with human baseline from Horikawa et al. [22].",
    315       "supported": "strong"
    316     },
    317     {
    318       "claim": "Maintainability (52.5%) and readability (28.1%) account for over 80% of agentic refactoring motivation.",
    319       "evidence": "Figure 4 and Section 4.3.3 (Finding #5). GPT-4.1-mini classification validated with human annotation (Cohen's κ = 0.77).",
    320       "supported": "strong"
    321     },
    322     {
    323       "claim": "Agentic refactoring yields statistically significant but negligible-effect-size reductions in design/implementation smell counts.",
    324       "evidence": "Section 4.4.3 (Finding #8). Wilcoxon signed-rank test p < 0.001, but Cohen's d = -0.027 and -0.026 (negligible). Median Δ = 0.00 for both smell types.",
    325       "supported": "strong"
    326     },
    327     {
    328       "claim": "Medium-level refactorings produce the most consistent structural quality improvements (Class LOC median Δ = -15.25, WMC median Δ = -2.07).",
    329       "evidence": "Table 7 and Section 4.4.3 (Finding #10). FDR-adjusted significance with Kruskal-Wallis tests across levels.",
    330       "supported": "strong"
    331     }
    332   ],
    333   "red_flags": [
    334     {
    335       "flag": "Heavily skewed agent distribution",
    336       "detail": "OpenAI Codex accounts for 89.3% of commits and 94.3% of PRs (Table 2). Findings may primarily reflect Codex behavior rather than agentic tools generally, but the paper does not analyze per-agent differences."
    337     },
    338     {
    339       "flag": "Human baseline from different context",
    340       "detail": "Human refactoring baselines come from prior studies [22, 26] with different datasets and time periods, not from a matched comparison within the same repositories. Direct comparisons should be interpreted cautiously."
    341     },
    342     {
    343       "flag": "LLM prompts not disclosed",
    344       "detail": "GPT-4.1-mini is used for project classification (Section 3.2.2) and refactoring purpose classification (Section 4.3.2), but the actual prompts are not provided, limiting reproducibility of these classification steps."
    345     }
    346   ],
    347   "cited_papers": [
    348     {
    349       "title": "The Rise of AI Teammates in Software Engineering (SE) 3.0: How Autonomous Coding Agents Are Reshaping Software Engineering",
    350       "authors": [
    351         "Hao Li",
    352         "Haoxiang Zhang",
    353         "Ahmed E. Hassan"
    354       ],
    355       "year": 2025,
    356       "arxiv_id": "2507.15003",
    357       "relevance": "Foundational AIDev dataset used in this study; empirical analysis of agentic coding contributions at scale."
    358     },
    359     {
    360       "title": "On the Use of Agentic Coding: An Empirical Study of Pull Requests on GitHub",
    361       "authors": [
    362         "Miku Watanabe",
    363         "Hao Li",
    364         "Yutaro Kashiwa",
    365         "Brittany Reid",
    366         "Hajimu Iida",
    367         "Ahmed E. Hassan"
    368       ],
    369       "year": 2025,
    370       "arxiv_id": "2509.14745",
    371       "relevance": "Study of agentic PRs finding 45.1% require post-review fixes; directly related to quality of AI-generated code."
    372     },
    373     {
    374       "title": "Agentic Software Engineering: Foundational Pillars and a Research Roadmap",
    375       "authors": [
    376         "Ahmed E. Hassan",
    377         "Hao Li",
    378         "Dayi Lin",
    379         "Bram Adams",
    380         "Tse-Hsun Chen",
    381         "Yutaro Kashiwa",
    382         "Dong Qiu"
    383       ],
    384       "year": 2025,
    385       "arxiv_id": "2509.06216",
    386       "relevance": "Defines agentic software engineering paradigm and research roadmap."
    387     },
    388     {
    389       "title": "Is GitHub's Copilot as bad as humans at introducing vulnerabilities in code?",
    390       "authors": [
    391         "Owura Asare",
    392         "Meiyappan Nagappan",
    393         "N. Asokan"
    394       ],
    395       "year": 2023,
    396       "relevance": "Empirical study of AI-generated code security; relevant to AI code quality assessment."
    397     },
    398     {
    399       "title": "Using AI-based coding assistants in practice: State of affairs, perceptions, and ways forward",
    400       "authors": [
    401         "Agnia Sergeyuk",
    402         "Yaroslav Golubev",
    403         "Timofey Bryksin",
    404         "Iftekhar Ahmed"
    405       ],
    406       "year": 2025,
    407       "relevance": "Developer perception study finding 21.9% avoid AI for refactoring due to trust concerns."
    408     },
    409     {
    410       "title": "GitHub Copilot AI pair programmer: Asset or Liability?",
    411       "authors": [
    412         "Arghavan Moradi Dakhel",
    413         "Vahid Majdinasab",
    414         "Amin Nikanjam",
    415         "Foutse Khomh",
    416         "Michel C. Desmarais",
    417         "Zhen Ming (Jack) Jiang"
    418       ],
    419       "year": 2023,
    420       "relevance": "Empirical evaluation of Copilot as AI pair programmer; relevant to AI coding assistant quality."
    421     },
    422     {
    423       "title": "An Empirical Study on the Code Refactoring Capability of Large Language Models",
    424       "authors": [
    425         "Jonathan Cordeiro",
    426         "Shayan Noei",
    427         "Ying Zou"
    428       ],
    429       "year": 2024,
    430       "arxiv_id": "2411.02320",
    431       "relevance": "Evaluates LLM refactoring quality under different prompt strategies; direct comparison to agentic refactoring."
    432     },
    433     {
    434       "title": "Vibe Coding vs. Agentic Coding: Fundamentals and Practical Implications of Agentic AI",
    435       "authors": [
    436         "Ranjan Sapkota",
    437         "Konstantinos I. Roumeliotis",
    438         "Manoj Karkee"
    439       ],
    440       "year": 2025,
    441       "arxiv_id": "2505.19443",
    442       "relevance": "Distinguishes vibe coding from agentic coding paradigms; relevant to understanding agent autonomy levels."
    443     },
    444     {
    445       "title": "Lost at C: A User Study on the Security Implications of Large Language Model Code Assistants",
    446       "authors": [
    447         "Gustavo Sandoval",
    448         "Hammond Pearce",
    449         "Teo Nys",
    450         "Ramesh Karri",
    451         "Siddharth Garg",
    452         "Brendan Dolan-Gavitt"
    453       ],
    454       "year": 2023,
    455       "relevance": "User study on security implications of LLM code assistants; relevant to AI-generated code quality."
    456     },
    457     {
    458       "title": "Exploring ChatGPT's code refactoring capabilities: An empirical study",
    459       "authors": [
    460         "Kayla Depalma",
    461         "Izabel Miminoshvili",
    462         "Chiara Henselder",
    463         "Kate Moss",
    464         "Eman Abdullah AlOmar"
    465       ],
    466       "year": 2024,
    467       "relevance": "Empirical study of ChatGPT refactoring showing inconsistency and unnecessary edits."
    468     }
    469   ],
    470   "engagement_factors": {
    471     "practical_relevance": {
    472       "score": 2,
    473       "justification": "Directly informs developers on what to delegate to AI agents (low-level cleanup) vs. handle themselves (architectural refactoring)."
    474     },
    475     "surprise_contrarian": {
    476       "score": 1,
    477       "justification": "The finding that agents fail to reduce code smells despite refactoring is mildly surprising, but the dominance of low-level edits is largely expected."
    478     },
    479     "fear_safety": {
    480       "score": 0,
    481       "justification": "No safety, security, or risk angle is present in this study."
    482     },
    483     "drama_conflict": {
    484       "score": 1,
    485       "justification": "Mildly questions the value proposition of AI coding agents by showing they produce negligible quality improvements and mostly do cosmetic cleanup."
    486     },
    487     "demo_ability": {
    488       "score": 1,
    489       "justification": "Replication package exists on GitHub but requires RefactoringMiner, DesigniteJava, and significant setup to reproduce."
    490     },
    491     "brand_recognition": {
    492       "score": 2,
    493       "justification": "Study directly analyzes OpenAI Codex, Claude Code, Cursor, and Devin — well-known products in the developer tools space."
    494     }
    495   }
    496 }

Impressum · Datenschutz