calibration.json (18384B)
1 { 2 "calibration_metadata": { 3 "paper_slug": "agentic-refactoring-empirical-2025", 4 "calibration_date": "2026-02-28", 5 "sonnet_scan_date": "2026-02-28", 6 "calibrator": "opus", 7 "agreement_rate": 0.98, 8 "total_questions": 50, 9 "agreements": 49, 10 "disagreements": 1 11 }, 12 "opus_checklist": { 13 "artifacts": { 14 "code_released": { 15 "applies": true, 16 "answer": true, 17 "justification": "Replication package provided at https://github.com/Mont9165/Agent_Refactoring_Analysis (footnote 4, Section 1). This is a working URL to a public repository." 18 }, 19 "data_released": { 20 "applies": true, 21 "answer": true, 22 "justification": "The replication package includes data used in the study. The AIDev dataset [28] used as the primary source is also publicly available." 23 }, 24 "environment_specified": { 25 "applies": true, 26 "answer": false, 27 "justification": "The paper specifies tool versions (RefactoringMiner 3.0.11, GPT-4.1-mini, DesigniteJava) but provides no requirements.txt, Dockerfile, or detailed environment setup listing library versions and dependencies." 28 }, 29 "reproduction_instructions": { 30 "applies": true, 31 "answer": false, 32 "justification": "The paper references a replication package but does not include step-by-step reproduction instructions in the paper text. No commands, scripts, or 'Reproducing Results' section is present." 33 } 34 }, 35 "statistical_methodology": { 36 "confidence_intervals_or_error_bars": { 37 "applies": true, 38 "answer": false, 39 "justification": "The paper reports point estimates (medians, percentages, p-values, effect sizes) but no confidence intervals or error bars on main results." 40 }, 41 "significance_tests": { 42 "applies": true, 43 "answer": true, 44 "justification": "Mann-Whitney U test (RQ1, Section 4.1.2), Wilcoxon signed-rank tests with Benjamini-Hochberg FDR correction (RQ4, Section 4.4.2), and Kruskal-Wallis tests for cross-group comparisons are used with reported p-values." 45 }, 46 "effect_sizes_reported": { 47 "applies": true, 48 "answer": true, 49 "justification": "Cliff's delta (d = 0.838, large) for RQ1 (Section 4.1.2), Cohen's d (-0.027, -0.026, negligible) for smell counts in RQ4, and rank-biserial effect sizes for metric comparisons. Thresholds are explicitly defined (Equation 1)." 50 }, 51 "sample_size_justified": { 52 "applies": true, 53 "answer": false, 54 "justification": "No power analysis or justification for why the sample size is sufficient. The sample is determined by the availability of data in the AIDev dataset, not by any statistical consideration." 55 }, 56 "variance_reported": { 57 "applies": true, 58 "answer": false, 59 "justification": "Median delta values are reported (Table 7) but without IQR, standard deviation, or other spread measures. Figures 3 and 5 show distributions qualitatively but spread measures are not quantified in tables." 60 } 61 }, 62 "evaluation_design": { 63 "baselines_included": { 64 "applies": true, 65 "answer": true, 66 "justification": "Human refactoring patterns from Horikawa et al. 2025 (RQ2 type distributions) and Kim et al. 2014 (RQ3 purpose classification) serve as baselines for comparison." 67 }, 68 "baselines_contemporary": { 69 "applies": true, 70 "answer": true, 71 "justification": "Horikawa et al. 2025 (ICSME'25) is contemporary for type distributions. Kim et al. 2014 is older for purpose comparison but is the canonical study on developer refactoring motivations and the paper explicitly adjusts its categorization scheme." 72 }, 73 "ablation_study": { 74 "applies": false, 75 "answer": false, 76 "justification": "This is a descriptive mining study that does not propose a system or method with components. There is nothing to ablate. The breakdowns by abstraction level and purpose are analytical perspectives, not ablation experiments." 77 }, 78 "multiple_metrics": { 79 "applies": true, 80 "answer": true, 81 "justification": "RQ4 uses multiple metrics: Lines of Code, Cyclomatic Complexity, WMC, Fan-in, Fan-out, LCOM, DIT, design smell counts, and implementation smell counts (Section 4.4.2, Table 7)." 82 }, 83 "human_evaluation": { 84 "applies": true, 85 "answer": true, 86 "justification": "Two human annotators (7 years experience each) independently labeled a stratified sample of refactoring commits for purpose classification (RQ3). Cohen's kappa = 0.83 between humans. Disagreements resolved by a third annotator (17 years experience). Section 4.3.2." 87 }, 88 "held_out_test_set": { 89 "applies": false, 90 "answer": false, 91 "justification": "This is a mining study analyzing the full dataset descriptively. There is no train/test split or predictive model evaluation." 92 }, 93 "per_category_breakdown": { 94 "applies": true, 95 "answer": true, 96 "justification": "Results broken down by abstraction level (high/medium/low) in Tables 4, 5, and 7; by individual refactoring type in Table 5; by purpose category in Figure 4." 97 }, 98 "failure_cases_discussed": { 99 "applies": true, 100 "answer": true, 101 "justification": "Finding #8 reports negligible smell count reduction (median delta = 0.00). Finding #10 notes low-level edits slightly increasing cyclomatic complexity. Section 5 discusses limitations of current agents." 102 }, 103 "negative_results_reported": { 104 "applies": true, 105 "answer": true, 106 "justification": "Finding #8 explicitly reports agents 'fail to consistently reduce the overall count of known design and implementation smells' (Section 4.4.3). Cohen's d = -0.027 (negligible)." 107 } 108 }, 109 "claims_and_evidence": { 110 "abstract_claims_supported": { 111 "applies": true, 112 "answer": true, 113 "justification": "All abstract claims are supported: 26.1% refactoring rate (Table 3), low-level type dominance (Tables 4-5), maintainability 52.5% and readability 28.1% (Figure 4), Class LOC median delta = -15.25 (Table 7)." 114 }, 115 "causal_claims_justified": { 116 "applies": true, 117 "answer": false, 118 "justification": "The paper claims agentic refactoring 'yields improvements' and 'reduces' metrics (e.g., abstract, Finding #9) using before-after Wilcoxon tests. This is an observational design without control commits; commits may contain non-refactoring changes that confound the metric deltas. The tangled commits issue is acknowledged in Section 4.1 but not controlled for in RQ4." 119 }, 120 "generalization_bounded": { 121 "applies": true, 122 "answer": true, 123 "justification": "Section 7.3 explicitly bounds findings to OSS Java projects from the AIDev dataset and warns about generalization to industrial settings and other languages." 124 }, 125 "alternative_explanations_discussed": { 126 "applies": true, 127 "answer": false, 128 "justification": "The paper does not discuss alternative explanations for the observed patterns. The dominance of OpenAI Codex (89.3% of commits) means results may reflect Codex-specific behavior rather than agentic coding generally, but this is not discussed. The threats section focuses on tool accuracy and construct definitions, not alternative interpretations of findings." 129 } 130 }, 131 "setup_transparency": { 132 "model_versions_specified": { 133 "applies": true, 134 "answer": true, 135 "justification": "GPT-4.1-mini is specified (Section 3.2.2, Figure 2). RefactoringMiner 3.0.11 is specified (footnote 8). GPT-4.1-mini is a specific API model identifier." 136 }, 137 "prompts_provided": { 138 "applies": true, 139 "answer": false, 140 "justification": "GPT-4.1-mini is used to classify repository types and refactoring purposes but the actual prompts are not provided. The paper describes inputs (README content, commit messages, PR titles) and output categories but not the prompt text." 141 }, 142 "hyperparameters_reported": { 143 "applies": true, 144 "answer": false, 145 "justification": "No hyperparameters (temperature, top-p, max_tokens) are reported for the GPT-4.1-mini API calls used for classification." 146 }, 147 "scaffolding_described": { 148 "applies": false, 149 "answer": false, 150 "justification": "This study analyzes existing agent-generated commits from the AIDev dataset. It evaluates third-party agents (OpenAI Codex, Devin, Cursor, Claude Code) as black boxes and does not build or deploy any agentic scaffolding." 151 }, 152 "data_preprocessing_documented": { 153 "applies": true, 154 "answer": true, 155 "justification": "Section 3.2 describes the multi-stage filtering pipeline with specific counts at each stage: 1,311,057 commits -> 14,998 Java non-merge commits -> 5,789 refactoring commits -> 3,907 agentic refactoring commits. Filtering criteria (Java file modification, merge exclusion, README classification, fork removal) are stated at each step." 156 } 157 }, 158 "limitations_and_scope": { 159 "limitations_section_present": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 7 'Threats to Validity' contains three dedicated subsections: Internal Validity (7.1), Construct Validity (7.2), and External Validity (7.3) with substantive discussion." 163 }, 164 "threats_to_validity_specific": { 165 "applies": true, 166 "answer": true, 167 "justification": "Specific threats include: RefactoringMiner false positives/negatives (7.1), GPT-4.1-mini misclassification risk mitigated by kappa = 0.77 (7.1), difficulty determining precise human intervention level in 'agentic' commits (7.2), and OSS-only Java dataset limiting generalizability (7.3)." 168 }, 169 "scope_boundaries_stated": { 170 "applies": true, 171 "answer": true, 172 "justification": "Section 7.3 explicitly states: 'Our study is based on the AIDev dataset, which consists exclusively of open-source software (OSS) projects' and 'our analysis was limited to commits that involved changes to Java files.' Warns about generalization to industrial settings and other languages." 173 } 174 }, 175 "data_integrity": { 176 "raw_data_available": { 177 "applies": true, 178 "answer": true, 179 "justification": "Replication package at https://github.com/Mont9165/Agent_Refactoring_Analysis includes data. The AIDev dataset [28] is also publicly available." 180 }, 181 "data_collection_described": { 182 "applies": true, 183 "answer": true, 184 "justification": "Section 3.1 describes using the GitHub REST API to mine commits from repositories in the AIDev dataset, collecting 1,311,057 agentic commits. The AIDev dataset composition (932,791 PRs from 61,000+ repositories across five agents) is described." 185 }, 186 "recruitment_methods_described": { 187 "applies": false, 188 "answer": false, 189 "justification": "No human participants are recruited. Data sources are public GitHub repositories via the AIDev dataset, a standard public dataset. This is a repository mining study." 190 }, 191 "data_pipeline_documented": { 192 "applies": true, 193 "answer": true, 194 "justification": "Section 3.2 documents each filtering step with counts: 1.3M commits -> 14,998 (Java non-merge) -> 5,789 (refactoring by RefactoringMiner) -> 3,907 (agentic refactoring by keyword matching). Repository filtering criteria (toy removal, fork removal) with counts are also stated." 195 } 196 }, 197 "conflicts_of_interest": { 198 "funding_disclosed": { 199 "applies": true, 200 "answer": true, 201 "justification": "Acknowledgments section lists JSPS KAKENHI grants (JP24K02921, JP25K21359), JST PRESTO (JPMJPR22P3), ASPIRE (JPMJAP2415), AIP Accelerated Program (JPMJCR25U7), and NSERC Canada." 202 }, 203 "affiliations_disclosed": { 204 "applies": true, 205 "answer": true, 206 "justification": "Author affiliations listed: Nara Institute of Science and Technology (Japan) and Queen's University (Canada). No authors are affiliated with the evaluated agent companies (OpenAI, Anthropic, etc.)." 207 }, 208 "funder_independent_of_outcome": { 209 "applies": true, 210 "answer": true, 211 "justification": "Funding from government research agencies (JSPS, JST, NSERC) with no financial stake in agentic refactoring outcomes." 212 }, 213 "financial_interests_declared": { 214 "applies": true, 215 "answer": false, 216 "justification": "No competing interests or financial interests statement appears in the paper. Per the schema, absence of disclosure counts as NO." 217 } 218 }, 219 "contamination": { 220 "training_cutoff_stated": { 221 "applies": false, 222 "answer": false, 223 "justification": "This is a mining/repository study. GPT-4.1-mini is used as a classification tool, not evaluated on a benchmark for model capability." 224 }, 225 "train_test_overlap_discussed": { 226 "applies": false, 227 "answer": false, 228 "justification": "Same rationale: the study mines agent-generated code in real-world repositories, not benchmark evaluation of model capability." 229 }, 230 "benchmark_contamination_addressed": { 231 "applies": false, 232 "answer": false, 233 "justification": "The study does not use a standard benchmark to evaluate model knowledge or capabilities. It mines real-world commit history." 234 } 235 }, 236 "human_studies": { 237 "pre_registered": { 238 "applies": false, 239 "answer": false, 240 "justification": "No human participants in the study. Mining public GitHub repositories is not a human subjects study." 241 }, 242 "irb_or_ethics_approval": { 243 "applies": false, 244 "answer": false, 245 "justification": "No human participants. The study mines public open-source repositories using automated tools." 246 }, 247 "demographics_reported": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants as study subjects. Annotators for validation are described only by experience level." 251 }, 252 "inclusion_exclusion_criteria": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participant recruitment. Inclusion/exclusion criteria apply to repositories and commits, not human subjects." 256 }, 257 "randomization_described": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants. Not an experimental study with treatment/control conditions." 261 }, 262 "blinding_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants as study subjects. Not an experimental study requiring blinding." 266 }, 267 "attrition_reported": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants. Data exclusions are documented as filtering steps, not participant attrition." 271 } 272 }, 273 "cost_and_practicality": { 274 "inference_cost_reported": { 275 "applies": true, 276 "answer": false, 277 "justification": "GPT-4.1-mini is used to classify 3,907+ commits and 3,232+ repositories. No API costs, token counts, or processing times are reported." 278 }, 279 "compute_budget_stated": { 280 "applies": true, 281 "answer": false, 282 "justification": "No total computational budget reported for RefactoringMiner on 14,998 commits, DesigniteJava on pre/post snapshots, or GPT-4.1-mini API calls. No hardware specification or processing time mentioned." 283 } 284 } 285 }, 286 "disagreements": [ 287 { 288 "question": "evaluation_design.ablation_study", 289 "category": "evaluation_design", 290 "sonnet_applies": true, 291 "sonnet_answer": false, 292 "opus_applies": false, 293 "opus_answer": false, 294 "direction": "applies_boundary", 295 "sonnet_justification": "No ablation study is performed. The study is descriptive/analytical rather than proposing a system with components to ablate. Analysis across abstraction levels and purposes are breakdowns, not ablations.", 296 "opus_justification": "This is a descriptive mining study that does not propose a system or method with components. There is nothing to ablate. The breakdowns by abstraction level and purpose are analytical perspectives, not ablation experiments.", 297 "resolution_note": "Sonnet set applies=true despite recognizing the study has no system to ablate, then answered false. Opus set applies=false because the criterion is structurally inapplicable to a purely descriptive mining study with no proposed system or method. The schema states 'NA if the system has only one component' -- this study has no system at all, which is a stronger case for applies=false. Both agree the paper does not contain an ablation study; the disagreement is on whether ablation is even applicable to this paper type." 298 } 299 ], 300 "summary": { 301 "agreement_rate": 0.98, 302 "total_disagreements": 1, 303 "by_direction": { 304 "sonnet_generous": 0, 305 "opus_generous": 0, 306 "applies_boundary": 1, 307 "interpretive": 0 308 }, 309 "notes": "Exceptionally high agreement (98%) on this paper. The only disagreement is an applies-boundary issue on ablation_study. Sonnet correctly identified that no ablation was performed but set applies=true for a study with no system to ablate. Opus considers this structurally inapplicable (applies=false) since the study is purely descriptive/observational with no proposed method or system. This disagreement has no impact on the effective score since both agree the paper does not contain an ablation study. The paper is a well-executed mining study with strong statistical methodology (appropriate tests, effect sizes, FDR correction) and good transparency (replication package, detailed pipeline documentation). Key methodological gaps include: no prompts for the GPT-4.1-mini classifier, no hyperparameters, no cost reporting, causal language not matched by study design, and no discussion of alternative explanations (especially the Codex dominance issue)." 310 } 311 }