calibration.json (19166B)
1 { 2 "calibration": { 3 "paper_slug": "agentspawn-adaptive-multiagent-2026", 4 "calibration_date": "2026-02-28", 5 "sonnet_scan_date": "2026-02-28", 6 "model": "opus", 7 "agreement_rate": 0.94, 8 "total_questions": 50, 9 "agreements": 47, 10 "disagreements": 3, 11 "disagreement_details": [ 12 { 13 "category": "evaluation_design", 14 "question": "multiple_metrics", 15 "sonnet": {"applies": true, "answer": true}, 16 "opus": {"applies": true, "answer": false}, 17 "direction": "sonnet_generous", 18 "reasoning": "Sonnet credits the paper for proposing multiple evaluation metrics in Section 4.1 (task completion rate, memory overhead, spawn count, coherence violations, cost per success). However, the conclusion explicitly lists empirical validation as future work, meaning no experiments were conducted and no metrics were actually measured. Proposing metrics in a design document is not the same as using multiple metrics in an evaluation. The schema asks 'Are multiple evaluation metrics used?' — they were not used because no evaluation occurred." 19 }, 20 { 21 "category": "evaluation_design", 22 "question": "per_category_breakdown", 23 "sonnet": {"applies": true, "answer": true}, 24 "opus": {"applies": true, "answer": false}, 25 "direction": "sonnet_generous", 26 "reasoning": "Sonnet credits Tables 4 (breakdown by difficulty: Easy/Medium/Hard) and 5 (breakdown by task type: Simple refactor/Multi-file fix/Complex feature) as per-category breakdowns. While these tables visually contain breakdowns, the conclusion confirms no experiments were conducted — these numbers are fabricated. Crediting presentation of invented data as satisfying a methodological criterion is incorrect. The criterion evaluates whether actual results are broken down, not whether hypothetical numbers are formatted in a table." 27 }, 28 { 29 "category": "cost_and_practicality", 30 "question": "inference_cost_reported", 31 "sonnet": {"applies": true, "answer": true}, 32 "opus": {"applies": true, "answer": false}, 33 "direction": "sonnet_generous", 34 "reasoning": "Sonnet credits Table 6 (cost-benefit analysis: $2.10 vs. $2.30 cost per success, 23 vs. 12 API calls, 26 vs. 18 minutes average time) and Section 5.3's O(n·ℓ) cost scaling analysis. However, Table 6's numbers are not from actual experiments — the conclusion lists 'empirical validation' as future work. These are projected or hypothetical costs, not measured inference costs. The schema asks 'Is inference cost or latency reported?' — reporting invented projections of what costs might be does not satisfy this criterion." 35 } 36 ], 37 "disagreement_summary": { 38 "sonnet_generous": 3, 39 "opus_generous": 0, 40 "applies_boundary": 0, 41 "interpretive": 0 42 }, 43 "notes": "All three disagreements follow the same pattern: Sonnet credits the paper for presenting tables and numbers that appear to satisfy the criterion on their surface, even though the conclusion explicitly states that no experiments were conducted (empirical validation is listed as future work item #2). This is a case of Sonnet evaluating the form of the paper (tables exist, numbers are present, categories are shown) rather than the substance (the numbers are fabricated or hypothetical). This paper is unusual in that it presents extensive fake experimental results as if they were real, which creates a trap for checklist evaluation: a literal reading of 'are multiple metrics used' would credit the paper because multiple metrics appear in the tables, but a substantive reading recognizes that metrics from non-existent experiments don't count as 'used.' Opus consistently applies the substantive reading." 44 }, 45 "opus_checklist": { 46 "artifacts": { 47 "code_released": { 48 "applies": true, 49 "answer": false, 50 "justification": "No code, repository, or archive is provided. The conclusion (Section 6) explicitly states 'implementing the AgentSpawn prototype using Claude 3.5 Sonnet or GPT-4' as future work, confirming no implementation exists." 51 }, 52 "data_released": { 53 "applies": true, 54 "answer": false, 55 "justification": "No datasets are released. The custom refactoring task set (100 tasks) mentioned in Section 4.1 is not available. SWE-bench and Defects4J are referenced as benchmarks but no data artifacts are provided." 56 }, 57 "environment_specified": { 58 "applies": true, 59 "answer": false, 60 "justification": "No environment specification (requirements.txt, Dockerfile, dependency list) is provided. The paper has no implementation, so there is no environment to specify." 61 }, 62 "reproduction_instructions": { 63 "applies": true, 64 "answer": false, 65 "justification": "No reproduction instructions exist. The conclusion lists 'empirical validation on SWE-bench and Defects4J with proper baseline configurations' as future work item (2), confirming experiments were never conducted." 66 } 67 }, 68 "statistical_methodology": { 69 "confidence_intervals_or_error_bars": { 70 "applies": true, 71 "answer": false, 72 "justification": "No confidence intervals or error bars appear anywhere. All tables (2-6) report single point estimates with no uncertainty quantification." 73 }, 74 "significance_tests": { 75 "applies": true, 76 "answer": false, 77 "justification": "No statistical significance tests are reported. Comparative claims (e.g., '+97% over GPT-4 Single' in Table 2) lack any statistical testing." 78 }, 79 "effect_sizes_reported": { 80 "applies": true, 81 "answer": false, 82 "justification": "Table 2 reports percentage improvements over baselines (e.g., +97%, +85%, +74%) but the GPT-4 Single baseline absolute values are not stated — only 'baseline' appears. Without knowing the absolute rates, effect sizes are uninterpretable. Furthermore, the results are from non-existent experiments." 83 }, 84 "sample_size_justified": { 85 "applies": true, 86 "answer": false, 87 "justification": "The paper states 300 SWE-bench tasks, 200 Defects4J tasks, and 100 custom tasks (Section 4.1) with no justification for these sizes and no power analysis. The experiments were never conducted." 88 }, 89 "variance_reported": { 90 "applies": true, 91 "answer": false, 92 "justification": "No variance, standard deviation, or spread measures appear for any result. All numbers are single-point estimates." 93 } 94 }, 95 "evaluation_design": { 96 "baselines_included": { 97 "applies": true, 98 "answer": false, 99 "justification": "Table 2 lists baselines (GPT-4 Single, AutoGen, CrewAI, AFLOW) but the conclusion states 'empirical validation on SWE-bench and Defects4J with proper baseline configurations' is future work, meaning baseline comparisons were never actually conducted." 100 }, 101 "baselines_contemporary": { 102 "applies": true, 103 "answer": false, 104 "justification": "The baselines listed (AutoGen, CrewAI, AFLOW) are contemporary systems, but since no experiments were conducted (per the conclusion's future work list), no actual baseline comparison was performed." 105 }, 106 "ablation_study": { 107 "applies": true, 108 "answer": false, 109 "justification": "Table 3 presents an ablation study removing memory slicing, skill inheritance, adaptive spawning, and coherence manager. However, the conclusion confirms experiments were not conducted, so these ablation numbers are hypothetical." 110 }, 111 "multiple_metrics": { 112 "applies": true, 113 "answer": false, 114 "justification": "Section 4.1 proposes five metrics (task completion rate, memory overhead, spawn count, coherence violations, cost per success) but no experiments were conducted. Proposing metrics in a design is not the same as using them in an evaluation. No metrics were actually measured." 115 }, 116 "human_evaluation": { 117 "applies": false, 118 "answer": false, 119 "justification": "The paper evaluates automated code generation via benchmark test suites. Human evaluation is not applicable and not claimed." 120 }, 121 "held_out_test_set": { 122 "applies": true, 123 "answer": false, 124 "justification": "No train/test split methodology is described. The paper references existing benchmarks as test sets but no experiments were conducted and no splitting procedure is documented." 125 }, 126 "per_category_breakdown": { 127 "applies": true, 128 "answer": false, 129 "justification": "Tables 4 and 5 present breakdowns by task difficulty (Easy/Medium/Hard) and task type (Simple refactor/Multi-file fix/Complex feature). However, since the conclusion confirms no experiments were run, these numbers are fabricated. Presenting invented data in categorized tables does not constitute providing per-category breakdowns of actual results." 130 }, 131 "failure_cases_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "No actual failure cases are shown or analyzed. Section 5.4 lists design limitations (hyperparameter sensitivity, semantic merge complexity) but presents no empirical failure examples." 135 }, 136 "negative_results_reported": { 137 "applies": true, 138 "answer": false, 139 "justification": "Every result presented shows AgentSpawn outperforming all baselines on all benchmarks. No negative results, failed approaches, or configurations that didn't work are reported." 140 } 141 }, 142 "claims_and_evidence": { 143 "abstract_claims_supported": { 144 "applies": true, 145 "answer": false, 146 "justification": "The abstract claims '34% higher completion rates than static baselines on benchmarks like SWE-bench' and '42% memory overhead reduction.' The conclusion contradicts these by listing empirical validation as future work, meaning the abstract's claims are unsupported by actual experiments." 147 }, 148 "causal_claims_justified": { 149 "applies": true, 150 "answer": false, 151 "justification": "The paper makes causal claims: 'memory slicing reduces context overflow failures by 42%' (Section 4.2), the ablation in Table 3 implies causal component contributions. No experiments were conducted, so no causal claims can be justified." 152 }, 153 "generalization_bounded": { 154 "applies": true, 155 "answer": false, 156 "justification": "The abstract claims results 'on benchmarks like SWE-bench' without clarifying experiments were never performed. Section 5.6 generalizes to 'document authoring, data analysis, system administration' without any evidence. The title ('Long-Horizon Code Generation') is broad given no empirical validation exists." 157 }, 158 "alternative_explanations_discussed": { 159 "applies": true, 160 "answer": false, 161 "justification": "No alternative explanations for the presented results are discussed. Section 5.4 mentions design limitations but does not consider alternative explanations for why the system might appear to perform well (e.g., benchmark selection, baseline configuration)." 162 } 163 }, 164 "setup_transparency": { 165 "model_versions_specified": { 166 "applies": true, 167 "answer": false, 168 "justification": "No model version is specified. The conclusion mentions 'Claude 3.5 Sonnet or GPT-4' as future implementation candidates. No model was actually used." 169 }, 170 "prompts_provided": { 171 "applies": true, 172 "answer": false, 173 "justification": "No actual prompts are provided. Section 3.3 describes skills as 'parameterized prompts s = (p, θs)' with abstract examples like 'Write unit tests for {function}' — these are templates, not actual prompts. No experiments were conducted." 174 }, 175 "hyperparameters_reported": { 176 "applies": true, 177 "answer": true, 178 "justification": "Table 1 provides hyperparameter specifications: spawn threshold δ=0.7, memory threshold θ=0.5, metric weights w1-w5, max spawn depth=3, concurrent spawn limit=4, timeout=600s, relevance weights α=0.3, β=0.3, γ=0.2, δ=0.2. These are proposed design values, but they are explicitly stated." 179 }, 180 "scaffolding_described": { 181 "applies": true, 182 "answer": true, 183 "justification": "The paper extensively describes agentic scaffolding: five components (Section 3.1), three formal algorithms (Algorithms 1-3), data structures for spawn/resume packages (Appendix A), memory tiers (Section 3.2), complexity metrics (Section 3.4), and coherence protocols (Section 3.6)." 184 }, 185 "data_preprocessing_documented": { 186 "applies": true, 187 "answer": false, 188 "justification": "No data preprocessing is described. Section 4.1 states benchmark sizes (300 SWE-bench, 200 Defects4J, 100 custom) without describing how tasks were selected, filtered, or prepared." 189 } 190 }, 191 "limitations_and_scope": { 192 "limitations_section_present": { 193 "applies": true, 194 "answer": true, 195 "justification": "Section 5.4 'Limitations' provides a dedicated subsection discussing four specific limitations: hyperparameter sensitivity, semantic merge complexity (73% success rate varies), domain generalization, and spawn depth scaling." 196 }, 197 "threats_to_validity_specific": { 198 "applies": true, 199 "answer": false, 200 "justification": "Section 5.4 discusses design limitations but not threats to validity of the experimental results. Critically, it does not acknowledge the most significant validity threat: that the presented results are from experiments that were never conducted." 201 }, 202 "scope_boundaries_stated": { 203 "applies": true, 204 "answer": false, 205 "justification": "The paper does not clearly state what the results do NOT show. The abstract and body present fabricated results as real without bounding what was actually demonstrated (architectural design only) versus what was not (any empirical validation)." 206 } 207 }, 208 "data_integrity": { 209 "raw_data_available": { 210 "applies": true, 211 "answer": false, 212 "justification": "No raw data is available. The experimental results in Tables 2-6 are not from actual experiments — the conclusion lists empirical validation as future work. No underlying data exists to verify." 213 }, 214 "data_collection_described": { 215 "applies": true, 216 "answer": false, 217 "justification": "No data collection procedure is described. The paper references existing benchmarks and a 'custom refactoring task set' without describing how any data was gathered or prepared." 218 }, 219 "recruitment_methods_described": { 220 "applies": false, 221 "answer": false, 222 "justification": "No human participants are involved. The paper evaluates automated code generation benchmarks." 223 }, 224 "data_pipeline_documented": { 225 "applies": true, 226 "answer": false, 227 "justification": "No data pipeline is documented. There is no description of how raw data flows to final results, because no experiments were conducted." 228 } 229 }, 230 "conflicts_of_interest": { 231 "funding_disclosed": { 232 "applies": true, 233 "answer": false, 234 "justification": "No funding source is disclosed. There is no acknowledgments section, no grant information, and no mention of any institutional or corporate support." 235 }, 236 "affiliations_disclosed": { 237 "applies": true, 238 "answer": true, 239 "justification": "The author lists 'AutoHand Evolve' as affiliation with email igor@autohand.ai on the title page." 240 }, 241 "funder_independent_of_outcome": { 242 "applies": true, 243 "answer": false, 244 "justification": "No funding is disclosed, so independence cannot be assessed. The author is affiliated with AutoHand Evolve, which presumably has commercial interest in the proposed architecture's success." 245 }, 246 "financial_interests_declared": { 247 "applies": true, 248 "answer": false, 249 "justification": "No competing interests statement, patent disclosures, or financial interests declaration appears anywhere in the paper." 250 } 251 }, 252 "contamination": { 253 "training_cutoff_stated": { 254 "applies": true, 255 "answer": false, 256 "justification": "No training data cutoff date is stated for any model. The conclusion mentions 'Claude 3.5 Sonnet or GPT-4' as future implementation options, but no model was actually used in experiments." 257 }, 258 "train_test_overlap_discussed": { 259 "applies": true, 260 "answer": false, 261 "justification": "No discussion of potential train/test overlap. SWE-bench tasks are publicly available and likely in LLM training data, but this is not addressed." 262 }, 263 "benchmark_contamination_addressed": { 264 "applies": true, 265 "answer": false, 266 "justification": "SWE-bench (published 2023) and Defects4J are public benchmarks likely seen during training of contemporary LLMs. The paper does not address contamination risk." 267 } 268 }, 269 "human_studies": { 270 "pre_registered": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants. The study evaluates automated code generation benchmarks." 274 }, 275 "irb_or_ethics_approval": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants. IRB approval is not applicable." 279 }, 280 "demographics_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants." 284 }, 285 "inclusion_exclusion_criteria": { 286 "applies": false, 287 "answer": false, 288 "justification": "No human participants." 289 }, 290 "randomization_described": { 291 "applies": false, 292 "answer": false, 293 "justification": "No human participants or randomized experimental design." 294 }, 295 "blinding_described": { 296 "applies": false, 297 "answer": false, 298 "justification": "No human participants." 299 }, 300 "attrition_reported": { 301 "applies": false, 302 "answer": false, 303 "justification": "No human participants." 304 } 305 }, 306 "cost_and_practicality": { 307 "inference_cost_reported": { 308 "applies": true, 309 "answer": false, 310 "justification": "Table 6 presents cost figures ($2.10 vs. $2.30 per success, API call counts, timing) and Section 5.3 discusses O(n·ℓ) cost scaling. However, these are hypothetical projections from experiments that were never conducted (per the conclusion's future work list), not measured inference costs." 311 }, 312 "compute_budget_stated": { 313 "applies": true, 314 "answer": false, 315 "justification": "No total computational budget is stated. Since no experiments were conducted, there is no actual compute expenditure to report." 316 } 317 } 318 } 319 }