calibration.json (21348B)
1 { 2 "paper_slug": "agent-contracts-formal-2026", 3 "calibration_model": "opus", 4 "scan_model": "sonnet", 5 "timestamp": "2026-02-28", 6 "total_questions": 50, 7 "agreement_count": 47, 8 "disagreement_count": 3, 9 "agreement_rate": 0.94, 10 "disagreements": [ 11 { 12 "category": "claims_and_evidence", 13 "question": "generalization_bounded", 14 "sonnet": {"applies": true, "answer": true}, 15 "opus": {"applies": true, "answer": false}, 16 "direction": "sonnet_generous", 17 "explanation": "The title 'Agent Contracts: A Formal Framework for Resource-Bounded Autonomous AI Systems' claims general applicability to 'Autonomous AI Systems,' but empirical validation is limited to Gemini 2.5 Flash/Flash-Lite with Google ADK/LiteLLM on two benchmarks (LiveCodeBench, OpenR1). The schema guidance states: 'broad titles like LLM-based Software Engineering when results are on a single benchmark in a single language is NO.' While Section 7 discusses enforcement limitations, the title and conclusion ('formal governance becomes essential... Agent Contracts provide one such foundation') imply general applicability beyond the tested setting. Sonnet credited the Section 7 limitations as adequate bounding, but the title and conclusion's generalizations are not bounded to the tested models and benchmarks." 18 }, 19 { 20 "category": "claims_and_evidence", 21 "question": "alternative_explanations_discussed", 22 "sonnet": {"applies": true, "answer": true}, 23 "opus": {"applies": true, "answer": false}, 24 "direction": "sonnet_generous", 25 "explanation": "Sonnet credited Section 7's discussion of enforcement limitations and the Section 8.3 outlier analysis as discussion of alternative explanations. However, Section 7 addresses what contracts cannot enforce (single-call budget overruns), which is a limitation, not an alternative explanation for the observed results. The core alternative explanation — that simple budget-aware prompting or iteration limits alone (without the full formal contract framework) could produce equivalent token reduction and variance reduction — is not discussed. The outlier analysis in Section 8.3 is a sensitivity analysis, not a consideration of confounding factors. Per the schema, 'a threats-to-validity section counts only if it discusses specific alternative explanations for the observed results, not just generic methodological limitations.'" 26 }, 27 { 28 "category": "cost_and_practicality", 29 "question": "inference_cost_reported", 30 "sonnet": {"applies": true, "answer": false}, 31 "opus": {"applies": true, "answer": true}, 32 "direction": "opus_generous", 33 "explanation": "The schema says 'Look for: API costs, tokens consumed, wall-clock time, cost per example.' The paper reports tokens consumed per task (3,461 vs 34,606 in Code Review), wall-clock time per task (6.9s, 12.5s, 16.9s in Strategy Modes), and iteration/LLM call counts. These are explicitly listed in the schema's 'Look for' guidance as sufficient indicators. Sonnet required dollar-cost conversion, but the schema does not require this — 'tokens consumed' and 'wall-clock time' are listed as standalone acceptable evidence." 34 } 35 ], 36 "opus_checklist": { 37 "artifacts": { 38 "code_released": { 39 "applies": true, 40 "answer": true, 41 "justification": "GitHub repository URL provided in footnote 3: 'Implementation available at https://github.com/flyersworder/agent-contracts.' The authors state they provide 'experiment code and data for reproducibility.' This is a working URL to a public repository." 42 }, 43 "data_released": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper uses publicly available benchmarks: LiveCodeBench (arXiv:2403.07974) and OpenR1 logic puzzles. The authors also state they provide experiment data via the GitHub repository." 47 }, 48 "environment_specified": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper mentions Google ADK and LiteLLM as frameworks and Gemini 2.5 Flash/Flash-Lite as models, but provides no requirements.txt, Dockerfile, conda environment file, or specific library versions. Mentioning framework names without version numbers is insufficient to recreate the environment." 52 }, 53 "reproduction_instructions": { 54 "applies": true, 55 "answer": false, 56 "justification": "The paper links to a GitHub repository but provides no step-by-step reproduction instructions in the paper itself. Experiment designs are described conceptually but not with specific commands, scripts, or a 'Reproducing Results' section." 57 } 58 }, 59 "statistical_methodology": { 60 "confidence_intervals_or_error_bars": { 61 "applies": true, 62 "answer": true, 63 "justification": "Section 8 states 'bootstrap confidence intervals (10,000 resamples) with BCa correction' for statistical analysis. P-values and significance tests are reported throughout the results tables." 64 }, 65 "significance_tests": { 66 "applies": true, 67 "answer": true, 68 "justification": "Multiple significance tests reported: paired t-test for token usage (p=0.0007), iterations (p<0.0001), LLM calls (p<0.0001), crisis communication token reduction (p=0.005), quality equivalence (p=0.32), success rate difference (p=0.13 NS), and strategy modes (p~0.05)." 69 }, 70 "effect_sizes_reported": { 71 "applies": true, 72 "answer": true, 73 "justification": "Effect sizes reported with baseline context: '90% token reduction' (from 34,606 to 3,461), '525x lower variance' (5.29B vs 10.1M), '7.1 percentage points' (60.0% to 52.9%), '23% token reduction,' '70%->86% success rate,' and '26.7x lower variance' (sigma 1.75 vs 9.07). Absolute and relative differences provided." 74 }, 75 "sample_size_justified": { 76 "applies": true, 77 "answer": false, 78 "justification": "Sample sizes are stated (n=70, n=50, n=50, n=24) but never justified. No power analysis is provided. No acknowledgment of whether sample sizes are sufficient for the claims, particularly the n=24 Crisis Communication experiment." 79 }, 80 "variance_reported": { 81 "applies": true, 82 "answer": true, 83 "justification": "Variance is a central result metric: '525x lower variance (5.29B vs 10.1M)' for Code Review and '26.7x lower variance (sigma: 1.75 vs 9.07)' for Research Pipeline. Standard deviation is explicitly reported." 84 } 85 }, 86 "evaluation_design": { 87 "baselines_included": { 88 "applies": true, 89 "answer": true, 90 "justification": "Each experiment includes baseline comparisons: CONTRACTED vs UNCONTRACTED in Code Review, Research Pipeline, and Crisis Communication; three contract modes (URGENT, ECONOMICAL, BALANCED) in Strategy Modes." 91 }, 92 "baselines_contemporary": { 93 "applies": true, 94 "answer": true, 95 "justification": "The UNCONTRACTED baseline (no contract enforcement) is the natural and appropriate baseline for evaluating a first-of-kind governance framework. Current models (Gemini 2.5 Flash) and frameworks (Google ADK) are used. No prior comparable framework exists to serve as a competitive baseline." 96 }, 97 "ablation_study": { 98 "applies": true, 99 "answer": false, 100 "justification": "No ablation study isolates individual components of the Agent Contract framework. The four experiments test different scenarios but do not systematically remove single components (e.g., resource constraints without success criteria, or budget-aware prompting without formal lifecycle management)." 101 }, 102 "multiple_metrics": { 103 "applies": true, 104 "answer": true, 105 "justification": "Multiple metrics reported across experiments: token usage, variance, iterations, LLM calls, success rate, timeout rate, reasoning tokens, average time, and quality scores. Code Review alone reports 5 distinct metrics." 106 }, 107 "human_evaluation": { 108 "applies": true, 109 "answer": false, 110 "justification": "The Research Pipeline experiment uses 'multi-judge LLM evaluation' (Section 8.3), not human evaluation. No human evaluation of system outputs is performed in any of the four experiments." 111 }, 112 "held_out_test_set": { 113 "applies": true, 114 "answer": true, 115 "justification": "LiveCodeBench problems 'released post-February 2025, after model cutoff' and OpenR1 problems 'released February 2025, after model cutoff' serve as held-out test sets that the model could not have seen during training." 116 }, 117 "per_category_breakdown": { 118 "applies": true, 119 "answer": true, 120 "justification": "Code Review provides breakdown by difficulty: 'medium-difficulty problems show 92% token savings versus 76% for easy' (Section 8.2). Research Pipeline uses five categories (technology, science, business, health, society). Strategy Modes breaks down by contract mode." 121 }, 122 "failure_cases_discussed": { 123 "applies": true, 124 "answer": true, 125 "justification": "Multiple failure cases discussed: one UNCONTRACTED agent 'failed entirely—stuck in an evaluation loop without submitting output' (Section 8.2); a runaway agent 'exceeded its 40K token budget (56K consumed)' detected and halted (Section 8.3)." 126 }, 127 "negative_results_reported": { 128 "applies": true, 129 "answer": true, 130 "justification": "The paper reports that contracted execution has a 7.1 percentage point lower success rate (52.9% vs 60.0%, p=0.13 NS) — an honest cost of governance. Section 7 discusses fundamental limitations of what contracts cannot guarantee." 131 } 132 }, 133 "claims_and_evidence": { 134 "abstract_claims_supported": { 135 "applies": true, 136 "answer": true, 137 "justification": "All abstract claims are supported: '90% token reduction' (Section 8.2, 34,606 to 3,461), '525x lower variance' (Section 8.2, 5.29B vs 10.1M), 'zero conservation violations' (Section 8.3, 0/50 trials), 'measurable quality-resource tradeoffs' (Section 8.4, 70%->86% across modes)." 138 }, 139 "causal_claims_justified": { 140 "applies": true, 141 "answer": true, 142 "justification": "The Code Review experiment uses a within-subjects design (same 70 problems run in both CONTRACTED and UNCONTRACTED conditions), enabling controlled causal inference about the effect of contracts on resource consumption. The manipulation is the contract treatment (including budget-aware prompting), and the within-subjects design controls for problem difficulty." 143 }, 144 "generalization_bounded": { 145 "applies": true, 146 "answer": false, 147 "justification": "The title 'A Formal Framework for Resource-Bounded Autonomous AI Systems' claims general applicability. Empirical results are limited to Gemini 2.5 Flash/Flash-Lite on LiveCodeBench and OpenR1 with Google ADK/LiteLLM. The conclusion states 'formal governance becomes essential... Agent Contracts provide one such foundation' without bounding this to the tested models. Per schema guidance, broad titles when results are on a single model family and two benchmarks is NO." 148 }, 149 "alternative_explanations_discussed": { 150 "applies": true, 151 "answer": false, 152 "justification": "Section 7 discusses enforcement limitations (single-call budget overruns cannot be prevented) and Section 8.3 provides a sensitivity analysis for the outlier. However, the core alternative explanation — that simple budget-aware prompting or iteration limits alone could produce equivalent results without the formal contract framework — is not discussed. The paper does not consider whether the observed token reduction and variance reduction are attributable to the formal framework versus the simpler mechanisms it bundles together." 153 } 154 }, 155 "setup_transparency": { 156 "model_versions_specified": { 157 "applies": true, 158 "answer": false, 159 "justification": "The paper states 'Gemini 2.5 Flash and Flash-Lite (knowledge cutoff: January 2025)' but provides no API version identifier or snapshot date. Per schema guidance, marketing names like 'Gemini 2.5' without a snapshot date or API version do not count as specified versions." 160 }, 161 "prompts_provided": { 162 "applies": true, 163 "answer": false, 164 "justification": "The paper describes budget-aware prompting and shows 'Budget: {used}/{total}' as an example template, but does not provide the full prompt text used in any experiment. Prompt templates with placeholders without actual fill values do not satisfy the criterion." 165 }, 166 "hyperparameters_reported": { 167 "applies": true, 168 "answer": false, 169 "justification": "No LLM inference hyperparameters (temperature, top-p, max tokens, sampling settings) are reported for any experiment. The paper reports resource constraint parameters (token budgets, iteration limits) but not the model's generation settings." 170 }, 171 "scaffolding_described": { 172 "applies": true, 173 "answer": true, 174 "justification": "The agentic scaffolding is described in detail: Google ADK with DelegatingAdkAgent for multi-agent delegation, Coder-Reviewer pipeline mechanics, tool descriptions (test_code, web_search), budget-aware prompting mechanism, runtime monitoring, conservation law enforcement, and contract lifecycle management (Sections 5, 6, and 8)." 175 }, 176 "data_preprocessing_documented": { 177 "applies": true, 178 "answer": false, 179 "justification": "The paper states which benchmarks were used and their general characteristics (difficulty levels, post-cutoff dates) but does not document how specific problems were selected or filtered. The selection of 31 easy and 39 medium LiveCodeBench problems, 50 medium OpenR1 problems, 50 research topics, and 24 crisis scenarios lacks documented filtering criteria." 180 } 181 }, 182 "limitations_and_scope": { 183 "limitations_section_present": { 184 "applies": true, 185 "answer": true, 186 "justification": "Section 7 'Fundamental Limitations and Practical Enforcement' is a dedicated limitations section covering single-call enforcement constraints, enforcement capabilities, and future infrastructure requirements." 187 }, 188 "threats_to_validity_specific": { 189 "applies": true, 190 "answer": true, 191 "justification": "The paper identifies specific threats: token consumption is unknowable during a call (Section 7.1), contracts provide best-effort not hard guarantees, the outlier sensitivity analysis in Section 8.3 examines whether one catastrophic failure drives variance results, and the Strategy Modes experiment acknowledges the reasoning_effort parameter is the direct mechanism." 192 }, 193 "scope_boundaries_stated": { 194 "applies": true, 195 "answer": true, 196 "justification": "Section 7.1 states 'contracts cannot prevent a single expensive call from exceeding budget.' Section 7.3 states 'contracts remain most valuable for multi-call and multi-agent scenarios.' These are specific statements about what the framework does NOT provide." 197 } 198 }, 199 "data_integrity": { 200 "raw_data_available": { 201 "applies": true, 202 "answer": true, 203 "justification": "The paper states 'we provide experiment code and data for reproducibility' via the GitHub repository (footnote 3). The benchmarks (LiveCodeBench, OpenR1) are publicly available for independent verification." 204 }, 205 "data_collection_described": { 206 "applies": true, 207 "answer": true, 208 "justification": "Benchmark sources are identified: LiveCodeBench (arXiv:2403.07974, post-February 2025 problems), OpenR1 (released February 2025), research topics across five categories, and 24 crisis communication scenarios. Sources, temporal windows, and difficulty levels are specified." 209 }, 210 "recruitment_methods_described": { 211 "applies": false, 212 "answer": false, 213 "justification": "No human participants. All experiments use LLM agents on benchmark problems and synthetically generated research topics." 214 }, 215 "data_pipeline_documented": { 216 "applies": true, 217 "answer": false, 218 "justification": "The selection criteria for specific problems within LiveCodeBench (31 easy, 39 medium from how many available?) and OpenR1 (50 medium-difficulty from how many?) are not documented. How research topics were generated and how crisis scenarios were defined is not explained." 219 } 220 }, 221 "conflicts_of_interest": { 222 "funding_disclosed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No funding is disclosed anywhere in the paper. While both authors are independent researchers with personal email addresses suggesting unfunded work, the experiments required real API costs (Gemini API for ~244 runs) and there is no acknowledgment of how these were funded. No acknowledgments section exists." 226 }, 227 "affiliations_disclosed": { 228 "applies": true, 229 "answer": true, 230 "justification": "Authors are listed as 'Independent Researcher' with personal emails (yeqi519@gmail.com, jtan@live.de). No institutional affiliation with Google (whose Gemini model is used) or other evaluated systems." 231 }, 232 "funder_independent_of_outcome": { 233 "applies": false, 234 "answer": false, 235 "justification": "No funding is disclosed. The authors appear to be self-funded independent researchers. NA per schema guidance for unfunded work." 236 }, 237 "financial_interests_declared": { 238 "applies": true, 239 "answer": false, 240 "justification": "No competing interests statement appears in the paper. The authors do not declare whether they hold patents, equity, or other financial interests related to the Agent Contracts framework. Absence of disclosure counts as NO per the schema." 241 } 242 }, 243 "contamination": { 244 "training_cutoff_stated": { 245 "applies": true, 246 "answer": true, 247 "justification": "Section 8 explicitly states 'Gemini 2.5 Flash and Flash-Lite (knowledge cutoff: January 2025).' This is used to justify benchmark selection." 248 }, 249 "train_test_overlap_discussed": { 250 "applies": true, 251 "answer": true, 252 "justification": "The paper directly addresses contamination by using benchmarks released after the model's training cutoff: LiveCodeBench 'released post-February 2025, after model cutoff' and OpenR1 'released February 2025, after model cutoff.' Temporal separation is the explicit mitigation strategy." 253 }, 254 "benchmark_contamination_addressed": { 255 "applies": true, 256 "answer": true, 257 "justification": "Benchmarks were specifically chosen to be released after the January 2025 model training cutoff (LiveCodeBench post-February 2025, OpenR1 February 2025). This is an explicit and appropriate contamination mitigation strategy." 258 } 259 }, 260 "human_studies": { 261 "pre_registered": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants. All experiments use LLM agents on benchmark problems." 265 }, 266 "irb_or_ethics_approval": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants. All experiments use LLM agents on benchmark problems." 270 }, 271 "demographics_reported": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants. All experiments use LLM agents on benchmark problems." 275 }, 276 "inclusion_exclusion_criteria": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants. All experiments use LLM agents on benchmark problems." 280 }, 281 "randomization_described": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants. All experiments use LLM agents on benchmark problems." 285 }, 286 "blinding_described": { 287 "applies": false, 288 "answer": false, 289 "justification": "No human participants. All experiments use LLM agents on benchmark problems." 290 }, 291 "attrition_reported": { 292 "applies": false, 293 "answer": false, 294 "justification": "No human participants. All experiments use LLM agents on benchmark problems." 295 } 296 }, 297 "cost_and_practicality": { 298 "inference_cost_reported": { 299 "applies": true, 300 "answer": true, 301 "justification": "The paper reports tokens consumed per task (34,606 vs 3,461 in Code Review, Section 8.2), wall-clock time per task (6.9s, 12.5s, 16.9s in Strategy Modes, Section 8.4), reasoning tokens (0, 718, 1519), and LLM call counts. The schema lists 'tokens consumed' and 'wall-clock time' as sufficient indicators of inference cost/latency." 302 }, 303 "compute_budget_stated": { 304 "applies": true, 305 "answer": false, 306 "justification": "No total computational budget is stated. The paper reports per-task metrics but not total API spend, total tokens across all ~244 experimental runs, or hardware specifications." 307 } 308 } 309 } 310 }