calibration.json (18351B)
1 { 2 "paper_slug": "agentic-ai-assessment-framework-2025", 3 "calibration_date": "2026-02-28", 4 "total_questions": 50, 5 "agreement_count": 47, 6 "disagreement_count": 3, 7 "agreement_rate": 0.94, 8 "disagreements": [ 9 { 10 "category": "statistical_methodology", 11 "question": "significance_tests", 12 "sonnet": { "applies": false, "answer": false }, 13 "opus": { "applies": true, "answer": false }, 14 "direction": "applies_boundary", 15 "note": "Sonnet says no comparative claims requiring significance tests. Opus judges RQ1 ('How effective is the proposed framework compared to baseline evaluations?') as a comparative claim — the paper compares two evaluation approaches on the same data and makes effectiveness claims without statistical tests. The comparison is about measurement scope rather than quantitative differences, but the framing uses comparative language." 16 }, 17 { 18 "category": "evaluation_design", 19 "question": "ablation_study", 20 "sonnet": { "applies": true, "answer": true }, 21 "opus": { "applies": true, "answer": false }, 22 "direction": "sonnet_generous", 23 "note": "Sonnet credits Table 7 ('Pillar ablation: Average failures per pillar') as an ablation study. Opus disagrees: the table shows a per-category breakdown of failures by pillar, not a proper ablation where components are removed to measure their individual contribution. No pillar is removed to show the framework degrades without it. The paper labels it 'ablation' but the methodology is a breakdown, not an ablation." 24 }, 25 { 26 "category": "evaluation_design", 27 "question": "human_evaluation", 28 "sonnet": { "applies": false, "answer": false }, 29 "opus": { "applies": true, "answer": false }, 30 "direction": "applies_boundary", 31 "note": "Sonnet sets applies=false, reasoning that the paper uses only automated LLM evaluation. Opus sets applies=true because human evaluation of the framework's outputs (e.g., expert review of whether identified failures are genuine) would be relevant and valuable — the schema says applies=false only when 'human evaluation is clearly irrelevant to the claims.' For a framework claiming to detect behavioral failures, human validation of those detected failures is relevant." 32 } 33 ], 34 "opus_checklist": { 35 "artifacts": { 36 "code_released": { 37 "applies": true, 38 "answer": true, 39 "justification": "GitHub URL (https://github.com/sa4s-serc/asf) provided in Section 1 and reiterated in Section 4.2: 'Our experiments and results are available on GitHub.'" 40 }, 41 "data_released": { 42 "applies": true, 43 "answer": false, 44 "justification": "Synthetic CloudOps data (instances, logs, policies) was generated using Sonnet 4.5 but no dataset download link or standalone release is mentioned. The GitHub repo contains experiment trajectories, not the underlying synthetic dataset." 45 }, 46 "environment_specified": { 47 "applies": true, 48 "answer": false, 49 "justification": "Paper mentions GPT-4o with temperature 0.7, ChromaDB, text-embedding-3-small, and Mem0 with 'default settings', but no requirements.txt, Dockerfile, or library version listing is provided. Insufficient to recreate the environment." 50 }, 51 "reproduction_instructions": { 52 "applies": true, 53 "answer": false, 54 "justification": "A GitHub link is provided but the paper itself contains no step-by-step reproduction instructions, commands, or 'Reproducing Results' section." 55 } 56 }, 57 "statistical_methodology": { 58 "confidence_intervals_or_error_bars": { 59 "applies": true, 60 "answer": false, 61 "justification": "Tables 3-7 report only point estimates (e.g., '33.7% precision', '37.9% recall'). Figure 3 shows visual distributions across three runs but no CIs or numerical error bars are reported." 62 }, 63 "significance_tests": { 64 "applies": true, 65 "answer": false, 66 "justification": "RQ1 frames a comparison between baseline and framework evaluation approaches, but no statistical significance tests are used to support the claim that the framework is more effective. The comparison relies on showing the framework reveals additional metrics, without formal hypothesis testing." 67 }, 68 "effect_sizes_reported": { 69 "applies": false, 70 "answer": false, 71 "justification": "The paper compares measurement scope (baseline metrics vs. framework metrics) rather than two competing methods on the same metrics. There is no meaningful effect size to compute between approaches that measure different things." 72 }, 73 "sample_size_justified": { 74 "applies": true, 75 "answer": false, 76 "justification": "Only three scenarios (S1, S2, S3), each run three times. No justification for why three scenarios are sufficient and no power analysis is discussed." 77 }, 78 "variance_reported": { 79 "applies": true, 80 "answer": false, 81 "justification": "Each scenario was run three times and 'averaged to ensure reliability,' but tables report only averages with no standard deviation or range. Figure 3 shows visual distributions but no numerical variance statistics." 82 } 83 }, 84 "evaluation_design": { 85 "baselines_included": { 86 "applies": true, 87 "answer": true, 88 "justification": "Table 3 compares a 'Baseline (B)' evaluation (task completion and tool usage ratio only) against the proposed 'Framework (F)' metrics across all three scenarios." 89 }, 90 "baselines_contemporary": { 91 "applies": true, 92 "answer": true, 93 "justification": "The baseline is the current dominant practice of task-completion and tool-usage-ratio metrics. This is the appropriate comparison for the paper's goal of showing what standard metrics miss." 94 }, 95 "ablation_study": { 96 "applies": true, 97 "answer": false, 98 "justification": "Table 7 is labeled 'Pillar ablation' but shows a per-category breakdown of failure counts across pillars, not a proper ablation study. No pillar is removed from the framework to measure its contribution — the table simply counts how many failures each category detects. A true ablation would remove a pillar and show the framework's effectiveness degrades." 99 }, 100 "multiple_metrics": { 101 "applies": true, 102 "answer": true, 103 "justification": "The framework uses many metrics: task completion, tool usage ratio, tool sequence correctness, expected calls, policy adherence, dependency inquiry, memory P/R/F1, and judge scores across completion/safety/memory/reasoning/overall dimensions." 104 }, 105 "human_evaluation": { 106 "applies": true, 107 "answer": false, 108 "justification": "All evaluation is automated (LLM-as-Judge and Agent-as-Judge). No human experts validate whether the framework's detected failures are genuine. For a framework claiming to identify behavioral failures, human validation of those findings would be relevant." 109 }, 110 "held_out_test_set": { 111 "applies": true, 112 "answer": false, 113 "justification": "The three CloudOps scenarios are used directly for framework validation with no separate held-out test set or train/test split." 114 }, 115 "per_category_breakdown": { 116 "applies": true, 117 "answer": true, 118 "justification": "Results broken down per pillar (Table 7), per scenario (Tables 3-6), and memory retrieval by type (Table 4: single-hop, multi-hop, temporal)." 119 }, 120 "failure_cases_discussed": { 121 "applies": true, 122 "answer": true, 123 "justification": "Section 4.2 and Table 7 describe concrete failures: 'Skipped policy validation before instance termination', 'Missed diagnostic or verification steps before applying remediation', 'Did not recall previous role mappings'. Section 5 discusses failure patterns." 124 }, 125 "negative_results_reported": { 126 "applies": true, 127 "answer": true, 128 "justification": "Substantial negative results reported: S1 had 0% task completion and 33% policy adherence, S2 had 13.1% memory recall, S3 had 7.67 average tool failures. The paper transparently presents these to demonstrate baseline evaluation gaps." 129 } 130 }, 131 "claims_and_evidence": { 132 "abstract_claims_supported": { 133 "applies": true, 134 "answer": true, 135 "justification": "The abstract claims the framework 'reveal[s] behavioral deviations overlooked by conventional metrics.' Tables 3-7 support this: S2 shows 100% task completion masking 13.1% memory recall and 33% tool sequence correctness." 136 }, 137 "causal_claims_justified": { 138 "applies": true, 139 "answer": false, 140 "justification": "The paper makes implicit causal claims (e.g., 'tool sequencing and memory management need particular attention in multi-agent scenarios') derived from observational differences across three scenarios of different complexity. No controlled single-variable manipulation isolates the causes of observed failures." 141 }, 142 "generalization_bounded": { 143 "applies": true, 144 "answer": false, 145 "justification": "The title 'An Assessment Framework for Evaluating Agentic AI Systems' implies broad applicability, but validation is only on CloudOps scenarios with a single LLM (GPT-4o). Section 5.1 acknowledges the CloudOps limitation but the title and framing claim generality beyond what was tested." 146 }, 147 "alternative_explanations_discussed": { 148 "applies": true, 149 "answer": false, 150 "justification": "Section 5.1 identifies some threats (uniform temperature, single model, ground truth contracts, judge biases) but these are methodological limitations, not specific alternative explanations for observed results. No consideration of whether failures stem from synthetic data design, prompt engineering choices, or temperature settings rather than genuine system weaknesses." 151 } 152 }, 153 "setup_transparency": { 154 "model_versions_specified": { 155 "applies": true, 156 "answer": false, 157 "justification": "The paper uses 'GPT-4o' without a version or snapshot date, 'Sonnet 4.5' for data generation, and 'text-embedding-3-small' for embeddings. Per schema criteria, marketing names without API snapshot dates do not count as specified versions." 158 }, 159 "prompts_provided": { 160 "applies": true, 161 "answer": false, 162 "justification": "No actual prompt text is provided anywhere in the paper or appendix. The LLM-as-Judge uses 'structured prompts' (Section 3.3) but the prompt text is not shown. Agent system prompts are not disclosed." 163 }, 164 "hyperparameters_reported": { 165 "applies": true, 166 "answer": false, 167 "justification": "Temperature (0.7) is reported for GPT-4o and Mem0 uses 'default settings,' but top-p, max tokens, and other API settings are not specified. The default settings of Mem0 are not enumerated. Partial reporting is insufficient." 168 }, 169 "scaffolding_described": { 170 "applies": true, 171 "answer": true, 172 "justification": "Detailed description of MOYA framework architecture, memory using Mem0 with ChromaDB, tool invocation protocols, test case generation, static/dynamic/judge evaluation layers, Agent Card mechanism, and Agent-as-Judge auditor. Figures 1 and 2 provide architectural diagrams." 173 }, 174 "data_preprocessing_documented": { 175 "applies": true, 176 "answer": false, 177 "justification": "Synthetic data was 'generated using Sonnet 4.5' based on production issues at MontyCloud, but the generation process, prompts used, volume, filtering criteria, and verification procedures are not documented." 178 } 179 }, 180 "limitations_and_scope": { 181 "limitations_section_present": { 182 "applies": true, 183 "answer": true, 184 "justification": "Section 5.1 'Threats to Validity' addresses external, internal, and construct validity with substantive multi-paragraph discussion." 185 }, 186 "threats_to_validity_specific": { 187 "applies": true, 188 "answer": true, 189 "justification": "Section 5.1 identifies study-specific threats: evaluation limited to CloudOps domain (external), uniform temperature 0.7 and single model GPT-4o (internal), metrics may not capture all reliability aspects and memory assessment excludes storage efficiency (construct)." 190 }, 191 "scope_boundaries_stated": { 192 "applies": true, 193 "answer": false, 194 "justification": "Section 5.1 notes CloudOps limitation but does not explicitly enumerate what the results do NOT show or what claims the authors are NOT making. There is no equivalent of METR's 'What the evidence does not show' — the paper states its limitations but not its explicit non-claims." 195 } 196 }, 197 "data_integrity": { 198 "raw_data_available": { 199 "applies": true, 200 "answer": false, 201 "justification": "GitHub repo contains 'experiments and results' (execution trajectories) but the underlying synthetic CloudOps data (instances, logs, policies) is not described as publicly available for independent verification." 202 }, 203 "data_collection_described": { 204 "applies": true, 205 "answer": false, 206 "justification": "Synthetic data generated 'using Sonnet 4.5 due to its ranking on LMArena' based on 'production issues we encountered,' but prompts, criteria, volume, and verification procedures for the synthetic data are not described." 207 }, 208 "recruitment_methods_described": { 209 "applies": false, 210 "answer": false, 211 "justification": "No human participants. The study uses synthetic CloudOps scenarios, not data collected from human subjects." 212 }, 213 "data_pipeline_documented": { 214 "applies": true, 215 "answer": false, 216 "justification": "The pipeline from synthetic data generation through evaluation metrics is not fully documented. The paper describes framework components but does not trace the full data pipeline with intermediate steps, data volumes, and filtering criteria." 217 } 218 }, 219 "conflicts_of_interest": { 220 "funding_disclosed": { 221 "applies": true, 222 "answer": false, 223 "justification": "No acknowledgments section or funding disclosure is present in the paper. The industry collaboration with MontyCloud is described but no explicit funding statement is provided." 224 }, 225 "affiliations_disclosed": { 226 "applies": true, 227 "answer": true, 228 "justification": "Author affiliations clearly listed on the title page: four authors at IIIT-Hyderabad SERC, two at MontyCloud Inc. The industry affiliation is prominently disclosed." 229 }, 230 "funder_independent_of_outcome": { 231 "applies": true, 232 "answer": false, 233 "justification": "Two MontyCloud Inc. employees are co-authors and the framework is validated exclusively on MontyCloud's production system (MOYA). MontyCloud has a financial interest in the framework appearing effective." 234 }, 235 "financial_interests_declared": { 236 "applies": true, 237 "answer": false, 238 "justification": "No competing interests statement or financial interest declaration is present. The MontyCloud affiliation creates a potential undisclosed conflict." 239 } 240 }, 241 "contamination": { 242 "training_cutoff_stated": { 243 "applies": true, 244 "answer": false, 245 "justification": "GPT-4o is used as the agent backbone but no training data cutoff date is stated. The paper does not address whether the model's training data could overlap with the evaluation scenarios." 246 }, 247 "train_test_overlap_discussed": { 248 "applies": true, 249 "answer": false, 250 "justification": "No discussion of whether GPT-4o's training data overlaps with the CloudOps scenarios, AWS documentation patterns, or tool descriptions used in evaluation." 251 }, 252 "benchmark_contamination_addressed": { 253 "applies": false, 254 "answer": false, 255 "justification": "The evaluation uses custom-designed CloudOps workflows, not published public benchmarks. Standard benchmark contamination does not apply to custom scenarios." 256 } 257 }, 258 "human_studies": { 259 "pre_registered": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants. The evaluation is entirely automated using LLM agents and synthetic CloudOps scenarios." 263 }, 264 "irb_or_ethics_approval": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants; IRB approval is not applicable." 268 }, 269 "demographics_reported": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants; demographics are not applicable." 273 }, 274 "inclusion_exclusion_criteria": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants; inclusion/exclusion criteria are not applicable." 278 }, 279 "randomization_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants or controlled experiment with human assignment; not applicable." 283 }, 284 "blinding_described": { 285 "applies": false, 286 "answer": false, 287 "justification": "No human participants or human evaluators; blinding is not applicable." 288 }, 289 "attrition_reported": { 290 "applies": false, 291 "answer": false, 292 "justification": "No human participants; attrition is not applicable." 293 } 294 }, 295 "cost_and_practicality": { 296 "inference_cost_reported": { 297 "applies": true, 298 "answer": true, 299 "justification": "Section 4.2.3 reports inference costs: average $0.0621 per scenario run, LLM-as-Judge $0.0593 total across all scenarios, Agent-as-Judge $0.9572 total. Figure 4 shows cost breakdowns." 300 }, 301 "compute_budget_stated": { 302 "applies": true, 303 "answer": true, 304 "justification": "Section 4.2.3 reports average execution time (183.5s per run), token consumption (19,644 input / 1,301 output per run), and per-scenario costs ($0.0405-$0.0818)." 305 } 306 } 307 } 308 }