calibration.json (19245B)
1 { 2 "paper_slug": "agent-error-taxonomy-2025", 3 "calibration_date": "2026-02-28", 4 "calibration_model": "opus", 5 "total_questions": 50, 6 "agreement_count": 49, 7 "disagreement_count": 1, 8 "agreement_rate": 0.98, 9 "disagreements": [ 10 { 11 "question": "statistical_methodology.effect_sizes_reported", 12 "sonnet_applies": true, 13 "sonnet_answer": false, 14 "opus_applies": true, 15 "opus_answer": true, 16 "direction": "opus_generous", 17 "explanation": "Sonnet says the paper 'does not report formal effect sizes such as Cohen's d or odds ratios.' Opus notes that the schema explicitly lists 'percentage improvement with baseline context' as sufficient for YES, giving the example 'A paper that says 12% improvement over baseline (from 45% to 57%) provides enough context for YES.' The paper consistently reports percentage improvements with baseline numbers (e.g., '24.3% vs. 0.3%', '45.0% vs. 28.0%', 'from 21 to 55'). These provide the magnitude context the schema requires. On reflection, this is a borderline case where Opus may be slightly generous — while the paper provides raw comparison numbers, it does not report standardized effect sizes (Cohen's d, odds ratios). The schema's example shows raw percentage with context is sufficient, but the paper's comparisons are not truly 'effect sizes' in the formal statistical sense. Opus leans toward YES based on the schema's explicit example." 18 } 19 ], 20 "opus_checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": false, 25 "justification": "The abstract states 'The code and data will be available at https://github.com/ulab-uiuc/AgentDebug', which is a promise of future release. Per schema criteria, a promise of future release counts as NO." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": false, 30 "justification": "The paper promises data availability at the same GitHub URL, but this is a future release promise ('will be available'). AgentErrorBench is not currently downloadable." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment specification is provided. The paper specifies model names and temperature=0 for GPT-4.1 but nothing sufficient to recreate the software environment." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "Algorithm 1 provides pseudocode but no step-by-step reproduction instructions (commands, scripts, README). A competent researcher could not replicate the experiments from the paper alone." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": false, 47 "justification": "Table 1 and Figures 5-7 report only point estimates (percentage accuracy, success rates). No confidence intervals, error bars, or uncertainty quantification is provided anywhere in the paper." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "No statistical significance tests are reported. The paper claims AgentDebug 'consistently surpasses baselines' (Section 4.1) based solely on comparing point estimates without p-values, t-tests, or any other statistical tests." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "The paper reports percentage improvements with baseline context throughout: '24.3% vs. 0.3% all-correct' (Table 1), '45.0% vs. 28.0% step accuracy' (Table 1), 'success from 21 to 55' (Section 4.2). Per the schema, 'A paper that says 12% improvement over baseline (from 45% to 57%) provides enough context for YES.' The paper consistently provides this form of context." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "The benchmark contains 200 annotated trajectories (100 ALFWorld, 50 WebShop, 50 GAIA) but no power analysis or justification for why this sample size is sufficient for the claims made." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "All results are reported as single-run point estimates. No standard deviation, variance across repeated experimental runs, or any spread measure is provided. Figure 7a shows cumulative success across attempts but these are cumulative counts, not variance." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "Section 4.1 compares against Direct Prompting, Brute Force, and Binary Search for error detection. Section 4.2 compares against Self-Refine, Vanilla Debugger, Tree-of-Thought, and Best-of-N for downstream task success." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "The baselines are contemporary: Self-Refine, Tree-of-Thought, and Best-of-N are recent methods. The paper explicitly controls for compute budget to ensure fair comparison (Section 4.2: 'max number of attempts of all baselines is matched to AgentDebug by total token usage')." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": true, 84 "justification": "Section 5.1 presents ablation studies on three factors: (1) max number of re-rollout attempts (Figure 7a), (2) different base models for the AgentDebug detector (Figure 7b), and (3) different rollout strategies (Figure 7c)." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "Table 1 reports three evaluation metrics for error detection: Step accuracy (S), Step+Module accuracy (S+M), and All Correct (Step+Module+Error Type). Task success rate is reported separately in Section 4.2." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": false, 94 "justification": "Human experts annotated the benchmark (10 graduate students), but this was for dataset construction, not for evaluating AgentDebug's outputs. All evaluation of the system is automated — comparing predictions to ground-truth labels." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": true, 99 "justification": "AgentErrorBench serves as the test set for error detection evaluation (Section 4.1). Downstream evaluation uses standard external benchmarks (ALFWorld, GAIA, WebShop). The paper does not describe using the test data for any tuning decisions." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Table 1 breaks down results per benchmark (ALFWorld, WebShop, GAIA). Figures 11-13 in Appendix A.4 provide per-module and per-error-type failure distributions for each benchmark." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": true, 109 "justification": "Section 5.2 analyzes error propagation patterns in detail with Figure 8 illustrating cascading failures. Appendix A.3 provides a concrete qualitative failure case comparison (ALFWorld saltshaker example, Figure 10)." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Figure 7b shows that alternative base models (Llama-3.3-70B, GPT-4o-mini, Qwen3-Next-80B) perform substantially worse as detectors. Table 1 shows Brute Force performing worse than Direct Prompting in some metrics. Figure 7c shows Act Only achieving only 0.10 success rate." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "The abstract claims '24% higher all-correct accuracy' — confirmed by Table 1 (24.3% vs. 0.3%). Claims '17% higher step accuracy' — confirmed (45.0% vs. 28.0%). Claims 'up to 26% relative improvements' — shown in Figures 5-6." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": false, 126 "justification": "The paper makes causal claims such as 'correcting a single root-cause mistake can often flip an otherwise failing trajectory' and 'focusing on root-cause errors...is key to efficient debugging.' The main comparison (AgentDebug vs. baselines) involves multiple simultaneous differences (taxonomy-guided analysis, critical error detection, targeted feedback), making causal attribution to any single mechanism difficult. While the ablations vary individual factors within AgentDebug, they don't isolate the core mechanism." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The title 'Where LLM Agents Fail' implies broad scope, and the conclusion frames AgentDebug as 'a foundation for agents that can continuously learn and evolve,' which significantly overgeneralizes from 3 specific benchmarks (ALFWorld, WebShop, GAIA) with a single detector model (GPT-4.1)." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": false, 136 "justification": "The limitations section (Appendix A.1) discusses scale and annotation cost constraints but does not consider alternative explanations for the observed results — for example, whether gains are driven by GPT-4.1's specific analytical strengths rather than the framework design, or whether the benchmark construction favored the proposed taxonomy." 137 } 138 }, 139 "setup_transparency": { 140 "model_versions_specified": { 141 "applies": true, 142 "answer": false, 143 "justification": "The paper uses 'GPT-4.1', 'GPT-4o-mini', 'Qwen3-8B', and 'Qwen3-Next-80B' without API version numbers, snapshot dates, or specific model identifiers. Per the schema, marketing names without a snapshot date or API version do not count as specified versions." 144 }, 145 "prompts_provided": { 146 "applies": true, 147 "answer": true, 148 "justification": "Appendix A.5 provides verbatim prompt content: Detector Prompt (Figure 14), AgentDebug Prompt (Figure 15), Baseline Prompts (Figure 16), and Environment Rollout Prompts for ALFWorld (Figure 17), WebShop (Figure 18), and GAIA (Figure 19). The templates include placeholders that are filled deterministically from the trajectory data and taxonomy." 149 }, 150 "hyperparameters_reported": { 151 "applies": true, 152 "answer": false, 153 "justification": "Section 4.1 states 'temperature set to 0 for deterministic outputs' for GPT-4.1. But no other hyperparameters are reported: temperature/top-p/max tokens for the backbone agents (GPT-4o-mini, Qwen3-8B, Qwen3-Next-80B) in Section 4.2 are not specified." 154 }, 155 "scaffolding_described": { 156 "applies": true, 157 "answer": true, 158 "justification": "Sections 3.1-3.2 and Algorithm 1 describe AgentDebug's three-stage scaffolding in detail: Stage 1 (fine-grained analysis with taxonomy), Stage 2 (critical error detection), Stage 3 (iterative debugging with re-rollouts). The modular rollout design with memory, reflection, planning, and action modules is described in Section 3.1." 159 }, 160 "data_preprocessing_documented": { 161 "applies": true, 162 "answer": false, 163 "justification": "Section 2.2 states 200 trajectories were 'curated' from 500+ failed trajectories but does not document the selection criteria. The annotation procedure is described (decision-step level, three rounds of pilot annotation, double-annotation), but the filtering from 500+ to 200 is an undocumented step." 164 } 165 }, 166 "limitations_and_scope": { 167 "limitations_section_present": { 168 "applies": true, 169 "answer": true, 170 "justification": "Appendix A.1 is titled 'LIMITATION' and provides substantive discussion of limitations including scale/domain constraints and annotation cost." 171 }, 172 "threats_to_validity_specific": { 173 "applies": true, 174 "answer": true, 175 "justification": "Appendix A.1 identifies specific limitations: (1) the benchmark 'remains limited in scale and domain diversity,' naming multimodal environments and safety-critical applications as untested domains, and (2) annotation costs preventing a trained debugging model. These are specific to this study rather than generic disclaimers." 176 }, 177 "scope_boundaries_stated": { 178 "applies": true, 179 "answer": false, 180 "justification": "The paper does not explicitly state what the results do NOT show. It does not bound claims to the specific models tested, acknowledge that results may not apply to multi-agent settings, or state specific things it did NOT test. The limitations are framed as future directions rather than explicit scope boundaries." 181 } 182 }, 183 "data_integrity": { 184 "raw_data_available": { 185 "applies": true, 186 "answer": false, 187 "justification": "AgentErrorBench (200 annotated trajectories) and the 500+ failed trajectories used for taxonomy development are promised for future release at GitHub but not currently available for independent verification." 188 }, 189 "data_collection_described": { 190 "applies": true, 191 "answer": true, 192 "justification": "Section 2.2 describes collecting 200 trajectories: 100 from ALFWorld, 50 from WebShop, 50 from GAIA. Ten graduate student annotators labeled each trajectory at the decision-step level. Annotation included training, double-annotation, and disagreement adjudication. Inter-annotator agreement reported (Cohen's kappa = 0.55)." 193 }, 194 "recruitment_methods_described": { 195 "applies": true, 196 "answer": true, 197 "justification": "Section 2.2 describes annotators as 'ten expert annotators—graduate students with prior experience in NLP and LLMs agent research.' The qualification criteria (graduate-level, relevant domain experience) are stated." 198 }, 199 "data_pipeline_documented": { 200 "applies": true, 201 "answer": false, 202 "justification": "The paper collected 500+ failed trajectories but used only 200 in the benchmark. The selection criteria for which 200 were chosen from the 500+ are not documented. This represents an unexplained filtering step in the data pipeline." 203 } 204 }, 205 "conflicts_of_interest": { 206 "funding_disclosed": { 207 "applies": true, 208 "answer": false, 209 "justification": "No funding sources are disclosed. The acknowledgment section thanks the OpenManus team for 'discussion and providing some resources' but does not mention grants, corporate sponsors, or funding agencies." 210 }, 211 "affiliations_disclosed": { 212 "applies": true, 213 "answer": true, 214 "justification": "Author affiliations are clearly listed: University of Illinois Urbana-Champaign, Stanford University, AMD, OpenManus, University of Toronto, Likelihood Lab. The paper does not evaluate any of these institutions' products directly." 215 }, 216 "funder_independent_of_outcome": { 217 "applies": false, 218 "answer": false, 219 "justification": "No funding is disclosed. Without an identified funder, independence cannot be assessed. The schema specifies NA if unfunded." 220 }, 221 "financial_interests_declared": { 222 "applies": true, 223 "answer": false, 224 "justification": "No competing interests statement or financial interests declaration is present in the paper. Per the schema, 'absence of disclosure is not the same as absence of conflict.'" 225 } 226 }, 227 "contamination": { 228 "training_cutoff_stated": { 229 "applies": true, 230 "answer": false, 231 "justification": "The paper evaluates GPT-4.1, GPT-4o-mini, Qwen3-8B, and Qwen3-Next-80B on agent benchmarks but does not state training data cutoff dates for any model." 232 }, 233 "train_test_overlap_discussed": { 234 "applies": true, 235 "answer": false, 236 "justification": "No analysis of whether ALFWorld, GAIA, or WebShop benchmarks appeared in the training data of the evaluated models. These public benchmarks from 2020-2023 are plausibly in training data for 2025 models." 237 }, 238 "benchmark_contamination_addressed": { 239 "applies": true, 240 "answer": false, 241 "justification": "ALFWorld (2020), WebShop (2022), and GAIA (2023) are public benchmarks that likely appeared in training data for GPT-4.1 and Qwen3 series models. The paper does not discuss this contamination risk." 242 } 243 }, 244 "human_studies": { 245 "pre_registered": { 246 "applies": false, 247 "answer": false, 248 "justification": "This paper does not involve human participants in an experimental sense. Human annotators constructed the benchmark but are not study subjects. Pre-registration is not applicable." 249 }, 250 "irb_or_ethics_approval": { 251 "applies": false, 252 "answer": false, 253 "justification": "The ethical statement explicitly states 'Our study does not involve human subjects.' Graduate student annotators are performing a service, not being studied. IRB approval is not applicable." 254 }, 255 "demographics_reported": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in the experimental sense. Annotators are characterized as 'graduate students with prior experience in NLP and LLMs agent research,' which is adequate for their role. Human studies demographic requirements do not apply." 259 }, 260 "inclusion_exclusion_criteria": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in the experimental sense. Annotation is a data construction process, not a human study." 264 }, 265 "randomization_described": { 266 "applies": false, 267 "answer": false, 268 "justification": "No experimental conditions involving human subjects requiring randomization." 269 }, 270 "blinding_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human subjects experiment requiring blinding." 274 }, 275 "attrition_reported": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in the experimental sense." 279 } 280 }, 281 "cost_and_practicality": { 282 "inference_cost_reported": { 283 "applies": true, 284 "answer": false, 285 "justification": "AgentDebug involves per-step LLM calls for error analysis plus critical error detection per trajectory, with up to 5 re-rollouts. The paper does not report API costs, tokens consumed, wall-clock time, or cost per example." 286 }, 287 "compute_budget_stated": { 288 "applies": true, 289 "answer": false, 290 "justification": "The paper mentions controlling for total token usage between methods (Section 4.2: 'the max number of attempts of all baselines is matched to AgentDebug by total token usage') but does not state the absolute compute budget in terms of total tokens, GPU hours, or API spend." 291 } 292 } 293 } 294 }