calibration.json (17791B)
1 { 2 "paper_slug": "agentask-multiagent-systems-2025", 3 "calibration_date": "2026-02-28", 4 "model": "opus", 5 "total_questions": 50, 6 "agreement_count": 49, 7 "disagreement_count": 1, 8 "agreement_rate": 0.98, 9 "disagreements": [ 10 { 11 "category": "conflicts_of_interest", 12 "question": "funder_independent_of_outcome", 13 "sonnet": {"applies": false, "answer": false}, 14 "opus": {"applies": true, "answer": false}, 15 "direction": "applies_boundary", 16 "explanation": "Sonnet set applies=false reasoning that no funding is disclosed and hence the question is NA (unfunded). Opus set applies=true because authors are affiliated with major research institutions (USTC, Shanghai AI Lab, NUS) where it is implausible the work was entirely unfunded. The schema says 'NA if unfunded' but absent explicit statement that the work was unfunded, the more conservative reading is that funding was simply not disclosed — making funder independence unassessable (applies=true, answer=false) rather than inapplicable." 17 } 18 ], 19 "opus_checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "The word 'Code' appears at the top of the abstract without any URL, repository link, or archive. No working code release is provided anywhere in the paper." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": false, 29 "justification": "The 824 annotated execution logs used for the taxonomy and the SFT training dataset are not released. The evaluation benchmarks (GSM8K, MATH, MMLU, HumanEval, MBPP) are public but the paper's own collected data is not available." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "No requirements.txt, Dockerfile, conda file, or environment specification is provided. Model names (Qwen-3-4B, Llama-3.2-3B, GPT-4o-mini-0718) are listed but no dependency or software environment details." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "Algorithm 1 provides training pseudocode but no step-by-step reproduction instructions, README, or runnable scripts are provided." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": true, 46 "justification": "Table 1 reports standard deviations for +GPT-5 and +AgentAsk conditions (e.g., '94.52 ± 1.27'). Origin baselines are single point estimates without uncertainty, but the main experimental conditions include error bars." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper claims AgentAsk improves accuracy by +3.29 to +3.45 points but reports no statistical significance tests — no p-values, t-tests, Mann-Whitney U, or bootstrap tests. All comparative claims are based solely on numeric differences." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "Improvements are reported as percentage-point differences (e.g., +3.29 to +3.45) but no standardized effect sizes (Cohen's d, odds ratios) are provided. While baseline context is given, this does not meet the bar for formal effect size reporting." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "No justification for the sample size of 824 execution logs used for taxonomy annotation. No power analysis for benchmark evaluations. The number of training examples N is referenced symbolically but never stated." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": true, 66 "justification": "Standard deviations are reported in Table 1 for +GPT-5 and +AgentAsk conditions across all five benchmarks and four frameworks. Origin baselines are single-run numbers, but the main experimental conditions report spread." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "Multiple baselines included: single-model prompting (IO, CoT, Self-Refine) and four multi-agent frameworks (GPTSwarm, AFlow, MaAS, MasRouter). Also compares +GPT-5 as an upper-bound clarifier." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "All four MAS baselines (GPTSwarm, AFlow, MaAS, MasRouter) are from 2024-2025 and represent state-of-the-art multi-agent systems." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": true, 83 "justification": "Section 5.4 and Table 3 ablate reward components (rpar, reff, rfmt) with full 'R only' comparison. Table 9 compares E-GRPO vs PPO and GRPO. Table 8 provides extended ablation across three frameworks." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "Accuracy (GSM8K, MATH, MMLU), Pass@1 (HumanEval, MBPP), plus latency and extra cost as efficiency metrics. Four distinct metrics used." 89 }, 90 "human_evaluation": { 91 "applies": false, 92 "answer": false, 93 "justification": "The paper evaluates on automated benchmarks with objective ground-truth answers (math solutions, code correctness, multiple choice). Human evaluation of system outputs is not relevant to the claims made." 94 }, 95 "held_out_test_set": { 96 "applies": true, 97 "answer": false, 98 "justification": "The paper does not explicitly describe whether standard test splits were used for the benchmarks or whether any tuning decisions were made using the evaluation data. The relationship between training data (execution logs) and benchmark evaluation splits is not clarified." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Results broken down per benchmark (5 benchmarks) in Tables 1, 4-7; per framework (4 frameworks); and per error type (Data Gap, Signal Corruption, Referential Drift, Capability Gap) with Resolved@Edge rates in Section 5.5 and Figure 4." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "Section 5.5 discusses where AgentAsk is less effective: Referential Drift (58.3% resolved) and Capability Gap (49.5% resolved). The Limitations section acknowledges inability to handle hallucinations. Case studies in Appendix D illustrate specific error patterns." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "Table 4 shows AgentAsk decreasing GSM8K accuracy by -0.20 on MaAS. Tables 5-6 show slight decreases in some SFT configurations (e.g., -0.11 MBPP on Llama-3.2-3B SFT, -0.08 MMLU on MaAS Qwen-3-4B SFT). These are reported transparently." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "Abstract claims 'improving accuracy by up to 4.69%' confirmed in Table 1 (MaAS +GPT-5 average +4.69). 'Latency and extra costs below 10%' supported by Tables 2 and 5-7 showing normalized extra costs of 3.3-9.7 for AgentAsk." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": true, 125 "justification": "Ablation studies in Section 5.4 provide controlled single-variable manipulation removing individual reward components. The experimental design holds orchestration fixed and varies only the clarifier, isolating its causal effect. Comparison with +GPT-4o-mini and +GPT-5 further supports the causal design." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": false, 130 "justification": "The title 'Multi-Agent Systems Need to Ask' and claims like 'improving the reliability of multi-agent systems' are broad. Evaluation is limited to 5 benchmarks (math reasoning, QA, code), one executor model (GPT-4o-mini-0718), and four specific MAS frameworks. Claims are not bounded to these specific settings." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper does not substantively discuss alternative explanations for the observed gains. Could improvements stem from simply adding more LLM calls regardless of clarification content? From increased token budget? The +GPT-4o-mini baseline partially controls for model-call overhead but is not framed as testing an alternative explanation." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Specific model versions provided: 'GPT-4o-mini-0718' (with date stamp), 'Qwen-3-4B', 'Llama-3.2-3B'. GPT-5 referenced with OpenAI URL accessed 2025-08-07. These are sufficiently specific identifiers." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Appendix F, Figure 7 provides the complete edge-level clarifier prompt including all four error types (DG, SC, RD, CG), decision criteria for when/what/whom to ask, and the NONE condition. This is the full prompt text." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "Algorithm 1 lists hyperparameters as required inputs (lr eta, lambda_ask, lambda_R, epsilon, beta, weights wt, baseline b) but their actual values are never reported. Temperature=0.3 is mentioned only for robustness experiments in Appendix C.1. No hyperparameter table is provided." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "Sections 4.1-4.3 describe the edge-local state (Eq. 9), action space (Eq. 10), factorized policy (Eq. 11), two-stage training (SFT then E-GRPO RL), and reward shaping. Algorithm 1 provides full pseudocode. Appendix A details modeling and SFT." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": false, 162 "justification": "The training corpus is described as built 'from a set of logged multi-agent executions' with labels from 'a teacher model' (Appendix A.2), but the paper does not document how execution logs were collected, filtered, or preprocessed. The number of training examples N is referenced but never stated." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "A dedicated 'Limitations' section follows the Conclusion, discussing that AgentAsk cannot eliminate errors from internal model flaws like hallucinations." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "The Limitations section is generic: AgentAsk 'cannot fully eliminate all errors' due to 'well-known limitations in large models, such as hallucinations and inconsistencies.' No specific threats to experimental validity are discussed (e.g., benchmark contamination, selection of favorable frameworks, confounds in cost measurements)." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "No explicit statement of what the results do NOT show. No discussion of excluded domains, task types, or model families. No explicit disclaimers about what claims are NOT being made. The paper presents results on math/QA/code without bounding generalization." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "The 824 annotated execution logs forming the taxonomy basis and the SFT training data are not released. The error type distribution claims (Data Gap 29.1%, Referential Drift 27.3%, Signal Corruption 36.8%, Capability Gap 6.8%) cannot be independently verified." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": false, 191 "justification": "The paper states '824 execution logs' were audited but does not describe which frameworks generated them, which tasks they covered, over what time period, or what inclusion/exclusion criteria were applied." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants in the research sense. Annotators are described as 'multiple professional annotators with expertise in MAS' performing taxonomy classification — this is annotation work, not a human subjects study." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": false, 201 "justification": "The pipeline from execution logs to taxonomy annotations to SFT training data is not fully documented. Filtering criteria, number of training examples, intermediate processing steps, and how many examples were removed at each stage are not described." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No acknowledgments section and no funding sources disclosed. Authors are affiliated with USTC, Shanghai AI Lab, Xi'an Jiaotong University, and NUS — institutions that typically fund research — but no grants or sponsors are mentioned." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations clearly listed on the title page: USTC, Shanghai AI Laboratory, Xi'an Jiaotong University, NUS. No author is affiliated with OpenAI or DeepSeek whose models are evaluated." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed despite authors being affiliated with major research institutions (USTC, Shanghai AI Lab, NUS) where unfunded research is implausible. Since funding is not disclosed, funder independence cannot be assessed. The schema says 'NA if unfunded' but the absence of a funding disclosure from well-resourced institutions is not the same as confirmed unfunded status." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement appears anywhere in the paper. Absence of disclosure is not the same as absence of conflict." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "GPT-4o-mini-0718 and GPT-5 are used to evaluate on benchmarks but no training data cutoff dates are stated for these models." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "Public benchmarks (HumanEval 2021, MMLU 2021, GSM8K 2021, MATH 2021, MBPP 2021) used with models trained well after these were published. No discussion of potential train/test overlap." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "All five benchmarks were published in 2021, well before the training data cutoff of any model used. The paper does not discuss contamination risk despite this being a significant concern for interpreting benchmark results." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human subjects study. Annotators classify agent execution logs but are not research participants." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in the research sense. Professional annotators performing taxonomy classification does not constitute a human subjects study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants. Annotators are described as professionals with MAS expertise but are not research subjects." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants whose inclusion/exclusion criteria need specification." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human subjects experiment with conditions requiring randomization." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human subjects experiment requiring blinding." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants who could drop out of a study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Tables 2, 5-7 report 'Extra Cost' as a normalized metric relative to origin=0. AgentAsk achieves extra costs of 3.3-9.7 while +GPT-5 incurs 24-44 units. These are relative rather than absolute costs but do quantify the overhead." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No GPU hours, hardware specifications, total API spend, or training time reported. The SFT and E-GRPO training processes are described algorithmically but computational costs are not quantified." 290 } 291 } 292 } 293 }