calibration.json (17916B)
1 { 2 "paper_slug": "adafuse-adaptive-ensemble-2026", 3 "total_questions": 50, 4 "agreement_count": 48, 5 "disagreement_count": 2, 6 "agreement_rate": 0.96, 7 "disagreements": [ 8 { 9 "category": "evaluation_design", 10 "question": "human_evaluation", 11 "sonnet_applies": false, 12 "sonnet_answer": false, 13 "opus_applies": true, 14 "opus_answer": false, 15 "opus_justification": "The paper includes machine translation evaluation (FLORES En-De, De-En) where BLEU scores are known to correlate imperfectly with human translation quality judgments. Human evaluation is relevant for assessing translation output quality and is commonly used in MT research. The paper did not include any human evaluation. For the QA and math tasks, automated metrics (exact match, accuracy) are sufficient, but the inclusion of translation tasks makes human evaluation applicable.", 16 "sonnet_justification": "The paper evaluates an automated decoding algorithm on NLP benchmarks using automated metrics (exact match, BLEU). Human evaluation is not relevant since ground-truth answers exist for all benchmarks used.", 17 "direction": "applies_boundary" 18 }, 19 { 20 "category": "claims_and_evidence", 21 "question": "generalization_bounded", 22 "sonnet_applies": true, 23 "sonnet_answer": true, 24 "opus_applies": true, 25 "opus_answer": false, 26 "opus_justification": "The paper's title 'AdaFuse: Adaptive Ensemble Decoding with Test-Time Scaling for LLMs' suggests general applicability to LLMs. The abstract claims ADAFUSE 'consistently outperforms strong ensemble baselines' without bounding this to the tested setting. Results cover only 4 open-source models of ~7-8B parameters on 6 benchmarks across 3 task types. While the Limitations section notes the method requires open-source models with token-level likelihoods, it does not explicitly bound generalization claims to the tested model sizes (~8B), tested task types (QA, math, translation), or the number of languages tested. The conclusion speaks broadly of 'diverse tasks and evolving generation contexts.' Per the schema, a broad title with narrow evaluation warrants answer=false.", 27 "sonnet_justification": "The Limitations section explicitly states that ADAFUSE 'is currently limited to open or fully accessible models' that provide token-level likelihoods. The paper evaluates on 6 benchmarks across 3 task types with 4 specific open-source models, and the claims are bounded to this setting.", 28 "direction": "sonnet_generous" 29 } 30 ], 31 "opus_checklist": { 32 "artifacts": { 33 "code_released": { 34 "applies": true, 35 "answer": true, 36 "justification": "The abstract states 'The code is available at https://github.com/CCM0111/AdaFuse.' A working GitHub URL is provided directly in the paper." 37 }, 38 "data_released": { 39 "applies": true, 40 "answer": true, 41 "justification": "All benchmarks used (NaturalQuestions, SQuAD, TriviaQA, GSM8K, FLORES) are publicly available standard datasets. Appendix B confirms they use standard datasets 'without modifying their contents.'" 42 }, 43 "environment_specified": { 44 "applies": true, 45 "answer": true, 46 "justification": "Appendix D.4 specifies the full software environment: PyTorch v2.4.1 (CUDA 12.1), HuggingFace Transformers v4.51.3, Tokenizers v0.21.0, Accelerate v1.6.0, and Datasets v3.0.2. This is sufficient to recreate the environment." 47 }, 48 "reproduction_instructions": { 49 "applies": true, 50 "answer": false, 51 "justification": "No step-by-step reproduction instructions are provided in the paper or appendix. Appendix B.5 describes the repository as having 'modular components' with 'inline comments' but does not provide specific commands or steps to reproduce experiments." 52 } 53 }, 54 "statistical_methodology": { 55 "confidence_intervals_or_error_bars": { 56 "applies": true, 57 "answer": false, 58 "justification": "No confidence intervals or error bars are reported for any results. Table 1 reports only point estimates. Appendix D.3 states 'we report results from a deterministic decoding setting,' indicating single-run point estimates." 59 }, 60 "significance_tests": { 61 "applies": true, 62 "answer": false, 63 "justification": "No statistical significance tests are used. Claims of outperformance (e.g., 'consistently outperforms') are based solely on comparing point estimates without any p-values, t-tests, or other hypothesis tests." 64 }, 65 "effect_sizes_reported": { 66 "applies": true, 67 "answer": true, 68 "justification": "Table 1 reports both absolute scores and relative percentage improvements (e.g., '+10.01%' on NQ, '+4.12%' on SQuAD) with baseline scores provided, enabling effect size interpretation." 69 }, 70 "sample_size_justified": { 71 "applies": true, 72 "answer": false, 73 "justification": "Appendix C lists dataset sizes (e.g., 3610 NQ, 2500 SQuAD, 1319 GSM8K) but no justification is given for these sizes. No power analysis or discussion of statistical adequacy." 74 }, 75 "variance_reported": { 76 "applies": true, 77 "answer": false, 78 "justification": "Appendix D.3 explicitly states results are from a 'deterministic decoding setting.' No standard deviation, variance, or spread across multiple runs is reported." 79 } 80 }, 81 "evaluation_design": { 82 "baselines_included": { 83 "applies": true, 84 "answer": true, 85 "justification": "Four ensemble baselines are compared: LLM-BLENDER (sample-level), DEEPEN (token-level), SWEETSPAN (span-level), and UNITE (token-level), covering all major ensemble granularities." 86 }, 87 "baselines_contemporary": { 88 "applies": true, 89 "answer": true, 90 "justification": "All baselines are recent: LLM-BLENDER (2023), DEEPEN (2024), SWEETSPAN (2024), UNITE (2024). These represent the current state of the art in LLM ensemble methods." 91 }, 92 "ablation_study": { 93 "applies": true, 94 "answer": true, 95 "justification": "Section 4.3 ablates adaptive vs. fixed-length word commitment (lengths 1, 2, 3). Section 4.4 ablates the diversity-aware ensemble scaling component. Appendix F compares diversity-aware scaling vs. beam-search scaling." 96 }, 97 "multiple_metrics": { 98 "applies": true, 99 "answer": true, 100 "justification": "Exact match accuracy for QA tasks, answer accuracy for GSM8K, spBLEU for machine translation, and BERTScore as an additional semantic similarity metric (Appendix D.4)." 101 }, 102 "human_evaluation": { 103 "applies": true, 104 "answer": false, 105 "justification": "The paper includes machine translation evaluation where human evaluation is relevant (BLEU correlates imperfectly with human quality judgments). No human evaluation was conducted; all evaluation is automated." 106 }, 107 "held_out_test_set": { 108 "applies": true, 109 "answer": true, 110 "justification": "Standard held-out test splits of established benchmarks are used. Hyperparameter sensitivity analysis (τΔ) is conducted on NQ separately, with the chosen value applied uniformly to all datasets." 111 }, 112 "per_category_breakdown": { 113 "applies": true, 114 "answer": true, 115 "justification": "Table 1 reports per-benchmark scores across six benchmarks covering three task categories (knowledge-intensive QA, arithmetic reasoning, machine translation). Section 4.3 analyzes per-task word commitment distributions." 116 }, 117 "failure_cases_discussed": { 118 "applies": true, 119 "answer": true, 120 "justification": "Section 4.2 discusses GSM8K underperformance (79.15 vs. LLaMA's 81.05), attributing it to base model disparity. Table 2 shows adding Qwen3-8B as a 4th model causes a performance drop. The Limitations section acknowledges open-source model restriction." 121 }, 122 "negative_results_reported": { 123 "applies": true, 124 "answer": true, 125 "justification": "Adding a 4th model (Qwen3-8B) leads to a performance drop on NQ (Table 2). GSM8K performance below single-model LLaMA. Appendix F shows beam-search scaling degrades performance." 126 } 127 }, 128 "claims_and_evidence": { 129 "abstract_claims_supported": { 130 "applies": true, 131 "answer": true, 132 "justification": "The abstract claims '6.88% average relative improvement' over strong ensemble baselines, which matches Table 1's improvement row. Claims about QA, arithmetic reasoning, and translation are all evaluated." 133 }, 134 "causal_claims_justified": { 135 "applies": true, 136 "answer": true, 137 "justification": "The ablation studies (Sections 4.3-4.4) use controlled single-variable manipulation: replacing adaptive commitment with fixed-length alternatives while keeping other components constant, and toggling diversity-aware scaling. This supports the causal claims about component contributions." 138 }, 139 "generalization_bounded": { 140 "applies": true, 141 "answer": false, 142 "justification": "The title 'AdaFuse: Adaptive Ensemble Decoding with Test-Time Scaling for LLMs' suggests general LLM applicability. The abstract claims ADAFUSE 'consistently outperforms strong ensemble baselines' without qualification. Results are limited to 4 open-source ~8B parameter models on 6 benchmarks across 3 task types. While the Limitations section notes the open-source restriction, it does not bound claims to the tested model sizes, task types, or number of models." 143 }, 144 "alternative_explanations_discussed": { 145 "applies": true, 146 "answer": false, 147 "justification": "The paper attributes GSM8K underperformance to 'large performance disparity between base models' but does not consider alternative explanations (e.g., tokenization differences, prompt sensitivity). No systematic discussion of confounds or alternative explanations for the observed improvements." 148 } 149 }, 150 "setup_transparency": { 151 "model_versions_specified": { 152 "applies": true, 153 "answer": true, 154 "justification": "Specific model versions are stated: LLaMA-3.1-8B-Instruct, Mistral-7B-Instruct-v0.3, Qwen3-8B, InternLM3-8B-Instruct. These are specific enough to identify exact model checkpoints." 155 }, 156 "prompts_provided": { 157 "applies": true, 158 "answer": false, 159 "justification": "Only shot counts are mentioned: 5-shot for NQ/SQuAD/TriviaQA, 4-shot CoT for GSM8K, 0-shot for FLORES. No actual prompt text or few-shot examples are provided in the paper or appendix." 160 }, 161 "hyperparameters_reported": { 162 "applies": true, 163 "answer": true, 164 "justification": "Key hyperparameters reported: confidence threshold τΔ=0.7, max committed words M=3, branching factor B, context length limit up to 512 tokens. Sensitivity analysis for τΔ in Figure 6 and Appendix D.2." 165 }, 166 "scaffolding_described": { 167 "applies": false, 168 "answer": false, 169 "justification": "ADAFUSE is a decoding algorithm, not an agentic scaffolding system. No tools, retry logic, memory management, or agentic workflows are used." 170 }, 171 "data_preprocessing_documented": { 172 "applies": true, 173 "answer": false, 174 "justification": "The paper uses standard benchmark splits but does not document preprocessing steps. Appendix C lists dataset sizes (e.g., 'SQuAD 2500') without explaining how the 2500 subset was selected from the full SQuAD dataset." 175 } 176 }, 177 "limitations_and_scope": { 178 "limitations_section_present": { 179 "applies": true, 180 "answer": true, 181 "justification": "A dedicated 'Limitations' section appears after the Conclusion, discussing the requirement for token-level likelihoods and restriction to open-source models." 182 }, 183 "threats_to_validity_specific": { 184 "applies": true, 185 "answer": false, 186 "justification": "The Limitations section contains only one specific point: incompatibility with closed-source APIs. No discussion of threats specific to the evaluation methodology, such as benchmark contamination risk, model size generalization, sensitivity to model pairing, or representativeness of chosen benchmarks." 187 }, 188 "scope_boundaries_stated": { 189 "applies": true, 190 "answer": false, 191 "justification": "The paper does not explicitly state what its results do NOT show. The Limitations section notes the open-source requirement but does not specify that results may not generalize to larger models, other task types, other languages, or different model pairings." 192 } 193 }, 194 "data_integrity": { 195 "raw_data_available": { 196 "applies": true, 197 "answer": true, 198 "justification": "All benchmarks (NaturalQuestions, SQuAD, TriviaQA, GSM8K, FLORES) are publicly available for download. Independent researchers can verify results using these standard datasets." 199 }, 200 "data_collection_described": { 201 "applies": true, 202 "answer": true, 203 "justification": "The paper uses publicly available benchmarks and cites their original papers. Appendix C provides descriptive statistics (number of entries, average lengths) for each dataset." 204 }, 205 "recruitment_methods_described": { 206 "applies": false, 207 "answer": false, 208 "justification": "No human participants are involved. The paper uses pre-existing standard benchmark datasets. Recruitment methods do not apply." 209 }, 210 "data_pipeline_documented": { 211 "applies": true, 212 "answer": false, 213 "justification": "The paper does not document how benchmark subsets were selected. For example, 'SQuAD 2500' implies a subset of the full SQuAD dataset, but no selection criteria or filtering process is described." 214 } 215 }, 216 "conflicts_of_interest": { 217 "funding_disclosed": { 218 "applies": true, 219 "answer": false, 220 "justification": "No acknowledgments section or funding disclosure is present in the paper. The work used approximately 500 A100 GPU-hours but the funding source is not mentioned." 221 }, 222 "affiliations_disclosed": { 223 "applies": true, 224 "answer": true, 225 "justification": "All authors are affiliated with the University of Illinois Urbana-Champaign, clearly stated on the title page. The paper does not evaluate any product made by the authors' institution." 226 }, 227 "funder_independent_of_outcome": { 228 "applies": false, 229 "answer": false, 230 "justification": "No funding source is disclosed, so funder independence cannot be assessed. Since no funder is identified, this question is structurally unanswerable." 231 }, 232 "financial_interests_declared": { 233 "applies": true, 234 "answer": false, 235 "justification": "No competing interests or financial interests statement is present in the paper. Absence of disclosure is not the same as absence of conflict." 236 } 237 }, 238 "contamination": { 239 "training_cutoff_stated": { 240 "applies": true, 241 "answer": false, 242 "justification": "The paper uses LLaMA-3.1-8B-Instruct, Mistral-7B-Instruct-v0.3, Qwen3-8B, and InternLM3-8B-Instruct to evaluate on standard benchmarks but does not state the training data cutoff dates for any model." 243 }, 244 "train_test_overlap_discussed": { 245 "applies": true, 246 "answer": false, 247 "justification": "No discussion of potential train/test overlap. The benchmarks (NQ 2019, SQuAD 2016, TriviaQA 2017, GSM8K 2021) predate the training cutoffs of all tested models, making contamination a real risk that is not addressed." 248 }, 249 "benchmark_contamination_addressed": { 250 "applies": true, 251 "answer": false, 252 "justification": "All QA and math benchmarks predate the likely training cutoffs of the tested models. No contamination analysis, canary strings, or temporal splits are discussed." 253 } 254 }, 255 "human_studies": { 256 "pre_registered": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants are involved in this study." 260 }, 261 "irb_or_ethics_approval": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants are involved." 265 }, 266 "demographics_reported": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants are involved." 270 }, 271 "inclusion_exclusion_criteria": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants are involved." 275 }, 276 "randomization_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants are involved." 280 }, 281 "blinding_described": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants are involved." 285 }, 286 "attrition_reported": { 287 "applies": false, 288 "answer": false, 289 "justification": "No human participants are involved." 290 } 291 }, 292 "cost_and_practicality": { 293 "inference_cost_reported": { 294 "applies": true, 295 "answer": true, 296 "justification": "Section 4.5 reports wall-clock inference time for ADAFUSE vs. baselines on 4x NVIDIA A100 80GB GPUs (Figure 5). Appendix A discusses GPU energy consumption tradeoffs." 297 }, 298 "compute_budget_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "Appendix D.1 states 'The total compute budget for all experiments is approximately 500 A100 GPU-hours.' Hardware (NVIDIA A100 80GB) and model memory footprints (~30GB in 16-bit) are specified." 302 } 303 } 304 } 305 }