scan.json (21655B)
1 { 2 "paper": { 3 "title": "The Illusion of Diminishing Returns: Measuring Long Horizon Execution in LLMs", 4 "authors": ["Akshit Sinha", "Arvindh Arun", "Shashwat Goel", "Steffen Staab", "Jonas Geiping"], 5 "year": 2025, 6 "venue": "Preprint (arXiv)", 7 "arxiv_id": "2509.09677" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "The paper header indicates 'Code' and 'Dataset' links. The paper states the benchmark can generate new examples programmatically." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "A 'Dataset' link is indicated at the top of the paper, and examples are generated programmatically from the described procedure." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided in the paper. Section E.5 mentions compute details but not software dependencies." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "While the experimental setup is described in detail (Section 3 and Appendix E), the paper does not provide step-by-step reproduction instructions or scripts to replicate the main experiments." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": true, 36 "justification": "Figure 4 shows 'mean ± one standard deviation over 100 samples' as shaded regions. Error bars/uncertainty bands appear in multiple figures." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper makes comparative claims (e.g., larger models execute longer, thinking fixes self-conditioning) but reports no statistical significance tests. Comparisons are based on visual inspection of curves." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "Effect sizes are reported with baseline context, e.g., 'GPT-5 (codenamed Horizon) with 2176 steps' vs 'Claude-4 Sonnet (432 steps)', and horizon lengths are quantified for different model sizes (Figure 4b)." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper uses 100 samples per condition but does not justify this sample size or discuss whether it provides sufficient statistical power." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Standard deviation is reported across 100 samples as shaded regions in Figure 4 and other plots." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Multiple model families and sizes are compared (Qwen3 4B-32B, Gemma3 4B-27B, frontier models). A hypothetical constant step-accuracy baseline (0.99) is also included as a dotted line in Figure 4." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "Baselines include contemporary frontier models: GPT-5, Claude-4 Sonnet, Grok 4, Gemini 2.5 Pro, Kimi K2, DeepSeek-R1/V3." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Multiple ablations are performed: with/without CoT, thinking vs non-thinking, context window sizes (Appendix A.2), majority voting (Appendix B), self-verification prompting (Appendix A.1), and error rate manipulation (Section 3.2)." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Multiple metrics are defined and used: step accuracy, turn accuracy, task accuracy, and horizon length (Section 2)." 79 }, 80 "human_evaluation": { 81 "applies": false, 82 "answer": false, 83 "justification": "The paper evaluates LLM execution on a synthetic arithmetic task with deterministic ground truth. Human evaluation is not relevant to these claims." 84 }, 85 "held_out_test_set": { 86 "applies": false, 87 "answer": false, 88 "justification": "Examples are generated programmatically on the fly; there is no training/tuning split to hold out from." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down by model size, model family, turn complexity K, number of turns, with/without thinking, and induced error rates." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Failure modes are extensively discussed: self-conditioning effect (Section 3.2), format-following failures (Appendix F), self-verification failures (Appendix A.1), and errors deconstructed into retrieval vs composition (Appendix D)." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "Several negative results: majority voting does not match thinking (Appendix B), self-verification prompting does not fix self-conditioning (Appendix A.1), scaling model size does not mitigate self-conditioning (Result 4)." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Abstract claims about compounding returns, self-conditioning, and thinking fixing self-conditioning are all supported by corresponding experimental results (Sections 2.1, 3.2, and 3.3)." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper uses controlled counterfactual experiments to support causal claims. The self-conditioning claim is tested by manipulating error rates in chat history (Section 3.2), isolating the causal mechanism through controlled variable manipulation." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": true, 120 "justification": "The Limitations section explicitly states the task 'does not reflect complexities and sources of error arising in real agentic tasks,' that results are about 'pretrained LLMs, and not inherent properties of transformers,' and that 'Improvement on our task is necessary, but not sufficient for long-horizon execution on real-world tasks.'" 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper explicitly tests alternative explanations: long-context degradation vs self-conditioning (Section 3.2), and discusses redundancy of internal circuits as an alternative explanation for scaling benefits (Section 3.1, citing Lindsey et al., 2025)." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "Models are referred to by marketing names (GPT-5, Claude-4 Sonnet, Grok 4, Gemini 2.5 Pro, Qwen3-32B) without API version strings or snapshot dates. Appendix E.4 lists model names but not specific version identifiers." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "Section E.2 and E.3 provide the exact prompting setup including system prompt, few-shot examples, and format instructions. The paper states 'the exact prompt' is provided in Section E." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Appendix E mentions temperature settings and Appendix Figure 13 shows results are robust to temperature. The binary search procedure for horizon length uses an 80% accuracy threshold." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "No agentic scaffolding is used. The experiments are direct model queries with controlled chat history." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "The data generation procedure is fully documented: dictionary of 5-letter English words mapping to integers in [-99, 99], keys sampled per turn, few-shot examples provided (Section 3, Appendix E.1)." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "A dedicated 'Limitations' paragraph appears at the end of Section 5 (Conclusion), discussing multiple specific limitations." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "Specific threats discussed: synthetic task does not reflect real agentic complexity, results may change with task-specific finetuning, task accuracy metric does not account for self-correction, and results are about pretrained LLMs not inherent transformer properties." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "The paper explicitly states: 'Improvement on our task is necessary, but not sufficient for long-horizon execution on real-world tasks.' It also notes the task does not cover diverse actions, only repeated ones." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "While a dataset link is indicated, the raw model outputs and per-sample results are not described as being released. Only aggregate results in figures are shown." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "The data generation procedure is fully specified: dictionary construction, key sampling, integer ranges, number of samples (100), and evaluation methodology (Sections 2-3, Appendix E)." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants. Data is programmatically generated from a synthetic task." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The full pipeline from task generation to evaluation is documented: dictionary creation, key sampling, model querying, answer extraction, and accuracy computation." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": true, 198 "justification": "Acknowledgements section lists CHIPS JU grant No. 101140087 (SMARTY), BMBF funding number 16MEE0444, IMPRS-IS, ELLIS PhD programs, and AISA cluster compute." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are listed: University of Cambridge, University of Stuttgart, Max Planck Institute, ELLIS Institute Tübingen, University of Southampton, Tübingen AI Center." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": true, 208 "justification": "Funders are academic/government bodies (EU CHIPS JU, German BMBF, IMPRS-IS) with no financial stake in the performance of any particular LLM." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "Training cutoff dates are not stated for any of the evaluated models. However, the paper notes the benchmark is contamination-free since examples are generated programmatically." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": true, 225 "justification": "The paper explicitly addresses this: 'An advantage of our benchmark is that it is contamination-free, as new examples can be generated programmatically' (Section 3.3)." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": true, 230 "justification": "The paper argues contamination is structurally impossible because examples are generated fresh each time: 'contamination-free, as new examples can be generated programmatically' (Section 3.3)." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "No API costs, token counts, or wall-clock times are reported for the experiments, despite querying multiple frontier API models extensively." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "Appendix E.5 mentions the AISA cluster but does not quantify GPU hours, total API spend, or overall compute budget." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "Diminishing gains in single-step accuracy compound into exponential improvements in horizon length for long-horizon tasks.", 286 "evidence": "Proposition 1 derives H_s(p) = ln(s)/ln(p), showing hyperbolic growth. Figure 2 illustrates this relationship. METR's empirical data on doubling horizon length every 7 months is consistent with this formula (Section 2.1).", 287 "supported": "strong" 288 }, 289 { 290 "claim": "Long-horizon execution is challenging even when models have the required knowledge and plan.", 291 "evidence": "Figure 4(a) shows near-perfect first-step accuracy for most models, yet task accuracy falls below 50% within 15 turns even for the best model (Section 3.1, Result 1).", 292 "supported": "strong" 293 }, 294 { 295 "claim": "Scaling model size yields non-diminishing improvements in horizon length.", 296 "evidence": "Figure 4(b-c) shows larger models sustain higher task accuracy for more turns across Qwen3 and Gemma3 families. Only 4 sizes per family are tested (Section 3.1, Result 2).", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "Models self-condition on their own errors, causing per-step accuracy to degrade beyond what long-context degradation alone explains.", 301 "evidence": "Counterfactual experiment manipulating error rates in chat history (Section 3.2). Figure 5 shows accuracy at turn 100 consistently degrades as induced error rate increases, even with error-free history showing some long-context degradation.", 302 "supported": "strong" 303 }, 304 { 305 "claim": "Thinking (RL-trained reasoning) fixes the self-conditioning effect.", 306 "evidence": "Figure 6 shows Qwen3 thinking models maintain stable accuracy at turn 100 regardless of induced error rate in context (Section 3.2, Result 5).", 307 "supported": "strong" 308 }, 309 { 310 "claim": "GPT-5 can execute over 2100 steps in a single turn, far ahead of Claude-4 Sonnet (432) and Grok 4 (384).", 311 "evidence": "Figure 7(b) benchmarks frontier thinking models on single-turn execution length with binary search for K at ≥80% accuracy (Section 3.3, Result 7).", 312 "supported": "moderate" 313 }, 314 { 315 "claim": "Parallel test-time compute (majority voting) cannot match sequential compute (thinking) for long-horizon execution.", 316 "evidence": "Appendix B, Figure 10 shows majority voting with N=100 provides only marginal improvement over the base model and does not match CoT performance.", 317 "supported": "moderate" 318 } 319 ], 320 "methodology_tags": ["benchmark-eval"], 321 "key_findings": "The paper demonstrates that diminishing returns in single-step LLM accuracy compound into exponential gains in the length of tasks models can complete, following a hyperbolic relationship. It identifies a 'self-conditioning' effect where models become more error-prone when their context contains prior mistakes, distinct from long-context degradation. Thinking (RL-trained reasoning) eliminates self-conditioning and dramatically extends single-turn execution length, with GPT-5 executing 2176 steps vs 432 for Claude-4 Sonnet. Sequential test-time compute significantly outperforms parallel scaling (majority voting) for long-horizon execution.", 322 "red_flags": [ 323 { 324 "flag": "Synthetic task generalizability", 325 "detail": "The key-value dictionary addition task is extremely simple and repetitive. The paper acknowledges this limitation but the title and framing ('The Illusion of Diminishing Returns') make broad claims about LLM scaling value that extend well beyond what this synthetic task demonstrates." 326 }, 327 { 328 "flag": "No statistical significance tests", 329 "detail": "Despite making multiple comparative claims about model scaling and self-conditioning effects, no statistical significance tests are reported. Conclusions are drawn from visual inspection of curves." 330 }, 331 { 332 "flag": "Model versions unspecified", 333 "detail": "Frontier models (GPT-5, Claude-4 Sonnet, Grok 4, etc.) are referenced by marketing names without API versions or snapshot dates, making results non-reproducible as these models update." 334 } 335 ], 336 "cited_papers": [ 337 { 338 "title": "Measuring AI ability to complete long tasks", 339 "authors": ["Thomas Kwa", "Ben West", "Joel Becker"], 340 "year": 2025, 341 "arxiv_id": "2503.14499", 342 "relevance": "METR benchmark for long-horizon agentic task completion, directly motivates this paper's framing of horizon length." 343 }, 344 { 345 "title": "The illusion of thinking: Understanding the strengths and limitations of reasoning models via the lens of problem complexity", 346 "authors": ["Parshin Shojaee"], 347 "year": 2025, 348 "arxiv_id": "2506.06941", 349 "relevance": "Claims thinking models give an 'illusion of thinking'; this paper directly responds by arguing failures are execution, not reasoning." 350 }, 351 { 352 "title": "Do large language model benchmarks test reliability?", 353 "authors": ["Joshua Vendrow"], 354 "year": 2025, 355 "arxiv_id": "2502.03461", 356 "relevance": "Evaluates LLM reliability on benchmarks, complementary to this paper's focus on long-horizon reliability." 357 }, 358 { 359 "title": "Scaling LLM test-time compute optimally can be more effective than scaling model parameters", 360 "authors": ["Charlie Snell"], 361 "year": 2024, 362 "arxiv_id": "2408.03314", 363 "relevance": "Studies test-time compute scaling; this paper contrasts parallel vs sequential compute for execution tasks." 364 }, 365 { 366 "title": "ReAct: Synergizing reasoning and acting in language models", 367 "authors": ["Shunyu Yao"], 368 "year": 2023, 369 "relevance": "Foundational work on reasoning-before-acting paradigm for LLM agents, cited to motivate thinking benefits." 370 }, 371 { 372 "title": "Navigating the jagged technological frontier: Field experimental evidence of the effects of AI on knowledge worker productivity and quality", 373 "authors": ["Fabrizio Dell'Acqua"], 374 "year": 2023, 375 "relevance": "RCT on AI productivity impact; coined 'jagged frontier' concept referenced in this paper." 376 }, 377 { 378 "title": "tau-bench: A benchmark for tool-agent-user interaction in real-world domains", 379 "authors": ["Shunyu Yao"], 380 "year": 2024, 381 "arxiv_id": "2406.12045", 382 "relevance": "Benchmark for evaluating LLM agent reliability in tool-use scenarios." 383 }, 384 { 385 "title": "AI Scientists Fail Without Strong Implementation Capability", 386 "authors": ["Minjun Zhu"], 387 "year": 2025, 388 "arxiv_id": "2506.01372", 389 "relevance": "Shows LLM execution failures in scientific coding tasks, complementary evidence for execution as bottleneck." 390 }, 391 { 392 "title": "Scaling laws for neural language models", 393 "authors": ["Jared Kaplan"], 394 "year": 2020, 395 "arxiv_id": "2001.08361", 396 "relevance": "Foundational scaling laws paper; this work extends the discussion to long-horizon task completion." 397 }, 398 { 399 "title": "Training compute-optimal large language models", 400 "authors": ["Jordan Hoffmann"], 401 "year": 2022, 402 "arxiv_id": "2203.15556", 403 "relevance": "Chinchilla scaling laws; this paper argues step-level diminishing returns mask horizon-level gains." 404 } 405 ] 406 }