scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23899B)
      1 {
      2   "paper": {
      3     "title": "Thinking Isn't an Illusion: Overcoming the Limitations of Reasoning Models via Tool Augmentations",
      4     "authors": ["Zhao Song", "Song Yue", "Jiahao Zhang"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2507.17699",
      8     "doi": "10.48550/arXiv.2507.17699"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "Tool-augmented LRMs consistently outperform their non-reasoning LLM counterparts on Apple's thinking-illusion benchmark, challenging the narrative that reasoning in LRMs is illusory. Program-of-Thought (PoT) with external Python interpreters yields the largest gains, enabling perfect accuracy on Hanoi Tower and ~80% on River Crossing/Blocks World for DeepSeek-R1. However, Checker Jumping remains unsolved for N≥3 across all models and tools, and weaker base models (Qwen 3) benefit less from tool augmentation.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "GitHub repository URL provided in abstract: https://github.com/magiclinux/thinking_is_not_an_illusion"
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The evaluation uses Apple's publicly available thinking-illusion benchmark puzzles. The paper reuses their problem descriptions (Section 3.1)."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "Only 'Python version 3.11.13' is mentioned for the PoT interpreter (Section 4.1). No requirements.txt, dependency list, or environment specification is provided."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README or reproduction guide is described."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Results are reported as success counts out of 5 trials (e.g., '4/5') with no confidence intervals or error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims LRMs outperform LLMs but provides no statistical significance tests. Comparisons are made by eyeballing success counts."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No effect sizes are reported. Results are raw success counts (e.g., 4/5 vs 0/5) without any formal effect size measure."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Each experiment is repeated 5 times with no justification for why 5 runs is sufficient. No power analysis."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Results are reported as k/5 success counts. No standard deviation, variance, or spread measure is provided across the 5 runs."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Direct prompting (no tool use) serves as the baseline, and LLM counterparts (DeepSeek-V3 vs R1, Qwen 3 vs Qwen 3 Thinking) are compared (Tables 2-3)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "DeepSeek-V3/R1 and Qwen 3 are 2024-2025 models. Apple's thinking-illusion benchmark is from 2025. All are contemporary."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Three tool-use frameworks (PoT, Think-and-Execute, Scratchpad) are compared against direct prompting, effectively ablating the tool component. Hyperparameter studies in Section 4.3 examine scratchpad chain length and token consumption."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "Only accuracy (success/trial out of 5) is reported. No other metrics such as token efficiency, solution optimality, or partial credit."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "The puzzles have automatically verifiable solutions (Section 3.1). Human evaluation is not relevant."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "This uses fixed puzzle instances with controlled complexity parameter N, not a train/test split paradigm."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by all 4 puzzle types (Hanoi, Checker, River, Block) and 6 complexity levels (N=3 to N=13) in Tables 2-3."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Observation 2 (Section 4.2): 'Checker Jumping remains unsolved for N≥3 across all models and tool-use methods.' Observation 3 discusses that weaker models benefit less from tools."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Checker Jumping fails across all conditions. Think-and-Execute provides minimal gains. Qwen 3 shows limited benefit from tools. These are explicitly reported."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims that 'with proper tool use, LRMs consistently outperform their non-reasoning counterparts across all levels of task complexity' are supported by Tables 2-3, though 'consistently' is generous given Checker Jumping failures."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper makes causal claims ('tool use can unlock the reasoning potential of LRMs') and the study design — controlled comparisons of LRM vs LLM with and without tools on fixed puzzles — provides adequate evidence for these component-level claims."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title and abstract claim to show that 'thinking isn't an illusion' and that tool augmentation overcomes 'limitations of reasoning models' broadly, but results are limited to 4 specific puzzle types from one benchmark with 2 model families. The paper does not bound these generalizations."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No discussion of alternative explanations. For example, PoT success on Hanoi could simply mean the models know the recursive algorithm (a memorization argument), not that they 'reason' better. The output length limitation hypothesis is discussed but no alternatives."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper measures success on 4 algorithmic puzzles and frames this as evidence about 'reasoning capabilities' broadly. No discussion of whether puzzle-solving is a valid proxy for general reasoning ability."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Models are listed as 'DeepSeek-V3', 'DeepSeek-R1', 'Qwen 3', 'Qwen 3 Thinking' (Table 1) without API snapshot dates or version identifiers. 'deepseek-reasoner' model name is mentioned but no version date."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper states they use 'the same prompts as in [SMA+25]' and adapt templates from [CKK+24] but does not include the actual prompt text. Scratchpad description D and in-context examples Em are described conceptually but not provided."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Scratchpad T=5 and m=3 in-context examples are reported. Python 3.11.13 is stated. But no temperature, top-p, max_tokens, or other API parameters are reported for model calls. Only a timeout=1200 recommendation is mentioned."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The scratchpad framework is described in detail (Section 3.3) with key components, step-wise prompting, early stopping mechanism, and final answer concatenation. PoT and Think-and-Execute are also described (Section 3.2, Figure 2)."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The puzzle generation follows Apple's benchmark with complexity parameter N. Section 3.1 describes the four puzzle types and how they map to the original benchmark's problem descriptions."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No dedicated limitations section. The conclusion mentions 'future directions' but does not discuss limitations of the current study."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity are discussed anywhere in the paper."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No explicit scope boundaries. The paper does not state what the results do NOT show or what settings were not tested."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "Only aggregate success counts (k/5) are reported. Individual run outputs, model responses, and generated code are not available."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 4.1 describes the experimental setup: models used, API interaction method, prompt structure, and repetition count."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data is from standardized benchmark puzzles."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The pipeline from model output to success/failure determination is not documented. How outputs are parsed and verified against puzzle solutions is not described beyond mentioning 'simple verifiers'."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is provided anywhere in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are listed: UC Berkeley and Northeastern University. One author has no institutional affiliation listed."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding information disclosed, so independence cannot be assessed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is included in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff dates are stated for any of the evaluated models."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether the models may have seen these puzzle types or solutions during training."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "Apple's thinking-illusion benchmark and classic puzzles like Tower of Hanoi are widely known. The paper does not discuss whether models have been trained on these puzzle solutions."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "Token consumption is analyzed for Qwen 3 Thinking (Figures 5-8) but no actual monetary costs or wall-clock times are reported."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total compute budget, API costs, or total token expenditure is reported."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Results are averaged over 5 runs but no seed sensitivity analysis or variance across seeds is shown."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Section 4.1: 'All experiments in this paper are repeated five times, and we report the average results across runs.'"
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search is described. Settings like T=5 and m=3 appear chosen without justification or search."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The choice of T=5 scratchpad steps and m=3 in-context examples is not justified. No sensitivity analysis for these choices beyond the scratchpad chain length study."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No statistical tests are performed at all, let alone corrections for multiple comparisons across the many model×tool×puzzle×N conditions."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors implement the tool-augmentation frameworks themselves and compare against Apple's results without acknowledging potential implementation bias."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Token consumption analysis (Figures 5-8) examines the relationship between tool use and token cost, showing that tool use does not necessarily increase token consumption."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether the 4 algorithmic puzzles actually measure 'reasoning capability' as claimed. The paper accepts Apple's benchmark at face value without questioning construct validity."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "When comparing LRMs vs LLMs with tools, different models may interact with tools differently. The paper does not address whether the observed LRM advantage comes from better reasoning or better tool use. The scaffold is identical, but the confound between reasoning ability and code generation ability is not discussed."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "Tower of Hanoi is a classic algorithm taught in CS courses and widely present in training data. The paper does not discuss whether models have memorized solutions."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the tool-use prompts or in-context examples leak solution strategies."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "The puzzle instances at different N values share the same structure. No discussion of whether performance on N=3 informs N=5, etc."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention methods are used."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "With proper tool use, LRMs consistently outperform their non-reasoning counterparts across all levels of task complexity.",
    365       "evidence": "Tables 2-3 show DeepSeek-R1 outperforming DeepSeek-V3 with PoT on River Crossing (4/5 vs 0/5 across all N) and Blocks World (5/5 vs 1/5). However, on Hanoi with PoT both achieve 5/5, and on Checker both achieve 0/5.",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "PoT enables major improvements for LRMs on previously unsolvable problems like River Crossing and Blocks World.",
    370       "evidence": "Table 3: DeepSeek-R1 with PoT achieves 4/5 on River Crossing across all N values (vs 0-1/5 with direct prompting) and 5/5 on Blocks World across all N values (vs 0-5/5).",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Some hard problems remain unsolved even with tool use (Checker Jumping for N≥3).",
    375       "evidence": "Table 2: All models score 0/5 on Checker Jumping for N≥3 across all tool-use conditions.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Tool use does not necessarily increase token consumption for LRMs.",
    380       "evidence": "Figures 5-8 show token consumption for Qwen 3 Thinking, where Scratchpad and Think-and-Execute sometimes use fewer tokens than direct prompting on Checker Jumping and Blocks World.",
    381       "supported": "moderate"
    382     }
    383   ],
    384   "red_flags": [
    385     {
    386       "flag": "Very small sample size",
    387       "detail": "Only 5 repetitions per condition with no statistical tests. Many comparisons are based on differences like 1/5 vs 0/5, which are not statistically distinguishable."
    388     },
    389     {
    390       "flag": "Overclaiming from limited evidence",
    391       "detail": "The title claims 'Thinking Isn't an Illusion' based on 4 algorithmic puzzles with 2 model families. On Checker Jumping (25% of tasks), no model benefits from tools. On Hanoi with PoT, LRMs and LLMs perform identically (all 5/5). The LRM advantage is mainly visible on 2 of 4 tasks with 1 of 3 tools."
    392     },
    393     {
    394       "flag": "Memorization confound unaddressed",
    395       "detail": "Tower of Hanoi is a canonical CS exercise. When PoT achieves 5/5 for all models, this likely reflects memorized algorithms rather than reasoning. The paper does not distinguish code generation from reasoning ability."
    396     },
    397     {
    398       "flag": "No statistical testing",
    399       "detail": "Claims of 'consistently outperform' are made without any statistical tests. With n=5 binary trials, many observed differences are within chance variation."
    400     },
    401     {
    402       "flag": "Missing limitations section",
    403       "detail": "No limitations, threats to validity, or scope boundaries are discussed despite substantial overclaiming."
    404     }
    405   ],
    406   "cited_papers": [
    407     {
    408       "title": "The illusion of thinking: Understanding the strengths and limitations of reasoning models via the lens of problem complexity",
    409       "authors": ["Parshin Shojaee", "Iman Mirzadeh", "Keivan Alizadeh", "Maxwell Horton", "Samy Bengio", "Mehrdad Farajtabar"],
    410       "year": 2025,
    411       "arxiv_id": "2506.06941",
    412       "relevance": "The primary benchmark and finding this paper responds to — claims LRMs do not outperform LLMs under controlled complexity."
    413     },
    414     {
    415       "title": "GSM-symbolic: Understanding the limitations of mathematical reasoning in large language models",
    416       "authors": ["Seyed Iman Mirzadeh", "Keivan Alizadeh", "Hooman Shahrokhi"],
    417       "year": 2025,
    418       "relevance": "Key prior work questioning LRM reasoning, finding models rely on pattern matching rather than generalizable reasoning."
    419     },
    420     {
    421       "title": "Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning",
    422       "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"],
    423       "year": 2025,
    424       "arxiv_id": "2501.12948",
    425       "relevance": "DeepSeek-R1 is one of the primary LRMs evaluated in this study."
    426     },
    427     {
    428       "title": "Program of thoughts prompting: Disentangling computation from reasoning for numerical reasoning tasks",
    429       "authors": ["Wenhu Chen", "Xueguang Ma", "Xinyi Wang", "William W. Cohen"],
    430       "year": 2023,
    431       "relevance": "PoT is a core tool-augmentation method evaluated in this paper."
    432     },
    433     {
    434       "title": "Language models as compilers: Simulating pseudocode execution improves algorithmic reasoning in language models",
    435       "authors": ["Hyungjoo Chae", "Yeonghyeon Kim"],
    436       "year": 2024,
    437       "relevance": "Think-and-Execute framework evaluated as one of the tool-use methods."
    438     },
    439     {
    440       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    441       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    442       "year": 2022,
    443       "relevance": "Foundational work on CoT reasoning that LRMs build upon."
    444     },
    445     {
    446       "title": "Scaling LLM test-time compute optimally can be more effective than scaling parameters for reasoning",
    447       "authors": ["Charlie Victor Snell", "Jaehoon Lee", "Kelvin Xu", "Aviral Kumar"],
    448       "year": 2025,
    449       "relevance": "Directly relevant to test-time compute scaling which is the mechanism behind LRMs."
    450     },
    451     {
    452       "title": "Stop overthinking: A survey on efficient reasoning for large language models",
    453       "authors": ["Yang Sui", "Yu-Neng Chuang", "Guanchu Wang"],
    454       "year": 2025,
    455       "arxiv_id": "2503.16419",
    456       "relevance": "Survey on LRM efficiency and overthinking, directly relevant to claims about reasoning model limitations."
    457     },
    458     {
    459       "title": "Toolformer: Language models can teach themselves to use tools",
    460       "authors": ["Timo Schick", "Jane Dwivedi-Yu"],
    461       "year": 2023,
    462       "relevance": "Foundational work on LLM tool use, relevant to the tool augmentation framework."
    463     },
    464     {
    465       "title": "Qwen3 technical report",
    466       "authors": ["An Yang", "Anfeng Li", "Baosong Yang"],
    467       "year": 2025,
    468       "arxiv_id": "2505.09388",
    469       "relevance": "Technical report for Qwen 3 models evaluated in this study."
    470     }
    471   ]
    472 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs