scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24761B)
      1 {
      2   "paper": {
      3     "title": "Thinking Longer, Not Always Smarter: Evaluating LLM Capabilities in Hierarchical Legal Reasoning",
      4     "authors": ["Li Zhang", "Matthias Grabmair", "Morgan Gray", "Kevin Ashley"],
      5     "year": 2025,
      6     "venue": "CSLAW '26",
      7     "arxiv_id": "2510.08710",
      8     "doi": "10.1145/3788646.3789522"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": false,
     17         "justification": "A project page URL is provided (https://thinking-longer-thinking-smarter.vercel.app) but no explicit code repository (GitHub, Zenodo) is linked in the paper. A project page alone is not a confirmed code release."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "No dataset download link is provided. The scenarios are generated by their scenario generator but neither the generated instances nor the generator code are explicitly released."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No requirements.txt, Dockerfile, or environment specification is mentioned in the paper."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions are provided. The evaluation pipeline is described at a high level (Section 5.1) but no runnable scripts or commands are given."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Figure 4 shows 'Accuracy with 95% CI (n=253)' — confidence intervals are reported for main accuracy results."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No statistical significance tests are reported. Model comparisons are made by comparing point estimates (e.g., '64.82% to 92.09%') without any formal tests."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Effect sizes are reported with baseline context: '2.6-fold improvement' (Section 6.2), '45% increase in computational effort' (Section 8), '3.0-fold increase in computational effort' (Section 8), percentage accuracy with task-level baselines."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "253 test instances per task are used (Section 5.2) but no justification for this number is provided and no power analysis is discussed."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "95% confidence intervals are shown in Figure 4, implying variance across instances is captured. Token usage is reported as averages (Tables 1-2)."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Six models are compared including 5 reasoning LLMs and 1 non-reasoning LLM (qwen3-non-thinking), providing a thinking vs. non-thinking baseline comparison."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Models evaluated include gpt-5-2025-08-07, gemini-2.5-pro-preview-06-05, qwen3-235b-a22b-thinking-2507 — all 2025 frontier models."
     72       },
     73       "ablation_study": {
     74         "applies": false,
     75         "answer": false,
     76         "justification": "No multi-component system is proposed. The paper evaluates off-the-shelf LLMs on a benchmark; there are no system components to ablate."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Two metrics are used: accuracy (exact set match) and reasoning token count (Tables 1-2, Figure 4)."
     82       },
     83       "human_evaluation": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "Ground truth is computed by a deterministic symbolic solver (Section 5.1). Human evaluation is irrelevant since correctness is formally verifiable."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Test instances are synthetically generated scenarios (253 per task). No model tuning is performed on any subset — models are evaluated zero/one-shot on all generated instances."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Results are broken down by Task 1, 2, and 3 (Table 1, Figure 4). Table 2 further breaks down token usage by correct vs. incorrect responses."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 7 provides detailed case study analysis of reasoning failures for qwen3-thinking (verbose/repetitive) and gpt-oss-120b (concise but error-prone), with specific excerpts from thinking traces."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The core finding is negative: models collapse on Task 3 (11.46%-33.99% accuracy). The 'thinking longer doesn't mean thinking smarter' finding is itself a negative result. Section 8 discusses the inefficiency pattern."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims (100% Task 1, 64.82%-92.09% Task 2, 11.46%-33.99% Task 3, models spend more tokens on incorrect answers) are all supported by Table 1 and Table 2."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The main causal claim — that RL-based post-training reasoning improves accuracy — is supported by comparing thinking vs. non-thinking variants of the same model family (qwen3-thinking 78.66% vs qwen3-non-thinking 30.04% on Task 2). This controlled comparison within one model family is adequate."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Section 10 explicitly states: 'Our evaluation is currently limited to the domain of trade secret law within the U.S. legal system, as represented by the CATO framework.' Specific scope limitations are acknowledged."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Section 10 discusses domain shift from math/logic training to legal reasoning as an alternative explanation for poor performance. Section 9 discusses that models may appear to reason effectively by 'retrieving and adapting existing solutions, masking the underlying deficit.'"
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper measures exact set match accuracy on formally defined legal reasoning tasks. Claims are about 'LLM capabilities in hierarchical legal reasoning' which directly matches the measurement. The formal framework ensures measurements are well-defined."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Specific model identifiers with dates/versions are provided: 'gpt-5-2025-08-07', 'qwen3-235b-a22b-thinking-2507', 'gpt-oss-120b', 'gemini-2.5-pro-preview-06-05', 'gemini-2.5-flash-preview-05-20' (Section 5.3)."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Full prompt templates for all three tasks are provided in Appendix C (Boxes C.1, C.2, C.3), including step-by-step instructions, definitions, and output format specifications."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Section 5.3 reports temperature (0.3), top_p (0.95), max_tokens (65,536) for each model family, with footnote explaining rationale and noting gpt-5 uses default parameters."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding is used. Models are prompted directly with structured inputs and produce JSON outputs."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 5.1-5.2 documents the evaluation pipeline: scenario generation from CATO framework, 253 instances per task, cases with 4 factors each, hierarchy in Mermaid format. The ground truth creation process is described."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 10 'Limitations and Future Work' provides substantive discussion of multiple limitations."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 10 discusses specific threats: limited to trade secret law domain, CATO hierarchy may not capture full legal reasoning complexity, RL training on math/logic tasks may not transfer to legal reasoning, and the fixed case complexity (4 factors)."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 10: 'Our evaluation is currently limited to the domain of trade secret law within the U.S. legal system.' Also acknowledges hierarchical structures 'may vary across different areas of law.'"
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No raw data (generated scenarios, model outputs, thinking traces beyond excerpts) is made available for independent verification."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 5.1 describes the scenario generation process: sampling factors from CATO framework, creating case pairs with specific properties (blocking, downplaying opportunities), and generating ground truth via deterministic solver."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants. Data consists of synthetically generated legal reasoning scenarios from the CATO framework."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Figure 3 and Section 5.1 document the full pipeline: scenario generation → ground truth creation → LLM inference → evaluation. Each stage is described with its purpose."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding acknowledgment or grant information is mentioned anywhere in the paper."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations (University of Pittsburgh, Technical University of Munich) are clearly listed. Authors are academic researchers with no apparent affiliation to the model providers being evaluated."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding is disclosed, so independence cannot be assessed. Absence of disclosure is not absence of conflict."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests statement or financial disclosure is present in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No training data cutoff dates are stated for any of the evaluated models, despite all being proprietary models with undisclosed training data."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "The CATO framework (factors, hierarchy, case representations) has been published since the 1990s and could be in model training data. This is not discussed."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "While scenarios are newly generated, they use the well-known CATO factor hierarchy which models may have seen. The paper does not discuss whether models' familiarity with CATO concepts could inflate performance. Task 1's 100% accuracy across all models is consistent with contamination on the framework structure."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study. It evaluates LLMs on synthetically generated scenarios."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": true,
    282         "justification": "Token usage (thinking tokens) is reported per model per task in Table 1, and broken down by correct/incorrect responses in Table 2. This is a direct measure of inference cost."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No total computational budget (API costs, total tokens, wall-clock time for all experiments) is stated."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No mention of multiple random seeds or runs. Results appear to be from single runs per model per instance."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The number of runs per instance is not stated. It appears to be a single run per instance given the lack of any variance-across-runs reporting."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Footnote 1 mentions temperature was set 'based on the models' performance on other reasoning benchmarks' but no search budget (configurations tried, compute spent) is reported."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "Section 5.3 and footnote 1 explain the rationale for parameter choices: temperature=0.3 chosen 'based on the models' performance on other reasoning benchmarks to maintain sufficient consistency,' avoiding temperature=0 to 'prevent overly deterministic responses.'"
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "Multiple model comparisons are made across three tasks with no statistical tests and therefore no multiple comparison correction."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The authors designed the benchmark and evaluation framework. They do not discuss potential bias from evaluating third-party models on their own benchmark design, nor do they have independent evaluation."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": true,
    324         "justification": "Section 6.3 and Figure 4 (right panel) explicitly analyze performance vs. token usage. Section 8 and Figure 5 compare computational effort between correct and incorrect responses. This is a central finding of the paper."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "The paper grounds the benchmark in the established CATO framework (Section 3), discusses the relationship between formal tasks and real legal reasoning (Sections 1, 9), and acknowledges in Section 10 that CATO may not capture full legal reasoning complexity."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": false,
    333         "answer": false,
    334         "justification": "No scaffolding is used. Models are prompted directly."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "The CATO framework (1990s publications) and its factor hierarchy are well-documented in AI & Law literature. Models trained on this literature could have internalized the reasoning patterns. This is not discussed."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "The prompts provide full task definitions, factor hierarchies, and one-shot examples. Whether this evaluation setup leaks information beyond what would be available in realistic usage is not discussed."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "All 253 scenarios per task are generated from the same CATO factor set and hierarchy. Structural similarities between instances are not discussed."
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No leakage detection or prevention method is applied."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "All models achieve 100% accuracy on Task 1 (identifying surface-level distinctions) but performance collapses on Task 3 (integrated analysis: 11.46%-33.99%)",
    363       "evidence": "Table 1 and Figure 4 show 100% accuracy for all 6 models on Task 1, degradation on Task 2 (64.82%-92.09%), and collapse on Task 3 (11.46%-33.99%).",
    364       "supported": "strong"
    365     },
    366     {
    367       "claim": "Models consistently expend more computational resources on incorrect responses than correct ones",
    368       "evidence": "Table 2 and Figure 5 show gpt-5 uses 4,456 tokens on incorrect Task 2 responses vs 3,081 for correct ones (45% increase). Pattern is consistent across most models/tasks.",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "Reasoning capabilities from RL-based post-training provide a 2.6-fold improvement on hierarchical reasoning",
    373       "evidence": "Section 6.2: qwen3-thinking achieves 78.66% on Task 2 vs 30.04% for qwen3-non-thinking. On Task 3, thinking model achieves 33.99% while non-thinking achieves 0.00%.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "More reasoning tokens do not correlate with better performance across models",
    378       "evidence": "Section 6.3: qwen3-thinking uses 9,596 tokens on Task 2 for 78.66% accuracy, while gpt-5 uses 3,189 tokens for 92.09% accuracy — 3.0x more tokens for worse performance.",
    379       "supported": "strong"
    380     }
    381   ],
    382   "methodology_tags": ["benchmark-eval"],
    383   "key_findings": "The paper reveals a performance degradation paradox in LLM hierarchical legal reasoning: while all models achieve 100% accuracy on surface-level distinction identification, performance collapses to 11-34% on integrated multi-step analysis. Models consistently spend more tokens on incorrect answers than correct ones, suggesting 'thinking longer' does not mean 'thinking smarter.' RL-based reasoning training helps (2.6x improvement for thinking vs non-thinking models) but is insufficient for complex integrated reasoning.",
    384   "red_flags": [
    385     {
    386       "flag": "Single-run results",
    387       "detail": "Results appear to be from single runs per model per instance. With temperature=0.3, outputs are not fully deterministic. No variance across runs is reported, making it impossible to assess result stability."
    388     },
    389     {
    390       "flag": "Potential contamination of CATO framework",
    391       "detail": "The CATO factor hierarchy has been published since the 1990s. All evaluated models could have seen CATO-related content during training. Task 1's universal 100% accuracy is consistent with models having internalized the framework's structure, yet contamination is never discussed."
    392     },
    393     {
    394       "flag": "No statistical tests for model comparisons",
    395       "detail": "All claims about model differences (e.g., gpt-5 vs qwen3-thinking, thinking vs non-thinking) are based on comparing point estimates. With n=253, confidence intervals overlap for some comparisons but no formal tests are conducted."
    396     }
    397   ],
    398   "cited_papers": [
    399     {
    400       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    401       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    402       "year": 2022,
    403       "relevance": "Foundational work on chain-of-thought prompting for LLM reasoning, directly relevant to the paper's evaluation of reasoning capabilities."
    404     },
    405     {
    406       "title": "Tree of thoughts: Deliberate problem solving with large language models",
    407       "authors": ["Shunyu Yao", "Dian Yu", "Jeffrey Zhao"],
    408       "year": 2023,
    409       "arxiv_id": "2305.10601",
    410       "relevance": "Extended reasoning approaches to multi-path exploration; relevant to hierarchical reasoning evaluation."
    411     },
    412     {
    413       "title": "Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning",
    414       "authors": ["Daya Guo", "Dejian Yang"],
    415       "year": 2025,
    416       "arxiv_id": "2501.12948",
    417       "relevance": "RL-based post-training for reasoning, the training paradigm whose effectiveness this paper evaluates."
    418     },
    419     {
    420       "title": "Legalbench: A collaboratively built benchmark for measuring legal reasoning in large language models",
    421       "authors": ["Neel Guha", "Julian Nyarko", "Daniel Ho"],
    422       "year": 2023,
    423       "relevance": "Major legal reasoning benchmark for LLMs; this paper positions itself against it by testing integrated reasoning chains vs isolated skills."
    424     },
    425     {
    426       "title": "GPT-5 System Card",
    427       "authors": ["OpenAI"],
    428       "year": 2025,
    429       "relevance": "System card for one of the evaluated models (gpt-5-2025-08-07), a frontier reasoning model."
    430     },
    431     {
    432       "title": "Gemini 2.5: Pushing the frontier with advanced reasoning, multimodality, long context, and next generation agentic capabilities",
    433       "authors": ["Gheorghe Comanici"],
    434       "year": 2025,
    435       "arxiv_id": "2507.06261",
    436       "relevance": "Technical report for two of the evaluated models (gemini-pro and gemini-flash)."
    437     },
    438     {
    439       "title": "Qwen3 technical report",
    440       "authors": ["An Yang", "Anfeng Li"],
    441       "year": 2025,
    442       "arxiv_id": "2505.09388",
    443       "relevance": "Technical report for two evaluated models (qwen3-thinking and qwen3-non-thinking)."
    444     },
    445     {
    446       "title": "Sleeper agents: Training deceptive llms that persist through safety training",
    447       "authors": ["Evan Hubinger", "Carson Denison"],
    448       "year": 2024,
    449       "arxiv_id": "2401.05566",
    450       "relevance": "AI safety work on deceptive LLM behavior, relevant to trustworthy AI evaluation."
    451     },
    452     {
    453       "title": "Augmented language models: a survey",
    454       "authors": ["Grégoire Mialon"],
    455       "year": 2023,
    456       "arxiv_id": "2302.07842",
    457       "relevance": "Survey of augmented LM approaches including tool use and reasoning; relevant to understanding LLM reasoning capabilities."
    458     },
    459     {
    460       "title": "A survey of frontiers in llm reasoning: Inference scaling, learning to reason, and agentic systems",
    461       "authors": ["Zixuan Ke", "Fangkai Jiao"],
    462       "year": 2025,
    463       "arxiv_id": "2504.09037",
    464       "relevance": "Comprehensive survey of LLM reasoning approaches including inference scaling, directly relevant to the 'thinking longer' finding."
    465     }
    466   ]
    467 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs