scan-v5.json (26218B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "The Geometry of Thought: How Scale Restructures Reasoning In Large Language Models", 6 "authors": [ 7 "S. Anderson" 8 ], 9 "year": 2026, 10 "venue": "arXiv.org", 11 "arxiv_id": "2601.13358", 12 "doi": "10.48550/arXiv.2601.13358" 13 }, 14 "checklist": { 15 "claims_and_evidence": { 16 "abstract_claims_supported": { 17 "applies": true, 18 "answer": true, 19 "justification": "All specific quantitative claims in the abstract (45% dimensional collapse, 31% alignment increase, 10× manifold untangling, coherence ≈−0.4, 63.6% operator accuracy) are backed by results sections with matching numbers.", 20 "source": "haiku" 21 }, 22 "causal_claims_justified": { 23 "applies": true, 24 "answer": false, 25 "justification": "The paper uses causal language throughout ('scale triggers,' 'scale induces') but compares only two static pre-trained checkpoints; the limitations section itself admits 'Correlation, not causation.'", 26 "source": "haiku" 27 }, 28 "generalization_bounded": { 29 "applies": true, 30 "answer": false, 31 "justification": "The conclusion states 'scale does not improve reasoning—it reshapes it' as a universal claim; the limitations section does restrict this to Llama and English data, but the abstract and main text repeatedly generalize beyond the tested setting.", 32 "source": "haiku" 33 }, 34 "alternative_explanations_discussed": { 35 "applies": true, 36 "answer": true, 37 "justification": "Section 4.1 explicitly presents two competing interpretations ('Expertise Interpretation' vs. 'Compression Interpretation') and acknowledges the data cannot fully disambiguate them.", 38 "source": "haiku" 39 }, 40 "proxy_outcome_distinction": { 41 "applies": true, 42 "answer": false, 43 "justification": "The paper conflates geometric changes in hidden-state dimensionality with qualitative changes in 'understanding,' stating the model 'reasons differently' based on dimensional collapse without demonstrating that lower dimensionality tracks task accuracy.", 44 "source": "haiku" 45 } 46 }, 47 "limitations_and_scope": { 48 "limitations_section_present": { 49 "applies": true, 50 "answer": true, 51 "justification": "Section 4.7 is a dedicated limitations section covering single model family, English-only data, only two scale points, dataset confounds, and lack of causal identification.", 52 "source": "haiku" 53 }, 54 "threats_to_validity_specific": { 55 "applies": true, 56 "answer": true, 57 "justification": "Specific named threats include: single model family (Llama-3-Instruct), English-only benchmarks, only two scale comparison points (8B and 70B), domain-dataset confounds, and absence of causal intervention experiments.", 58 "source": "haiku" 59 }, 60 "scope_boundaries_stated": { 61 "applies": true, 62 "answer": true, 63 "justification": "Limitations explicitly state results may not generalize to other architectures, non-English legal systems, or intermediate/larger scales, with causal interpretation requiring further intervention experiments.", 64 "source": "haiku" 65 } 66 }, 67 "conflicts_of_interest": { 68 "funding_disclosed": { 69 "applies": true, 70 "answer": false, 71 "justification": "No funding disclosure or acknowledgments section is present anywhere in the paper.", 72 "source": "haiku" 73 }, 74 "affiliations_disclosed": { 75 "applies": true, 76 "answer": true, 77 "justification": "Author affiliation with Scrivly.AI is disclosed on the title page (sam@scrivly.ai).", 78 "source": "haiku" 79 }, 80 "funder_independent_of_outcome": { 81 "applies": true, 82 "answer": false, 83 "justification": "No funding is disclosed; the author is from Scrivly.AI, a commercial entity whose potential interest in the research outcomes is unknown and undisclosed.", 84 "source": "haiku" 85 }, 86 "financial_interests_declared": { 87 "applies": true, 88 "answer": false, 89 "justification": "No competing interests statement, patent disclosure, or financial interests declaration appears anywhere in the paper.", 90 "source": "haiku" 91 } 92 }, 93 "scope_and_framing": { 94 "key_terms_defined": { 95 "applies": true, 96 "answer": true, 97 "justification": "Key terms are formally defined: 'reasoning trajectory' (sequence of final-layer hidden states, Eq. 1), d95, dmle, alignment, coherence, and all three phase labels (Crystalline/Liquid/Lattice) are operationalized in Sections 1.2 and 3.4.", 98 "source": "haiku" 99 }, 100 "intended_contribution_clear": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 1.5 'Contributions' lists five numbered contributions explicitly: domain-dependent geometric scaling laws, a three-phase taxonomy, the universal oscillatory signature, geometry-aware operator learning, and a reproducible measurement framework.", 104 "source": "haiku" 105 }, 106 "engagement_with_prior_work": { 107 "applies": true, 108 "answer": true, 109 "justification": "Section 2 contains 10 subsections and Table 1 explicitly positioning contributions relative to scaling laws, CoT, mechanistic interpretability, manifold hypothesis, phase transitions, inference acceleration, and operator learning literature.", 110 "source": "haiku" 111 } 112 } 113 }, 114 "type_checklist": { 115 "empirical": { 116 "artifacts": { 117 "code_released": { 118 "applies": true, 119 "answer": false, 120 "justification": "Code release is promised 'upon acceptance'; until then 'available to qualified researchers upon request'—both are NO under the criterion.", 121 "source": "haiku" 122 }, 123 "data_released": { 124 "applies": true, 125 "answer": true, 126 "justification": "All five datasets (GSM8K, GPQA, HumanEval, CaseHOLD, LexGLUE–SCOTUS) are publicly available standard benchmarks used unmodified.", 127 "source": "haiku" 128 }, 129 "environment_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "Hardware (8× NVIDIA B200 GPUs, bfloat16 weights) is mentioned but no requirements.txt, Dockerfile, or Python package versions are provided.", 133 "source": "haiku" 134 }, 135 "reproduction_instructions": { 136 "applies": true, 137 "answer": false, 138 "justification": "The methodology describes the conceptual protocol but no step-by-step reproduction instructions are provided, and code is not released.", 139 "source": "haiku" 140 } 141 }, 142 "statistical_methodology": { 143 "confidence_intervals_or_error_bars": { 144 "applies": true, 145 "answer": true, 146 "justification": "Section 5.4 ('Statistical Robustness') reports bootstrapped 95% confidence intervals for alignment changes across all four domains.", 147 "source": "haiku" 148 }, 149 "significance_tests": { 150 "applies": true, 151 "answer": true, 152 "justification": "Bootstrap CIs are used to assess statistical significance—Math CI spans zero (confirming scale invariance), Law CI [0.14, 0.30] excludes zero (confirming significant effect).", 153 "source": "haiku" 154 }, 155 "effect_sizes_reported": { 156 "applies": true, 157 "answer": true, 158 "justification": "Absolute and percentage changes are reported throughout: 45% dimensional collapse, Δ=+0.22 alignment, 213% clustering increase, 10× G/L reduction.", 159 "source": "haiku" 160 }, 161 "sample_size_justified": { 162 "applies": true, 163 "answer": false, 164 "justification": "Sample sizes are determined by benchmark availability (N=7473 GSM8K, N=500 GPQA, N=164 HumanEval, N=5000 legal subsamples) with no power analysis or justification for adequacy.", 165 "source": "haiku" 166 }, 167 "variance_reported": { 168 "applies": true, 169 "answer": false, 170 "justification": "Variance is reported for only one metric (σ=0.027 for legal alignment); d95, dmle, coherence, G/L ratio, and silhouette score are all reported as single point estimates without spread.", 171 "source": "haiku" 172 } 173 }, 174 "evaluation_design": { 175 "baselines_included": { 176 "applies": true, 177 "answer": true, 178 "justification": "Neural Reasoning Operator evaluation uses two baselines: identity predictor (ĥ_T = h_0) and mean predictor (ĥ_T = E[h_T]).", 179 "source": "haiku" 180 }, 181 "baselines_contemporary": { 182 "applies": true, 183 "answer": true, 184 "justification": "Baselines are appropriate for the operator learning task; four operator architectures (Linear, MLP, DeepONet, Spectral KAN) are also compared against each other.", 185 "source": "haiku" 186 }, 187 "ablation_study": { 188 "applies": true, 189 "answer": true, 190 "justification": "Four operator architectures (Linear, MLP, DeepONet, Spectral KAN vs. Turbo with velocity conditioning) are systematically compared, effectively ablating design choices.", 191 "source": "haiku" 192 }, 193 "multiple_metrics": { 194 "applies": true, 195 "answer": true, 196 "justification": "Six geometric metrics are reported (d95, dmle, alignment, coherence, silhouette, G/L ratio) plus operator test MSE and probe decoding accuracy.", 197 "source": "haiku" 198 }, 199 "human_evaluation": { 200 "applies": false, 201 "answer": false, 202 "justification": "Human evaluation is not relevant for this geometric analysis of LLM hidden-state trajectories.", 203 "source": "haiku" 204 }, 205 "held_out_test_set": { 206 "applies": true, 207 "answer": true, 208 "justification": "Operator training uses a fixed 70/15/15 train/val/test split (seed 42); probe decoding accuracy is reported on the held-out test set.", 209 "source": "haiku" 210 }, 211 "per_category_breakdown": { 212 "applies": true, 213 "answer": true, 214 "justification": "All results are broken down by domain (Law, Science, Code, Math) and scale (8B, 70B), yielding 48 measurements across 6 metrics × 4 domains × 2 scales.", 215 "source": "haiku" 216 }, 217 "failure_cases_discussed": { 218 "applies": true, 219 "answer": true, 220 "justification": "LogicBench extraction failure is reported and the domain excluded; the paper also explains why Liquid domains (Science, Math) resist operator amortization due to unfavorable geometry.", 221 "source": "haiku" 222 }, 223 "negative_results_reported": { 224 "applies": true, 225 "answer": true, 226 "justification": "Spectral KAN underperforms the Turbo operator (reported as an informative negative result); scale invariance in Science/Math is explicitly framed as a null result of equal theoretical significance to Crystallization.", 227 "source": "haiku" 228 } 229 }, 230 "setup_transparency": { 231 "model_versions_specified": { 232 "applies": true, 233 "answer": true, 234 "justification": "Exact HuggingFace model IDs are specified: 'meta-llama/Meta-Llama-3-8B-Instruct' and 'meta-llama/Llama-3.1-70B-Instruct' with hidden dimensions noted.", 235 "source": "haiku" 236 }, 237 "prompts_provided": { 238 "applies": true, 239 "answer": false, 240 "justification": "Prompt format is described (chat template with delimiters like 'Final:', 'Answer:', 'Verdict:') but actual prompt text is not provided; it will be released with the code upon acceptance.", 241 "source": "haiku" 242 }, 243 "hyperparameters_reported": { 244 "applies": true, 245 "answer": true, 246 "justification": "Greedy decoding (do_sample=False), max_new_tokens=512, AdamW lr=1e-4, cosine annealing, batch size 64, 50 epochs, k=10 for MLE estimator, and fixed seed 42 are all reported.", 247 "source": "haiku" 248 }, 249 "scaffolding_described": { 250 "applies": false, 251 "answer": false, 252 "justification": "No agentic scaffolding is used; this is a trajectory extraction and geometric analysis study on instruction-tuned models.", 253 "source": "haiku" 254 }, 255 "data_preprocessing_documented": { 256 "applies": true, 257 "answer": true, 258 "justification": "Section 3.3 documents subsampling (N=5000 for legal datasets), empty generation filtering, two-pass generate-then-extract protocol, delimiter localization, and float16 storage with float32 computation.", 259 "source": "haiku" 260 } 261 }, 262 "data_integrity": { 263 "raw_data_available": { 264 "applies": true, 265 "answer": false, 266 "justification": "Hidden-state trajectory arrays are stored as memory-mapped files but not released; 'available to qualified researchers upon request' does not constitute public availability.", 267 "source": "haiku" 268 }, 269 "data_collection_described": { 270 "applies": true, 271 "answer": true, 272 "justification": "Section 3.3 details the two-pass generate-then-extract protocol including teacher-forced forward passes, trajectory indexing formulas, delimiter localization, and filtering steps.", 273 "source": "haiku" 274 }, 275 "recruitment_methods_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "Standard public benchmarks are used; no participant recruitment is involved.", 279 "source": "haiku" 280 }, 281 "data_pipeline_documented": { 282 "applies": true, 283 "answer": true, 284 "justification": "Full pipeline from benchmark loading → tokenization → generation → teacher-forced extraction → NumPy memmap storage → geometric analysis is documented across Sections 3.2–3.4.", 285 "source": "haiku" 286 } 287 }, 288 "contamination": { 289 "training_cutoff_stated": { 290 "applies": true, 291 "answer": false, 292 "justification": "Training data cutoffs for Llama-3-8B-Instruct and Llama-3.1-70B-Instruct are not stated, despite reporting operator accuracy on benchmarks that may overlap with training data.", 293 "source": "haiku" 294 }, 295 "train_test_overlap_discussed": { 296 "applies": true, 297 "answer": false, 298 "justification": "No discussion of potential training data overlap with any of the five benchmarks; GSM8K and HumanEval were published well before Llama-3 training.", 299 "source": "haiku" 300 }, 301 "benchmark_contamination_addressed": { 302 "applies": true, 303 "answer": false, 304 "justification": "GSM8K, HumanEval, and CaseHOLD were all publicly available before Llama-3 training cutoffs; potential contamination affecting geometric and accuracy results is not addressed.", 305 "source": "haiku" 306 } 307 }, 308 "human_studies": { 309 "pre_registered": { 310 "applies": false, 311 "answer": false, 312 "justification": "No human participants involved.", 313 "source": "haiku" 314 }, 315 "irb_or_ethics_approval": { 316 "applies": false, 317 "answer": false, 318 "justification": "No human participants involved.", 319 "source": "haiku" 320 }, 321 "demographics_reported": { 322 "applies": false, 323 "answer": false, 324 "justification": "No human participants involved.", 325 "source": "haiku" 326 }, 327 "inclusion_exclusion_criteria": { 328 "applies": false, 329 "answer": false, 330 "justification": "No human participants involved.", 331 "source": "haiku" 332 }, 333 "randomization_described": { 334 "applies": false, 335 "answer": false, 336 "justification": "No human participants involved.", 337 "source": "haiku" 338 }, 339 "blinding_described": { 340 "applies": false, 341 "answer": false, 342 "justification": "No human participants involved.", 343 "source": "haiku" 344 }, 345 "attrition_reported": { 346 "applies": false, 347 "answer": false, 348 "justification": "No human participants involved.", 349 "source": "haiku" 350 } 351 }, 352 "cost_and_practicality": { 353 "inference_cost_reported": { 354 "applies": true, 355 "answer": false, 356 "justification": "Hardware (8× NVIDIA B200 GPUs, 180GB VRAM each) is mentioned but no inference latency, throughput, or cost per trajectory is reported.", 357 "source": "haiku" 358 }, 359 "compute_budget_stated": { 360 "applies": true, 361 "answer": false, 362 "justification": "Compute node specification is given but total GPU-hours or cost to extract 25,000+ trajectories across two model scales is not stated.", 363 "source": "haiku" 364 } 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "Scale triggers domain-specific geometric phase transitions: legal reasoning undergoes 'Crystallization' (45% d95 collapse from 501→274, 31% alignment increase, 10× manifold untangling), while Science and Math remain geometrically invariant despite 9× parameter increase.", 371 "evidence": "PCA-based d95, displacement alignment, and G/L ratio measured on 25,000+ chain-of-thought trajectories at 8B and 70B across four domains; bootstrapped CIs reported for alignment changes.", 372 "supported": "moderate" 373 }, 374 { 375 "claim": "A universal oscillatory constant (step-to-step coherence ≈−0.4) persists across all four domains and both model scales, indicating an architectural invariant of transformer dynamics.", 376 "evidence": "Cosine similarity of consecutive velocity vectors reported as −0.40 to −0.42 uniformly across all eight experimental conditions; visualized in Figure 8.", 377 "supported": "moderate" 378 }, 379 { 380 "claim": "Neural Reasoning Operators achieve 63.6% probe decoding accuracy on held-out legal classification tasks, exceeding identity and mean baselines by 10 percentage points.", 381 "evidence": "Adapter/probe decoding on 70/15/15 held-out test split; trained on 8B legal trajectories using AdamW with cosine annealing.", 382 "supported": "moderate" 383 }, 384 { 385 "claim": "Code reasoning at 70B forms a 'Lattice' of 5 discrete strategic modes, with silhouette score increasing 213% (0.133→0.417) compared to 8B.", 386 "evidence": "K-means clustering on PCA-projected (50-dim) start states with silhouette score optimization over k values.", 387 "supported": "weak" 388 }, 389 { 390 "claim": "Intrinsic dimensionality (dmle ≈20–25) is invariant across all domains and scales, decoupled from global dimensionality variation.", 391 "evidence": "Levina-Bickel MLE with k=10 nearest neighbors estimated on random subsamples of start states {h0} per condition.", 392 "supported": "weak" 393 } 394 ], 395 "methodology_tags": [ 396 "observational", 397 "benchmark-eval" 398 ], 399 "key_findings": "Analyzing 25,000+ chain-of-thought trajectories from Llama-3-8B and 70B across four domains, this paper finds scaling laws are domain-dependent at the geometric level: legal reasoning undergoes 'Crystallization' (45% dimensional collapse, 10× manifold untangling, 31% alignment increase), while scientific and mathematical reasoning remain geometrically invariant ('Liquid' phase), and code organizes into discrete strategic clusters ('Lattice' phase). A universal oscillatory signature (step-to-step coherence ≈−0.4) persists across all conditions regardless of domain or scale, interpreted as an architectural invariant of transformer dynamics. Neural Reasoning Operators trained on crystalline legal trajectories achieve 63.6% probe decoding accuracy on held-out classification, supporting the hypothesis that favorable manifold geometry enables amortized inference.", 400 "red_flags": [ 401 { 402 "flag": "Single model family", 403 "detail": "All experiments use only Llama-3-Instruct variants; 'universal' and 'architectural invariant' properties are demonstrated solely within one architecture family, making cross-architecture generalization unverified." 404 }, 405 { 406 "flag": "Only two scale points", 407 "detail": "Comparing 8B and 70B cannot distinguish a sharp phase transition from smooth continuous change; the 'phase transition' framing is a metaphor, not an empirically established discontinuity." 408 }, 409 { 410 "flag": "Code not released", 411 "detail": "Extraction and analysis code promised 'upon acceptance' or 'available upon request'; independent replication is currently impossible." 412 }, 413 { 414 "flag": "Causal language without causal design", 415 "detail": "Paper claims scale 'triggers,' 'induces,' and 'restructures' reasoning throughout, but the observational comparison of two static pre-trained checkpoints cannot support causal inference; the limitations section itself acknowledges this." 416 }, 417 { 418 "flag": "No funding or COI disclosure", 419 "detail": "Single author from commercial entity Scrivly.AI; no funding source, competing interests statement, or acknowledgments section is present." 420 }, 421 { 422 "flag": "Proxy-outcome conflation", 423 "detail": "Geometric changes in hidden-state dimensionality are interpreted as evidence that models 'reason differently' without demonstrating that dimensional collapse tracks improved task performance or different behavioral outputs." 424 }, 425 { 426 "flag": "LogicBench silently excluded", 427 "detail": "LogicBench extraction 'did not complete successfully' and was excluded without investigation; this silent failure potentially biases the domain coverage underlying the phase taxonomy." 428 }, 429 { 430 "flag": "Operator validated on 8B only", 431 "detail": "The 63.6% accuracy result is for operators trained and evaluated on 8B trajectories; 70B operator validation is deferred to 'future work' despite the paper's central focus on the 70B Crystallization event." 432 } 433 ], 434 "cited_papers": [ 435 { 436 "title": "Scaling Laws for Neural Language Models", 437 "relevance": "Kaplan et al. foundational paper this work claims to extend from behavioral metrics to representational geometry." 438 }, 439 { 440 "title": "Training Compute-Optimal Large Language Models (Chinchilla)", 441 "relevance": "Hoffmann et al. behavioral-only scaling analysis that the paper argues misses geometric heterogeneity across domains." 442 }, 443 { 444 "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 445 "relevance": "Wei et al. foundational CoT paper; this work characterizes CoT trajectories geometrically across domains and scales." 446 }, 447 { 448 "title": "Emergent Abilities of Large Language Models", 449 "relevance": "Wei et al. phase transition framing of emergent abilities; this paper proposes a complementary geometric phase taxonomy." 450 }, 451 { 452 "title": "REMA: A Unified Reasoning Manifold Framework for Interpreting Large Language Models", 453 "relevance": "Li et al. (2025) most closely related prior work formalizing the reasoning manifold concept and using geometric deviation for failure diagnosis." 454 }, 455 { 456 "title": "A Statistical Physics of Language Model Reasoning", 457 "relevance": "Carson & Reisizadeh (2025) drift-diffusion framing of sentence-level trajectories; directly compared prior work in the same space." 458 }, 459 { 460 "title": "Are Emergent Abilities of Large Language Models a Mirage?", 461 "relevance": "Schaeffer et al. challenges phase transition framing of LLM capabilities; directly relevant counterpoint to this paper's claims." 462 }, 463 { 464 "title": "GPQA: A Graduate-Level Google-Proof Q&A Benchmark", 465 "relevance": "Primary scientific reasoning dataset used for trajectory extraction in this study." 466 }, 467 { 468 "title": "LexGLUE: A Benchmark Dataset for Legal Language Understanding in English", 469 "relevance": "Primary legal reasoning benchmark (SCOTUS subset) for the main cross-scale Crystallization analysis." 470 }, 471 { 472 "title": "Fast Inference from Transformers via Speculative Decoding", 473 "relevance": "Key inference acceleration baseline that endpoint prediction is proposed to improve upon by bypassing sequential trajectory traversal." 474 } 475 ], 476 "engagement_factors": { 477 "practical_relevance": { 478 "score": 2, 479 "justification": "Findings have direct implications for domain-specific model compression and inference acceleration, but code is not released making immediate application impossible." 480 }, 481 "surprise_contrarian": { 482 "score": 3, 483 "justification": "Directly and empirically challenges the dominant 'scale uniformly improves reasoning' narrative with domain-specific null results (Science/Math invariance) alongside the Crystallization finding." 484 }, 485 "fear_safety": { 486 "score": 1, 487 "justification": "Brief discussion of domain-dependent failure modes and interpretability implications for AI safety, but not a primary focus of the paper." 488 }, 489 "drama_conflict": { 490 "score": 1, 491 "justification": "Interesting physics-metaphor framing with phase transitions, but no direct conflict with named researchers or particularly controversial claims." 492 }, 493 "demo_ability": { 494 "score": 1, 495 "justification": "Uses open-source Llama models and public benchmarks so replication is theoretically possible, but code is not released." 496 }, 497 "brand_recognition": { 498 "score": 0, 499 "justification": "Single author from unknown commercial entity Scrivly.AI; no major lab or university affiliation." 500 } 501 }, 502 "hn_data": { 503 "threads": [], 504 "top_points": 0, 505 "total_points": 0, 506 "total_comments": 0 507 } 508 }