ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (26218B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "The Geometry of Thought: How Scale Restructures Reasoning In Large Language Models",
      6     "authors": [
      7       "S. Anderson"
      8     ],
      9     "year": 2026,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2601.13358",
     12     "doi": "10.48550/arXiv.2601.13358"
     13   },
     14   "checklist": {
     15     "claims_and_evidence": {
     16       "abstract_claims_supported": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "All specific quantitative claims in the abstract (45% dimensional collapse, 31% alignment increase, 10× manifold untangling, coherence ≈−0.4, 63.6% operator accuracy) are backed by results sections with matching numbers.",
     20         "source": "haiku"
     21       },
     22       "causal_claims_justified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "The paper uses causal language throughout ('scale triggers,' 'scale induces') but compares only two static pre-trained checkpoints; the limitations section itself admits 'Correlation, not causation.'",
     26         "source": "haiku"
     27       },
     28       "generalization_bounded": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The conclusion states 'scale does not improve reasoning—it reshapes it' as a universal claim; the limitations section does restrict this to Llama and English data, but the abstract and main text repeatedly generalize beyond the tested setting.",
     32         "source": "haiku"
     33       },
     34       "alternative_explanations_discussed": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Section 4.1 explicitly presents two competing interpretations ('Expertise Interpretation' vs. 'Compression Interpretation') and acknowledges the data cannot fully disambiguate them.",
     38         "source": "haiku"
     39       },
     40       "proxy_outcome_distinction": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper conflates geometric changes in hidden-state dimensionality with qualitative changes in 'understanding,' stating the model 'reasons differently' based on dimensional collapse without demonstrating that lower dimensionality tracks task accuracy.",
     44         "source": "haiku"
     45       }
     46     },
     47     "limitations_and_scope": {
     48       "limitations_section_present": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Section 4.7 is a dedicated limitations section covering single model family, English-only data, only two scale points, dataset confounds, and lack of causal identification.",
     52         "source": "haiku"
     53       },
     54       "threats_to_validity_specific": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Specific named threats include: single model family (Llama-3-Instruct), English-only benchmarks, only two scale comparison points (8B and 70B), domain-dataset confounds, and absence of causal intervention experiments.",
     58         "source": "haiku"
     59       },
     60       "scope_boundaries_stated": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Limitations explicitly state results may not generalize to other architectures, non-English legal systems, or intermediate/larger scales, with causal interpretation requiring further intervention experiments.",
     64         "source": "haiku"
     65       }
     66     },
     67     "conflicts_of_interest": {
     68       "funding_disclosed": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No funding disclosure or acknowledgments section is present anywhere in the paper.",
     72         "source": "haiku"
     73       },
     74       "affiliations_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Author affiliation with Scrivly.AI is disclosed on the title page (sam@scrivly.ai).",
     78         "source": "haiku"
     79       },
     80       "funder_independent_of_outcome": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No funding is disclosed; the author is from Scrivly.AI, a commercial entity whose potential interest in the research outcomes is unknown and undisclosed.",
     84         "source": "haiku"
     85       },
     86       "financial_interests_declared": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No competing interests statement, patent disclosure, or financial interests declaration appears anywhere in the paper.",
     90         "source": "haiku"
     91       }
     92     },
     93     "scope_and_framing": {
     94       "key_terms_defined": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Key terms are formally defined: 'reasoning trajectory' (sequence of final-layer hidden states, Eq. 1), d95, dmle, alignment, coherence, and all three phase labels (Crystalline/Liquid/Lattice) are operationalized in Sections 1.2 and 3.4.",
     98         "source": "haiku"
     99       },
    100       "intended_contribution_clear": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 1.5 'Contributions' lists five numbered contributions explicitly: domain-dependent geometric scaling laws, a three-phase taxonomy, the universal oscillatory signature, geometry-aware operator learning, and a reproducible measurement framework.",
    104         "source": "haiku"
    105       },
    106       "engagement_with_prior_work": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 2 contains 10 subsections and Table 1 explicitly positioning contributions relative to scaling laws, CoT, mechanistic interpretability, manifold hypothesis, phase transitions, inference acceleration, and operator learning literature.",
    110         "source": "haiku"
    111       }
    112     }
    113   },
    114   "type_checklist": {
    115     "empirical": {
    116       "artifacts": {
    117         "code_released": {
    118           "applies": true,
    119           "answer": false,
    120           "justification": "Code release is promised 'upon acceptance'; until then 'available to qualified researchers upon request'—both are NO under the criterion.",
    121           "source": "haiku"
    122         },
    123         "data_released": {
    124           "applies": true,
    125           "answer": true,
    126           "justification": "All five datasets (GSM8K, GPQA, HumanEval, CaseHOLD, LexGLUE–SCOTUS) are publicly available standard benchmarks used unmodified.",
    127           "source": "haiku"
    128         },
    129         "environment_specified": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "Hardware (8× NVIDIA B200 GPUs, bfloat16 weights) is mentioned but no requirements.txt, Dockerfile, or Python package versions are provided.",
    133           "source": "haiku"
    134         },
    135         "reproduction_instructions": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "The methodology describes the conceptual protocol but no step-by-step reproduction instructions are provided, and code is not released.",
    139           "source": "haiku"
    140         }
    141       },
    142       "statistical_methodology": {
    143         "confidence_intervals_or_error_bars": {
    144           "applies": true,
    145           "answer": true,
    146           "justification": "Section 5.4 ('Statistical Robustness') reports bootstrapped 95% confidence intervals for alignment changes across all four domains.",
    147           "source": "haiku"
    148         },
    149         "significance_tests": {
    150           "applies": true,
    151           "answer": true,
    152           "justification": "Bootstrap CIs are used to assess statistical significance—Math CI spans zero (confirming scale invariance), Law CI [0.14, 0.30] excludes zero (confirming significant effect).",
    153           "source": "haiku"
    154         },
    155         "effect_sizes_reported": {
    156           "applies": true,
    157           "answer": true,
    158           "justification": "Absolute and percentage changes are reported throughout: 45% dimensional collapse, Δ=+0.22 alignment, 213% clustering increase, 10× G/L reduction.",
    159           "source": "haiku"
    160         },
    161         "sample_size_justified": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "Sample sizes are determined by benchmark availability (N=7473 GSM8K, N=500 GPQA, N=164 HumanEval, N=5000 legal subsamples) with no power analysis or justification for adequacy.",
    165           "source": "haiku"
    166         },
    167         "variance_reported": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "Variance is reported for only one metric (σ=0.027 for legal alignment); d95, dmle, coherence, G/L ratio, and silhouette score are all reported as single point estimates without spread.",
    171           "source": "haiku"
    172         }
    173       },
    174       "evaluation_design": {
    175         "baselines_included": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Neural Reasoning Operator evaluation uses two baselines: identity predictor (ĥ_T = h_0) and mean predictor (ĥ_T = E[h_T]).",
    179           "source": "haiku"
    180         },
    181         "baselines_contemporary": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Baselines are appropriate for the operator learning task; four operator architectures (Linear, MLP, DeepONet, Spectral KAN) are also compared against each other.",
    185           "source": "haiku"
    186         },
    187         "ablation_study": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Four operator architectures (Linear, MLP, DeepONet, Spectral KAN vs. Turbo with velocity conditioning) are systematically compared, effectively ablating design choices.",
    191           "source": "haiku"
    192         },
    193         "multiple_metrics": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Six geometric metrics are reported (d95, dmle, alignment, coherence, silhouette, G/L ratio) plus operator test MSE and probe decoding accuracy.",
    197           "source": "haiku"
    198         },
    199         "human_evaluation": {
    200           "applies": false,
    201           "answer": false,
    202           "justification": "Human evaluation is not relevant for this geometric analysis of LLM hidden-state trajectories.",
    203           "source": "haiku"
    204         },
    205         "held_out_test_set": {
    206           "applies": true,
    207           "answer": true,
    208           "justification": "Operator training uses a fixed 70/15/15 train/val/test split (seed 42); probe decoding accuracy is reported on the held-out test set.",
    209           "source": "haiku"
    210         },
    211         "per_category_breakdown": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "All results are broken down by domain (Law, Science, Code, Math) and scale (8B, 70B), yielding 48 measurements across 6 metrics × 4 domains × 2 scales.",
    215           "source": "haiku"
    216         },
    217         "failure_cases_discussed": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "LogicBench extraction failure is reported and the domain excluded; the paper also explains why Liquid domains (Science, Math) resist operator amortization due to unfavorable geometry.",
    221           "source": "haiku"
    222         },
    223         "negative_results_reported": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "Spectral KAN underperforms the Turbo operator (reported as an informative negative result); scale invariance in Science/Math is explicitly framed as a null result of equal theoretical significance to Crystallization.",
    227           "source": "haiku"
    228         }
    229       },
    230       "setup_transparency": {
    231         "model_versions_specified": {
    232           "applies": true,
    233           "answer": true,
    234           "justification": "Exact HuggingFace model IDs are specified: 'meta-llama/Meta-Llama-3-8B-Instruct' and 'meta-llama/Llama-3.1-70B-Instruct' with hidden dimensions noted.",
    235           "source": "haiku"
    236         },
    237         "prompts_provided": {
    238           "applies": true,
    239           "answer": false,
    240           "justification": "Prompt format is described (chat template with delimiters like 'Final:', 'Answer:', 'Verdict:') but actual prompt text is not provided; it will be released with the code upon acceptance.",
    241           "source": "haiku"
    242         },
    243         "hyperparameters_reported": {
    244           "applies": true,
    245           "answer": true,
    246           "justification": "Greedy decoding (do_sample=False), max_new_tokens=512, AdamW lr=1e-4, cosine annealing, batch size 64, 50 epochs, k=10 for MLE estimator, and fixed seed 42 are all reported.",
    247           "source": "haiku"
    248         },
    249         "scaffolding_described": {
    250           "applies": false,
    251           "answer": false,
    252           "justification": "No agentic scaffolding is used; this is a trajectory extraction and geometric analysis study on instruction-tuned models.",
    253           "source": "haiku"
    254         },
    255         "data_preprocessing_documented": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "Section 3.3 documents subsampling (N=5000 for legal datasets), empty generation filtering, two-pass generate-then-extract protocol, delimiter localization, and float16 storage with float32 computation.",
    259           "source": "haiku"
    260         }
    261       },
    262       "data_integrity": {
    263         "raw_data_available": {
    264           "applies": true,
    265           "answer": false,
    266           "justification": "Hidden-state trajectory arrays are stored as memory-mapped files but not released; 'available to qualified researchers upon request' does not constitute public availability.",
    267           "source": "haiku"
    268         },
    269         "data_collection_described": {
    270           "applies": true,
    271           "answer": true,
    272           "justification": "Section 3.3 details the two-pass generate-then-extract protocol including teacher-forced forward passes, trajectory indexing formulas, delimiter localization, and filtering steps.",
    273           "source": "haiku"
    274         },
    275         "recruitment_methods_described": {
    276           "applies": false,
    277           "answer": false,
    278           "justification": "Standard public benchmarks are used; no participant recruitment is involved.",
    279           "source": "haiku"
    280         },
    281         "data_pipeline_documented": {
    282           "applies": true,
    283           "answer": true,
    284           "justification": "Full pipeline from benchmark loading → tokenization → generation → teacher-forced extraction → NumPy memmap storage → geometric analysis is documented across Sections 3.2–3.4.",
    285           "source": "haiku"
    286         }
    287       },
    288       "contamination": {
    289         "training_cutoff_stated": {
    290           "applies": true,
    291           "answer": false,
    292           "justification": "Training data cutoffs for Llama-3-8B-Instruct and Llama-3.1-70B-Instruct are not stated, despite reporting operator accuracy on benchmarks that may overlap with training data.",
    293           "source": "haiku"
    294         },
    295         "train_test_overlap_discussed": {
    296           "applies": true,
    297           "answer": false,
    298           "justification": "No discussion of potential training data overlap with any of the five benchmarks; GSM8K and HumanEval were published well before Llama-3 training.",
    299           "source": "haiku"
    300         },
    301         "benchmark_contamination_addressed": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "GSM8K, HumanEval, and CaseHOLD were all publicly available before Llama-3 training cutoffs; potential contamination affecting geometric and accuracy results is not addressed.",
    305           "source": "haiku"
    306         }
    307       },
    308       "human_studies": {
    309         "pre_registered": {
    310           "applies": false,
    311           "answer": false,
    312           "justification": "No human participants involved.",
    313           "source": "haiku"
    314         },
    315         "irb_or_ethics_approval": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants involved.",
    319           "source": "haiku"
    320         },
    321         "demographics_reported": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants involved.",
    325           "source": "haiku"
    326         },
    327         "inclusion_exclusion_criteria": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants involved.",
    331           "source": "haiku"
    332         },
    333         "randomization_described": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants involved.",
    337           "source": "haiku"
    338         },
    339         "blinding_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants involved.",
    343           "source": "haiku"
    344         },
    345         "attrition_reported": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants involved.",
    349           "source": "haiku"
    350         }
    351       },
    352       "cost_and_practicality": {
    353         "inference_cost_reported": {
    354           "applies": true,
    355           "answer": false,
    356           "justification": "Hardware (8× NVIDIA B200 GPUs, 180GB VRAM each) is mentioned but no inference latency, throughput, or cost per trajectory is reported.",
    357           "source": "haiku"
    358         },
    359         "compute_budget_stated": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "Compute node specification is given but total GPU-hours or cost to extract 25,000+ trajectories across two model scales is not stated.",
    363           "source": "haiku"
    364         }
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "Scale triggers domain-specific geometric phase transitions: legal reasoning undergoes 'Crystallization' (45% d95 collapse from 501→274, 31% alignment increase, 10× manifold untangling), while Science and Math remain geometrically invariant despite 9× parameter increase.",
    371       "evidence": "PCA-based d95, displacement alignment, and G/L ratio measured on 25,000+ chain-of-thought trajectories at 8B and 70B across four domains; bootstrapped CIs reported for alignment changes.",
    372       "supported": "moderate"
    373     },
    374     {
    375       "claim": "A universal oscillatory constant (step-to-step coherence ≈−0.4) persists across all four domains and both model scales, indicating an architectural invariant of transformer dynamics.",
    376       "evidence": "Cosine similarity of consecutive velocity vectors reported as −0.40 to −0.42 uniformly across all eight experimental conditions; visualized in Figure 8.",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "Neural Reasoning Operators achieve 63.6% probe decoding accuracy on held-out legal classification tasks, exceeding identity and mean baselines by 10 percentage points.",
    381       "evidence": "Adapter/probe decoding on 70/15/15 held-out test split; trained on 8B legal trajectories using AdamW with cosine annealing.",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "Code reasoning at 70B forms a 'Lattice' of 5 discrete strategic modes, with silhouette score increasing 213% (0.133→0.417) compared to 8B.",
    386       "evidence": "K-means clustering on PCA-projected (50-dim) start states with silhouette score optimization over k values.",
    387       "supported": "weak"
    388     },
    389     {
    390       "claim": "Intrinsic dimensionality (dmle ≈20–25) is invariant across all domains and scales, decoupled from global dimensionality variation.",
    391       "evidence": "Levina-Bickel MLE with k=10 nearest neighbors estimated on random subsamples of start states {h0} per condition.",
    392       "supported": "weak"
    393     }
    394   ],
    395   "methodology_tags": [
    396     "observational",
    397     "benchmark-eval"
    398   ],
    399   "key_findings": "Analyzing 25,000+ chain-of-thought trajectories from Llama-3-8B and 70B across four domains, this paper finds scaling laws are domain-dependent at the geometric level: legal reasoning undergoes 'Crystallization' (45% dimensional collapse, 10× manifold untangling, 31% alignment increase), while scientific and mathematical reasoning remain geometrically invariant ('Liquid' phase), and code organizes into discrete strategic clusters ('Lattice' phase). A universal oscillatory signature (step-to-step coherence ≈−0.4) persists across all conditions regardless of domain or scale, interpreted as an architectural invariant of transformer dynamics. Neural Reasoning Operators trained on crystalline legal trajectories achieve 63.6% probe decoding accuracy on held-out classification, supporting the hypothesis that favorable manifold geometry enables amortized inference.",
    400   "red_flags": [
    401     {
    402       "flag": "Single model family",
    403       "detail": "All experiments use only Llama-3-Instruct variants; 'universal' and 'architectural invariant' properties are demonstrated solely within one architecture family, making cross-architecture generalization unverified."
    404     },
    405     {
    406       "flag": "Only two scale points",
    407       "detail": "Comparing 8B and 70B cannot distinguish a sharp phase transition from smooth continuous change; the 'phase transition' framing is a metaphor, not an empirically established discontinuity."
    408     },
    409     {
    410       "flag": "Code not released",
    411       "detail": "Extraction and analysis code promised 'upon acceptance' or 'available upon request'; independent replication is currently impossible."
    412     },
    413     {
    414       "flag": "Causal language without causal design",
    415       "detail": "Paper claims scale 'triggers,' 'induces,' and 'restructures' reasoning throughout, but the observational comparison of two static pre-trained checkpoints cannot support causal inference; the limitations section itself acknowledges this."
    416     },
    417     {
    418       "flag": "No funding or COI disclosure",
    419       "detail": "Single author from commercial entity Scrivly.AI; no funding source, competing interests statement, or acknowledgments section is present."
    420     },
    421     {
    422       "flag": "Proxy-outcome conflation",
    423       "detail": "Geometric changes in hidden-state dimensionality are interpreted as evidence that models 'reason differently' without demonstrating that dimensional collapse tracks improved task performance or different behavioral outputs."
    424     },
    425     {
    426       "flag": "LogicBench silently excluded",
    427       "detail": "LogicBench extraction 'did not complete successfully' and was excluded without investigation; this silent failure potentially biases the domain coverage underlying the phase taxonomy."
    428     },
    429     {
    430       "flag": "Operator validated on 8B only",
    431       "detail": "The 63.6% accuracy result is for operators trained and evaluated on 8B trajectories; 70B operator validation is deferred to 'future work' despite the paper's central focus on the 70B Crystallization event."
    432     }
    433   ],
    434   "cited_papers": [
    435     {
    436       "title": "Scaling Laws for Neural Language Models",
    437       "relevance": "Kaplan et al. foundational paper this work claims to extend from behavioral metrics to representational geometry."
    438     },
    439     {
    440       "title": "Training Compute-Optimal Large Language Models (Chinchilla)",
    441       "relevance": "Hoffmann et al. behavioral-only scaling analysis that the paper argues misses geometric heterogeneity across domains."
    442     },
    443     {
    444       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    445       "relevance": "Wei et al. foundational CoT paper; this work characterizes CoT trajectories geometrically across domains and scales."
    446     },
    447     {
    448       "title": "Emergent Abilities of Large Language Models",
    449       "relevance": "Wei et al. phase transition framing of emergent abilities; this paper proposes a complementary geometric phase taxonomy."
    450     },
    451     {
    452       "title": "REMA: A Unified Reasoning Manifold Framework for Interpreting Large Language Models",
    453       "relevance": "Li et al. (2025) most closely related prior work formalizing the reasoning manifold concept and using geometric deviation for failure diagnosis."
    454     },
    455     {
    456       "title": "A Statistical Physics of Language Model Reasoning",
    457       "relevance": "Carson & Reisizadeh (2025) drift-diffusion framing of sentence-level trajectories; directly compared prior work in the same space."
    458     },
    459     {
    460       "title": "Are Emergent Abilities of Large Language Models a Mirage?",
    461       "relevance": "Schaeffer et al. challenges phase transition framing of LLM capabilities; directly relevant counterpoint to this paper's claims."
    462     },
    463     {
    464       "title": "GPQA: A Graduate-Level Google-Proof Q&A Benchmark",
    465       "relevance": "Primary scientific reasoning dataset used for trajectory extraction in this study."
    466     },
    467     {
    468       "title": "LexGLUE: A Benchmark Dataset for Legal Language Understanding in English",
    469       "relevance": "Primary legal reasoning benchmark (SCOTUS subset) for the main cross-scale Crystallization analysis."
    470     },
    471     {
    472       "title": "Fast Inference from Transformers via Speculative Decoding",
    473       "relevance": "Key inference acceleration baseline that endpoint prediction is proposed to improve upon by bypassing sequential trajectory traversal."
    474     }
    475   ],
    476   "engagement_factors": {
    477     "practical_relevance": {
    478       "score": 2,
    479       "justification": "Findings have direct implications for domain-specific model compression and inference acceleration, but code is not released making immediate application impossible."
    480     },
    481     "surprise_contrarian": {
    482       "score": 3,
    483       "justification": "Directly and empirically challenges the dominant 'scale uniformly improves reasoning' narrative with domain-specific null results (Science/Math invariance) alongside the Crystallization finding."
    484     },
    485     "fear_safety": {
    486       "score": 1,
    487       "justification": "Brief discussion of domain-dependent failure modes and interpretability implications for AI safety, but not a primary focus of the paper."
    488     },
    489     "drama_conflict": {
    490       "score": 1,
    491       "justification": "Interesting physics-metaphor framing with phase transitions, but no direct conflict with named researchers or particularly controversial claims."
    492     },
    493     "demo_ability": {
    494       "score": 1,
    495       "justification": "Uses open-source Llama models and public benchmarks so replication is theoretically possible, but code is not released."
    496     },
    497     "brand_recognition": {
    498       "score": 0,
    499       "justification": "Single author from unknown commercial entity Scrivly.AI; no major lab or university affiliation."
    500     }
    501   },
    502   "hn_data": {
    503     "threads": [],
    504     "top_points": 0,
    505     "total_points": 0,
    506     "total_comments": 0
    507   }
    508 }

Impressum · Datenschutz