ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27597B)


      1 {
      2   "paper": {
      3     "title": "The Geometry of Thought: How Scale Restructures Reasoning in Large Language Models",
      4     "authors": ["Samuel Cyrenius Anderson"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2601.13358",
      8     "doi": "10.48550/arXiv.2601.13358"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "Scale induces domain-specific geometric reorganizations in LLM reasoning representations rather than uniform improvement. Legal reasoning 'crystallizes' at 70B (45% dimensionality collapse, 31% alignment increase), scientific/math reasoning remains geometrically invariant, and code reasoning forms discrete strategic clusters. A universal oscillatory signature (coherence ≈ −0.4) persists across all domains and scales, suggesting an architectural invariant of transformers.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "Paper states code will be released 'upon acceptance' and is 'available to qualified researchers upon request' — not currently publicly available."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "All datasets used (GSM8K, GPQA, HumanEval, CaseHOLD, LexGLUE-SCOTUS) are publicly available under their respective licenses, as stated in the Data and Code Availability section."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "Hardware is described (8× NVIDIA B200 GPUs, bfloat16) and HuggingFace Transformers is mentioned, but no requirements.txt, library versions, or environment specification is provided."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. The paper describes the methodology but does not include scripts, commands, or a README-style guide."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Figure 9 reports 95% bootstrapped confidence intervals for ∆Alignment across domains (e.g., Law: +0.22 [0.14, 0.30])."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No formal significance tests (p-values, t-tests, etc.) are reported. Confidence intervals are shown for alignment changes but comparative claims between domains lack formal statistical testing."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Effect sizes are reported throughout with baseline context: 45% dimensional collapse (d95: 501→274), 31% alignment increase (0.72→0.94), 213% clustering increase (silhouette: 0.133→0.417), and percentage changes with invariance zones."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Dataset sizes are stated (GSM8K N=7473, CaseHOLD N=5000, GPQA N=500, HumanEval N=164) but no justification is given for why these sizes are adequate, especially the small N=164 for code and N=500 for science."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Greedy decoding makes generation deterministic, but operator training results report only point accuracy (63.6%) with no variance across seeds or runs. Alignment variance (σ=0.027) is reported for trajectory statistics but not for the operator learning experiments."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Operator learning experiments include baselines: identity operator (ĥT = h0) and mean predictor (ĥT = E[hT]). Three operator architectures are compared (Linear, MLP, DeepONet). Section 3.5.1."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The baselines are simple mathematical baselines (identity, mean predictor). No comparison to existing inference acceleration methods (speculative decoding, early exit) or prior trajectory analysis work is made experimentally."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple operator architectures are compared (Linear, MLP, DeepONet, Spectral KAN, Turbo), and the Turbo operator's velocity-conditioning (h1−h0) is evaluated as a component. Section 2.8 discusses why Spectral KAN underperforms."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Six geometric metrics are reported: global dimension (d95), intrinsic dimension (dmle), trajectory alignment, step-to-step coherence, silhouette score, and G/L ratio. Operator evaluation uses MSE, latent similarity, and probe decoding accuracy."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is a geometric analysis of hidden-state trajectories; human evaluation of system outputs is not relevant to the claims."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Operator learning uses a fixed 70/15/15 train/val/test split with seed 42, and accuracy is reported on held-out test trajectories. Section 3.5.1."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by domain (Law, Science, Code, Math) and by scale (8B, 70B), with per-domain metrics across all six geometric dimensions. Figure 4 provides the full metrics matrix."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The Liquid phase (Science, Math) is presented as a null result where scale does not reorganize geometry. LogicBench extraction failure is acknowledged. The Spectral KAN operator's underperformance is discussed. Section 4.2 discusses why science resists crystallization."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Several negative results: Spectral KAN underperforms simpler Turbo operator, scientific/mathematical reasoning shows no geometric improvement with scale, LogicBench extraction did not complete. Section 2.8 explicitly frames KAN as an informative negative result."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims (45% dimensionality collapse, 31% alignment increase, 10× manifold untangling, silhouette 0.13→0.42, coherence ≈−0.4, 63.6% accuracy) are all supported with corresponding figures and numerical results in the paper."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper uses causal language throughout: 'scale triggers Crystallization', 'scale reshapes reasoning', 'scale induces domain-specific phase transitions.' These are observational comparisons across two model sizes, not controlled causal interventions. The paper acknowledges 'correlation, not causation' in Section 4.7 but continues using causal framing in the conclusion."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 4.7 (Limitations) explicitly bounds generalization: single model family (Llama-3), English-only data, two scale points, dataset confounds. The paper acknowledges these prevent claims about universal transformer behavior."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 4.1 presents two competing interpretations (Expertise vs. Compression) for crystallization. Section 4.7 discusses dataset confounds as an alternative explanation. Section 4.2 proposes domain structure (open vs. closed systems) as explanation for the liquid state."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper is careful to distinguish geometric measurements (dimensionality, alignment, coherence) from reasoning quality. It does not claim geometry equals capability; it claims geometry predicts operator learnability, which is separately tested. The probe decoding accuracy is presented as a proxy with clear framing."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Exact model identifiers given: 'meta-llama/Meta-Llama-3-8B-Instruct (hidden size d=4096, 32 layers)' and 'meta-llama/Llama-3.1-70B-Instruct (hidden size d=8192, 80 layers)'. Section 3.2.1."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "Prompts are described in natural language ('Prompts instruct step-by-step reasoning and require an explicit delimiter Final:') but the actual prompt text is not provided in the paper or appendix."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Decoding: greedy (do_sample=False), max_new_tokens=512. Operator training: AdamW lr=1e-4, cosine annealing, batch size 64, 50 epochs, seed 42. MLE: k=10 nearest neighbors. Section 3.3.1 and 3.5.1."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The paper extracts hidden states from standard model inference."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 3.3 describes the two-pass generate-then-extract protocol, trajectory indexing, delimiter localization, filtering criteria (discard empty generations), minimum-length requirements, and float16→float32 casting. Section 3.2.2 states subsampling sizes."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 4.7 is titled 'Limitations' and provides substantive discussion across five specific limitations."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 4.7 lists specific threats: single model family (Llama-3), English-only data, only two scale points preventing characterization of transition dynamics, dataset confounds (size, difficulty, format differ across domains), correlation vs. causation."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 4.7 explicitly states what results do NOT show: cannot claim generality beyond Llama-3, cannot distinguish sharp transition from smooth compression with only two scale points, cannot claim causal relationships. English-only limitation noted for legal reasoning specifically."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The paper states it will not 'bundle large raw hidden-state dumps' and code/data will only be available upon acceptance or request. Raw trajectory data is not publicly available."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 3.3 describes the full trajectory extraction procedure: deterministic greedy decoding, teacher-forced hidden-state capture, trajectory indexing relative to prompt boundary, memory-mapped storage in float16."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. All data sources are standard public benchmarks (GSM8K, GPQA, HumanEval, CaseHOLD, LexGLUE-SCOTUS)."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline from dataset loading through tokenization, generation, teacher-forced extraction, trajectory indexing, filtering, and geometric analysis is documented across Sections 3.3-3.4. Filtering criteria and minimum-length requirements are stated."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is disclosed anywhere in the paper. The author is affiliated with Scrivly.AI but no funding source is mentioned."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliation is listed as 'Scrivly.AI' with contact email sam@scrivly.ai."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding information is disclosed, so independence of funder cannot be assessed. The author's company Scrivly.AI could potentially benefit from inference acceleration findings."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present. The author is affiliated with Scrivly.AI which may have commercial interests related to the inference acceleration applications discussed."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The paper does not state the training data cutoff dates for Llama-3-8B-Instruct or Llama-3.1-70B-Instruct, despite evaluating them on benchmarks that existed before training."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether the models may have seen GSM8K, HumanEval, GPQA, or LexGLUE data during training, despite these being public benchmarks."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "HumanEval (2021) and GSM8K (2021) were published well before Llama-3's training. The paper does not discuss contamination risk, though it claims to use benchmarks for geometric characterization rather than leaderboard reporting."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference cost, latency, or wall-clock time is reported for trajectory extraction or operator training, despite using 8× B200 GPUs and processing 25,000+ trajectories."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Hardware is described (8× NVIDIA B200 GPUs, 180GB VRAM each) but total compute budget (GPU hours, training time, total extraction time) is not quantified."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Generation uses greedy decoding (deterministic), but operator training uses a single seed (42) with no sensitivity analysis across seeds."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of operator training runs is not stated. It appears to be a single run per architecture. Best checkpoint selected by validation MSE but no multi-run statistics."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget is reported. The operator architectures use fixed hyperparameters (lr=1e-4, batch 64, 50 epochs) with no indication of how these were selected."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "Best checkpoint selected by validation MSE on the held-out validation split (15% of data), and reported on test split. Section 3.5.1."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors implement all operator baselines and architectures themselves. No acknowledgment of potential bias from implementing and tuning their own baselines."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": false,
    325         "answer": false,
    326         "justification": "The operator architectures differ in parameter count but compute differences are small (all are lightweight adapters). The main comparison is about geometric structure, not compute-performance tradeoffs."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "The paper explicitly discusses using benchmarks for geometric characterization rather than leaderboard-style evaluation: 'We use these splits as a fixed corpus for geometric characterization and operator/probe training rather than leaderboard-style benchmark reporting.' Section 3.2.2."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved. The paper extracts hidden states from standard model inference without agentic scaffolding."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of temporal leakage despite using benchmarks (HumanEval 2021, GSM8K 2021) published well before Llama-3's training. Though the paper uses benchmarks for geometric analysis not accuracy evaluation, contamination could still affect hidden-state structure."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup leaks information. For example, the legal classification task provides the full court opinion and asks for issue area classification — no discussion of whether prompt format affects trajectory geometry."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of potential overlap or structural similarity between the benchmark datasets and model training data."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention methods are applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Legal reasoning undergoes 'Crystallization' at scale: 45% collapse in global dimensionality (d95: 501→274), 31% alignment increase (0.72→0.94), and 10× manifold untangling (G/L: 9.82→0.98) between 8B and 70B parameters.",
    365       "evidence": "Figures 2, 3, 5, 7 and Section 4.1 present the dimensional, alignment, and G/L metrics. 95% CI for alignment gain: [0.14, 0.30] (Figure 9).",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "Scientific and mathematical reasoning are geometrically invariant to scale (Liquid phase): d95 237→235 for Science, 501→501 for Math.",
    370       "evidence": "Figures 2, 3, 10. Math alignment change CI spans zero: −0.01 [−0.07, +0.05] (Figure 9).",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Code reasoning forms a discrete Lattice of strategic modes with silhouette score increasing from 0.13 to 0.42 and optimal clusters expanding from k=2 to k=5 at scale.",
    375       "evidence": "Figures 1, 4. Silhouette scores and cluster counts reported in Results section.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "A universal oscillatory signature (coherence ≈ −0.4) is invariant across all domains and scales, suggesting an architectural invariant of transformer dynamics.",
    380       "evidence": "Figure 8 shows coherence values: Law −0.40/−0.40, Science −0.40/−0.40, Code −0.42/−0.40, Math −0.42/−0.40.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Neural Reasoning Operators achieve 63.6% accuracy on held-out legal classification tasks via probe decoding, exceeding baselines by 10 percentage points.",
    385       "evidence": "Section 1.4 and 5 (Conclusion) report 63.6% accuracy and 74.7% latent similarity for the Turbo operator. Trained on 8B trajectories with 70/15/15 split.",
    386       "supported": "weak"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "Single model family",
    392       "detail": "All experiments use Llama-3-Instruct only. The three-phase taxonomy (Crystal/Liquid/Lattice) may reflect Llama-specific training dynamics rather than universal properties of transformer reasoning. The paper acknowledges this but frames findings broadly."
    393     },
    394     {
    395       "flag": "Only two scale points",
    396       "detail": "With only 8B and 70B comparisons, the paper cannot distinguish a sharp phase transition from gradual compression, yet uses phase transition framing extensively. Intermediate scales (14B, 33B) would be needed to characterize the transition shape."
    397     },
    398     {
    399       "flag": "Operator validation limited to pilot on 8B",
    400       "detail": "The headline 63.6% operator accuracy is achieved only on 8B legal trajectories with probe decoding. No 70B operator results are reported despite Crystallization at 70B being the central finding. The paper speculates 70B would be better but provides no evidence."
    401     },
    402     {
    403       "flag": "No contamination analysis",
    404       "detail": "Uses public benchmarks (HumanEval 2021, GSM8K 2021) that likely appeared in Llama-3 training data. While the paper claims to study geometry rather than accuracy, if models have memorized solutions, the hidden-state trajectories may reflect retrieval rather than reasoning, undermining the entire geometric analysis."
    405     },
    406     {
    407       "flag": "Dataset confounds across domains",
    408       "detail": "Domains differ dramatically in dataset size (164 to 7473), answer format, and difficulty distribution. The paper acknowledges this but does not control for it — observed geometric differences may reflect data characteristics rather than domain properties."
    409     },
    410     {
    411       "flag": "Causal language from observational design",
    412       "detail": "Despite acknowledging 'correlation, not causation' in limitations, the paper consistently uses causal framing: 'scale triggers', 'scale induces', 'scale reshapes.' Title and abstract use causal framing throughout."
    413     }
    414   ],
    415   "cited_papers": [
    416     {
    417       "title": "Scaling laws for neural language models",
    418       "authors": ["J. Kaplan", "S. McCandlish", "T. Henighan"],
    419       "year": 2020,
    420       "arxiv_id": "2001.08361",
    421       "relevance": "Foundational scaling laws paper establishing power-law relationships between model parameters, data, and performance."
    422     },
    423     {
    424       "title": "Training compute-optimal large language models",
    425       "authors": ["J. Hoffmann", "S. Borgeaud", "A. Mensch"],
    426       "year": 2022,
    427       "arxiv_id": "2203.15556",
    428       "relevance": "Chinchilla scaling analysis showing optimal compute allocation requires scaling data and parameters in tandem."
    429     },
    430     {
    431       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    432       "authors": ["J. Wei", "X. Wang", "D. Schuurmans"],
    433       "year": 2022,
    434       "arxiv_id": "2201.11903",
    435       "relevance": "Introduces chain-of-thought prompting showing intermediate reasoning steps improve LLM performance on reasoning tasks."
    436     },
    437     {
    438       "title": "Emergent abilities of large language models",
    439       "authors": ["J. Wei", "Y. Tay", "R. Bommasani"],
    440       "year": 2022,
    441       "arxiv_id": "2206.07682",
    442       "relevance": "Documents capabilities that appear discontinuously at specific scale thresholds, relevant to phase transition analysis."
    443     },
    444     {
    445       "title": "Are emergent abilities of large language models a mirage?",
    446       "authors": ["R. Schaeffer", "B. Miranda", "S. Koyejo"],
    447       "year": 2023,
    448       "arxiv_id": "2304.15004",
    449       "relevance": "Challenges emergence framing, arguing apparent discontinuities may reflect metric choice — relevant context for phase transition claims."
    450     },
    451     {
    452       "title": "Evaluating large language models trained on code",
    453       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    454       "year": 2021,
    455       "arxiv_id": "2107.03374",
    456       "relevance": "Introduces HumanEval benchmark used in this study for code generation evaluation."
    457     },
    458     {
    459       "title": "Tree of thoughts: Deliberate problem solving with large language models",
    460       "authors": ["S. Yao", "D. Yu", "J. Zhao"],
    461       "year": 2023,
    462       "arxiv_id": "2305.10601",
    463       "relevance": "Structures reasoning as search over alternative continuations, relevant to understanding reasoning trajectory geometry."
    464     },
    465     {
    466       "title": "Representation engineering: A top-down approach to AI transparency",
    467       "authors": ["A. Zou", "L. Phan", "S. Chen"],
    468       "year": 2023,
    469       "arxiv_id": "2310.01405",
    470       "relevance": "Demonstrates linear probes can identify high-level concepts in activation space, foundational to representation analysis approaches."
    471     },
    472     {
    473       "title": "GPQA: A graduate-level google-proof Q&A benchmark",
    474       "authors": ["D. Rein", "B. L. Hou", "A. C. Stickland"],
    475       "year": 2023,
    476       "arxiv_id": "2311.12022",
    477       "relevance": "Expert-level scientific reasoning benchmark used in this study."
    478     },
    479     {
    480       "title": "Fast inference from transformers via speculative decoding",
    481       "authors": ["Y. Leviathan", "M. Kalman", "Y. Matias"],
    482       "year": 2023,
    483       "arxiv_id": "2211.17192",
    484       "relevance": "Key inference acceleration method that this paper positions against with its endpoint prediction approach."
    485     }
    486   ]
    487 }

Impressum · Datenschutz