ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (20733B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "theoretical",
      4   "paper": {
      5     "title": "Energy-Aware Routing to Large Reasoning Models",
      6     "authors": [
      7       "Austin R. Ellis-Mohr",
      8       "Max Hartman",
      9       "Lav R. Varshney"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2601.00823",
     14     "doi": "10.48550/arXiv.2601.00823"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The abstract's claims about three operating regimes, volatility-limited performance, and variance-aware routing are supported by Theorem 2 and the simulation results in Fig. 2 and Fig. 4.",
     22         "source": "opus"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper's causal claims (e.g., 'Routing error induces per-task excess energy, which shifts the mean battery drift') are justified by mathematical proofs (Theorems 1-2) and confirmed by simulation. The causal mechanisms are analytically derived.",
     28         "source": "opus"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Assumptions are clearly stated (Assumption 1: i.i.d. arrivals, Poisson tasks, empty initial battery). Section V explicitly notes these assumptions and discusses relaxation as future work. The paper states simulations use 'abstract model, hardware, and task parameters (not deployed models).'",
     34         "source": "opus"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper does not discuss alternative explanations for the observed simulation scaling behavior or whether other mathematical frameworks could yield different predictions. Section V discusses extensions but not alternatives.",
     40         "source": "opus"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper's claims match the granularity of its measurements. It measures auxiliary energy deficit E[DT] and claims this characterizes energy-aware routing. No proxy gap exists — the measured quantity is exactly what is being optimized.",
     46         "source": "opus"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "There is no dedicated limitations section. Section V (Discussion) mentions future work directions (relaxing i.i.d. assumptions, dependency structures, limited-capacity batteries, stochastic tolerance) but frames them as extensions, not limitations of the current work.",
     54         "source": "opus"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The Discussion mentions specific assumptions that could be relaxed (i.i.d. arrivals, unlimited parallel processing, oracle stopping, independent tasks) but frames these as future work opportunities rather than threats to the validity of current results.",
     60         "source": "opus"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The paper clearly states its modeling assumptions: Assumption 1 (i.i.d., Poisson), oracle stopping (Sec II-C), unlimited parallel capacity (Sec II), and abstract parametric models (Appendix B: 'not deployed models'). Section V explicitly notes aspects not addressed.",
     66         "source": "opus"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding source or acknowledgments section is present in the paper.",
     74         "source": "opus"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations are clearly stated: Department of Electrical and Computer Engineering, University of Illinois Urbana-Champaign, and AI Innovation Institute, Stony Brook University.",
     80         "source": "opus"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding disclosure is itself a gap.",
     86         "source": "opus"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial interests statement is present in the paper.",
     92         "source": "opus"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms precisely defined: 'Large reasoning models (LRMs)' (usage throughout); 'auxiliary energy' (Section II.D as supplemental energy); 'thinking time' τ (Section II.B); 'task descriptor' θ (Section II.A); all mathematical quantities carefully formalized.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Abstract and Introduction explicitly state contributions: 'introduce a mathematically principled formulation' (abstract); 'first- and second-order characterizations' (Section I); 'practical guidance for dispatch policies in deployed AI factories' (Section I).",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section I discusses prior work (Clover, EcoServe, FrugalGPT, pruning) and explicitly states how this work differs: 'does not explicitly address deployment settings in which renewable energy availability, inference cost heterogeneity, and deadline constraints must be jointly considered.' Cites formal similarity to energy harvesting systems.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "theoretical": {
    118       "formal_quality": {
    119         "assumptions_stated_explicitly": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "Assumption 1 formally stated with explicit conditions (Poisson arrivals, i.i.d. tasks/harvests). Oracle stopping assumption stated in Section II.C: 'once the router dispatches a task and chooses (i,τ) satisfying (3), the task is guaranteed to stop'.",
    123           "source": "haiku"
    124         },
    125         "proofs_complete_or_sketched": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "Theorem 1 (Appendix A, 2 pages): complete proof by induction. Theorem 2 (Appendix A, 3 pages): full case analysis for three regimes with detailed integral evaluations. Lemmas 1-3 have complete proofs. No 'left to reader' statements.",
    129           "source": "haiku"
    130         },
    131         "bounds_tight_or_discussed": {
    132           "applies": true,
    133           "answer": true,
    134           "justification": "Lemma 1 proves lumped-myopic is 'pathwise pessimistic' (conservative bound); its looseness acknowledged. Theorem 2 derives exact scaling from first principles (Brownian motion), not loose approximations. Section III.C explicitly discusses bound quality via κ parameter.",
    135           "source": "haiku"
    136         },
    137         "counterexamples_explored": {
    138           "applies": true,
    139           "answer": true,
    140           "justification": "Paper explores parameter regimes (Figure 2: drift-dominated, fluctuation-dominated, critical µ=0). Figure 4 shows regime transitions between square-root and linear scaling. Appendix B explores different task difficulties [1.7, 1.9] and model sizes (1B, 10B) affecting crossing behavior.",
    141           "source": "haiku"
    142         },
    143         "notation_consistent": {
    144           "applies": true,
    145           "answer": true,
    146           "justification": "Mathematical notation consistent throughout: B_t/B̃_t for battery states, C_t for consumption rate, E_i(x,τ) for energy functions, Θ/ε/λ for task properties. Standard symbols (Φ for normal CDF, W_t for Brownian motion) used appropriately.",
    147           "source": "haiku"
    148         },
    149         "constructive_vs_existence_noted": {
    150           "applies": true,
    151           "answer": true,
    152           "justification": "Theorem 1 is constructive: provides explicit greedy algorithm G_t := -(B̃_t + R_t - C_t)^+ for computing minimum injections. Theorem 2 gives closed-form formulas (Eq. 27). Section IV provides explicit numerical algorithm for scaling-law-based dispatch.",
    153           "source": "haiku"
    154         }
    155       },
    156       "connections": {
    157         "connection_to_practice_discussed": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Section I states 'provides practical guidance for dispatch policies in deployed AI factories'. Section IV grounds analysis in empirical scaling laws from Hoffmann et al [25] and Ellis-Mohr et al [2]. Figure 3 demonstrates practical 1B/10B model tradeoff. Section III.C discusses dispatcher overhead practical constraints.",
    161           "source": "haiku"
    162         },
    163         "relationship_to_prior_work_clear": {
    164           "applies": true,
    165           "answer": true,
    166           "justification": "Explicitly positions relative to energy harvesting literature: 'has formal similarity to information-theoretic investigations of optimal packet scheduling in energy harvesting systems [18][19]'. Distinguishes contribution: 'we study first- and second-order characterizations...directly using properties of Brownian motion' vs Berry-Esseen approaches.",
    167           "source": "haiku"
    168         },
    169         "computational_complexity_discussed": {
    170           "applies": true,
    171           "answer": true,
    172           "justification": "Section III.C explicitly analyzes dispatcher computational overhead: 'dispatcher itself consumes energy E_π^self(X) and time ξ_π^self(X), reducing available slack and precluding lower-energy allocations'. Motivates use of scaling laws for 'lightweight routing without heavy precomputation'.",
    173           "source": "haiku"
    174         },
    175         "limitations_of_formal_model_stated": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Oracle stopping isolates 'routing/dispatch limits from early-stopping design' (II.C). Section V lists explicit model limitations: 'relaxing the i.i.d. assumptions', 'limited-capacity or leaky battery models', 'tolerance constraint ε as deterministic [vs] stochastic', task dependencies currently assume independence.",
    179           "source": "haiku"
    180         }
    181       }
    182     }
    183   },
    184   "claims": [
    185     {
    186       "claim": "Energy-aware routing exhibits three distinct operating regimes determined by drift parameter μ = E[R_t] - E[C_t]^my",
    187       "evidence": "Theorem 2 proves expected deficit E[D_T] takes three forms: |μ|T + σ²/(2|μ|) for μ<0, σ²/(2μ) for μ>0, and σ√(2T/π) for μ=0. Figure 4 validates regime transitions numerically.",
    188       "supported": "strong"
    189     },
    190     {
    191       "claim": "At the critical regime (μ=0), auxiliary energy deficit is volatility-limited and scales as O(√T) rather than linear drift",
    192       "evidence": "Theorem 2 case μ=0 yields E[D_T] ≈ σ√(2T/π) (Eq. 28). Figure 4 shows zero-error policy maintains √T scaling throughout. This is the key insight motivating variance-aware routing.",
    193       "supported": "strong"
    194     },
    195     {
    196       "claim": "Routing error (excess energy beyond minimum required) acts as a first-order effect that eventually dominates second-order fluctuation costs",
    197       "evidence": "Section III.C shows routing error ∆E_i^π(X) shifts mean drift by ĒΔE, while fluctuations contribute σ√T. For large T where |μ|T >> σ√T, drift dominates. Figure 4 shows transition point between regimes marked by vertical lines.",
    198       "supported": "strong"
    199     },
    200     {
    201       "claim": "The cumulative cost of auxiliary energy injections equals the maximum deficit of the unconstrained battery trajectory",
    202       "evidence": "Theorem 1 proves D_T = (-min_{0≤t≤T} B_t)^+, with complete inductive proof in Appendix A. This fundamental identity simplifies analysis from constrained to unconstrained dynamics.",
    203       "supported": "strong"
    204     },
    205     {
    206       "claim": "Variance-aware routing is most beneficial in the critical regime where drift and fluctuation effects are comparable in magnitude",
    207       "evidence": "Figure 2 plots deviation from drift-only scaling vs relative mean-variance ratio κ. Shows maximum deviation (suggesting fluctuations matter most) when κ ≈ 1, i.e., when |μ|T and σ√T are of similar scale.",
    208       "supported": "moderate"
    209     },
    210     {
    211       "claim": "Scaling laws for training-compute and inference-compute can enable lightweight dispatch without heavy computational overhead",
    212       "evidence": "Section IV.C proposes using empirical scaling laws to compute feasible allocations, reducing dispatcher to 'difficulty prediction' task. Figure 3 demonstrates energy/latency tradeoff but uses toy parameters; no real-system validation provided.",
    213       "supported": "moderate"
    214     }
    215   ],
    216   "methodology_tags": [
    217     "theoretical",
    218     "numerical-simulation"
    219   ],
    220   "key_findings": "The paper establishes that energy-aware routing for large reasoning models under renewable energy constraints exhibits three distinct operating regimes: (1) persistent deficit (μ<0) where auxiliary energy grows linearly in time, dominated by routing accuracy; (2) persistent surplus (μ>0) where auxiliary energy remains bounded but renewable energy is wasted; (3) critical regime (μ=0) where the system operates at a volatility-limited equilibrium with deficit scaling as O(√T). The critical insight is that at the operating point where neither regime dominates, stochastic fluctuations become as important as routing accuracy in determining energy costs. The paper provides explicit scaling relationships grounded in Brownian motion theory, suggesting that lightweight, scaling-law-based dispatch policies can achieve near-optimal energy efficiency.",
    221   "red_flags": [
    222     {
    223       "flag": "Oracle stopping assumption unrealistic",
    224       "detail": "Section II.C assumes tasks 'halt immediately when instructed' at chosen thinking time τ. Real systems have variable stopping times, early-exit failures, and stopping overhead. Authors explicitly acknowledge this isolates the problem but the assumption is strong."
    225     },
    226     {
    227       "flag": "Independence assumptions violate real workload structure",
    228       "detail": "Assumption 1 requires Poisson task arrivals and i.i.d. energy harvesting. Real ML workloads and renewable energy exhibit temporal correlations, time-of-day patterns, and burstiness. Section V acknowledges this as future work."
    229     },
    230     {
    231       "flag": "Scaling laws grounding is preliminary and unpublished",
    232       "detail": "Section IV grounds inference-compute scaling in Ellis-Mohr et al [2] which is marked 'to appear' (not yet published). Empirical validation that these specific scaling laws hold for energy prediction is not provided in this paper."
    233     },
    234     {
    235       "flag": "Numerical validation uses toy parameters only",
    236       "detail": "Appendix B uses abstract 1B/10B models with synthetic task difficulty ranges [1.7,1.9] and fixed hardware specs. No validation on actual LRM inference costs from GPT/Claude/Gemini or realistic task distributions. Figure 3 crossing point may not reflect real behavior."
    237     },
    238     {
    239       "flag": "No explicit dispatch policy proposed for critical regime",
    240       "detail": "Paper identifies the critical regime (μ≈0) as most relevant for real systems but does not propose an actual routing policy. Mentions backpressure algorithms [27] in Discussion as future work but leaves implementation to the reader."
    241     },
    242     {
    243       "flag": "Unbounded computational capacity assumption unrealistic",
    244       "detail": "Section II assumes 'unlimited parallel processing capacity: any number of tasks may be processed concurrently.' Real systems are limited by GPU/TPU availability, fundamentally changing the optimization landscape and feasibility constraints."
    245     }
    246   ],
    247   "cited_papers": [
    248     {
    249       "title": "Scaling laws for neural language models",
    250       "authors": "Kaplan et al.",
    251       "year": 2020,
    252       "relevance": "Foundational empirical scaling laws that ground the theoretical model in Section IV; provides experimental basis for token-level pretraining loss functions."
    253     },
    254     {
    255       "title": "Training compute-optimal large language models",
    256       "authors": "Hoffmann et al.",
    257       "year": 2022,
    258       "relevance": "Introduces training-compute optimal scaling frontier used in Section IV.C; provides parametric form L_i(N_i) = L_irr + ΓN^(-γ) for capability prediction."
    259     },
    260     {
    261       "title": "A Theory of Inference Compute Scaling: Reasoning through Directed Stochastic Skill Search",
    262       "authors": "Ellis-Mohr, Nayak, Varshney",
    263       "year": 2026,
    264       "relevance": "Provides theoretical grounding for inference-compute scaling with reasoning tasks; defines success probability ψ_i(X, Ω) used in Section IV.C (to appear)."
    265     },
    266     {
    267       "title": "Clover: Toward sustainable AI with carbon-aware machine learning inference service",
    268       "authors": "Li et al.",
    269       "year": 2023,
    270       "relevance": "Practical system demonstrating mixture-of-models approach for energy efficiency; validates practical relevance of multi-model routing strategies."
    271     },
    272     {
    273       "title": "Optimal packet scheduling in an energy harvesting communication system",
    274       "authors": "Yang and Ulukus",
    275       "year": 2012,
    276       "relevance": "Foundational theoretical work on energy harvesting optimization; establishes formal analogy drawn in Section I for the routing problem formulation."
    277     },
    278     {
    279       "title": "Energy harvesting wireless communications: A review of recent advances",
    280       "authors": "Ulukus et al.",
    281       "year": 2015,
    282       "relevance": "Comprehensive survey of energy harvesting optimization theory; provides mathematical foundations for battery dynamics and deficit analysis."
    283     },
    284     {
    285       "title": "Estimating the carbon footprint of BLOOM, a 176B parameter language model",
    286       "authors": "Luccioni et al.",
    287       "year": 2023,
    288       "relevance": "Empirical energy measurements of LLM inference; demonstrates heterogeneous energy costs across models that motivate the routing problem."
    289     },
    290     {
    291       "title": "How hungry is AI? benchmarking energy, water, and carbon footprint of LLM inference",
    292       "authors": "Jegham et al.",
    293       "year": 2025,
    294       "relevance": "Recent comprehensive energy benchmarking of LLMs; provides practical context showing energy variability that the paper's routing theory addresses."
    295     }
    296   ],
    297   "engagement_factors": {
    298     "practical_relevance": {
    299       "score": 2,
    300       "justification": "Problem is timely (AI factory energy optimization) but theory assumes unrealistic conditions: oracle stopping, unlimited capacity, i.i.d. arrivals. Practical guidance limited to 'use scaling laws'—significant adaptation required for deployment."
    301     },
    302     "surprise_contrarian": {
    303       "score": 2,
    304       "justification": "Finding that critical regime is volatility-limited is somewhat surprising but follows naturally from Brownian motion theory. Paper validates existing scaling law approaches rather than challenging them. Incremental rather than contrarian."
    305     },
    306     "fear_safety": {
    307       "score": 0,
    308       "justification": "Pure optimization paper on energy efficiency. No discussion of AI safety, alignment, capability risks, or security. Sustainability is addressed but not fear-inducing from a safety perspective."
    309     },
    310     "drama_conflict": {
    311       "score": 0,
    312       "justification": "Entirely technical theoretical contribution with no controversy, conflicting empirical claims, or disagreement with prior work. No dramatic angle or conflict narrative. Incremental advancement in optimization theory."
    313     },
    314     "demo_ability": {
    315       "score": 1,
    316       "justification": "Numerical simulation (Appendix B) is reproducible with published equations but code is not provided. Applying to real LRMs requires proprietary energy profiles and hardware specs not disclosed. Limited practical demo-ability."
    317     },
    318     "brand_recognition": {
    319       "score": 1,
    320       "justification": "Authors from University of Illinois (Varshney known for information theory). Publication on arXiv only (not yet accepted to major venue). Low brand recognition compared to DeepMind, OpenAI, or Meta research papers."
    321     }
    322   },
    323   "hn_data": {
    324     "threads": [],
    325     "top_points": 0,
    326     "total_points": 0,
    327     "total_comments": 0
    328   }
    329 }

Impressum · Datenschutz