scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23005B)
      1 {
      2   "paper": {
      3     "title": "Energy-Aware Routing to Large Reasoning Models",
      4     "authors": [
      5       "Austin R. Ellis-Mohr",
      6       "Max Hartman",
      7       "Lav R. Varshney"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv",
     11     "arxiv_id": "2601.00823",
     12     "doi": "10.48550/arXiv.2601.00823"
     13   },
     14   "scan_version": 2,
     15   "active_modules": [],
     16   "methodology_tags": ["theoretical"],
     17   "key_findings": "This paper introduces a mathematical formulation for energy-aware routing of tasks to large reasoning models (LRMs) powered by renewable energy. Using Brownian motion diffusion approximation, the authors prove three operating regimes exist for auxiliary energy scaling: linear growth (persistent deficit), bounded (persistent surplus), and sqrt(T) at critical balance. At criticality, performance is volatility-limited, making variance-aware routing a principled design axis. Training-compute and inference-compute scaling laws are incorporated to enable lightweight dispatch without running a heavy AI dispatcher.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No source code or repository link is provided anywhere in the paper. The numerical simulations in Appendix B describe parameters but no code is released."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No datasets are released. The paper uses synthetic parametric simulations with parameters described in Appendix B but does not release simulation data or scripts."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No environment specification, requirements file, or software dependencies are mentioned. The simulation implementation details are not provided."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Appendix B provides simulation parameter values (model sizes, energy costs, hardware parameters, scaling law coefficients), which is helpful, but there are no step-by-step reproduction instructions or scripts."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Fig. 4 caption states 'Shaded regions indicate standard error over 100 trials.' Standard error bands are shown for the simulation results."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No statistical significance tests are used. The paper compares scaling regimes and routing policies based on analytical proofs and simulation plots without formal statistical comparison."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper characterizes scaling regimes (linear, sqrt(T), bounded) but does not report traditional effect sizes (Cohen's d, relative improvement with context) for the simulation comparisons."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "100 simulation trials are used (Fig. 4 caption) but no justification is given for why 100 trials is sufficient."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Standard error bands are shown in Fig. 4. Appendix B states 'The standard deviation of DT across trials is generally on the same order of magnitude as the expectation demonstrating the possibly significant variation in the accrued deficit.'"
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The myopic dispatcher serves as the baseline policy. The paper compares zero-error (optimal) routing against the myopic policy and various nonzero-error policies in Fig. 4."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "As a new problem formulation, the paper defines its own baseline (myopic dispatcher) which is appropriate. Prior work (Clover, EcoServe, FrugalGPT) is discussed qualitatively but addresses different problem formulations."
     78       },
     79       "ablation_study": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "This is a mathematical framework, not a system with discrete components to ablate. Parameter sensitivity is explored (different error rates, task difficulties) but this is inherent to the theoretical analysis."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "The primary evaluation metric is E[DT] (expected auxiliary energy deficit). Per-task energy and latency are shown in Fig. 3 as model properties, but the system-level evaluation focuses on a single metric."
     88       },
     89       "human_evaluation": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "Human evaluation is irrelevant to this theoretical/analytical paper about energy optimization."
     93       },
     94       "held_out_test_set": {
     95         "applies": false,
     96         "answer": false,
     97         "justification": "Not applicable — this is a theoretical paper with parametric simulations, not an ML training/evaluation setup."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Fig. 3 provides per-task difficulty breakdowns for both model sizes. Fig. 4 shows results across different prediction error rates. Appendix B provides specific numerical values at different difficulty levels."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "The paper does not discuss where the framework breaks down or fails. The three regimes are all characterized positively. No discussion of scenarios where the assumptions lead to poor predictions."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": false,
    112         "justification": "No negative results are reported. All theoretical results and simulations confirm the framework's predictions. Fig. 4 shows degradation with routing errors, but this is expected behavior, not a negative finding."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract's claims about three operating regimes, volatility-limited performance, and variance-aware routing are supported by Theorem 2 and the simulation results in Fig. 2 and Fig. 4."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper's causal claims (e.g., 'Routing error induces per-task excess energy, which shifts the mean battery drift') are justified by mathematical proofs (Theorems 1-2) and confirmed by simulation. The causal mechanisms are analytically derived."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Assumptions are clearly stated (Assumption 1: i.i.d. arrivals, Poisson tasks, empty initial battery). Section V explicitly notes these assumptions and discusses relaxation as future work. The paper states simulations use 'abstract model, hardware, and task parameters (not deployed models).'"
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper does not discuss alternative explanations for the observed simulation scaling behavior or whether other mathematical frameworks could yield different predictions. Section V discusses extensions but not alternatives."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper's claims match the granularity of its measurements. It measures auxiliary energy deficit E[DT] and claims this characterizes energy-aware routing. No proxy gap exists — the measured quantity is exactly what is being optimized."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The paper uses abstract parametric models (1B and 10B parameters) with all relevant parameters fully specified in Appendix B: nlayers=48, dattn=2048, energy costs, hardware parameters, scaling law coefficients. These are not real deployed models but are fully reproducible."
    147       },
    148       "prompts_provided": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "No prompting is involved. The paper uses mathematical models and parametric simulations, not LLM API calls."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Appendix B provides extensive simulation parameters: Emem=10^-11 J/param, Ecomp=10^-12 J/FLOP, BW=5×10^12, TP=2×10^13, Δ=1s, K=1, scaling law parameters (Lirr=1.69, γ≈0.34, Γ≈900), sigmoid steepness b=5, m=50 skills, ω=20 tokens/skill, ε=0.1."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. This is a theoretical/mathematical framework."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "The simulation data generation is fully specified in Appendix B, including task parameterization (difficulty l∈[1.7,1.9], linearly spaced across ten tasks), energy budget (CLB≈593.5), and renewable energy distribution (Gamma with specified variance)."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "There is no dedicated limitations section. Section V (Discussion) mentions future work directions (relaxing i.i.d. assumptions, dependency structures, limited-capacity batteries, stochastic tolerance) but frames them as extensions, not limitations of the current work."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The Discussion mentions specific assumptions that could be relaxed (i.i.d. arrivals, unlimited parallel processing, oracle stopping, independent tasks) but frames these as future work opportunities rather than threats to the validity of current results."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "The paper clearly states its modeling assumptions: Assumption 1 (i.i.d., Poisson), oracle stopping (Sec II-C), unlimited parallel capacity (Sec II), and abstract parametric models (Appendix B: 'not deployed models'). Section V explicitly notes aspects not addressed."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "No simulation data, intermediate results, or raw outputs are released for independent verification."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "The simulation data generation procedure is fully described in Appendix B with all parameter values, distributions, and task specifications."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. All data is synthetically generated from specified parametric distributions."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The simulation pipeline from parameter specification to result generation is documented in Appendix B, including model characterization, task generation, energy budget calibration, and regime detection (BIC model selection)."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding source or acknowledgments section is present in the paper."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are clearly stated: Department of Electrical and Computer Engineering, University of Illinois Urbana-Champaign, and AI Innovation Institute, Stony Brook University."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding disclosure is itself a gap."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial interests statement is present in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "The paper does not evaluate any pre-trained model on benchmarks. It uses abstract parametric models in a theoretical framework."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "No pre-trained model evaluation on benchmarks. Simulations use synthetic parametric tasks."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "No standard benchmarks are used. All evaluation is through parametric simulations of the theoretical framework."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this theoretical paper."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this theoretical paper."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this theoretical paper."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this theoretical paper."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this theoretical paper."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this theoretical paper."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this theoretical paper."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "This is a theoretical paper. It models energy costs abstractly but does not run real inference requiring cost reporting."
    289       },
    290       "compute_budget_stated": {
    291         "applies": false,
    292         "answer": false,
    293         "justification": "This is a theoretical paper. The 100-trial simulations with abstract parameters are lightweight and do not require compute budget reporting."
    294       }
    295     }
    296   },
    297   "claims": [
    298     {
    299       "claim": "Three operating regimes emerge for expected auxiliary energy deficit: linear scaling for persistent deficit (CLB > R), bounded for persistent surplus (CLB < R), and sqrt(T) at critical balance (CLB ≈ R).",
    300       "evidence": "Theorem 2 provides closed-form expressions for E[DT] in all three regimes, proved in Appendix A. Fig. 4 confirms the sqrt(T) scaling at zero error and transition to linear scaling with routing errors.",
    301       "supported": "strong"
    302     },
    303     {
    304       "claim": "At criticality, performance is volatility-limited and governed by variance-aware routing, not mean optimality alone.",
    305       "evidence": "Equation (28) shows E[DT] ≈ σ_Bmy √(2T/π) at zero drift, depending only on the volatility parameter. Fig. 2 characterizes the deviation from drift-only scaling.",
    306       "supported": "strong"
    307     },
    308     {
    309       "claim": "Routing errors are first-order effects that eventually dominate the second-order fluctuation costs.",
    310       "evidence": "Section III-C provides analytical comparison of drift vs. fluctuation terms. Fig. 4 shows nonzero-error policies transitioning from sqrt(T) to linear scaling as cumulative drift overtakes diffusion, confirmed over 100 trials with regime transition detected via BIC.",
    311       "supported": "strong"
    312     },
    313     {
    314       "claim": "Training-compute and inference-compute scaling laws can enable lightweight dispatch policies that avoid the energy overhead of running a large AI model as dispatcher.",
    315       "evidence": "Section IV derives explicit energy-latency expressions as functions of model size and task difficulty using scaling laws from Hoffmann et al. and Ellis-Mohr et al. Fig. 3 illustrates the crossover point. However, no comparison is made against an actual AI-based dispatcher.",
    316       "supported": "moderate"
    317     },
    318     {
    319       "claim": "For certain task difficulty ranges, smaller models dominate larger models on both energy and latency, but a crossover exists beyond which larger models are preferred on both axes.",
    320       "evidence": "Fig. 3 shows energy and latency per task for 1B and 10B parameter models across difficulty levels l∈[1.7,1.9], with specific numerical values in Appendix B. However, this is based on abstract parametric models, not real deployed systems.",
    321       "supported": "moderate"
    322     }
    323   ],
    324   "red_flags": [
    325     {
    326       "flag": "Abstract parametric models only",
    327       "detail": "All simulations use 'abstract model, hardware, and task parameters (not deployed models)' (Appendix B). The practical applicability to real AI factories with real LRMs is undemonstrated. The gap between the theoretical framework and deployment is significant."
    328     },
    329     {
    330       "flag": "Oracle stopping assumption",
    331       "detail": "The paper assumes models halt exactly when instructed (Section II-C), which 'isolates routing/dispatch limits from early-stopping design.' This removes a major practical challenge and may make the theoretical results overly optimistic for real systems."
    332     },
    333     {
    334       "flag": "No real system validation",
    335       "detail": "Despite motivating the work with real AI factories and renewable energy, the paper provides no experimental validation with actual models, hardware, or energy systems. The 100-trial simulation uses synthetic parameters."
    336     },
    337     {
    338       "flag": "I.i.d. assumption unrealistic",
    339       "detail": "Assumption 1 requires i.i.d. task arrivals and energy harvests, which the paper itself acknowledges 'in practice may depend on temporal factors such as time of day and day of week.' Real workloads have strong temporal correlations."
    340     }
    341   ],
    342   "cited_papers": [
    343     {
    344       "title": "Large language model routing with benchmark datasets",
    345       "authors": ["T. Shnitzer", "A. Ou", "M. Silva", "K. Soule", "Y. Sun", "J. Solomon", "N. Thompson", "M. Yurochkin"],
    346       "year": 2024,
    347       "relevance": "Directly addresses LLM routing problem using benchmark datasets, core prior work for model dispatch in AI systems."
    348     },
    349     {
    350       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    351       "authors": ["L. Chen", "M. Zaharia", "J. Zou"],
    352       "year": 2024,
    353       "relevance": "Proposes cost-reduction strategies for LLM usage including prompt adaptation, approximation, and cascading — directly relevant to efficient LLM deployment."
    354     },
    355     {
    356       "title": "Clover: Toward sustainable AI with carbon-aware machine learning inference service",
    357       "authors": ["B. Li", "S. Samsi", "V. Gadepally", "D. Tiwari"],
    358       "year": 2023,
    359       "relevance": "Experimental demonstration that routing to a mixture of low- and high-quality models can improve energy efficiency while maintaining performance."
    360     },
    361     {
    362       "title": "EcoServe: Designing carbon-aware AI inference systems",
    363       "authors": ["Y. Li", "Z. Hu", "E. Choukse", "R. Fonseca", "G. E. Suh", "U. Gupta"],
    364       "year": 2025,
    365       "arxiv_id": "2502.05043",
    366       "relevance": "Focuses on GPU/CPU optimization for carbon-aware AI inference, complementary approach to energy-efficient AI serving."
    367     },
    368     {
    369       "title": "Estimating the carbon footprint of BLOOM, a 176B parameter language model",
    370       "authors": ["A. S. Luccioni", "S. Viguier", "A.-L. Ligozat"],
    371       "year": 2023,
    372       "relevance": "Empirical analysis of LLM energy costs and carbon footprint, providing real-world evidence for the energy optimization problem this paper addresses."
    373     },
    374     {
    375       "title": "Scaling laws for neural language models",
    376       "authors": ["J. Kaplan", "S. McCandlish", "T. Henighan", "T. B. Brown", "B. Chess", "R. Child", "S. Gray", "A. Radford", "J. Wu", "D. Amodei"],
    377       "year": 2020,
    378       "arxiv_id": "2001.08361",
    379       "relevance": "Foundational scaling laws paper that this work builds upon for characterizing model energy expenditure as a function of model size."
    380     },
    381     {
    382       "title": "Training compute-optimal large language models",
    383       "authors": ["J. Hoffmann", "S. Borgeaud", "A. Mensch"],
    384       "year": 2022,
    385       "arxiv_id": "2203.15556",
    386       "relevance": "Chinchilla scaling laws used directly in this paper's theoretical framework to model training-compute optimal frontier and pretraining loss."
    387     },
    388     {
    389       "title": "A Theory of Inference Compute Scaling: Reasoning through Directed Stochastic Skill Search",
    390       "authors": ["Austin R. Ellis-Mohr", "Anuj K. Nayak", "Lav R. Varshney"],
    391       "year": 2026,
    392       "relevance": "Inference-compute scaling theory directly adopted in Section IV for modeling reasoning model token usage and success probability."
    393     },
    394     {
    395       "title": "How hungry is AI? benchmarking energy, water, and carbon footprint of LLM inference",
    396       "authors": ["N. Jegham", "M. Abdelatti", "C. Y. Koh", "L. Elmoubarki", "A. Hendawi"],
    397       "year": 2025,
    398       "arxiv_id": "2505.09598",
    399       "relevance": "Empirical benchmarking of LLM inference energy costs, directly relevant to understanding heterogeneous energy costs across models."
    400     },
    401     {
    402       "title": "Compact language models via pruning and knowledge distillation",
    403       "authors": ["S. Muralidharan", "S. T. Sreenivas", "R. B. Joshi"],
    404       "year": 2024,
    405       "relevance": "Model compression techniques that reduce LLM energy consumption, an alternative approach to the routing strategy proposed here."
    406     }
    407   ]
    408 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs