scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30431B)
      1 {
      2   "paper": {
      3     "title": "Sustainable LLM Inference using Context-Aware Model Switching",
      4     "authors": [
      5       "Yuvarani",
      6       "Akashdeep Singh",
      7       "Zahra Fathanah",
      8       "Salsabila Harlen",
      9       "Syeikha Syafura Al-Zahra binti Zahari",
     10       "Hema Subramaniam"
     11     ],
     12     "year": 2026,
     13     "venue": "arXiv",
     14     "arxiv_id": "2602.22261"
     15   },
     16   "scan_version": 2,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "A context-aware model switching system using cache, rule-based, and ML-based routing across three small open-source models (1B–4B parameters) achieved 67.5% energy reduction and 68% latency improvement on 150 hand-crafted queries compared to always using the largest (4B) model, while retaining 93.6% BERTScore F1 quality. Routing accuracy was 79.3% overall, with high simple-query recall (98%) but low complex-query recall (52%) due to conservative escalation. The evaluation was limited to a single local host with very small models, making generalizability to production-scale deployments unclear.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No source code repository URL, GitHub link, or archive is provided anywhere in the paper."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The paper states 'The dataset is available as supplementary material' (Section 3, Methodology, Phase 4), indicating the 150-query evaluation dataset is released."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper describes hardware (AMD Ryzen 7 5800H, GTX 1650 Ti, 32GB RAM) and mentions Python 3.10, CUDA 11.8, PyTorch, Ollama, and sentence-transformers, but provides no library version numbers, requirements.txt, Dockerfile, or environment file sufficient to recreate the software environment."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No step-by-step reproduction instructions, README with commands, or scripts to replicate experiments are provided."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Standard deviations are reported for main results: latency (σ = 2.1s baseline, σ = 1.4s adaptive), energy (σ = 6.3 kJ baseline, σ = 1.8 kJ adaptive), and BERTScore F1 (σ = 0.04) across three repetitions."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No statistical significance tests are reported. Claims of 67.5% energy reduction and 68% latency improvement are based solely on comparing mean values without any t-tests, Mann-Whitney U, or other tests."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Effect sizes are reported with baseline context: energy 84.2 kJ → 22.0 kJ (67.5% reduction), latency 13.8s → 3.5s (68% reduction), throughput 25.4 → 61.3 tokens/s (141% increase), BERTScore F1 93.6%."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The evaluation uses 150 queries (50 per category) with no justification for why this number was chosen and no power analysis. No discussion of whether 150 queries is sufficient to draw the conclusions made."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Each query was executed three times and standard deviations are reported: σ = 2.1s and σ = 1.4s for latency, σ = 6.3 kJ and σ = 1.8 kJ for energy, σ = 0.04 for BERTScore F1."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "A clear baseline is defined: 'all incoming queries are routed to a single large LLM that remains continuously loaded in memory, regardless of query complexity.' All metrics are compared against this baseline."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "The only baseline is the trivial 'always use the largest model' approach. The paper discusses FrugalGPT and RouteLLM as prior routing work but does not compare against either, despite both being directly relevant and available."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "The system has multiple components (LRU cache, 96 regex rules, ML semantic classifier, user-adaptive component) but no ablation study tests the contribution of each routing level."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Multiple metrics are reported: response latency, throughput (tokens/s), energy consumption (kJ), CO2 emissions, routing accuracy, precision, recall, weighted F1, and BERTScore F1."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No human evaluation is included. Output quality is assessed only via BERTScore F1 against the large model's output. The paper acknowledges 'Quality assessment relied on automated similarity metrics rather than human evaluation' as a limitation."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "The paper states 'Initial experimental runs are used to identify misclassification patterns, which inform subsequent adjustments to routing rules and confidence thresholds.' This iterative tuning used the same evaluation pipeline with no described dev/test split, so it is unclear whether the reported 150-query results are on data independent of the tuning process."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Results are broken down by query category (simple, medium, complex) for latency, energy, routing accuracy (precision, recall, F1), and quality. E.g., simple queries: ~300ms latency, 98% recall; complex queries: 96.3% precision, 52% recall."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper discusses failure modes: complex query recall of only 52%, quality degradation in 'borderline cases where smaller models processed intermediate or complex queries,' and the conservative escalation strategy that sacrifices recall for quality."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Complex query recall of 52% is reported as a clear weakness. The paper also notes quality degradation in borderline routing cases and acknowledges that 'absolute latency grew under continuous load' during stress testing."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Abstract claims of 67.5% energy reduction, 93.6% quality retention, and ~68% latency improvement are all supported by results in Section 4 with specific numbers: 84.2 kJ → 22.0 kJ, BERTScore F1 93.6%, and 13.8s → 3.5s."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The paper makes causal claims like 'performance gains derive primarily from directing simple and intermediate queries to smaller, more efficient models.' The study design is a controlled comparison (same queries, same hardware, routing vs. no routing), which is adequate for these component-level causal claims."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The abstract claims the approach offers 'a practical and scalable path toward more energy-efficient and sustainable AI systems' and the conclusion states 'sustainability and performance are complementary rather than competing objectives.' These broad claims go well beyond the tested setting: 150 hand-crafted queries, a single host with a GTX 1650 Ti, and three models all ≤4B parameters."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "No alternative explanations for the results are discussed. For example, the energy savings could be primarily attributable to the trivially detectable simple queries (greetings) rather than the sophisticated routing architecture. The limitations mention deployment scope and metric choice but do not consider alternative explanations for the observed efficiency gains."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "BERTScore F1 measured against the large model's own output is framed as 'response quality,' but similarity to a 4B model's output is not the same as actual quality. The paper does not acknowledge this proxy gap — being semantically similar to the Qwen3 4B response does not establish that the response is high quality in absolute terms."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "Models are identified as 'Gemma3 1B, Gemma3 4B and Qwen3 4B' and 'all-MiniLM-L6-v2.' No specific version identifiers, checkpoint dates, quantization variants, or Ollama model tags are provided. Multiple variants of these models exist."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "No actual prompts sent to the language models are provided. The 96 precompiled regular expressions and predefined task vectors used for routing are described in general terms but not listed. The actual text sent to models during inference is not shown."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "Some routing parameters are mentioned (LRU cache TTL of 300s, complexity score range 0–100, keep_alive: 0), but critical inference parameters (temperature, top-p, max tokens) are not reported for any of the three models. The actual complexity score thresholds for routing to small/medium/large tiers are not stated."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used. The system is a deterministic routing pipeline (cache → rules → ML classifier) directing queries to models via Ollama API, not an agentic system with tools, retry logic, or feedback loops."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "The evaluation dataset construction is described: 150 prompts manually curated across 3 categories (50 simple, 50 medium, 50 complex) with examples of each type, reviewed for ambiguity. The evaluation procedure (3 repetitions, identical conditions) is documented."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "A substantive limitations paragraph appears in Section 4: 'Several constraints warrant acknowledgment. The evaluation concentrated on single-host deployment and did not examine effects of high concurrency or distributed architecture. Quality assessment relied on automated similarity metrics rather than human evaluation...'"
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Specific threats are identified: single-host deployment without high concurrency testing, reliance on automated BERTScore rather than human evaluation ('potentially overlooking subtle semantic distinctions'), and qualitative rather than quantitative stress test analysis."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The paper explicitly states what was not tested: 'evaluation concentrated on single-host deployment and did not examine effects of high concurrency or distributed architecture,' 'evaluation limited to conversational workloads,' and stress test results received 'qualitative rather than quantitative treatment.'"
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The paper mentions 'Comprehensive stress test logs remain available in evaluation artifacts' and 'execution logs provided the empirical foundation,' but no URL or archive is provided for accessing raw telemetry data, per-query measurements, or execution logs."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Data collection is described: 150 manually curated queries across 3 categories, executed 3 times each on specified hardware, with energy measured via NVML GPU power telemetry (mean GPU power draw × inference duration in seconds) and quality via BERTScore F1."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants. The evaluation dataset was hand-crafted by the authors, not collected from a population."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The four-phase methodology describes the pipeline: requirement analysis → architecture design → implementation with iterative refinement → comparative evaluation. The evaluation pipeline (queries → routing → inference → telemetry → averaging across 3 runs) is documented."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding information, acknowledgments section, or grant numbers are provided anywhere in the paper."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "All authors are listed as affiliated with the Faculty of Computer Science and Information Technology, Universiti Malaya, Kuala Lumpur, Malaysia. No conflicts of interest with the evaluated models (Google Gemma, Alibaba Qwen)."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "Funding is not disclosed, so independence of funding from outcomes cannot be assessed. University affiliation suggests academic funding, but this is not stated."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests statement or financial disclosure is included in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "The paper evaluates a routing system's energy efficiency, not a pre-trained model's capability on a benchmark. The models are used as inference components, and results do not depend on whether models memorized evaluation data."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "The paper evaluates a routing architecture, not model knowledge. The 150 evaluation queries test routing accuracy and energy efficiency, not whether models have memorized answers."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "The evaluation measures routing-level metrics (energy, latency, routing accuracy) rather than model knowledge or capability, making benchmark contamination structurally irrelevant."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study. The evaluation is entirely system-level using automated metrics."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": true,
    290         "justification": "Inference cost is thoroughly reported: energy per dataset (84.2 kJ baseline vs 22.0 kJ adaptive), response latency (13.8s vs 3.5s), throughput (25.4 vs 61.3 tokens/s), and estimated CO2 emissions (11.1 vs 2.9 gCO2e). Per-category breakdowns are also provided."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "Hardware is specified (AMD Ryzen 7 5800H, GTX 1650 Ti 4GB, 32GB RAM, CUDA 11.8) and total energy consumption for the evaluation is reported (84.2 kJ baseline, 22.0 kJ adaptive for 150 queries)."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": true,
    302         "justification": "Each query was executed three times and standard deviations across runs are reported for energy (σ = 6.3 kJ, σ = 1.8 kJ) and latency (σ = 2.1s, σ = 1.4s), capturing run-to-run variance from model non-determinism."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": true,
    307         "justification": "Explicitly stated: 'Each query was executed three times to minimize variance and ensure measurement stability.'"
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "Routing thresholds were tuned through 'pilot testing' and 'iterative optimization' (Section 3, Phase 3), but no search budget is stated: number of configurations tried, search method, or compute spent on tuning is not reported."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "The paper describes iterative refinement of routing rules and confidence thresholds but does not explain how the final configuration was selected, whether it was chosen on a validation set, or what selection criterion was used."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": false,
    321         "answer": false,
    322         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors designed, implemented, tuned, and evaluated their own system against a trivial baseline they defined, without acknowledging author-evaluation bias or employing independent evaluation."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": true,
    332         "justification": "The paper's core contribution is analyzing performance as a function of compute budget. Energy and latency are reported across model tiers (small/medium/large), and the routing system explicitly trades compute budget for performance quality."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "The 150-query evaluation dataset is described as covering 'the distribution of queries typical in real-world conversational AI deployments,' but no evidence validates this claim. The distribution was assumed, not empirically derived from actual production logs or usage data."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "The routing scaffold IS the system being tested. There are no scaffold-confounded model comparisons — the evaluation compares routing vs. no-routing on the same models using the same Ollama backend."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "No discussion of whether the evaluation queries or expected response patterns could have appeared in the training data of Gemma3 or Qwen3 models, which could affect BERTScore quality measurements."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of whether the routing system's evaluation setup provides information that would not be available in real deployment (e.g., pre-labeled query categories used to evaluate routing accuracy)."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No discussion of whether pilot testing queries (used to tune routing thresholds) overlap with or are structurally similar to the 150 evaluation queries."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No leakage detection or prevention method is used or discussed."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "Model switching reduces energy consumption by up to 67.5% compared to always using the largest model.",
    371       "evidence": "Section 4: baseline consumed mean 84.2 kJ (σ = 6.3 kJ) vs adaptive 22.0 kJ (σ = 1.8 kJ) for 150 queries across 3 repetitions.",
    372       "supported": "moderate"
    373     },
    374     {
    375       "claim": "Response quality is maintained at 93.6% (BERTScore F1) compared to baseline.",
    376       "evidence": "Section 4: mean BERTScore F1 quality retention rate of 93.6% (σ = 0.04) measured against large-model baseline outputs.",
    377       "supported": "weak"
    378     },
    379     {
    380       "claim": "Response time for simple queries improved by approximately 68%.",
    381       "evidence": "Section 4: mean latency reduced from 13.8s (σ = 2.1s) to 3.5s (σ = 1.4s). Simple queries achieved ~300ms average latency.",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "Throughput increased by 141% under adaptive routing.",
    386       "evidence": "Section 4: 25.4 tokens/s (baseline) → 61.3 tokens/s (adaptive).",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "Overall routing accuracy of 79.3% with weighted F1 of 78.1%.",
    391       "evidence": "Section 4: directly measured routing accuracy with per-category breakdown — simple recall 98%, complex precision 96.3%, complex recall 52%.",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "This is the first study to demonstrate that hybrid routing on locally hosted open-source models can achieve >67% energy reduction while retaining >93% quality.",
    396       "evidence": "No prior work comparison is provided to validate novelty. FrugalGPT achieved 98% cost reduction and RouteLLM achieved 2x cost reduction, both cited but not compared against.",
    397       "supported": "weak"
    398     }
    399   ],
    400   "red_flags": [
    401     {
    402       "flag": "Misleading model size labeling",
    403       "detail": "The 'large' model (Qwen3 4B) and 'medium' model (Gemma3 4B) are both 4B parameters — essentially the same size. The entire model range spans only 1B to 4B parameters, which is tiny by current standards. Calling a 4B model 'large' is misleading, and the narrow parameter range limits generalizability of efficiency claims to production-scale systems using models 10–100× larger."
    404     },
    405     {
    406       "flag": "Circular quality metric",
    407       "detail": "Response quality is measured as BERTScore F1 similarity to the 'large' (Qwen3 4B) model's own output, making the quality baseline the output of a very small model. 93.6% similarity to a 4B model's output does not establish absolute response quality — it only shows the routing system produces similar outputs to the default model."
    408     },
    409     {
    410       "flag": "No comparison with cited routing systems",
    411       "detail": "The paper discusses FrugalGPT (up to 98% cost reduction) and RouteLLM (2× cost reduction) in detail but compares only against the trivial 'always use large model' baseline. This omission makes it impossible to assess whether the proposed approach advances the state of the art."
    412     },
    413     {
    414       "flag": "Very small evaluation dataset",
    415       "detail": "The evaluation uses only 150 hand-crafted queries. No justification is given for this sample size, no power analysis is performed, and no statistical significance tests are applied to the results. With only 50 queries per category, the per-category breakdowns are based on very small samples."
    416     },
    417     {
    418       "flag": "Incomplete energy measurement",
    419       "detail": "The paper acknowledges 'CPU-side energy was not independently instrumented; reported figures reflect GPU-dominant inference energy.' For 1B–4B models on a GTX 1650 Ti with 4GB VRAM and shared memory, CPU computation may be significant, meaning energy savings could be overstated."
    420     },
    421     {
    422       "flag": "No statistical significance testing",
    423       "detail": "All comparative claims (67.5% energy reduction, 68% latency improvement, 141% throughput increase) are based on comparing mean values across only 3 repetitions without any statistical tests. The differences are large, but the methodology does not formally establish significance."
    424     },
    425     {
    426       "flag": "Potential tuning-test contamination",
    427       "detail": "Routing thresholds were iteratively refined using 'initial experimental runs' but no held-out test split is described. The final reported results may be on data influenced by the tuning process."
    428     }
    429   ],
    430   "cited_papers": [
    431     {
    432       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    433       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    434       "year": 2023,
    435       "arxiv_id": "2305.05176",
    436       "relevance": "Directly relevant prior work on LLM cascade routing that achieved up to 98% cost reduction matching GPT-4 quality — key comparison point for routing efficiency claims."
    437     },
    438     {
    439       "title": "RouteLLM: Learning to route LLMs with preference data",
    440       "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu", "Wei-Lin Chiang", "Tianhao Wu", "Joseph E. Gonzalez", "M. Waleed Kadous", "Ion Stoica"],
    441       "year": 2024,
    442       "arxiv_id": "2406.18665",
    443       "relevance": "Learned LLM routing framework using human preference data for single-pass model selection, demonstrating 2x cost reduction with quality maintenance."
    444     },
    445     {
    446       "title": "RouterBench: A benchmark for multi-LLM routing system",
    447       "authors": ["Qitian Jason Hu", "Jacob Bieker", "Xiuyu Li", "Nan Jiang", "Benjamin Keigwin", "Gaurav Ranganath", "Kurt Keutzer", "Shriyash Kaustubh Upadhyay"],
    448       "year": 2024,
    449       "arxiv_id": "2403.12031",
    450       "relevance": "Benchmark for evaluating multi-LLM routing systems showing oracle routers can outperform single models while reducing cost — establishes performance ceiling for routing approaches."
    451     },
    452     {
    453       "title": "Energy and policy considerations for deep learning in NLP",
    454       "authors": ["Emma Strubell", "Ananya Ganesh", "Andrew McCallum"],
    455       "year": 2019,
    456       "doi": "10.18653/v1/P19-1355",
    457       "relevance": "Foundational work quantifying energy consumption and carbon emissions of NLP model training, establishing computational cost as an evaluation criterion."
    458     },
    459     {
    460       "title": "Green AI",
    461       "authors": ["Roy Schwartz", "Jesse Dodge", "Noah A. Smith", "Oren Etzioni"],
    462       "year": 2020,
    463       "doi": "10.1145/3381831",
    464       "relevance": "Introduced the Red AI vs Green AI framework distinguishing efficiency-indifferent from efficiency-aware AI research, proposing FLOPs as a universal efficiency metric."
    465     },
    466     {
    467       "title": "Estimating the carbon footprint of BLOOM, a 176B parameter language model",
    468       "authors": ["Alexandra Sasha Luccioni", "Sylvain Viguier", "Anne-Laure Ligozat"],
    469       "year": 2023,
    470       "relevance": "Quantified inference-phase carbon footprint of a large language model, finding inference emissions can rival training costs — directly motivates inference efficiency work."
    471     },
    472     {
    473       "title": "The power hungry processing: Watts driving the cost of AI deployment?",
    474       "authors": ["Sasha Liao", "Jordi Montes-Sanchez", "Chris Cummins", "Hugh Leather"],
    475       "year": 2023,
    476       "arxiv_id": "2311.05610",
    477       "relevance": "Empirical study showing inference optimization techniques don't uniformly reduce energy use across GPU configurations — motivates system-level approaches to sustainable inference."
    478     },
    479     {
    480       "title": "Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity",
    481       "authors": ["William Fedus", "Barret Zoph", "Noam Shazeer"],
    482       "year": 2022,
    483       "arxiv_id": "2101.03961",
    484       "relevance": "Introduced sparse mixture-of-experts routing architecture where tokens are routed to subsets of specialized layers, achieving parameter scaling without proportional compute increase."
    485     },
    486     {
    487       "title": "DeeBERT: Dynamic early exiting for accelerating BERT inference",
    488       "authors": ["Ji Xin", "Raphael Tang", "Jaejun Lee", "Yaoliang Yu", "Jimmy Lin"],
    489       "year": 2020,
    490       "arxiv_id": "2004.12993",
    491       "relevance": "Demonstrated early exit inference can skip up to 40% of BERT computation on simple inputs without accuracy loss — architectural-level approach to adaptive inference efficiency."
    492     }
    493   ]
    494 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs