scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25640B)
      1 {
      2   "paper": {
      3     "title": "Characterizing LLM Inference Energy-Performance Tradeoffs across Workloads and GPU Scaling",
      4     "authors": [
      5       "Paul Joe Maliakel",
      6       "Shashikant Ilager",
      7       "Ivona Brandic"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv preprint",
     11     "arxiv_id": "2501.08219"
     12   },
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper. No supplementary material links are provided."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper uses four publicly available NLP benchmarks: BoolQ, HellaSwag, TruthfulQA, and NarrativeQA. All are standard public datasets referenced with citations. Models are also publicly available (Llama, Qwen)."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Section IV-B specifies the hardware (NVIDIA RTX PRO 6000 Blackwell GPU, 96 GB memory), measurement infrastructure (NVML telemetry via nvidia-smi at 10ms sampling), inference precision (FP16), and torch.cuda.synchronize() usage. Section IV-C specifies model names and parameter counts. However, no requirements.txt or software dependency list is provided."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "While the experimental setup is described in detail (Section IV), there are no step-by-step reproduction instructions, scripts, or a README with commands to run. The methodology is described but not packaged for replication."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Results are reported as point estimates (e.g., '42% energy savings', '1-6% latency increase'). No confidence intervals, error bars, or uncertainty measures are reported despite each configuration being repeated three times."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper makes comparative claims (e.g., semantic features predict difficulty better than input length, DVFS achieves energy savings) but does not use any statistical significance tests. Comparisons are based solely on point estimates."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper reports effect sizes with baseline context throughout, e.g., '42% energy savings' relative to 2842 MHz baseline, '17.5 percentage point improvement' over length-only baseline (Table VI), energy per query from 2.9 J (1B) to 21.0 J (32B). Correlation coefficients are also reported (e.g., r = 0.002, r = -0.29)."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper uses 1,000 queries per dataset (3,817 total) and 3 runs per configuration, but no justification is given for why these sample sizes were chosen. No power analysis is discussed."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Section IV-B states 'Each configuration is repeated three times, and we report mean values' but no standard deviation, variance, or spread measures across the three runs are reported. Only means are given."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper uses the maximum GPU frequency (2842 MHz) as the baseline for all DVFS comparisons (Section IV-B). For workload characterization, a length-only heuristic (51.1% accuracy) serves as baseline against semantic features (68.6% accuracy, Table VI)."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The baselines are appropriate: maximum GPU frequency is the natural reference point for DVFS energy studies, and the length-based heuristic is the standard assumption the paper challenges. The models used (Llama-3.1/3.2, Qwen2.5) are contemporary."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Table VI presents a feature ablation for difficulty classification: length only (51.1%), + entity density (66.6%), + causal question score (68.4%), semantic features only (68.6%). Section V-D also discusses feature selection rationale for excluding token entropy, reasoning complexity, and complexity score."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The paper reports energy consumption (joules), latency (end-to-end and per-phase), Energy-Delay Product (EDP), accuracy (for classification tasks), and ROUGE-L (for generation tasks). Multiple dimensions of the energy-performance tradeoff are captured."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "This is a hardware benchmarking study measuring energy consumption and latency. Human evaluation is not relevant to the claims about GPU frequency scaling behavior and energy efficiency."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The difficulty classification ablation (Table VI) uses 5-fold stratified cross-validation, which provides proper train-test separation. The NLP benchmark datasets are used as-is without modification."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down by model (5 models), dataset (4 benchmarks), batch size (1, 4, 8), GPU frequency (7 levels), and inference phase (prefill vs decode). Tables XI, XII, XIII, XVI provide detailed per-configuration breakdowns."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper discusses 'Always Hard' queries (32.6%) where even 32B models struggle (Table IX), notes that model routing introduces quality tradeoffs (6.8 percentage point drop, Section VII-C), and acknowledges TruthfulQA's low scores across all models (Table VII)."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper reports that input length is a weak predictor (r = 0.002), that reasoning complexity has near-zero correlation with quality (r = -0.03) despite theoretical relevance, and that 32.6% of queries remain hard regardless of model size. Token entropy is excluded due to redundancy with length."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims are well-supported: '44.5% of queries achieving comparable quality across model sizes' is in Table IX; 'decode phase dominates inference time (77-91%)' is in Table XI; '42% energy savings with only a 1-6% latency increase' is in Tables XI/XIV. All claims match the data."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The paper makes causal claims about why DVFS works (memory-bound decode phase is insensitive to frequency). These are supported by controlled experiments varying a single factor (GPU frequency) while holding others constant, and by phase-level instrumentation showing the mechanism. The ablation study (Table VI) supports causal claims about feature contributions."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper is explicit about scope: 'Our goal is not to propose a new serving system or scheduling algorithm' (Section I). The threats to validity section (Section VII) states limitations to 'offline replay-based benchmarking on a single GPU' and notes it 'does not capture production serving dynamics.' The title specifies 'across Workloads and GPU Scaling' rather than making universal claims."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The threats to validity paragraph (Section VII) discusses alternative factors: production serving dynamics (continuous batching, speculative decoding, multi-GPU), system-level overheads beyond GPU, classification accuracy for routing, and switching overhead. The paper also discusses why token entropy is correlated with length rather than being an independent predictor."
    130       }
    131     },
    132     "setup_transparency": {
    133       "model_versions_specified": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Table I and Section IV-C specify exact model names: Llama-3.2-1B, Llama-3.2-3B, Llama-3.1-8B, Qwen2.5-14B, Qwen2.5-32B. These are specific versioned model identifiers with parameter counts, not vague marketing names."
    137       },
    138       "prompts_provided": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "The paper does not use prompting in the traditional sense. It runs standard NLP benchmark evaluations (log-likelihood for classification, greedy decoding for generation) on existing datasets. No custom prompts are designed."
    142       },
    143       "hyperparameters_reported": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Section IV-C specifies greedy decoding (do_sample=False, temperature=0), max 100 generated tokens, early stopping on EOS. Section IV-B specifies seven SM frequency levels (180-2842 MHz), batch sizes (1, 4, 8), FP16 precision, and NVML sampling at 10ms. The logistic regression uses L2 regularization with C=1.0."
    147       },
    148       "scaffolding_described": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "No agentic scaffolding is used. This is a benchmarking study that runs models directly for inference on standard datasets."
    152       },
    153       "data_preprocessing_documented": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "The paper documents data handling: 1,000 queries per dataset (817 for TruthfulQA), quality scores min-max normalized within each dataset before averaging (Section V-E), 5-fold stratified cross-validation with standardization (Section V-D). NER features extracted via spaCy en_core_web_sm. Difficulty labels based on dataset-median quality threshold."
    157       }
    158     },
    159     "limitations_and_scope": {
    160       "limitations_section_present": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Section VII includes a 'Threats to validity' paragraph that discusses specific limitations of the study including offline-only evaluation, single GPU, and excluded production dynamics."
    164       },
    165       "threats_to_validity_specific": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "The threats to validity are specific to this study: 'offline replay-based benchmarking on a single GPU', 'does not capture production serving dynamics such as continuous batching, speculative decoding, or multi-GPU execution', 'Energy measurements reflect GPU consumption only and exclude system-level overheads', and limitations to English workloads."
    169       },
    170       "scope_boundaries_stated": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "The paper explicitly states scope boundaries: 'Our goal is not to propose a new serving system or scheduling algorithm' (Section I), 'these findings are intended as empirical observations rather than deployment prescriptions' (Section VII), and lists what was not tested: distributed inference, heterogeneous hardware, non-English workloads."
    174       }
    175     },
    176     "data_integrity": {
    177       "raw_data_available": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "No raw measurement data (power traces, latency logs, per-query results) is made available. Only aggregated statistics are reported in tables."
    181       },
    182       "data_collection_described": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Section IV-B describes the data collection procedure in detail: NVML telemetry via nvidia-smi at 10ms sampling, torch.cuda.synchronize() for timing, three repetitions per configuration, offline replay-based setup on an otherwise idle system."
    186       },
    187       "recruitment_methods_described": {
    188         "applies": false,
    189         "answer": false,
    190         "justification": "No human participants. Data sources are standard NLP benchmarks (BoolQ, HellaSwag, TruthfulQA, NarrativeQA) which are publicly available."
    191       },
    192       "data_pipeline_documented": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "The data pipeline is documented: queries are loaded from benchmark datasets, characterized using semantic features (Section V), inference is run under specified configurations, power and latency are measured via NVML (Section IV-B), and results are aggregated. The study workflow is shown in Figure 1."
    196       }
    197     },
    198     "conflicts_of_interest": {
    199       "funding_disclosed": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "The Acknowledgment section lists funding from the Austrian Science Fund (FWF) through the Themis project (Grant DOI: 10.55776/PAT1668223), the Standalone Project Triton (Grant No. P 36870-N), the Austrian Research Promotion Agency (FFG) through Virtual Shepherd (Grant No. FO999910627), and the Vienna Science and Technology Fund (WWTF, Grant No. ESR24-053)."
    203       },
    204       "affiliations_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Author affiliations are clearly stated: Paul Joe Maliakel and Ivona Brandic are at TU Wien (Computational Sustainability Group), Shashikant Ilager is at University of Amsterdam (Multiscale Networked Systems Group). No commercial affiliations with evaluated products."
    208       },
    209       "funder_independent_of_outcome": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Funding comes from Austrian government science agencies (FWF, FFG, WWTF). These are public research funders with no financial stake in GPU DVFS or LLM inference energy optimization outcomes."
    213       },
    214       "financial_interests_declared": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No competing interests or financial interests statement is present in the paper. The absence of such a declaration is noted."
    218       }
    219     },
    220     "contamination": {
    221       "training_cutoff_stated": {
    222         "applies": false,
    223         "answer": false,
    224         "justification": "The paper does not evaluate model capability on benchmarks in the traditional sense (testing whether the model 'knows' the answers). It measures energy consumption and latency during inference. The quality scores are secondary metrics used for workload characterization, not for evaluating model knowledge."
    225       },
    226       "train_test_overlap_discussed": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "Same rationale as training_cutoff_stated. The study measures hardware energy-performance behavior, not model capability. Contamination would not affect the energy or latency measurements that are the primary contribution."
    230       },
    231       "benchmark_contamination_addressed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "Same rationale. The benchmarks are used as representative workloads to exercise inference, not to evaluate whether models have memorized answers. The primary outcomes (energy, latency, phase behavior) are independent of contamination."
    235       }
    236     },
    237     "human_studies": {
    238       "pre_registered": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No human participants. This is a hardware benchmarking study."
    242       },
    243       "irb_or_ethics_approval": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants. This is a hardware benchmarking study."
    247       },
    248       "demographics_reported": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants. This is a hardware benchmarking study."
    252       },
    253       "inclusion_exclusion_criteria": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants. This is a hardware benchmarking study."
    257       },
    258       "randomization_described": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants. This is a hardware benchmarking study."
    262       },
    263       "blinding_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants. This is a hardware benchmarking study."
    267       },
    268       "attrition_reported": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants. This is a hardware benchmarking study."
    272       }
    273     },
    274     "cost_and_practicality": {
    275       "inference_cost_reported": {
    276         "applies": true,
    277         "answer": true,
    278         "justification": "This is the core contribution of the paper. Energy consumption per query is reported in joules (e.g., Table XVI: 2.92 J for 1B to 20.97 J for 32B at baseline), energy per token (Figure 3), and latency measurements are provided across all configurations."
    279       },
    280       "compute_budget_stated": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "While per-query energy is reported (the paper's contribution), the total computational budget for running all experiments is not stated. No mention of total GPU hours, wall-clock time for the full study, or total energy consumed for the benchmarking campaign."
    284       }
    285     }
    286   },
    287   "claims": [
    288     {
    289       "claim": "Semantic query features predict inference difficulty better than input length, with input length showing near-zero correlation with quality (r = 0.002).",
    290       "evidence": "Table V and Figure 2 show input length has r = 0.002 correlation with quality. Table VI shows semantic features achieve 68.6% classification accuracy vs. 51.1% for length-only baseline. Entity density shows r = -0.29 correlation (Section V).",
    291       "supported": "strong"
    292     },
    293     {
    294       "claim": "44.5% of queries achieve comparable quality across all model sizes (1B-32B), suggesting they can be routed to smaller models.",
    295       "evidence": "Table IX shows 44.5% 'Always Easy' queries. Table X validates this classification: easy queries score 0.661 average quality vs. 0.405 for hard queries across all five models (Section V-E).",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "The decode phase dominates inference time (77-91%) and is largely insensitive to GPU frequency.",
    300       "evidence": "Table XI shows decode time fraction ranges from 72.5% to 90.6%, and decode latency changes by less than ±1% across all frequency settings. Detailed phase-level measurements across 5 models and 3 batch sizes support this (Section VI-C).",
    301       "supported": "strong"
    302     },
    303     {
    304       "claim": "Reducing GPU frequency from 2842 MHz to 180 MHz achieves an average 42% energy savings with only 1-6% latency increase.",
    305       "evidence": "Table XI shows energy savings of 39.9-44.2% across all model-batch configurations, with end-to-end latency increases of -0.1% to +5.6%. Table XIV summarizes: 40-44% energy savings (avg 42%), +1-3% latency change (Section VI-B).",
    306       "supported": "strong"
    307     },
    308     {
    309       "claim": "Combining workload-aware model selection with phase-aware DVFS could reduce energy by up to 87%.",
    310       "evidence": "Table XVII estimates combined savings: weighted average 87% across all query categories. However, this is an upper-bound projection based on combining independently measured savings, not a deployed system. The paper acknowledges this is illustrative (Section VII).",
    311       "supported": "moderate"
    312     }
    313   ],
    314   "methodology_tags": [
    315     "benchmark-eval"
    316   ],
    317   "key_findings": "LLM inference energy consumption is dominated by the decode phase (77-91% of inference time), which is memory-bound and largely insensitive to GPU frequency scaling. Reducing GPU SM frequency from 2842 MHz to 180 MHz achieves ~42% energy savings with only 1-6% latency increase across five decoder-only models (1B-32B). Semantic query features (entity density, causal question presence) predict inference difficulty better than input length (r = -0.29 vs r = 0.002), with 44.5% of queries achieving comparable quality across all model sizes, suggesting significant potential for workload-aware model routing combined with phase-aware DVFS.",
    318   "red_flags": [
    319     {
    320       "flag": "No variance or uncertainty reported",
    321       "detail": "Despite running each configuration three times, the paper reports only mean values with no standard deviation, confidence intervals, or error bars. Given that hardware measurements can vary, the stability of the reported energy savings and latency figures cannot be assessed."
    322     },
    323     {
    324       "flag": "No code or raw data released",
    325       "detail": "The benchmarking code, scripts, and raw measurement data are not released, making independent verification or replication impossible despite the paper's emphasis on measurement-driven analysis."
    326     },
    327     {
    328       "flag": "Combined savings estimate is an upper bound, not validated",
    329       "detail": "The 87% combined energy savings claim (Table XVII) is an arithmetic combination of independently measured factors, not a validated end-to-end system. The paper acknowledges this but the headline number may overstate practical achievability."
    330     },
    331     {
    332       "flag": "Single GPU evaluation",
    333       "detail": "All experiments are on a single NVIDIA RTX PRO 6000 (Blackwell). DVFS behavior may differ substantially on other GPU architectures (A100, H100), data center GPUs, or multi-GPU setups. The paper acknowledges this limitation."
    334     }
    335   ],
    336   "cited_papers": [
    337     {
    338       "title": "Efficient memory management for large language model serving with PagedAttention",
    339       "authors": ["W. Kwon", "Z. Li"],
    340       "year": 2023,
    341       "relevance": "Foundational LLM serving system (vLLM) that the paper builds upon for understanding inference optimization."
    342     },
    343     {
    344       "title": "FlashAttention: Fast and memory-efficient exact attention with IO-awareness",
    345       "authors": ["T. Dao"],
    346       "year": 2022,
    347       "relevance": "Key inference optimization technique that reduces memory traffic, relevant to understanding compute vs memory-bound phases."
    348     },
    349     {
    350       "title": "Splitwise: Efficient generative LLM inference using phase splitting",
    351       "authors": ["P. Patel", "E. Choukse"],
    352       "year": 2024,
    353       "relevance": "Directly related work on phase-aware LLM inference optimization, splitting prefill and decode phases."
    354     },
    355     {
    356       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    357       "authors": ["L. Chen", "M. Zaharia", "J. Zou"],
    358       "year": 2023,
    359       "arxiv_id": "2305.05176",
    360       "relevance": "Addresses cost-aware LLM inference through model routing cascades, closely related to the workload-aware routing concept in this paper."
    361     },
    362     {
    363       "title": "GreenLLM: SLO-aware dynamic frequency scaling for energy-efficient LLM serving",
    364       "authors": ["Q. Liu", "D. Huang", "M. Zapater", "D. Atienza"],
    365       "year": 2025,
    366       "arxiv_id": "2508.16449",
    367       "relevance": "Directly related work on DVFS for LLM inference energy efficiency with SLO awareness."
    368     },
    369     {
    370       "title": "SLO-aware GPU DVFS for energy-efficient LLM inference serving",
    371       "authors": ["A. K. Kakolyris", "D. Masouros", "S. Xydis", "D. Soudris"],
    372       "year": 2024,
    373       "relevance": "Prior work on GPU DVFS for LLM inference with service-level objective awareness."
    374     },
    375     {
    376       "title": "Zeus: Understanding and optimizing GPU energy consumption of DNN training",
    377       "authors": ["J. You", "J.-W. Chung", "M. Chowdhury"],
    378       "year": 2023,
    379       "relevance": "Framework for understanding and optimizing GPU energy in deep learning, applicable to inference optimization."
    380     },
    381     {
    382       "title": "Energy and policy considerations for deep learning in NLP",
    383       "authors": ["E. Strubell", "A. Ganesh", "A. McCallum"],
    384       "year": 2019,
    385       "relevance": "Seminal paper on energy costs of NLP/deep learning that motivated the 'Green AI' movement."
    386     },
    387     {
    388       "title": "Green AI",
    389       "authors": ["R. Schwartz", "J. Dodge", "N. A. Smith", "O. Etzioni"],
    390       "year": 2020,
    391       "doi": "10.1145/3381831",
    392       "relevance": "Foundational position paper on energy-efficient AI research that motivates sustainability-focused benchmarking."
    393     },
    394     {
    395       "title": "MLPerf Inference benchmark",
    396       "authors": ["V. J. Reddi"],
    397       "year": 2020,
    398       "doi": "10.1109/ISCA45697.2020.00045",
    399       "relevance": "Standard inference benchmarking suite that provides context for LLM inference evaluation methodology."
    400     },
    401     {
    402       "title": "Accelerating LLM inference with staged speculative decoding",
    403       "authors": ["B. Spector", "C. Re"],
    404       "year": 2023,
    405       "arxiv_id": "2308.04623",
    406       "relevance": "Alternative approach to LLM inference efficiency through speculative decoding, complementary to DVFS optimization."
    407     }
    408   ]
    409 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs