scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25497B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Green-Code: Learning to Optimize Energy Efficiency in Llm-Based Code Generation",
      6     "authors": [
      7       "Shashikant Ilager",
      8       "Lukas Florian Briem",
      9       "Ivona Brandić"
     10     ],
     11     "year": 2025,
     12     "venue": "IEEE/ACM International Symposium on Cluster, Cloud and Internet Computing",
     13     "arxiv_id": "2501.11006",
     14     "doi": "10.1109/ccgrid64434.2025.00068"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": false,
     21         "justification": "The abstract claims '23–50% energy reduction without significantly affecting accuracy,' but at the aggressive T=0.6 threshold, RougeL drops from ~0.42 to ~0.29 (~31% accuracy loss), which is significant; the qualifier 'without significantly affecting accuracy' only holds for conservative thresholds (T=0.9–0.92) where savings are 23–29%.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper directly compares GREEN-CODE against baseline models with and without early exits on the same hardware and datasets, providing adequate empirical support for the causal claim that the RL-based dynamic early exit reduces energy consumption.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Results are based on two small-scale models (2.7B–3B parameters) on a single GPU (NVIDIA RTX 8000), but conclusions are presented without explicit scope boundaries; the paper concludes 'demonstrates the feasibility of such techniques in real-world usage' without bounding to the tested setting.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No alternative explanations are discussed for why energy savings occur (e.g., whether aggregated-loss fine-tuning itself changes model behavior independently of early exits, or whether results would differ on different hardware architectures).",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper uses RougeL, BLEU, and CodeBLEU as proxies for 'accuracy' in code generation but does not discuss whether these metrics capture functional correctness or developer utility; the abstract conflates these proxy scores with general 'accuracy'.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "There is no dedicated limitations or threats-to-validity section; limitations such as single-GPU evaluation, no hyperparameter optimization, and KV-cache incompatibility are scattered as inline remarks in the methodology and evaluation sections.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No threats-to-validity are formally enumerated; while the paper notes 'we did not perform hyperparameter optimization' and that models are small-scale due to testbed constraints, these are not framed as validity threats.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The paper does not explicitly state what the results do NOT show; for instance, no statement that results may not hold for larger models, different hardware, or non-code generation tasks beyond what is mentioned in future work.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Funding is disclosed in the acknowledgments: HPQC (FFG Nr. 897481), Triton (FWF DOI: 10.55776/P36870), and Themis (FWF DOI: 10.55776/PAT1668223).",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations are clearly disclosed on the first page: University of Amsterdam (Ilager) and TU Wien (Briem, Brandić).",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Funders are Austrian public research agencies (FFG, FWF), which are independent of the specific framework being proposed and have no commercial interest in the outcome.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) is present in the paper.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms such as 'early exit,' 'dynamic early exiting,' 'energy consumption,' and 'RL agent' are defined or described in context; the system model section clearly delineates the framework components.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper explicitly lists four bullet-point contributions at the end of Section I, including the fine-tuning method, the RL-based framework, the prototype implementation, and the experimental evaluation.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section VII (Related Work) explicitly compares GREEN-CODE against ConsistentEE, LITE, LayerSkip, and Sun et al., distinguishing the proposed approach by its single LM head and purely dynamic RL-based exit strategy.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "The paper provides a GitHub URL (https://github.com/Large-scale-Sustainable-Computing-LSC/green-code) with 'prototype implementation, deployment configuration scripts, and details, including libraries and dependencies.'",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "Both datasets (JavaCorpus and PY150) are publicly available via the CodeXGLUE benchmark; the paper follows the standard train/test splits from CodeXGLUE.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "The paper mentions Python 3.12, Gymnasium, Stable-Baselines3, and PyTorch by name, but provides no requirements.txt, Dockerfile, or explicit version pins in the paper itself; the GitHub repo is referenced for 'libraries and dependencies' but not confirmed.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No step-by-step reproduction instructions are provided in the paper; the GitHub repo is referenced but the paper itself does not describe how to reproduce experiments.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "All results are presented as single-run bar charts with no error bars or confidence intervals reported for any main results.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No statistical significance tests are applied to any comparative claims, despite multiple comparisons between GREEN-CODE and baseline models across metrics.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Absolute metric values and percentage improvements are reported (e.g., '23–50% energy reduction,' 'RougeL of 0.41 vs. 0.425'), providing meaningful effect size context.",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "The evaluation uses 1,000 samples from test sets, but no justification is provided for why this number is sufficient or whether it yields adequate statistical power.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "No variance, standard deviation, or spread across runs is reported; results appear to be from single experimental runs.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Two baselines are used: (i) the non-fine-tuned base model and (ii) the fine-tuned model using all layers without early exits.",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": false,
    186           "justification": "The baselines are only full-layer versions of the same models; no comparison is made against competing early exit methods (ConsistentEE, LayerSkip, Sun et al. [18]) that are discussed in related work as directly comparable.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "The preliminary experiment in Section II with fixed exits (Figure 1) serves as an ablation of the RL component, and the threshold sweep (T=0.6 to 0.92) effectively ablates the aggressiveness of the policy.",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Multiple performance metrics are used (RougeL, BLEU, CodeBLEU with syntax/dataflow sub-metrics) alongside efficiency metrics (energy, latency, throughput, layers skipped).",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": false,
    204           "justification": "No human evaluation of generated code quality or developer experience is conducted, despite the paper claiming practical usability via a VS Code extension.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "The paper explicitly states 'we always evaluate on 1000 samples from the test sets of the datasets,' using the held-out test splits from CodeXGLUE.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Results are broken down per model (OPT vs. Llama), per dataset (JavaCorpus vs. PY150), and per threshold setting, with separate figures for each combination.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": false,
    222           "justification": "The paper does not show or analyze specific failure cases where early exits produce incorrect or syntactically invalid code; only aggregate metric differences are reported.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "The paper reports accuracy degradation at aggressive thresholds (T=0.6, RougeL drops ~30%), worse performance with higher context lengths, and OPT underperforming Llama in several settings.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": false,
    236           "justification": "Models are specified as 'OPT 2.7B' and 'Llama 3.2 3B' but without HuggingFace model IDs, commit hashes, or snapshot dates; this is insufficient for exact reproduction.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": false,
    241           "answer": false,
    242           "justification": "The paper does not use prompt engineering; it is an autoregressive code completion task where the first 20% of code tokens are provided as context.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Table III explicitly reports PPO hyperparameters (steps, batch size, buffer size, epochs, learning rate, discount factor, hidden layer sizes); fine-tuning hyperparameters (learning rate 1e-5, batch size 4, gradient accumulation 32) are also stated.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "The RL scaffolding is described in detail including state space (hidden states), action space (exit/continue), reward function (Equations 2–4), and the PPO training procedure.",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "The paper describes tokenization, normalization, train/test splits from CodeXGLUE, maximum context length of 512 tokens, first 20% of file as context, and packing for small samples.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "The evaluation datasets (JavaCorpus, PY150 via CodeXGLUE) are publicly available; however, the raw experimental output data (energy measurements, per-sample predictions) is not explicitly released.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Section III-B describes the datasets, their sources (CodeXGLUE, GitHub), sizes, and the train/test split followed; the paper refers to original dataset papers for full collection details.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants; standard public benchmarks are used.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "The full pipeline from code file selection through tokenization, LLM fine-tuning, RL training, and evaluation is described in Sections III–VI with a system diagram in Figure 2.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "No training data cutoffs are stated for OPT or Llama 3.2; the models are referenced only by parameter count and HuggingFace availability.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "JavaCorpus (GitHub Java, 2013) and PY150 (GitHub Python, 2016) are likely included in the pretraining data of both OPT and Llama 3.2, but the paper does not discuss or acknowledge this potential overlap.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "Both benchmarks predate the LLMs' training by several years and are publicly available GitHub-sourced datasets; the paper does not address whether the models' fine-tuning or pretraining includes these exact code files.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": true,
    358           "justification": "Energy consumption (measured in Watt-seconds via ZeusMonitor/pynvml), latency, and throughput are all measured and reported as primary evaluation metrics in Section VI.",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": true,
    364           "justification": "Fine-tuning duration is reported (~24 hours for Llama, ~19 hours for OPT on NVIDIA RTX 8000); RL training convergence is stated at 200k–500k steps.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "GREEN-CODE reduces energy consumption by 23–50% compared to full-layer inference without significantly affecting accuracy",
    373       "evidence": "Bar charts (Figures 8–11) show energy savings at T=0.92 of ~23–29% with RougeL within ~2–5% of baseline; T=0.6 shows ~50% energy savings but ~30% accuracy loss",
    374       "supported": "weak"
    375     },
    376     {
    377       "claim": "RL-based dynamic early exit outperforms static early exit and baseline models on the accuracy-efficiency tradeoff",
    378       "evidence": "Figure 1 shows fixed exit layer results; GREEN-CODE with RL achieves comparable accuracy at higher thresholds while saving energy, but no direct quantitative comparison against static methods is provided",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "The aggregated-loss fine-tuning enables effective early exiting with a single LM head, avoiding multiple LM head overhead",
    383       "evidence": "Section III-D describes the LITE-based aggregated loss method; the approach is compared against the baseline full model, but not against multi-LM-head approaches in direct experiments",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "GREEN-CODE overhead (RL agent forward pass + softmax) remains below 1/5th of total runtime",
    388       "evidence": "Table IV shows relative energy/time overhead ranging from 4.38%–9.37% for Llama and 4.85%–8.53% for OPT across thresholds, all well below 20%",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "Results generalize across two model families (OPT, Llama) and two programming languages (Java, Python)",
    393       "evidence": "Figures 8–11 show consistent energy savings and comparable accuracy patterns on both models and datasets, with similar trends",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "Higher context lengths lead to larger accuracy gaps between GREEN-CODE and the full model",
    398       "evidence": "Figure 12 shows that at context 0.5, CodeBLEU gap at T=0.92 is 8.9% vs. 5.28% at context 0.2, approximately double",
    399       "supported": "strong"
    400     }
    401   ],
    402   "methodology_tags": [
    403     "benchmark-eval",
    404     "empirical"
    405   ],
    406   "key_findings": "GREEN-CODE, an RL-based dynamic early exit framework for LLM code generation, achieves 23–29% energy savings with minimal accuracy loss at conservative thresholds (T=0.9–0.92) on OPT 2.7B and Llama 3.2 3B across Java and Python code completion tasks. The approach uses a PPO-trained agent that dynamically selects exit layers at inference time, requiring no multiple LM heads, with overhead below 20% of total runtime. Results are consistent across two datasets (JavaCorpus, PY150) but are limited to small-scale models on a single GPU without comparison against competing early exit methods.",
    407   "red_flags": [
    408     {
    409       "flag": "No comparison against competing early exit methods",
    410       "detail": "The paper discusses ConsistentEE, LayerSkip, and Sun et al. as closely related work in the related work section but compares GREEN-CODE only against non-early-exit baselines (base model, full fine-tuned model), making it impossible to assess whether the RL approach actually outperforms simpler alternatives."
    411     },
    412     {
    413       "flag": "Abstract accuracy claim misleading",
    414       "detail": "The abstract states '23–50% energy reduction without significantly affecting accuracy,' but the 50% end of the range corresponds to T=0.6 where RougeL drops ~31% — a significant accuracy loss bundled with selective framing."
    415     },
    416     {
    417       "flag": "No error bars or statistical tests",
    418       "detail": "All results are single-run measurements with no variance reported and no significance tests applied to any comparative claims, making it unclear whether differences are meaningful."
    419     },
    420     {
    421       "flag": "Benchmark contamination not addressed",
    422       "detail": "JavaCorpus (2013) and PY150 (2016) are GitHub-sourced datasets likely present in OPT and Llama 3.2 pretraining corpora; this overlap is not acknowledged or mitigated."
    423     },
    424     {
    425       "flag": "Small-scale models only",
    426       "detail": "Only 2.7B–3B parameter models are evaluated due to 'testbed resource constraints'; claims about feasibility in real-world code tools (GitHub Copilot scale) are not supported by these results."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "When Neural Code Completion Models Size Up the Situation: Attaining Cheaper and Faster Completion through Dynamic Model Inference",
    432       "relevance": "Most directly related prior work on early exiting for code generation with classifier-based exit and multiple LM heads"
    433     },
    434     {
    435       "title": "ConsistentEE: A Consistent and Hardness-Guided Early Exiting Method for Accelerating Language Models Inference",
    436       "relevance": "Prior RL-based early exit method that GREEN-CODE directly contrasts with, uses multiple LM heads"
    437     },
    438     {
    439       "title": "LayerSkip: Enabling Early Exit Inference and Self-Speculative Decoding",
    440       "relevance": "Competing early exit approach using dropout during training and self-speculative decoding"
    441     },
    442     {
    443       "title": "Confident Adaptive Language Modeling",
    444       "relevance": "Early exit method using confidence/entropy-based exit triggers on T5 models"
    445     },
    446     {
    447       "title": "Accelerating LLaMA Inference by Enabling Intermediate Layer Decoding via Instruction Tuning with LITE",
    448       "relevance": "The aggregated loss fine-tuning method (LITE) that GREEN-CODE adapts for early exit capability"
    449     },
    450     {
    451       "title": "CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation",
    452       "relevance": "Benchmark providing both evaluation datasets (JavaCorpus, PY150) used in this study"
    453     },
    454     {
    455       "title": "The Growing Energy Footprint of Artificial Intelligence",
    456       "relevance": "Motivating statistic cited for LLM inference energy consumption (ChatGPT 564 MWh/day)"
    457     }
    458   ],
    459   "engagement_factors": {
    460     "practical_relevance": {
    461       "score": 2,
    462       "justification": "VS Code extension integration and GitHub Copilot framing make the work tangible for practitioners, but small model sizes limit immediate applicability."
    463     },
    464     "surprise_contrarian": {
    465       "score": 1,
    466       "justification": "Applying RL to early exit selection is a reasonable engineering contribution but not a surprising or contrarian finding."
    467     },
    468     "fear_safety": {
    469       "score": 1,
    470       "justification": "Addresses AI energy consumption concerns, which are societally relevant but not an urgent safety risk framing."
    471     },
    472     "drama_conflict": {
    473       "score": 0,
    474       "justification": "No controversy or conflict angle; straightforward systems engineering paper."
    475     },
    476     "demo_ability": {
    477       "score": 2,
    478       "justification": "A working VS Code extension is demonstrated and code is released on GitHub, making it tryable in principle."
    479     },
    480     "brand_recognition": {
    481       "score": 0,
    482       "justification": "TU Wien and University of Amsterdam are respected but not brand-recognized labs in the LLM space."
    483     }
    484   },
    485   "hn_data": {
    486     "threads": [],
    487     "top_points": 0,
    488     "total_points": 0,
    489     "total_comments": 0
    490   }
    491 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs