scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29842B)
      1 {
      2   "paper": {
      3     "title": "GREEN-CODE: Learning to Optimize Energy Efficiency in LLM-based Code Generation",
      4     "authors": [
      5       "Shashikant Ilager",
      6       "Lukas Florian Briem",
      7       "Ivona Brandić"
      8     ],
      9     "year": 2025,
     10     "venue": "IEEE/ACM International Symposium on Cluster, Cloud and Internet Computing",
     11     "arxiv_id": "2501.11006",
     12     "doi": "10.1109/ccgrid64434.2025.00068"
     13   },
     14   "scan_version": 2,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "GREEN-CODE uses RL-based dynamic early exiting to reduce LLM inference energy consumption by 23-50% for code generation tasks. Fine-tuning with weighted aggregated loss enables intermediate layer decoding with a single LM head, avoiding the overhead of multiple LM heads. Evaluated on Llama 3.2 3B and OPT 2.7B with JavaCorpus and PY150, the approach achieves configurable accuracy-efficiency trade-offs controlled by a softmax temperature threshold.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "GitHub repository URL provided in Section V: https://github.com/Large-scale-Sustainable-Computing-LSC/green-code, described as containing 'prototype implementation, deployment configuration scripts, and details, including libraries and dependencies used.'"
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Uses publicly available JavaCorpus and PY150 datasets from the CodeXGlue benchmark. Table I cites the original sources and Section III-B describes following CodeXGlue's train-test splits."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Hardware is described (NVIDIA RTX 8000, AMD EPYC 7452, Ubuntu 20.04.6 LTS, Python 3.12) and libraries are named (Gymnasium, Stable-Baselines3, PyTorch, ZeusMonitor), but no library versions are specified in the paper. The GitHub repo is said to contain dependency details, but the paper itself lacks a requirements file or version-level environment specification."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper points to a GitHub repository but does not include step-by-step reproduction instructions within the paper itself. No 'Reproducing Results' section or specific commands are provided."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "All results in Figures 8-13 are presented as point estimates (bar charts) without error bars, confidence intervals, or ± notation."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Claims such as '23-50% energy reduction' and performance comparisons between GC variants and baselines are made without any statistical significance tests. No p-values, t-tests, or other statistical tests appear anywhere in the paper."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Effect sizes with baseline context are reported throughout: e.g., 'RougeL score of about 0.41, while the full model achieves around 0.425, while saving 23% in energy and time' (Section VI-E1). Percentage reductions and absolute score comparisons are consistently provided."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The paper states 'We always evaluate on 1000 samples from the test sets' (Section VI-C) but provides no justification for this number and no power analysis."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No variance, standard deviation, or spread measures are reported. All results appear to be from single runs. No mention of multiple experimental runs or result stability across repetitions."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Two baselines are compared: (i) the base (non-fine-tuned) model using all layers, and (ii) the fine-tuned model using all layers without early exits. Results are compared across all metrics in Figures 8-11."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "The baselines are the authors' own base and fine-tuned models. No comparison against contemporary competing early-exit methods such as LayerSkip (2024), ConsistentEE (2024), or Sun et al. (2024), all of which are discussed in related work (Section VII) as directly relevant approaches."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The comparison between base model (no fine-tuning, no RL), full model (fine-tuned, all layers), and GC variants (fine-tuned + RL at various thresholds) shows the contribution of the fine-tuning and RL agent components. The sensitivity analysis of context length (Section VI-F) and KV cache impact (Section VI-G) provide additional ablation-like analyses."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Multiple metrics are used: RougeL, BLEU, CodeBLEU (with syntax and dataflow sub-metrics), energy consumption (Ws), latency (seconds), throughput (tokens/second), and number of layers skipped."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "All evaluation is automated using text-matching and code-specific metrics. No human evaluation of generated code quality was conducted, despite the paper acknowledging that 'Standard LLM metrics like BLEU and ROUGE...may not align with the evaluation of the functional aspects of the code.'"
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Section VI-C states: 'We follow the train-test split introduced in CodeXGlue.' The RL agent is trained on the training set, and evaluation uses 1000 samples from the test sets."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results are broken down per model (Llama, OPT), per dataset (JavaCorpus, PY150), per threshold (T = 0.6 to 0.92), and per context length (0.2 to 0.6) in Figures 8-12."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "No qualitative error analysis or specific failure examples are shown. The paper notes performance degradation at high context settings (Section VI-F) and KV cache limitations (Section VI-G) but does not examine what types of code completions the early-exit approach gets wrong."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Several negative findings reported: fine-tuning slightly reduces accuracy vs base model on most metrics (Section VI-E1), performance gap widens at higher context lengths ('almost double the loss observed,' Section VI-F), and KV cache is natively incompatible with early exits (Section VI-G). OPT is noted to perform worse than Llama."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims 'reduces the energy consumption between 23-50% on average...without significantly affecting accuracy.' Results in Section VI-E show 23% savings at T=0.92 for Llama/JavaCorpus and ~40-50% at lower thresholds, with accuracy trade-offs. The claim is supported, though 'without significantly affecting accuracy' applies mainly to the highest threshold settings."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper claims the RL-based early exit causes energy reduction. This is justified through controlled manipulation: comparing the same model with and without the RL agent component, isolating the effect of early exiting. The design allows for valid causal attribution of energy savings to the early exit mechanism."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The title 'Learning to Optimize Energy Efficiency in LLM-based Code Generation' implies broad applicability, but the paper tests only two small models (2.7B, 3B parameters) on two datasets. The conclusion acknowledges plans to 'extend this framework to incorporate larger models for different LLM tasks' but the title and abstract do not bound claims to the tested settings."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "No alternative explanations for the results are discussed. The paper does not consider whether energy savings could be attributed to other factors (e.g., reduced batch processing, GPU power management effects), or whether the RL agent's behavior might be task-specific rather than generalizable."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "Section VI-A2 explicitly acknowledges the proxy gap: 'Standard LLM metrics like BLEU and ROUGE, provide a broad performance estimate, prioritizing lexical precision, which may not align with the evaluation of the functional aspects of the code.' They add CodeBLEU with syntax and dataflow sub-metrics to partially address this gap."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Specific models are identified: 'Llama 3.2' with 3B parameters and 28 layers, and 'OPT' with 2.7B parameters and 32 layers (Table II). These are specific open-source model releases with fixed weights on Huggingface."
    147       },
    148       "prompts_provided": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "The paper does not use prompting. The models perform autoregressive code completion given code context, using fine-tuning and RL-based inference rather than prompt-based interaction."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Table III reports PPO hyperparameters (steps, batch size, buffer size, epochs, learning rate, discount factor, network architecture). Section III-D reports fine-tuning hyperparameters (learning rate 1e-5, batch size 4, gradient accumulation 32, number of epochs, weight budget α values)."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. The system is a model-level inference optimization (early exiting), not an agent pipeline."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section III-B describes dataset selection from CodeXGlue, train-test splits, and preprocessing including tokenization and normalization. Section VI-C describes context selection (first 20% of tokens), maximum context length (512 tokens), and maximum new tokens (15)."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No dedicated limitations section exists. The conclusion mentions future plans ('we plan to extend this framework to incorporate larger models for different LLM tasks') but does not substantively discuss limitations of the current work."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No threats to validity are discussed. There is no consideration of internal or external validity threats specific to the study."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No explicit scope boundaries are stated. The paper does not identify what the results do NOT show, what model sizes or tasks are excluded, or what claims the authors are NOT making."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "While the benchmark datasets are public and code is on GitHub, the raw experimental outputs (energy measurements, model predictions, per-sample results) are not made available for independent verification."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Data sources are described: JavaCorpus and PY150 from CodeXGlue (Section III-B), with sample counts and token statistics (Table I). Energy is measured using ZeusMonitor/pynvml. Evaluation uses 1000 test samples with specific context settings."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. Data sources are standard public benchmarks (JavaCorpus, PY150)."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The pipeline is documented across Sections III-IV: dataset selection → preprocessing (tokenization, splitting, sequence length limits) → fine-tuning with aggregated loss → RL agent training → evaluation. Context selection and evaluation procedures are specified in Section VI-C."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Acknowledgments section lists three funding sources: HPQC (Austrian Research Promotion Agency FFG Nr. 897481), Triton (Austrian Science Fund FWF DOI: 10.55776/P36870), and Themis (FWF DOI: 10.55776/PAT1668223)."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are listed: University of Amsterdam and TU Wien. No product of a specific company is being evaluated, so no conflict arises."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Funding is from Austrian public research agencies (FFG and FWF), which are government bodies with no financial stake in the outcome of the research."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial interests statement is included in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No training data cutoff dates are stated for either Llama 3.2 or OPT. The paper does not discuss when the training data for these models was collected."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No discussion of potential train/test overlap. JavaCorpus (2013) and PY150 (2016) were published well before Llama 3.2 (2024) and OPT (2022), meaning these benchmark solutions could be in the models' training data."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "Both JavaCorpus (2013) and PY150 (2016) were available online years before the training of Llama 3.2 and OPT. The paper does not discuss this contamination risk. While contamination affects absolute scores less than relative comparisons, it is not addressed."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Inference cost is a central focus. Energy consumption (Ws), latency (seconds), and throughput (tokens/second) are reported for all configurations in Figures 8-11. Table IV reports overhead percentages."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Hardware specified (NVIDIA RTX 8000, AMD EPYC 7452). Fine-tuning time reported: ~24 hours for Llama, ~19 hours for OPT (Section III-D). RL training took 200k-500k steps to converge (Section V)."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No mention of multiple random seeds. Results appear to be from single runs with no seed sensitivity analysis, despite RL training being known to be sensitive to random initialization."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The number of experimental runs is not stated. Results are presented without indicating whether they come from single or multiple runs."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The paper explicitly states: 'we did not perform hyperparameter optimization due to the significant computational costs associated with it' (Section VI-B). Default library parameters were used."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": true,
    315         "justification": "Rather than selecting a single 'best' configuration, the paper transparently presents results across multiple thresholds (T = 0.6, 0.8, 0.9, 0.91, 0.92) and lets readers choose based on their own accuracy-efficiency trade-off preferences. For fine-tuning, default hyperparameters are acknowledged."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied. The paper makes numerous comparative claims (across thresholds, models, datasets) purely from point estimates."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors compare their GREEN-CODE system against their own implementations of baselines (base model and fine-tuned model) without acknowledging potential self-comparison bias. No independent evaluation or external baseline implementations are used."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "This is a central contribution of the paper. Performance is systematically reported as a function of compute/energy at each threshold level, and Table IV reports overhead explicitly relative to the full model."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": true,
    335         "justification": "Section VI-A2 discusses the limitations of BLEU and ROUGE for code evaluation and adds CodeBLEU with syntax and dataflow sub-metrics: 'code generation demands additional syntactic and semantic correctness to ensure functionality. Standard LLM metrics like BLEU and ROUGE...may not align with the evaluation of the functional aspects of the code.'"
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No scaffolding is involved. The system performs model-level inference optimization (early exiting), not agent-based code generation."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "JavaCorpus (2013) and PY150 (2016) were published years before Llama 3.2 (2024) and OPT (2022) were trained. The models could have seen the benchmark solutions during pre-training. This temporal leakage is not discussed."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the evaluation setup leaks information. The context is drawn from the same code files as the ground truth, which is standard practice but not discussed as a potential concern."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of independence between training and test data. The paper uses CodeXGlue splits but does not verify whether pre-trained model training data overlaps with test examples."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No concrete leakage detection or prevention method is applied. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines are used."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "GREEN-CODE reduces energy consumption between 23-50% on average for code generation tasks without significantly affecting accuracy.",
    369       "evidence": "Figures 8-11 show energy savings across models and datasets. For Llama/JavaCorpus at T=0.92, RougeL drops from ~0.425 to ~0.41 (3.5% drop) with 23% energy savings. At T=0.6, ~50% energy savings but RougeL drops to ~0.29 (~31% drop). Similar patterns for OPT and PY150.",
    370       "supported": "moderate"
    371     },
    372     {
    373       "claim": "Fine-tuning with aggregated loss enables intermediate layer decoding with a single LM head, eliminating the need for multiple LM heads.",
    374       "evidence": "Section III-D describes the LITE-based fine-tuning method. Figure 4 shows loss convergence. Figure 1 demonstrates that fine-tuned models achieve reasonable CodeBLEU even at shallow layers (e.g., 0.3 at layer 10 vs 0.4 at final layer for Llama).",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "The RL agent learns to balance trade-offs between accuracy, latency, and energy consumption through dynamic early exiting.",
    379       "evidence": "Figure 6 shows PPO training convergence across 2000-2500 episodes. Figure 7 shows distribution of optimal exit points. Section VI-E shows the agent achieves different accuracy-efficiency trade-offs at different thresholds.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "The overhead of the RL agent is reasonable — always below 1/5th of the total runtime.",
    384       "evidence": "Table IV reports energy and time overhead for different thresholds. Maximum overhead is 19.43%/11.98% for OPT at T=0.92. The overhead scales with threshold since higher thresholds cause more 'continue' actions.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "GREEN-CODE achieves comparable accuracy to the non-fine-tuned base model using all layers.",
    389       "evidence": "Figures 8-11 show that at high thresholds (T=0.91-0.92), GC approaches base model accuracy. However, at T=0.92 for Llama/JavaCorpus, RougeL is ~0.41 vs ~0.425 for the base model (still a gap), and at lower thresholds the gap widens substantially.",
    390       "supported": "weak"
    391     }
    392   ],
    393   "red_flags": [
    394     {
    395       "flag": "No error bars or variance reporting",
    396       "detail": "All experiments appear to be single-run results with no variance, standard deviation, or confidence intervals. RL training is known to be highly sensitive to random seeds (Henderson et al. 2018 found results can vary by 2x), making single-run PPO results unreliable."
    397     },
    398     {
    399       "flag": "No comparison with competing methods",
    400       "detail": "Despite discussing LayerSkip (2024), ConsistentEE (2024), and Sun et al. (2024) in related work as directly relevant early-exit approaches for LLMs and code generation, the paper does not compare against any of them experimentally."
    401     },
    402     {
    403       "flag": "Overstated abstract claim",
    404       "detail": "The abstract claims energy reduction 'without significantly affecting accuracy,' but at aggressive settings (T=0.6), RougeL drops ~31% and CodeBLEU drops ~43%. Even at the most conservative setting (T=0.92), accuracy is still measurably lower. The claim applies only to the narrowest interpretation."
    405     },
    406     {
    407       "flag": "Limited model scale",
    408       "detail": "Only 2.7B and 3B parameter models are tested, while production code generation tools use models orders of magnitude larger. The paper's title suggests general applicability to 'LLM-based Code Generation' but results may not transfer to larger models where layer dynamics differ."
    409     },
    410     {
    411       "flag": "No statistical testing",
    412       "detail": "Numerous comparative claims are made (across thresholds, models, datasets) without any statistical significance testing. Differences are presented as meaningful based purely on point estimate comparisons."
    413     }
    414   ],
    415   "cited_papers": [
    416     {
    417       "title": "Evaluating the code quality of AI-assisted code generation tools: An empirical study on GitHub Copilot, Amazon CodeWhisperer, and ChatGPT",
    418       "authors": ["B. Yetistiren", "I. Özsoy", "M. Ayerdem", "E. Tüzün"],
    419       "year": 2023,
    420       "arxiv_id": "2304.10778",
    421       "relevance": "Empirical evaluation of AI code generation tool quality, directly relevant to the survey's coverage of LLM code generation assessment."
    422     },
    423     {
    424       "title": "CodeXGlue: A machine learning benchmark dataset for code understanding and generation",
    425       "authors": ["S. Lu", "D. Guo", "S. Ren"],
    426       "year": 2021,
    427       "relevance": "Major code understanding/generation benchmark suite used in this paper; central to evaluating LLM code capabilities."
    428     },
    429     {
    430       "title": "When neural code completion models size up the situation: Attaining cheaper and faster completion through dynamic model inference",
    431       "authors": ["Z. Sun", "X. Du", "F. Song", "S. Wang", "L. Li"],
    432       "year": 2024,
    433       "relevance": "Most closely related work on early exiting for code generation with classifier-based approach and multiple LM heads."
    434     },
    435     {
    436       "title": "ConsistentEE: A consistent and hardness-guided early exiting method for accelerating language models inference",
    437       "authors": ["Z. Zeng", "Y. Hong", "H. Dai", "H. Zhuang", "C. Chen"],
    438       "year": 2024,
    439       "relevance": "RL-based early exiting method with policies and LM heads at every exit layer, directly comparable approach to GREEN-CODE."
    440     },
    441     {
    442       "title": "LayerSkip: Enabling early exit inference and self-speculative decoding",
    443       "authors": ["M. Elhoushi", "A. Shrivastava", "D. Liskovich"],
    444       "year": 2024,
    445       "relevance": "Contemporary early-exit method using dropout during training with self-speculative decoding verification, relevant to LLM inference efficiency."
    446     },
    447     {
    448       "title": "LLM-pruner: On the structural pruning of large language models",
    449       "authors": ["X. Ma", "G. Fang", "X. Wang"],
    450       "year": 2023,
    451       "relevance": "Structural pruning approach for LLMs, alternative technique for reducing LLM inference cost relevant to the survey."
    452     },
    453     {
    454       "title": "QLoRA: Efficient finetuning of quantized LLMs",
    455       "authors": ["T. Dettmers", "A. Pagnoni", "A. Holtzman", "L. Zettlemoyer"],
    456       "year": 2024,
    457       "relevance": "Efficient LLM fine-tuning via quantization, relevant to resource-efficient LLM deployment approaches."
    458     },
    459     {
    460       "title": "CodeBLEU: A method for automatic evaluation of code synthesis",
    461       "authors": ["S. Ren", "D. Guo", "S. Lu"],
    462       "year": 2020,
    463       "relevance": "Code-specific evaluation metric incorporating syntax and dataflow matching, key to assessing LLM code generation quality."
    464     },
    465     {
    466       "title": "Out of the BLEU: How should we assess quality of the code generation models?",
    467       "authors": ["M. Evtikhiev", "E. Bogomolov", "Y. Sokolov", "T. Bryksin"],
    468       "year": 2023,
    469       "relevance": "Critical assessment of code generation evaluation metrics, directly relevant to methodology quality in AI code generation research."
    470     },
    471     {
    472       "title": "OPT: Open pre-trained transformer language models",
    473       "authors": ["S. Zhang", "S. Roller", "N. Goyal"],
    474       "year": 2022,
    475       "relevance": "Open-source LLM family used in this study, relevant to evaluation of open-source code generation models."
    476     },
    477     {
    478       "title": "LLaMA: Open and efficient foundation language models",
    479       "authors": ["H. Touvron", "T. Lavril", "G. Izacard"],
    480       "year": 2023,
    481       "relevance": "Foundation model family (Llama) used in this study, central to open-source LLM capability evaluation."
    482     }
    483   ]
    484 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs