scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (21918B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "Evaluating Language Models for Efficient Code Generation",
      6     "authors": [
      7       "Jiawei Liu",
      8       "Songrun Xie",
      9       "Junhao Wang",
     10       "Yuxiang Wei",
     11       "Yifeng Ding",
     12       "Lingming Zhang"
     13     ],
     14     "year": 2024,
     15     "venue": "COLM 2024",
     16     "arxiv_id": "2408.06450",
     17     "doi": "10.48550/arXiv.2408.06450"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Abstract claims about DPE framework, instruction tuning benefiting efficiency, scaling law not holding for efficiency, and 4.8× improvement over prior art inputs are all supported by results in §4.1 and §4.2.",
     25         "source": "opus"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Claims like 'instruction tuning benefits efficiency' and 'model size does not reliably improve efficiency' are supported by controlled comparisons within model families where only the variable of interest differs (e.g., same model, base vs instruct; same family, different sizes).",
     31         "source": "opus"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The title claims 'Evaluating Language Models for Efficient Code Generation' broadly, but experiments are limited to Python tasks from HumanEval+ and MBPP+. The abstract and conclusion make general claims about 'code generation' without bounding to Python or to algorithmic-style programming problems.",
     37         "source": "opus"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No alternative explanations are discussed for the key findings. For example, the instruction tuning efficiency benefit could stem from training data quality rather than the tuning process itself, and the scaling law failure could have multiple explanations, none of which are explored.",
     43         "source": "opus"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper explicitly distinguishes between their proxy metric (hardware instruction count / DPS) and actual runtime efficiency. §2 and Appendix A.2 discuss limitations of runtime measurement and why DPS provides a more meaningful compound metric than raw speedup, carefully defining what DPS measures.",
     49         "source": "opus"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No dedicated limitations or threats-to-validity section exists. The paper has sections for Introduction, DPE, EVALPERF, Evaluation, Related Work, Conclusion, and Appendix, with no explicit limitations discussion.",
     57         "source": "opus"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No specific threats to validity are discussed anywhere in the paper.",
     63         "source": "opus"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No explicit scope boundaries are stated. The paper does not discuss what the results do NOT show, what settings were excluded, or what claims the authors are not making.",
     69         "source": "opus"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Acknowledgment section states: 'This work was partially supported by NSF grant CCF-2131943 and Kwai Inc, as well as API credits from the OpenAI Researcher Access Program.'",
     77         "source": "opus"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Author affiliations are listed: University of Illinois Urbana-Champaign and Tongji University. No undisclosed industry affiliations are apparent.",
     83         "source": "opus"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "OpenAI provided API credits through the Researcher Access Program, and the paper evaluates GPT-4 Turbo (an OpenAI product), which achieves the best DPS in the evaluation. OpenAI has a financial interest in GPT-4 performing well.",
     89         "source": "opus"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests or financial disclosure statement is present in the paper.",
     95         "source": "opus"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Key terms are defined precisely: 'code efficiency' is scoped to execution efficiency (not readability), DPS is formally defined with its cumulative ratio formula (§2.5), and 'performance-exercising' inputs are defined via the filtering criteria (§2.3).",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Four contributions are explicitly enumerated in §1: a new evaluation dimension (efficiency), the DPE technique, the EVALPERF benchmark, and an empirical study—each clearly scoped.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "§5 engages substantively with HumanEval, MBPP, EvalPlus, PIE, and contemporaneous sibling benchmarks (EffiBench, ECCO), explaining specific technical limitations of each and how DPE addresses them rather than just listing citations.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "benchmark-creation": {
    121       "construct_design": {
    122         "construct_validity_argued": {
    123           "applies": true,
    124           "answer": true,
    125           "justification": "The paper argues why DPE measures code efficiency better than alternatives: §2 explains that efficiency-demanding inputs are required because light computation makes all complexities 'equal,' and §A.2 argues that DPS is a more interpretable compound metric than average speedup, which is skewed by outlier tasks.",
    126           "source": "haiku"
    127         },
    128         "difficulty_distribution_characterized": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "Difficulty is operationalized through performance clustering (§2.4) requiring K≥4 clusters per task, and Table 1 documents the filtering funnel. The adaptive clustering algorithm produces tasks with distinct performance tiers rather than assuming difficulty.",
    132           "source": "haiku"
    133         },
    134         "ceiling_floor_effects_checked": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "The benchmark is designed against ceiling/floor effects through filtering (sufficient computation >10k instructions, ≥4 performance clusters), but the paper does not explicitly report or analyze ceiling/floor effects at the model-performance level after benchmark construction.",
    138           "source": "haiku"
    139         },
    140         "human_baseline_included": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No human baseline is included or discussed. The evaluation covers only LLM variants; there is no measurement of how human programmers perform on EVALPERF tasks.",
    144           "source": "haiku"
    145         },
    146         "scoring_rubric_justified": {
    147           "applies": true,
    148           "answer": true,
    149           "justification": "DPS is formally defined (§2.5) and extensively justified against average speedup in §A.2, including a concrete counterexample showing how speedup averaging can mislead. Edge cases (DPS=0 for failing or slowest solutions) are addressed in the definition.",
    150           "source": "haiku"
    151         }
    152       },
    153       "robustness": {
    154         "contamination_resistance_designed": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "EVALPERF tasks are derived from HumanEval+ and MBPP+, both widely used training data sources. The paper cites LiveCodeBench's contamination-free design in related work but applies no contamination resistance (temporal splits, canary strings, or dynamic generation) to EVALPERF itself.",
    158           "source": "haiku"
    159         },
    160         "temporal_robustness_discussed": {
    161           "applies": true,
    162           "answer": false,
    163           "justification": "The paper mentions future plans to extend EVALPERF (§3) but does not discuss whether or when models will saturate the benchmark, what update cadence is planned, or what mechanisms would prevent gaming as models improve.",
    164           "source": "haiku"
    165         },
    166         "failure_modes_discussed": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "§A.2 discusses measurement trade-offs (hardware counters vs. simulators vs. physical runtime) and related work critiques sibling benchmarks, but the paper does not systematically discuss what EVALPERF itself fails to measure or how it could be gamed.",
    170           "source": "haiku"
    171         },
    172         "baseline_implementations_provided": {
    173           "applies": true,
    174           "answer": true,
    175           "justification": "The paper explicitly states: 'we fully open-source and maintain the data curation pipeline and evaluator at github.com/evalplus/evalplus,' providing reference solutions, profiling code, and the evaluation framework.",
    176           "source": "haiku"
    177         }
    178       },
    179       "documentation": {
    180         "dataset_documentation_complete": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "The data curation pipeline is documented in detail: source tasks (HumanEval+ + MBPP+), solution sampling (21 LLMs, 50 samples each), input generator creation (DeepSeekCoder-33B, 16 samples), filtering criteria with explicit thresholds, and clustering parameters are all specified.",
    184           "source": "haiku"
    185         },
    186         "licensing_and_access_clear": {
    187           "applies": true,
    188           "answer": false,
    189           "justification": "While the benchmark is stated to be open-sourced at github.com/evalplus/evalplus, the paper itself does not specify a license for EVALPERF or clarify under what terms others may use, modify, or redistribute it.",
    190           "source": "haiku"
    191         },
    192         "intended_use_specified": {
    193           "applies": true,
    194           "answer": false,
    195           "justification": "The intended use (evaluating LLM code efficiency on algorithmic Python tasks) is clear from context, but the paper does not state what should NOT be concluded from EVALPERF results—for example, that DPS scores don't reflect real-world performance on non-algorithmic tasks or other languages.",
    196           "source": "haiku"
    197         }
    198       }
    199     }
    200   },
    201   "claims": [
    202     {
    203       "claim": "DPE generates inputs that are 4.8× more performance-exercising than EvalPlus baselines, measured by tasks passing the clustering criterion",
    204       "evidence": "Table 1: SAS passes 121 tasks with ≥4 clusters vs. EvalPlus's 25 tasks (271 vs. 204 with computation filter alone)",
    205       "supported": "strong"
    206     },
    207     {
    208       "claim": "EVALPERF produces consistent evaluation results across diverse hardware platforms, with maximum coefficient of variation <0.4%",
    209       "evidence": "Table 2 shows DPS scores for three models across four test beds (desktop i7, desktop i9-12900K, workstation, server) with CV ranging 0.1–0.4%",
    210       "supported": "strong"
    211     },
    212     {
    213       "claim": "General instruction tuning improves code efficiency alongside correctness, even though it was not designed to optimize efficiency",
    214       "evidence": "Figure 4 shows instruct variants outperform base models for most model families (e.g., DeepSeekCoder-6.7B: +19% DPS), with StarCoder2-15B as the only exception",
    215       "supported": "moderate"
    216     },
    217     {
    218       "claim": "The scaling law does not hold for code efficiency — larger models within a family do not consistently generate more efficient code",
    219       "evidence": "Figure 5 shows 7/12 pairs where larger models outperform smaller, but 4 cases of >1% degradation including StarCoder2 3B→7B (−6%)",
    220       "supported": "moderate"
    221     },
    222     {
    223       "claim": "Performance-encouraging prompts (perf-instruct, perf-CoT) do not consistently or noticeably improve code efficiency",
    224       "evidence": "Figure 4 shows no systematic advantage for perf-instruct or perf-CoT over instruct prompting; Table 3 shows performance-encouraging prompts commonly cause correctness degradation",
    225       "supported": "moderate"
    226     },
    227     {
    228       "claim": "GPT-4 Turbo achieves the highest DPS among evaluated models",
    229       "evidence": "Table 3 shows GPT-4 Turbo reaching 91.5% avg DPS (perf-CoT), highest of all models; Figure 7 visualizes this",
    230       "supported": "strong"
    231     }
    232   ],
    233   "methodology_tags": [
    234     "benchmark-eval",
    235     "benchmark-creation"
    236   ],
    237   "key_findings": "DPE (Differential Performance Evaluation) addresses two core failures of existing code benchmarks—light-computation tasks and misleading speedup metrics—by using LLM-synthesized performance-exercising inputs and a percentile-rank DPS metric. EVALPERF (121 Python tasks) is 4.8× more discriminating than EvalPlus baselines and produces consistent rankings across hardware platforms (CV <0.4%). Instruction tuning reliably improves code efficiency alongside correctness, but the scaling law that holds for correctness does not extend to efficiency, and performance-encouraging prompts provide no reliable efficiency gains.",
    238   "red_flags": [
    239     {
    240       "flag": "Contamination not addressed",
    241       "detail": "EVALPERF tasks are drawn from HumanEval+ and MBPP+, both widely used LLM training sources. The paper cites contamination-free benchmarks in related work but applies no contamination mitigation to EVALPERF itself."
    242     },
    243     {
    244       "flag": "Causal language for observational findings",
    245       "detail": "'General instruction tuning benefits both code correctness and efficiency' is presented causally, but base and instruct variants differ in training data composition and volume, not just instruction tuning—confounds are uncontrolled."
    246     },
    247     {
    248       "flag": "Python-only generalization",
    249       "detail": "All 121 tasks are Python algorithmic problems from HumanEval/MBPP lineage. Findings about scaling laws and instruction tuning are presented as general conclusions without bounding to Python or algorithmic tasks."
    250     },
    251     {
    252       "flag": "No human baseline",
    253       "detail": "There is no measurement of how human programmers perform on EVALPERF tasks, making it impossible to calibrate where LLM scores fall relative to human-level efficiency."
    254     },
    255     {
    256       "flag": "No limitations section",
    257       "detail": "The paper contains no dedicated limitations or threats-to-validity discussion. Implicit limitations (Python only, HumanEval/MBPP task distribution, hardware counter limitations for interpreted languages) are not systematically acknowledged."
    258     }
    259   ],
    260   "cited_papers": [
    261     {
    262       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation (EvalPlus)",
    263       "relevance": "Direct predecessor; EVALPERF uses EvalPlus's rigorous tests for correctness validation and EvalPlus inputs as a baseline comparison for performance difficulty"
    264     },
    265     {
    266       "title": "Evaluating large language models trained on code (HumanEval)",
    267       "relevance": "Source of 164 base tasks used to construct EVALPERF; the standard code generation correctness benchmark"
    268     },
    269     {
    270       "title": "Program synthesis with large language models (MBPP)",
    271       "relevance": "Source of 399 base tasks used to construct EVALPERF alongside HumanEval+"
    272     },
    273     {
    274       "title": "Learning performance-improving code edits (PIE)",
    275       "relevance": "Most closely related benchmark for code efficiency evaluation; DPE explicitly compares its methodology against PIE's C++ optimization focus and simulator-based profiling"
    276     },
    277     {
    278       "title": "LiveCodeBench: Holistic and contamination-free evaluation of LLMs for code",
    279       "relevance": "Contamination-aware benchmark cited as addressing a limitation that EVALPERF does not address"
    280     },
    281     {
    282       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    283       "relevance": "Foundational technique used by DPE's Synthesizing a Synthesizer (SAS) for input generator creation"
    284     },
    285     {
    286       "title": "EffiBench: Benchmarking the efficiency of automatically generated code",
    287       "relevance": "Contemporaneous sibling benchmark for code efficiency evaluation; DPE claims to address its limitations"
    288     },
    289     {
    290       "title": "How efficient is LLM-generated code? A rigorous & high-standard benchmark",
    291       "relevance": "Another contemporaneous efficiency benchmark; DPE contrasts its metric design against speedup-based approaches used here"
    292     },
    293     {
    294       "title": "ALGO: Synthesizing algorithmic programs with generated oracle verifiers",
    295       "relevance": "Related work on LLM-generated test input synthesis; SAS is explicitly differentiated from ALGO's approach"
    296     }
    297   ],
    298   "engagement_factors": {
    299     "practical_relevance": {
    300       "score": 2,
    301       "justification": "Practitioners evaluating LLMs for code generation can directly use the open-sourced EVALPERF benchmark and DPE framework to assess efficiency, a gap not addressed by existing benchmarks."
    302     },
    303     "surprise_contrarian": {
    304       "score": 2,
    305       "justification": "The finding that scaling laws don't hold for code efficiency—while they hold for correctness—challenges a dominant assumption in LLM scaling research."
    306     },
    307     "fear_safety": {
    308       "score": 0,
    309       "justification": "The paper raises no AI safety or risk concerns; it is a methodological contribution to benchmark design."
    310     },
    311     "drama_conflict": {
    312       "score": 1,
    313       "justification": "The paper critiques existing benchmarks (EvalPlus, contemporaneous efficiency benchmarks) as inadequate, but the critique is technical rather than controversial."
    314     },
    315     "demo_ability": {
    316       "score": 3,
    317       "justification": "EVALPERF and the full DPE pipeline are fully open-sourced at github.com/evalplus/evalplus; practitioners can run evaluations immediately."
    318     },
    319     "brand_recognition": {
    320       "score": 1,
    321       "justification": "UIUC is a well-known institution and EvalPlus has prior community recognition, but no major industry lab (Google, OpenAI, Meta) is the primary author."
    322     }
    323   },
    324   "hn_data": {
    325     "threads": [
    326       {
    327         "hn_id": "41567138",
    328         "title": "Can Generative Multi-Agents Spontaneously Form a Society?",
    329         "points": 48,
    330         "comments": 5,
    331         "url": "https://news.ycombinator.com/item?id=41567138",
    332         "created_at": "2024-09-17T12:55:14Z"
    333       },
    334       {
    335         "hn_id": "43921813",
    336         "title": "Human-Like Episodic Memory for Infinite Context LLMs",
    337         "points": 27,
    338         "comments": 0,
    339         "url": "https://news.ycombinator.com/item?id=43921813",
    340         "created_at": "2025-05-08T00:21:21Z"
    341       },
    342       {
    343         "hn_id": "40021906",
    344         "title": "Wu's Method Can Boost AlphaGeometry to Outperform Gold Medalists at IMO Geometry",
    345         "points": 7,
    346         "comments": 1,
    347         "url": "https://news.ycombinator.com/item?id=40021906",
    348         "created_at": "2024-04-13T10:06:11Z"
    349       },
    350       {
    351         "hn_id": "24247130",
    352         "title": "Manticore: A 4096-core RISC-V Chiplet Arch for Ultra-efficient FP Computing",
    353         "points": 7,
    354         "comments": 1,
    355         "url": "https://news.ycombinator.com/item?id=24247130",
    356         "created_at": "2020-08-22T20:45:30Z"
    357       },
    358       {
    359         "hn_id": "40015493",
    360         "title": "Show HN: Symbolic AI at Silver Medal, Boosts AlphaGeometry to Beat IMO Geo Gold",
    361         "points": 6,
    362         "comments": 0,
    363         "url": "https://news.ycombinator.com/item?id=40015493",
    364         "created_at": "2024-04-12T17:36:41Z"
    365       },
    366       {
    367         "hn_id": "39691144",
    368         "title": "Adding NVMe SSDs to Enable and Accelerate 100B Model Fine-Tuning on a Single GPU",
    369         "points": 3,
    370         "comments": 1,
    371         "url": "https://news.ycombinator.com/item?id=39691144",
    372         "created_at": "2024-03-13T13:44:20Z"
    373       },
    374       {
    375         "hn_id": "41317807",
    376         "title": "Human-Like Episodic Memory for Infinite Context LLMs",
    377         "points": 3,
    378         "comments": 0,
    379         "url": "https://news.ycombinator.com/item?id=41317807",
    380         "created_at": "2024-08-22T07:52:12Z"
    381       },
    382       {
    383         "hn_id": "40001562",
    384         "title": "Symbolic AI at Silver Medal, Boosts AlphaGeometry to Beat Gold at IMO Geometry",
    385         "points": 3,
    386         "comments": 0,
    387         "url": "https://news.ycombinator.com/item?id=40001562",
    388         "created_at": "2024-04-11T12:53:16Z"
    389       },
    390       {
    391         "hn_id": "37165307",
    392         "title": "Taboo and Collaborative Knowledge Production: Evidence from Wikipedia",
    393         "points": 3,
    394         "comments": 0,
    395         "url": "https://news.ycombinator.com/item?id=37165307",
    396         "created_at": "2023-08-17T17:34:16Z"
    397       },
    398       {
    399         "hn_id": "24208779",
    400         "title": "Manticore: A 4096-core RISC-V Chiplet Arch for Ultra-efficient FP Computing",
    401         "points": 2,
    402         "comments": 1,
    403         "url": "https://news.ycombinator.com/item?id=24208779",
    404         "created_at": "2020-08-19T10:00:12Z"
    405       }
    406     ],
    407     "top_points": 48,
    408     "total_points": 109,
    409     "total_comments": 9
    410   }
    411 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs