scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (21069B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "Evaluating Language Models for Efficient Code Generation",
      6     "authors": [
      7       "Jiawei Liu",
      8       "Songrun Xie",
      9       "Junhao Wang",
     10       "Yuxiang Wei",
     11       "Yifeng Ding",
     12       "Lingming Zhang"
     13     ],
     14     "year": 2024,
     15     "venue": "COLM 2024",
     16     "arxiv_id": "2408.06450",
     17     "doi": "10.48550/arXiv.2408.06450"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "All abstract claims are substantiated: the 121-task EVALPERF is built and evaluated, scaling law failure is demonstrated in Figure 5, and instruction tuning efficiency gains are shown in Figure 4.",
     25         "source": "haiku"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "Claims like 'general instruction tuning benefits code efficiency' are supported only by observational model comparisons (base vs. instruct variants), not controlled experiments; confounders such as training data quality are not ruled out.",
     31         "source": "haiku"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper draws broad conclusions ('scaling law fails for code efficiency') based on one language (Python) and tasks from HumanEval+/MBPP+ only, without bounding claims to these specific settings.",
     37         "source": "haiku"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No alternative explanations are considered for why instruction tuning improves efficiency or why scaling fails; the paper presents a single interpretation without considering competing hypotheses.",
     43         "source": "haiku"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper explicitly distinguishes instruction count (#instructions) from physical runtime, justifies the proxy in Appendix A.2, and separately defines DPS vs. DPSnorm to capture different aspects of performance.",
     49         "source": "haiku"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "There is no dedicated limitations or threats-to-validity section; only a brief mention of future extensions in the conclusion ('our future efforts will continuously extend EVALPERF').",
     57         "source": "haiku"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No specific threats are articulated; cross-platform variation is addressed empirically but framed as a positive result, not a threat, and benchmark coverage and contamination risk are not discussed.",
     63         "source": "haiku"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The paper does not explicitly state what EVALPERF results do NOT show; Python-only coverage and reliance on HumanEval/MBPP source tasks are not framed as scope limitations.",
     69         "source": "haiku"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Funding is disclosed in the Acknowledgment: NSF grant CCF-2131943, Kwai Inc, and OpenAI Researcher Access Program API credits.",
     77         "source": "haiku"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Author affiliations are listed: University of Illinois Urbana-Champaign and Tongji University, with email contact provided.",
     83         "source": "haiku"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Neither Kwai Inc nor NSF produce the evaluated LLMs (CodeLlama, DeepSeekCoder, StarCoder, GPT-4); no evaluated model is linked to the funders, so there is no direct financial stake in the outcome.",
     89         "source": "haiku"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests or financial interests statement is provided beyond the funding acknowledgment; patents, equity, or consulting arrangements are not declared.",
     95         "source": "haiku"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Key terms are defined: 'code efficiency' is operationalized as instruction count, 'DPS' and 'DPSnorm' are formally defined with equations, and 'performance-exercising' criteria are specified numerically.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Four explicit contributions are enumerated in the introduction: a new evaluation dimension, the DPE technique, the EVALPERF benchmark, and an empirical study.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 5 compares DPE against HumanEval, MBPP, EvalPlus, PIE, and contemporaneous sibling benchmarks (EffiBench, ECCO), explaining specific shortcomings of each that DPE addresses.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "benchmark-creation": {
    121       "construct_design": {
    122         "construct_validity_argued": {
    123           "applies": true,
    124           "answer": true,
    125           "justification": "The paper explicitly argues in Section 1 that existing benchmarks fail due to light computation and inadequate metrics, and that performance-exercising inputs with a reference-relative compound metric are necessary for valid efficiency measurement.",
    126           "source": "haiku"
    127         },
    128         "difficulty_distribution_characterized": {
    129           "applies": true,
    130           "answer": false,
    131           "justification": "The paper requires at least 4 performance clusters per task but does not characterize the distribution of task difficulty across EVALPERF (no histogram, no easy/medium/hard tiers reported).",
    132           "source": "haiku"
    133         },
    134         "ceiling_floor_effects_checked": {
    135           "applies": true,
    136           "answer": true,
    137           "justification": "Filtering criteria explicitly guard against floor effects (>10k instruction minimum) and ensure performance diversity (≥4 clusters required); ceiling effects are mitigated by requiring the strongest input within 20-second / 16GB limits.",
    138           "source": "haiku"
    139         },
    140         "human_baseline_included": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No human performance baseline is provided; the paper evaluates only LLMs and uses LLM-generated solutions as reference clusters.",
    144           "source": "haiku"
    145         },
    146         "scoring_rubric_justified": {
    147           "applies": true,
    148           "answer": true,
    149           "justification": "DPS is formally defined with a mathematical formula, Appendix A.2 extensively justifies the choice over relative speedup and physical runtime, and edge cases (e.g., solution slower than all references → score 0) are addressed.",
    150           "source": "haiku"
    151         }
    152       },
    153       "robustness": {
    154         "contamination_resistance_designed": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "Tasks are drawn directly from HumanEval and MBPP, which are public benchmarks almost certainly in LLM training data; no temporal splits, canary strings, or anti-gaming measures are discussed.",
    158           "source": "haiku"
    159         },
    160         "temporal_robustness_discussed": {
    161           "applies": true,
    162           "answer": false,
    163           "justification": "The paper only briefly notes 'our future efforts will continuously extend EVALPERF'; there is no analysis of how quickly benchmark utility will degrade as models improve or as tasks become memorized.",
    164           "source": "haiku"
    165         },
    166         "failure_modes_discussed": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "Failure modes of the benchmark itself (e.g., gaming via memorized solutions, instruction-count as proxy breaking down, Python-only coverage) are not systematically discussed; DPS cost is acknowledged but framed positively.",
    170           "source": "haiku"
    171         },
    172         "baseline_implementations_provided": {
    173           "applies": true,
    174           "answer": true,
    175           "justification": "The full pipeline and evaluator are open-sourced at github.com/evalplus/evalplus, and reference solutions per cluster are embedded in the dataset to enable reproduction of reported DPS scores.",
    176           "source": "haiku"
    177         }
    178       },
    179       "documentation": {
    180         "dataset_documentation_complete": {
    181           "applies": true,
    182           "answer": false,
    183           "justification": "No formal data card is provided; while the curation methodology is described in detail, there is no structured documentation of dataset statistics, splits, preprocessing edge cases, or known biases.",
    184           "source": "haiku"
    185         },
    186         "licensing_and_access_clear": {
    187           "applies": true,
    188           "answer": false,
    189           "justification": "The benchmark is released as part of EvalPlus on GitHub, but no license is mentioned in the paper, leaving reuse terms unclear.",
    190           "source": "haiku"
    191         },
    192         "intended_use_specified": {
    193           "applies": true,
    194           "answer": false,
    195           "justification": "The paper explains what the benchmark is for but does not explicitly state what conclusions should NOT be drawn from EVALPERF results (e.g., no guidance on extrapolating to non-Python languages or non-algorithmic tasks).",
    196           "source": "haiku"
    197         }
    198       }
    199     }
    200   },
    201   "claims": [
    202     {
    203       "claim": "SAS generates performance-exercising inputs that result in 4.8× more tasks passing all quality filters compared to EvalPlus inputs.",
    204       "evidence": "Table 1: SAS yields 121 qualifying tasks vs. 25 for EvalPlus after applying the ≥4-cluster criterion on 342 tasks with ≥10 solutions.",
    205       "supported": "strong"
    206     },
    207     {
    208       "claim": "Scaling laws hold for code correctness but not for code efficiency.",
    209       "evidence": "Figure 5 shows 7/12 pairs where larger models outperform on DPS, but 4 pairs show >1% degradation; the result is mixed, not a clean reversal.",
    210       "supported": "moderate"
    211     },
    212     {
    213       "claim": "Instruction tuning consistently improves code efficiency beyond correctness.",
    214       "evidence": "Figure 4 shows instruct > base for most model families (e.g., DeepSeekCoder-6.7B: +19% DPS), with StarCoder2-15B as the only clear exception.",
    215       "supported": "strong"
    216     },
    217     {
    218       "claim": "Performance-encouraging prompts (perf-instruct, perf-CoT) do not consistently improve efficiency and often degrade correctness.",
    219       "evidence": "Figure 4 shows no consistent advantage for perf-instruct/perf-CoT over instruct; Table 3 shows correctness degradation for most models under performance prompts.",
    220       "supported": "strong"
    221     },
    222     {
    223       "claim": "EVALPERF produces highly consistent results across platforms (max CV 0.4%).",
    224       "evidence": "Table 2 shows DPS for three models across four hardware configurations with coefficient of variation ≤0.4% in all cases.",
    225       "supported": "strong"
    226     },
    227     {
    228       "claim": "GPT-4 Turbo achieves the best DPS but not the best DPSnorm, where DeepSeekCoder-6.7B-instruct leads.",
    229       "evidence": "Table 3 and Figure 8: GPT-4 Turbo avg DPS 88.5–91.5 vs. DeepSeekCoder-6.7B-instruct avg DPSnorm 81.4 being highest across models.",
    230       "supported": "strong"
    231     }
    232   ],
    233   "methodology_tags": [
    234     "benchmark-eval"
    235   ],
    236   "key_findings": "DPE successfully addresses two core failures of existing coding benchmarks — insufficient computational load and misleading speedup metrics — by synthesizing performance-exercising test inputs and introducing the Differential Performance Score (DPS). Instruction tuning consistently improves both code correctness and efficiency across model families, while scaling laws that hold for correctness do not cleanly extend to code efficiency. EVALPERF achieves near-zero cross-platform variance (CV ≤ 0.4%) by using hardware instruction counters and relative performance ranking, making it reliably reproducible across diverse hardware configurations.",
    237   "red_flags": [
    238     {
    239       "flag": "No contamination analysis",
    240       "detail": "EVALPERF tasks originate from HumanEval and MBPP, both widely used public benchmarks almost certainly in the training data of evaluated models; no analysis of whether models have memorized these tasks is presented."
    241     },
    242     {
    243       "flag": "No limitations section",
    244       "detail": "The paper lacks a dedicated limitations or threats-to-validity section; key scope restrictions (Python-only, 121 tasks, LLM-as-oracle for reference solutions) are not framed as limitations."
    245     },
    246     {
    247       "flag": "No human baseline",
    248       "detail": "A benchmark for code efficiency evaluation never establishes how humans perform, making it impossible to contextualize LLM scores relative to human capability."
    249     },
    250     {
    251       "flag": "Broad scaling law claim from limited evidence",
    252       "detail": "The claim that 'scaling law fails for code efficiency' is based on 12 within-family pairwise comparisons across three model families; 7/12 pairs actually show larger-is-better, making the headline finding overstated."
    253     },
    254     {
    255       "flag": "Python-only benchmark with language-general framing",
    256       "detail": "All 121 tasks are Python; the framework is described as general, but no validation in other languages is provided to support that generalization."
    257     },
    258     {
    259       "flag": "No licensing information",
    260       "detail": "The benchmark is released on GitHub but no license is stated in the paper, leaving downstream use rights ambiguous."
    261     }
    262   ],
    263   "cited_papers": [
    264     {
    265       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    266       "relevance": "Primary source benchmark from which EVALPERF tasks are derived; canonical correctness evaluation baseline."
    267     },
    268     {
    269       "title": "Program Synthesis with Large Language Models (MBPP)",
    270       "relevance": "Second primary source benchmark for EVALPERF tasks; used for pass@1 correctness evaluation alongside HumanEval+."
    271     },
    272     {
    273       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation (EvalPlus)",
    274       "relevance": "Framework providing rigorous correctness tests used as prerequisite filter; also serves as SAS baseline comparison."
    275     },
    276     {
    277       "title": "Learning Performance-Improving Code Edits (PIE)",
    278       "relevance": "Closest prior work on LLM code efficiency evaluation; DPE differentiates itself from PIE's program optimization setting and simulator-based profiling."
    279     },
    280     {
    281       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    282       "relevance": "Sibling benchmark addressing contamination in coding evaluation; motivates the contamination problem DPE does not fully address."
    283     },
    284     {
    285       "title": "EffiBench: Benchmarking the Efficiency of Automatically Generated Code",
    286       "relevance": "Contemporaneous sibling benchmark for code efficiency; DPE claims to address its limitation of variation-sensitive speedup metrics."
    287     },
    288     {
    289       "title": "ECCO: Can We Improve Model-Generated Code Efficiency without Sacrificing Functional Correctness?",
    290       "relevance": "Another contemporaneous sibling benchmark for code efficiency evaluation mentioned in related work comparison."
    291     },
    292     {
    293       "title": "ALGO: Synthesizing Algorithmic Programs with Generated Oracle Verifiers",
    294       "relevance": "Prior work using LLM-generated test input generators; DPE's SAS approach is contrasted against ALGO's reliance on ChatGPT Code Interpreter."
    295     },
    296     {
    297       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    298       "relevance": "Foundational technique used by SAS for few-shot CoT input generator synthesis."
    299     },
    300     {
    301       "title": "Scaling Laws for Neural Language Models",
    302       "relevance": "Established scaling law that DPE's experiments find does not hold for code efficiency, a central empirical finding."
    303     }
    304   ],
    305   "engagement_factors": {
    306     "practical_relevance": {
    307       "score": 3,
    308       "justification": "EVALPERF is open-sourced, actively maintained as part of EvalPlus, and directly usable by practitioners to benchmark LLM code efficiency."
    309     },
    310     "surprise_contrarian": {
    311       "score": 2,
    312       "justification": "The finding that scaling laws fail for code efficiency while holding for correctness challenges common assumptions about larger models being uniformly better."
    313     },
    314     "fear_safety": {
    315       "score": 0,
    316       "justification": "No AI safety or risk concerns; the paper is focused on performance optimization evaluation methodology."
    317     },
    318     "drama_conflict": {
    319       "score": 1,
    320       "justification": "Mild challenge to existing benchmark methodology (HumanEval, MBPP inadequacy for efficiency), but framed constructively rather than controversially."
    321     },
    322     "demo_ability": {
    323       "score": 3,
    324       "justification": "Fully open-sourced benchmark and pipeline at github.com/evalplus/evalplus; anyone can run EVALPERF on their model immediately."
    325     },
    326     "brand_recognition": {
    327       "score": 1,
    328       "justification": "EvalPlus community has some recognition in the code generation space, but no major lab affiliation; published at COLM 2024, a newer venue."
    329     }
    330   },
    331   "hn_data": {
    332     "threads": [
    333       {
    334         "hn_id": "41567138",
    335         "title": "Can Generative Multi-Agents Spontaneously Form a Society?",
    336         "points": 48,
    337         "comments": 5,
    338         "url": "https://news.ycombinator.com/item?id=41567138",
    339         "created_at": "2024-09-17T12:55:14Z"
    340       },
    341       {
    342         "hn_id": "43921813",
    343         "title": "Human-Like Episodic Memory for Infinite Context LLMs",
    344         "points": 27,
    345         "comments": 0,
    346         "url": "https://news.ycombinator.com/item?id=43921813",
    347         "created_at": "2025-05-08T00:21:21Z"
    348       },
    349       {
    350         "hn_id": "40021906",
    351         "title": "Wu's Method Can Boost AlphaGeometry to Outperform Gold Medalists at IMO Geometry",
    352         "points": 7,
    353         "comments": 1,
    354         "url": "https://news.ycombinator.com/item?id=40021906",
    355         "created_at": "2024-04-13T10:06:11Z"
    356       },
    357       {
    358         "hn_id": "24247130",
    359         "title": "Manticore: A 4096-core RISC-V Chiplet Arch for Ultra-efficient FP Computing",
    360         "points": 7,
    361         "comments": 1,
    362         "url": "https://news.ycombinator.com/item?id=24247130",
    363         "created_at": "2020-08-22T20:45:30Z"
    364       },
    365       {
    366         "hn_id": "40015493",
    367         "title": "Show HN: Symbolic AI at Silver Medal, Boosts AlphaGeometry to Beat IMO Geo Gold",
    368         "points": 6,
    369         "comments": 0,
    370         "url": "https://news.ycombinator.com/item?id=40015493",
    371         "created_at": "2024-04-12T17:36:41Z"
    372       },
    373       {
    374         "hn_id": "39691144",
    375         "title": "Adding NVMe SSDs to Enable and Accelerate 100B Model Fine-Tuning on a Single GPU",
    376         "points": 3,
    377         "comments": 1,
    378         "url": "https://news.ycombinator.com/item?id=39691144",
    379         "created_at": "2024-03-13T13:44:20Z"
    380       },
    381       {
    382         "hn_id": "41317807",
    383         "title": "Human-Like Episodic Memory for Infinite Context LLMs",
    384         "points": 3,
    385         "comments": 0,
    386         "url": "https://news.ycombinator.com/item?id=41317807",
    387         "created_at": "2024-08-22T07:52:12Z"
    388       },
    389       {
    390         "hn_id": "40001562",
    391         "title": "Symbolic AI at Silver Medal, Boosts AlphaGeometry to Beat Gold at IMO Geometry",
    392         "points": 3,
    393         "comments": 0,
    394         "url": "https://news.ycombinator.com/item?id=40001562",
    395         "created_at": "2024-04-11T12:53:16Z"
    396       },
    397       {
    398         "hn_id": "37165307",
    399         "title": "Taboo and Collaborative Knowledge Production: Evidence from Wikipedia",
    400         "points": 3,
    401         "comments": 0,
    402         "url": "https://news.ycombinator.com/item?id=37165307",
    403         "created_at": "2023-08-17T17:34:16Z"
    404       },
    405       {
    406         "hn_id": "24208779",
    407         "title": "Manticore: A 4096-core RISC-V Chiplet Arch for Ultra-efficient FP Computing",
    408         "points": 2,
    409         "comments": 1,
    410         "url": "https://news.ycombinator.com/item?id=24208779",
    411         "created_at": "2020-08-19T10:00:12Z"
    412       }
    413     ],
    414     "top_points": 48,
    415     "total_points": 109,
    416     "total_comments": 9
    417   }
    418 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs