scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (22061B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
      6     "authors": [
      7       "Jain, N.",
      8       "Han, K.",
      9       "Gu, A.",
     10       "Li, W.",
     11       "Yan, F.",
     12       "Zhang, T.",
     13       "Wang, S. I.",
     14       "Solar-Lezama, A.",
     15       "Sen, K.",
     16       "Stoica, I."
     17     ],
     18     "year": 2024,
     19     "venue": "arXiv",
     20     "arxiv_id": "2403.07974",
     21     "doi": null
     22   },
     23   "checklist": {
     24     "claims_and_evidence": {
     25       "abstract_claims_supported": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "All abstract claims are substantiated: 500+ problems from 3 platforms, 18 base + 34 instruction-tuned models evaluated, contamination findings (Figure 1), holistic evaluation across 4 scenarios, and HumanEval overfitting evidence (Figure 5) are all present in the paper.",
     29         "source": "haiku"
     30       },
     31       "causal_claims_justified": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The contamination claim is appropriately hedged as 'likely' and 'potential,' supported by temporal performance drops that match known cutoff dates. The overfitting claim for HumanEval uses similarly qualified language ('might be overfitting'), keeping causal language appropriate to the observational evidence.",
     35         "source": "haiku"
     36       },
     37       "generalization_bounded": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Section 7 explicitly bounds results to competition programming in Python across 3 platforms, stating 'we recommend using LiveCodeBench as a starting point' and noting it may not represent 'the most general notion of LLM programming capabilities.'",
     41         "source": "haiku"
     42       },
     43       "alternative_explanations_discussed": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The contamination analysis does not fully rule out difficulty increasing over time (AtCoder stability is cited but not systematically controlled), and the HumanEval 'overfitting' cluster analysis does not discuss whether fine-tuned models may simply be trained on less diverse data distributions generally.",
     47         "source": "haiku"
     48       },
     49       "proxy_outcome_distinction": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "The paper explicitly distinguishes competition programming performance from general coding ability, noting in the limitations that competition problems differ from real-world software development tasks and recommending domain-specific evaluations.",
     53         "source": "haiku"
     54       }
     55     },
     56     "limitations_and_scope": {
     57       "limitations_section_present": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Section 7 'Limitations' is a dedicated multi-paragraph section covering benchmark size, Python-only focus, prompt robustness, and problem domain constraints.",
     61         "source": "haiku"
     62       },
     63       "threats_to_validity_specific": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Specific threats are quantified: 1–1.5% performance variance from bootstrapping 349 problems, concrete prompt sensitivity observed specifically for open models on code execution with COT, and specific difficulty cutoffs used to exclude untractable problems.",
     67         "source": "haiku"
     68       },
     69       "scope_boundaries_stated": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The paper explicitly states what LCB does not show: real-world software development capability, multi-language coding ability, and performance on open-ended unconstrained problems faced by users.",
     73         "source": "haiku"
     74       }
     75     },
     76     "conflicts_of_interest": {
     77       "funding_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Acknowledgements list NSF grants (CCF:1900968, CCF:1908870, CCF:2217064) and SKY Lab industrial sponsors including Google, IBM, Intel, Microsoft, and others.",
     81         "source": "haiku"
     82       },
     83       "affiliations_disclosed": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Author affiliations are clearly listed: UC Berkeley, MIT, and Cornell. No author is identified as an employee of any company whose models are evaluated.",
     87         "source": "haiku"
     88       },
     89       "funder_independent_of_outcome": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "SKY Lab industrial sponsors include Google and Microsoft, whose products (Gemini, GPT-4 series) are directly evaluated and ranked in the benchmark, creating a potential conflict of interest even if unintentional.",
     93         "source": "haiku"
     94       },
     95       "financial_interests_declared": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No competing interests statement or financial interests declaration is present beyond the general funding acknowledgment; there is no 'no competing interests' statement.",
     99         "source": "haiku"
    100       }
    101     },
    102     "scope_and_framing": {
    103       "key_terms_defined": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Key terms are defined: 'contamination' (models trained on benchmark problems), 'live updates' (evaluating on post-cutoff-date problems), 'holistic evaluation' (multiple code scenarios), and all four evaluation scenarios are formally defined with examples in Figure 3.",
    107         "source": "haiku"
    108       },
    109       "intended_contribution_clear": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper explicitly states its contribution as a continuously updated, contamination-free benchmark with four evaluation scenarios, and clearly distinguishes it from prior benchmarks through four named design principles in the introduction.",
    113         "source": "haiku"
    114       },
    115       "engagement_with_prior_work": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Section 6 provides thorough related work and the concurrent work paragraph directly compares to Huang et al. 2023 and Li et al. 2023c with specific differentiators, not just a listing.",
    119         "source": "haiku"
    120       }
    121     }
    122   },
    123   "type_checklist": {
    124     "benchmark-creation": {
    125       "construct_design": {
    126         "construct_validity_argued": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "The paper argues competition problems measure coding capability because they are human-vetted by thousands of participants, and the multi-scenario design is motivated by AlphaCodium and real-world software engineering workflows that require debugging, comprehension, and test generation beyond mere code generation.",
    130           "source": "haiku"
    131         },
    132         "difficulty_distribution_characterized": {
    133           "applies": true,
    134           "answer": true,
    135           "justification": "Table 1 presents exact counts across Easy/Medium/Hard tiers per platform with numeric rating brackets specified (e.g., AtCoder [0–200), [200–400), [400–500]), and average test counts per tier.",
    136           "source": "haiku"
    137         },
    138         "ceiling_floor_effects_checked": {
    139           "applies": true,
    140           "answer": true,
    141           "justification": "The paper explicitly excludes the hardest CodeForces problems because most models achieve near-zero performance, and the results tables confirm good discrimination at Easy/Medium levels with appropriately near-zero Hard performance for weaker models.",
    142           "source": "haiku"
    143         },
    144         "human_baseline_included": {
    145           "applies": true,
    146           "answer": false,
    147           "justification": "No human performance baseline on LCB tasks is reported. Only GPT-4's ELO on CodeForces (bottom 5 percentile) is cited as an indirect reference to human competition performance.",
    148           "source": "haiku"
    149         },
    150         "scoring_rubric_justified": {
    151           "applies": true,
    152           "answer": true,
    153           "justification": "Pass@1 is justified by citation to Kulal et al. 2019 and Chen et al. 2021; 10-candidate sampling with temperature 0.2 and nucleus sampling is fully specified; scenario-specific correctness criteria (functional correctness, execution equivalence) are defined.",
    154           "source": "haiku"
    155         }
    156       },
    157       "robustness": {
    158         "contamination_resistance_designed": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Temporal splitting is the core anti-contamination mechanism: every problem is tagged with a contest release date, and models are evaluated only on problems released after their training cutoff date.",
    162           "source": "haiku"
    163         },
    164         "temporal_robustness_discussed": {
    165           "applies": true,
    166           "answer": true,
    167           "justification": "The benchmark is explicitly designed as continuously updated. The limitations section discusses plans for adding new platforms, a private test set, and strategies mirroring Kaggle's approach to maintain evaluations as models improve.",
    168           "source": "haiku"
    169         },
    170         "failure_modes_discussed": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "Failure modes discussed include: small effective evaluation set after cutoff filtering (349 problems, 1–1.5% variance), Python-only coverage, prompt sensitivity for open models on CoT tasks, and the competition domain not representing real-world programming.",
    174           "source": "haiku"
    175         },
    176         "baseline_implementations_provided": {
    177           "applies": true,
    178           "answer": true,
    179           "justification": "All prompts for all four scenarios are provided in Appendix C (Sections C.2–C.5), and the authors commit to releasing all model completions and a general evaluation toolkit at livecodebench.github.io.",
    180           "source": "haiku"
    181         }
    182       },
    183       "documentation": {
    184         "dataset_documentation_complete": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Section 3 and Appendix A document collection methodology, platform-specific curation procedures, generator-based test generation (Section A.2), code execution filtering criteria and statistics (A.3), and overall dataset statistics (Table 1, Figure 8).",
    188           "source": "haiku"
    189         },
    190         "licensing_and_access_clear": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Appendix A.1 explicitly addresses licensing via Fair Use §107, specifying academic-only use and no training on collected problems. The benchmark is publicly accessible at livecodebench.github.io.",
    194           "source": "haiku"
    195         },
    196         "intended_use_specified": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "The limitations section explicitly states 'we recommend using LiveCodeBench as a starting point for evaluating LLMs and further using domain-specific evaluations,' and notes it should not be used to represent general-purpose or real-world programming capability.",
    200           "source": "haiku"
    201         }
    202       }
    203     }
    204   },
    205   "claims": [
    206     {
    207       "claim": "Models show dramatic performance drops on problems released after their training cutoff dates, indicating contamination of pre-cutoff problems in training data",
    208       "evidence": "Figure 1 shows DS-Ins-33B dropping sharply after August 2023 problems and GPT-4-O dropping after November 2023 problems, matching their known cutoff dates; pattern is consistent across multiple scenarios (Figure 10)",
    209       "supported": "strong"
    210     },
    211     {
    212       "claim": "Fine-tuned open-source models likely overfit to HumanEval, with high HumanEval scores not translating to LiveCodeBench performance",
    213       "evidence": "Figure 5 scatter plot shows two distinct clusters; DS-Ins-1.3B achieves 59.8% on HumanEval+ but only 26.3% on LCB-Easy; CodeQwen and DS-Ins-6.7B outperform Claude-3-Sonnet on HumanEval+ but are >20 points behind on LCB-Easy",
    214       "supported": "moderate"
    215     },
    216     {
    217       "claim": "Model performance is highly correlated across LiveCodeBench scenarios (r>0.88) but meaningful relative differences exist, particularly on reasoning-heavy tasks",
    218       "evidence": "Figure 13 shows correlations including r=0.98 for generation/repair and r=0.89 for generation/execution; Claude-3-Opus outperforms GPT-4-Turbo on test output prediction despite trailing in code generation",
    219       "supported": "strong"
    220     },
    221     {
    222       "claim": "Closed-access models consistently outperform open-access models, with only three large instruction-tuned models (LLaMA-3-70B-Ins, Mixtral, DS-Ins-33B) approaching closed-model performance",
    223       "evidence": "Table 3 shows GPT-4-Turbo-2024-04-09 at 41.1% total vs. most open models below 25%; Figure 2 right confirms the gap narrows for only three named open models",
    224       "supported": "strong"
    225     },
    226     {
    227       "claim": "LiveCodeBench differentiates model capabilities better than HumanEval, especially between GPT-4 and other models",
    228       "evidence": "DS-Ins-33B is only 4.3 points behind GPT-4-Turbo on HumanEval+ but 16.2 points (69%) behind on LCB code generation; gap amplifies further on test output prediction (96%) and code execution (134%)",
    229       "supported": "strong"
    230     },
    231     {
    232       "claim": "GPT-4-Turbo generates code with significantly more inline comments than other models (19.5x more comment tokens than GPT-4)",
    233       "evidence": "Quantitative token count analysis is reported but the methodology for this measurement is not described in detail",
    234       "supported": "weak"
    235     }
    236   ],
    237   "methodology_tags": [
    238     "benchmark-eval"
    239   ],
    240   "key_findings": "LiveCodeBench introduces temporal problem release date tagging to enable contamination-free evaluation, finding clear evidence of training data contamination in specific models (DeepSeek, GPT-4-O, Codestral) through performance drops after their respective cutoff dates. The benchmark reveals that many fine-tuned open-source models likely overfit to HumanEval, with DS-Ins-1.3B scoring ~60% on HumanEval+ but only 26% on comparable LCB-Easy problems. Across 52 models, GPT-4-Turbo and Claude-3-Opus consistently lead, with only three large open models approaching closed-model performance; the gap between SoTA and others is far more visible in LCB than in HumanEval. Model rankings are highly correlated across four coding scenarios (r>0.88) but meaningful relative differences emerge, with Claude-3-Opus outperforming GPT-4 on reasoning-heavy tasks.",
    241   "red_flags": [
    242     {
    243       "flag": "No human baseline",
    244       "detail": "The benchmark evaluates 52 LLMs but provides no human performance baseline on any of the four tasks, making absolute capability assessment impossible."
    245     },
    246     {
    247       "flag": "Industrial sponsor conflict",
    248       "detail": "SKY Lab sponsors include Google and Microsoft, whose products (Gemini, GPT series) are directly evaluated and ranked. No competing interests statement is provided."
    249     },
    250     {
    251       "flag": "LLM-generated tests for LeetCode",
    252       "detail": "LeetCode hidden tests are not directly available, so GPT-4-Turbo is used to generate test inputs via generator programs. This creates a potential circularity when evaluating GPT-4-based models against GPT-4-generated tests."
    253     },
    254     {
    255       "flag": "HumanEval overfitting is observational",
    256       "detail": "The overfitting claim is based on cluster separation in a scatter plot. The alternative explanation—that fine-tuned open models use less diverse training data generally—is noted but not ruled out."
    257     },
    258     {
    259       "flag": "Small effective evaluation set",
    260       "detail": "After filtering for DeepSeek contamination, only 349 problems are used, producing 1–1.5% performance variance; models with more recent cutoffs face even smaller evaluation sets."
    261     }
    262   ],
    263   "cited_papers": [
    264     {
    265       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    266       "relevance": "Primary benchmark being superseded; demonstrates the contamination problem that motivates LiveCodeBench's design"
    267     },
    268     {
    269       "title": "Is Your Code Generated by ChatGPT Really Correct? (HumanEval+)",
    270       "relevance": "Enhanced HumanEval used as the primary comparison benchmark in the overfitting analysis"
    271     },
    272     {
    273       "title": "CRUXEval: A Benchmark for Code Reasoning, Understanding and Execution",
    274       "relevance": "Directly inspires the code execution scenario; LiveCodeBench extends it with live, harder, human-produced functions"
    275     },
    276     {
    277       "title": "Competition-Level Problems are Effective LLM Evaluators",
    278       "relevance": "Concurrent work using time-segmented CodeForces evaluation; LiveCodeBench extends to multiple platforms and additional scenarios"
    279     },
    280     {
    281       "title": "Quantifying Contamination in Evaluating Code Generation Capabilities",
    282       "relevance": "Alternative AST/edit-distance contamination detection approach, contrasted with LiveCodeBench's temporal method"
    283     },
    284     {
    285       "title": "Measuring Mathematical Problem Solving with the MATH Dataset (APPS)",
    286       "relevance": "Competition programming benchmark used as prior work baseline and prompt format reference"
    287     },
    288     {
    289       "title": "Code Generation with AlphaCodium",
    290       "relevance": "Demonstrates the importance of multi-step coding pipelines (generation + repair + test), motivating LiveCodeBench's holistic multi-scenario design"
    291     },
    292     {
    293       "title": "Demystifying GPT Self-Repair for Code Generation",
    294       "relevance": "Prior work motivating and shaping the self-repair scenario evaluation setup"
    295     },
    296     {
    297       "title": "SWE-Bench: Can Language Models Resolve Real-World GitHub Issues?",
    298       "relevance": "Complementary benchmark for real-world software engineering, contrasted with LiveCodeBench's competition programming scope"
    299     },
    300     {
    301       "title": "Rethinking Benchmark and Contamination for Language Models with Rephrased Samples",
    302       "relevance": "Shows that decontamination via fuzzy matching can be evaded by paraphrasing, motivating the temporal live-update approach"
    303     }
    304   ],
    305   "engagement_factors": {
    306     "practical_relevance": {
    307       "score": 3,
    308       "justification": "A continuously updated public benchmark with interactive UI for comparing 50+ models is immediately usable by anyone selecting or evaluating code LLMs."
    309     },
    310     "surprise_contrarian": {
    311       "score": 2,
    312       "justification": "Providing empirical evidence that specific popular models (DeepSeek, GPT-4-O) are contaminated and that HumanEval is gamed by fine-tuned models directly challenges the field's primary evaluation methodology."
    313     },
    314     "fear_safety": {
    315       "score": 0,
    316       "justification": "No AI safety concerns are raised; the paper focuses purely on benchmark methodology and model evaluation."
    317     },
    318     "drama_conflict": {
    319       "score": 2,
    320       "justification": "Naming specific commercial models (DeepSeek, GPT-4-O, Codestral) as likely contaminated carries reputational implications and implicitly challenges model providers' evaluation claims."
    321     },
    322     "demo_ability": {
    323       "score": 3,
    324       "justification": "The benchmark is publicly accessible at livecodebench.github.io with an interactive UI for scrolling through time windows and comparing models, fully demonstrable by anyone immediately."
    325     },
    326     "brand_recognition": {
    327       "score": 2,
    328       "justification": "UC Berkeley, MIT, Cornell authors with Ion Stoica and Armando Solar-Lezama as senior authors; the paper evaluates and names GPT-4, Claude-3, Gemini, and other high-profile models."
    329     }
    330   },
    331   "hn_data": {
    332     "threads": [
    333       {
    334         "hn_id": "40938701",
    335         "title": "Training a time series model using transformers at Datadog",
    336         "points": 27,
    337         "comments": 0,
    338         "url": "https://news.ycombinator.com/item?id=40938701",
    339         "created_at": "2024-07-11T17:19:07Z"
    340       },
    341       {
    342         "hn_id": "39703474",
    343         "title": "Show HN: WebAssembly Instrumentation in the Wizard Research Engine",
    344         "points": 7,
    345         "comments": 0,
    346         "url": "https://news.ycombinator.com/item?id=39703474",
    347         "created_at": "2024-03-14T13:08:14Z"
    348       },
    349       {
    350         "hn_id": "40018963",
    351         "title": "Ferret-v2: An Improved Baseline for Referring and Grounding with LLMs",
    352         "points": 5,
    353         "comments": 0,
    354         "url": "https://news.ycombinator.com/item?id=40018963",
    355         "created_at": "2024-04-13T00:04:37Z"
    356       },
    357       {
    358         "hn_id": "42330945",
    359         "title": "Algorithmic Bayesian Epistemology",
    360         "points": 3,
    361         "comments": 0,
    362         "url": "https://news.ycombinator.com/item?id=42330945",
    363         "created_at": "2024-12-05T18:27:12Z"
    364       },
    365       {
    366         "hn_id": "39788551",
    367         "title": "LiveCodeBench: Holistic, Contamination Free Evaluation of LLMs for Code",
    368         "points": 2,
    369         "comments": 0,
    370         "url": "https://news.ycombinator.com/item?id=39788551",
    371         "created_at": "2024-03-22T08:36:13Z"
    372       },
    373       {
    374         "hn_id": "39061405",
    375         "title": "Hijacking Attacks Against Neural Networks by Analyzing Training Data",
    376         "points": 1,
    377         "comments": 1,
    378         "url": "https://news.ycombinator.com/item?id=39061405",
    379         "created_at": "2024-01-19T21:25:51Z"
    380       },
    381       {
    382         "hn_id": "39451057",
    383         "title": "ScreenAgent: A Vision Language Model-Driven Computer Control Agent",
    384         "points": 1,
    385         "comments": 0,
    386         "url": "https://news.ycombinator.com/item?id=39451057",
    387         "created_at": "2024-02-21T07:31:11Z"
    388       },
    389       {
    390         "hn_id": "30697988",
    391         "title": "Born’s Rule in a Timeless Universe",
    392         "points": 1,
    393         "comments": 0,
    394         "url": "https://news.ycombinator.com/item?id=30697988",
    395         "created_at": "2022-03-16T12:38:46Z"
    396       }
    397     ],
    398     "top_points": 27,
    399     "total_points": 47,
    400     "total_comments": 1
    401   }
    402 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs