scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (22753B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "LeetCodeDataset: A Temporal Dataset for Robust Evaluation and Efficient Training of Code LLMs",
      6     "authors": [
      7       "Yunhui Xia",
      8       "Wei Shen",
      9       "Yan Wang",
     10       "Jason Klein Liu",
     11       "Huifeng Sun",
     12       "Siyue Wu",
     13       "Jian Hu",
     14       "Xiaolong Xu"
     15     ],
     16     "year": 2025,
     17     "venue": "arXiv.org",
     18     "arxiv_id": "2504.14655",
     19     "doi": "10.48550/arXiv.2504.14655"
     20   },
     21   "checklist": {
     22     "claims_and_evidence": {
     23       "abstract_claims_supported": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "All abstract claims are substantiated: high-quality benchmark (validated through 100+ test cases and metadata), temporal splits enabling contamination-free eval (demonstrated in Figure 3), and SFT efficiency (Table 4 shows 2.6K-sample model matches 110K-sample baselines on HumanEval/MBPP).",
     27         "source": "haiku"
     28       },
     29       "causal_claims_justified": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The paper makes a causal claim: model-generated training data improves SFT performance vs. human-written (79.9% vs. 55.5% on HumanEval). Study design holds all hyperparameters constant and varies only data source, which is adequate for this comparison.",
     33         "source": "haiku"
     34       },
     35       "generalization_bounded": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "Scope is appropriately bounded to competitive-programming code generation (LeetCode problems). Paper acknowledges limitations on hard benchmarks (Section 4.2: 'small-scale SFT primarily develops basic programming skills'). No overgeneralization to all code or software tasks.",
     39         "source": "haiku"
     40       },
     41       "alternative_explanations_discussed": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "Paper reports that reasoning models outperform non-reasoning (DeepSeek-R1 65.23% vs. Claude 50.78%) but does not discuss whether advantage stems from architecture, training procedure, reasoning time, or data. Model-generated vs. human-written superiority (Table 4) lacks mechanism analysis.",
     45         "source": "haiku"
     46       },
     47       "proxy_outcome_distinction": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Paper claims to assess 'reasoning abilities' (Abstract, Section 1) but measures pass@1 rate on test cases. Passing test cases is not the same as demonstrating reasoning—a model could brute-force solutions. The distinction between measurement (pass@1) and claim (reasoning ability) is not acknowledged.",
     51         "source": "haiku"
     52       }
     53     },
     54     "limitations_and_scope": {
     55       "limitations_section_present": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Section 6 'Limitations' is a dedicated subsection discussing false positive risks, complexity analysis gaps, and coverage gaps (multiple entry points).",
     59         "source": "haiku"
     60       },
     61       "threats_to_validity_specific": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Specific threats are listed: 'our dataset lacks extremely complex input patterns and suffers from an imbalanced test case distribution' (false positive risk); 'Determining time/space complexity...exceeds our current scope' (specific capability gap); 'We haven't included...problems with multiple solution entry points' (concrete scope limitation).",
     65         "source": "haiku"
     66       },
     67       "scope_boundaries_stated": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "Explicit scope boundaries: Python-only (Section 2.1: '3,115 supported Python submissions'), single-function entry points (Section 2.1: 'our implementation focuses exclusively on single-function starter code scenarios'), temporal split (pre/post July 2024).",
     71         "source": "haiku"
     72       }
     73     },
     74     "conflicts_of_interest": {
     75       "funding_disclosed": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No funding acknowledgment or statement appears in the paper. No acknowledgments section provided.",
     79         "source": "haiku"
     80       },
     81       "affiliations_disclosed": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "Author names and email addresses are listed, but no institutional affiliations are provided. No disclosure of potential conflicts with LeetCode platform.",
     85         "source": "haiku"
     86       },
     87       "funder_independent_of_outcome": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "No funding source disclosed, so independence cannot be assessed.",
     91         "source": "haiku"
     92       },
     93       "financial_interests_declared": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No competing interests statement or financial disclosures (patents, equity, consulting) are declared.",
     97         "source": "haiku"
     98       }
     99     },
    100     "scope_and_framing": {
    101       "key_terms_defined": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "Critical terms lack formal definition: 'reasoning models' vs 'non-reasoning models' (used by example, not defined—what makes DeepSeek-R1 'reasoning'?), 'contamination' (used contextually without definition), 'pass@1' (used throughout without introduction).",
    105         "source": "haiku"
    106       },
    107       "intended_contribution_clear": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Intended contributions are explicit: (1) LeetCodeDataset—a new benchmark with 2,869 problems, 100+ test cases per problem; (2) contamination-free evaluation via temporal splits; (3) efficient SFT through curated training data.",
    111         "source": "haiku"
    112       },
    113       "engagement_with_prior_work": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Section 5 engages with code-generation benchmarks (HumanEval, MBPP, LiveCodeBench, APPS, CodeContests) and fine-tuning datasets (Magicoder, CodeAlpaca, Open-R1). Paper positions LeetCodeDataset as improving on LiveCodeBench (broader coverage, richer metadata) and providing training splits.",
    117         "source": "haiku"
    118       }
    119     }
    120   },
    121   "type_checklist": {
    122     "benchmark-creation": {
    123       "construct_design": {
    124         "construct_validity_argued": {
    125           "applies": true,
    126           "answer": false,
    127           "justification": "Paper claims benchmark measures 'reasoning abilities' and 'coding capability' but provides no explicit argument for construct validity. No analysis of whether pass@1 rate on test cases actually reflects reasoning or coding skill (vs. memorization, pattern matching, etc.).",
    128           "source": "haiku"
    129         },
    130         "difficulty_distribution_characterized": {
    131           "applies": true,
    132           "answer": true,
    133           "justification": "Table 1 reports difficulty distribution: Easy 23.91%, Medium 52.21%, Hard 23.88%. Table 2 confirms discrimination across difficulty tiers—pass rates decline from Easy (94.44% max) to Hard (41.86% max), validating difficulty stratification.",
    134           "source": "haiku"
    135         },
    136         "ceiling_floor_effects_checked": {
    137           "applies": true,
    138           "answer": true,
    139           "justification": "Table 2 reveals ceiling effect on Easy (DeepSeek-R1 94.44%, leaving 5.6pp margin). Hard problems avoid floor effects (even best model 41.86%). Data demonstrates discrimination exists, though not explicitly discussed as ceiling/floor analysis.",
    140           "source": "haiku"
    141         },
    142         "human_baseline_included": {
    143           "applies": true,
    144           "answer": false,
    145           "justification": "Paper provides no human baseline. Cannot validate whether Easy is actually easy for humans or whether Hard poses meaningful human-level difficulty. Only machine performance is reported.",
    146           "source": "haiku"
    147         },
    148         "scoring_rubric_justified": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "Paper uses pass@1 (binary: code passes all test cases or not) without justification. No explanation for why pass@1 over pass@k, partial credit, test coverage, or other metrics.",
    152           "source": "haiku"
    153         }
    154       },
    155       "robustness": {
    156         "contamination_resistance_designed": {
    157           "applies": true,
    158           "answer": true,
    159           "justification": "Contamination resistance is explicitly designed: temporal split (pre/post July 2024) prevents training data leakage. Figure 3 analyzes monthly accuracy by problem release month to detect post-release performance decline (signature of contamination). Approach mirrors LiveCodeBench's strategy.",
    160           "source": "haiku"
    161         },
    162         "temporal_robustness_discussed": {
    163           "applies": true,
    164           "answer": false,
    165           "justification": "No discussion of temporal robustness or planned updates. Unlike LiveCodeBench ('live updates'), this benchmark provides static splits. No mention of how the dataset will remain challenging as models improve or when new problems should be added.",
    166           "source": "haiku"
    167         },
    168         "failure_modes_discussed": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "Section 6 lists limitations (false positives, complexity gap, coverage) but does not discuss failure modes of the benchmark itself (e.g., Can solutions be gamed? Are test cases brittle to prompt engineering? Does the benchmark systematically underweight certain coding skills?). Limitations are about coverage, not robustness.",
    172           "source": "haiku"
    173         },
    174         "baseline_implementations_provided": {
    175           "applies": true,
    176           "answer": true,
    177           "justification": "Paper states 'The dataset and evaluation framework are available on Hugging Face and Github.' Mention of 'evaluation framework' indicates baseline evaluation code is provided, enabling reproduction of reported results.",
    178           "source": "haiku"
    179         }
    180       },
    181       "documentation": {
    182         "dataset_documentation_complete": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "Data collection methodology is documented: Section 2.1 describes metadata acquisition (slug, problem_id, difficulty, description, tags), canonical solution verification, test case generation, and preprocessing. Source (LeetCode GraphQL API) and collection process are detailed.",
    186           "source": "haiku"
    187         },
    188         "licensing_and_access_clear": {
    189           "applies": true,
    190           "answer": false,
    191           "justification": "Paper states dataset is 'available on Hugging Face and Github' but does not specify licensing terms, usage restrictions, or whether derivatives are permitted. No explicit license (MIT, Apache, CC, etc.) is mentioned.",
    192           "source": "haiku"
    193         },
    194         "intended_use_specified": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "Intended uses are explicit: (1) 'benchmarking' code models to evaluate reasoning and coding capability; (2) supervised fine-tuning via training split; (3) support for RL training via test cases as verifiers. Context is competitive-programming code generation.",
    198           "source": "haiku"
    199         }
    200       }
    201     }
    202   },
    203   "claims": [
    204     {
    205       "claim": "Reasoning models significantly outperform non-reasoning counterparts on code generation.",
    206       "evidence": "Table 2: DeepSeek-R1 (reasoning, pass@1=65.23%) and QwQ-Plus (reasoning, 56.25%) exceed non-reasoning models (GPT-4o 35.55%, DeepSeek-V3 35.55%, Claude 50.78%). Table 3 shows consistency across topic tags.",
    207       "supported": "strong"
    208     },
    209     {
    210       "claim": "SFT with 2.6K curated model-generated samples achieves performance comparable to 110K-sample datasets.",
    211       "evidence": "Table 4: LeetCodeDataset 2.6K samples achieves 79.9% HumanEval, 77.5% MBPP, exceeding Magicoder 111.1K (77.4% HumanEval, 74.1% MBPP).",
    212       "supported": "moderate"
    213     },
    214     {
    215       "claim": "Model-generated training data outperforms human-written responses for code generation SFT.",
    216       "evidence": "Table 4: LeetCodeDataset with model-generated responses: 79.9% HumanEval, 77.5% MBPP vs. human responses: 55.5% HumanEval, 53.4% MBPP (same sample size, 2.6K).",
    217       "supported": "strong"
    218     },
    219     {
    220       "claim": "Temporal problem splits prevent contamination and enable authentic capability measurement.",
    221       "evidence": "Figure 3 shows minimal monthly accuracy decline post-release for GPT-4o-0806 (overlap Aug 2024 vs test set post-July 2024), interpreted as evidence of clean evaluation. Applies LiveCodeBench methodology.",
    222       "supported": "moderate"
    223     },
    224     {
    225       "claim": "100+ test cases per problem minimize false positives in solution verification.",
    226       "evidence": "Section 2.1 describes multi-stage test case generation (LLM one-shot, LLM complex inputs, multiple samples). Section 6 acknowledges residual false-positive risk despite effort.",
    227       "supported": "moderate"
    228     },
    229     {
    230       "claim": "Difficulty tiers (Easy/Medium/Hard) effectively discriminate model capability.",
    231       "evidence": "Table 2 shows clear stratification: Easy 81–94% pass rates, Medium 26–69%, Hard 10–42% across models. Performance correlates with difficulty classification.",
    232       "supported": "strong"
    233     }
    234   ],
    235   "methodology_tags": [
    236     "benchmark-creation",
    237     "benchmark-eval",
    238     "observational"
    239   ],
    240   "key_findings": "LeetCodeDataset curates 2,869 Python LeetCode problems (90% of platform) with 100+ verified test cases and temporal splits (pre/post July 2024) to enable contamination-free evaluation. Reasoning models (DeepSeek-R1: 65.23%, QwQ-Plus: 56.25%) substantially outperform non-reasoning models (Claude: 50.78%, GPT-4o: 35.55%) on the 256-problem test set, confirming reasoning capability advantage in competition-level code generation. Supervised fine-tuning with 2.6K model-generated samples achieves 79.9% HumanEval and 77.5% MBPP—performance matching or exceeding much larger datasets (75–111K samples)—demonstrating exceptional data efficiency through high-quality curation.",
    241   "red_flags": [
    242     {
    243       "flag": "No human baseline",
    244       "detail": "Cannot validate whether Easy, Medium, Hard tiers match human skill levels. Benchmark is calibrated only to machine performance."
    245     },
    246     {
    247       "flag": "Scoring rubric not justified",
    248       "detail": "Pass@1 (binary correctness) chosen without justification. No comparison to pass@k, partial credit, test coverage, or complexity-aware metrics."
    249     },
    250     {
    251       "flag": "Proxy outcome mismatch",
    252       "detail": "Paper claims to assess 'reasoning abilities' but measures test-case pass rate. Passing tests does not necessarily demonstrate reasoning; solutions could succeed through pattern matching."
    253     },
    254     {
    255       "flag": "Ceiling effects on Easy problems",
    256       "detail": "DeepSeek-R1 achieves 94.44% on Easy tier, leaving only 5.6pp discrimination margin. May not adequately differentiate high-performing models."
    257     },
    258     {
    259       "flag": "No conflict of interest disclosure",
    260       "detail": "No funding statement or author affiliation disclosure. Unclear if any authors have ties to LeetCode platform."
    261     },
    262     {
    263       "flag": "Key terms undefined",
    264       "detail": "'Reasoning models' vs 'non-reasoning' not formally defined. 'Contamination' and 'pass@1' used throughout without introduction."
    265     },
    266     {
    267       "flag": "No plan for temporal updates",
    268       "detail": "Unlike LiveCodeBench (live updates), this is a static dataset. No mechanism described for keeping benchmark fresh as models improve."
    269     },
    270     {
    271       "flag": "Limited hard benchmark performance",
    272       "detail": "Section 4.2 acknowledges SFT model underperforms on hard benchmarks (12.5% LiveCodeBench). Questions practical utility for rigorous evaluation."
    273     },
    274     {
    275       "flag": "No failure mode analysis",
    276       "detail": "Paper lists coverage gaps but does not discuss what the benchmark systematically fails to measure or how solutions might be gamed."
    277     },
    278     {
    279       "flag": "License terms not specified",
    280       "detail": "Dataset availability on HuggingFace/GitHub is mentioned, but no explicit license (MIT, Apache, CC) is stated."
    281     }
    282   ],
    283   "cited_papers": [
    284     {
    285       "title": "LiveCodeBench: Holistic and contamination-free evaluation of large language models for code",
    286       "relevance": "Directly related prior work on temporal contamination avoidance in code benchmarking; paper cites LiveCodeBench's approach as motivation for temporal splits."
    287     },
    288     {
    289       "title": "HumanEval: Hand-written evaluation set for evaluating code generation models",
    290       "relevance": "Foundational code-generation benchmark; LeetCodeDataset evaluated on HumanEval (79.9% for SFT model) for calibration."
    291     },
    292     {
    293       "title": "MBPP: A scalable benchmark for evaluating neural code generation models",
    294       "relevance": "Standard Python code benchmark; used to validate SFT training efficiency (77.5% with 2.6K samples)."
    295     },
    296     {
    297       "title": "APPS: Learning to code by solving problems",
    298       "relevance": "Competitive programming dataset; compared as existing baseline for code-generation SFT datasets."
    299     },
    300     {
    301       "title": "CodeContests: A large-scale competition-based code generation dataset",
    302       "relevance": "Related competitive programming benchmark; positioned as precursor to LeetCodeDataset improvements."
    303     },
    304     {
    305       "title": "Magicoder: Empowering code generation with OSS-instruct",
    306       "relevance": "SFT baseline dataset (75K samples, 73.8% HumanEval); LeetCodeDataset shows 2.6K samples > 75K samples in data efficiency."
    307     },
    308     {
    309       "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning",
    310       "relevance": "Evaluated as top-performing reasoning model (65.23% pass@1); demonstrates reasoning architecture advantage in code generation."
    311     },
    312     {
    313       "title": "Direct Preference Optimization (DPO): Direct preference optimization with a zero-reference objective",
    314       "relevance": "Mentioned as an alternative training method (alongside SFT, RL) that LeetCodeDataset could support."
    315     }
    316   ],
    317   "engagement_factors": {
    318     "practical_relevance": {
    319       "score": 3,
    320       "justification": "Practitioners can immediately use LeetCodeDataset for model training (SFT) and evaluation. Public release on HuggingFace/GitHub enables direct adoption."
    321     },
    322     "surprise_contrarian": {
    323       "score": 1,
    324       "justification": "Finding that reasoning models outperform non-reasoning models aligns with expected model capabilities. Data efficiency result (2.6K vs 110K samples) is moderately surprising but presented without mechanistic insight."
    325     },
    326     "fear_safety": {
    327       "score": 0,
    328       "justification": "Paper does not raise AI safety, alignment, or risk concerns. Focus is purely on code-generation capability and data efficiency."
    329     },
    330     "drama_conflict": {
    331       "score": 0,
    332       "justification": "No controversial findings, conflicting methodologies, or debate angles. Paper presents a benchmark contribution straightforwardly."
    333     },
    334     "demo_ability": {
    335       "score": 3,
    336       "justification": "LeetCodeDataset is publicly available on HuggingFace and GitHub with evaluation framework. Users can immediately download and run models on the benchmark."
    337     },
    338     "brand_recognition": {
    339       "score": 2,
    340       "justification": "Models evaluated include OpenAI (GPT-4o), Anthropic (Claude), and DeepSeek (well-known in AI), raising credibility. Authors appear to be from academic/research settings, not major labs."
    341     }
    342   },
    343   "hn_data": {
    344     "threads": [
    345       {
    346         "hn_id": "44436031",
    347         "title": "Show HN: Arch-Router – 1.5B model for LLM routing by preferences, not benchmarks",
    348         "points": 66,
    349         "comments": 15,
    350         "url": "https://news.ycombinator.com/item?id=44436031",
    351         "created_at": "2025-07-01T17:13:11Z"
    352       },
    353       {
    354         "hn_id": "27075013",
    355         "title": "MarioNette: Self-Supervised Sprite Learning",
    356         "points": 47,
    357         "comments": 1,
    358         "url": "https://news.ycombinator.com/item?id=27075013",
    359         "created_at": "2021-05-07T12:09:34Z"
    360       },
    361       {
    362         "hn_id": "44771836",
    363         "title": "Arch-Router: Aligning LLM Routing with Human Preferences",
    364         "points": 9,
    365         "comments": 2,
    366         "url": "https://news.ycombinator.com/item?id=44771836",
    367         "created_at": "2025-08-02T21:42:16Z"
    368       },
    369       {
    370         "hn_id": "44597819",
    371         "title": "Show HN: 1.5B LLM routing model that aligns to preferences, not leaderboards",
    372         "points": 4,
    373         "comments": 0,
    374         "url": "https://news.ycombinator.com/item?id=44597819",
    375         "created_at": "2025-07-17T20:29:12Z"
    376       },
    377       {
    378         "hn_id": "45324991",
    379         "title": "Show HN: Model-literals, model-aliases, and preference-aligned routing for LLMs",
    380         "points": 2,
    381         "comments": 0,
    382         "url": "https://news.ycombinator.com/item?id=45324991",
    383         "created_at": "2025-09-21T17:45:54Z"
    384       },
    385       {
    386         "hn_id": "44650696",
    387         "title": "Show HN: RouteGPT – model routing on ChatGPT aligned to user preferences",
    388         "points": 2,
    389         "comments": 0,
    390         "url": "https://news.ycombinator.com/item?id=44650696",
    391         "created_at": "2025-07-22T17:52:01Z"
    392       },
    393       {
    394         "hn_id": "43191958",
    395         "title": "Volume estimates for unions of convex sets, and the Kakeya set conjecture in d=3",
    396         "points": 2,
    397         "comments": 1,
    398         "url": "https://news.ycombinator.com/item?id=43191958",
    399         "created_at": "2025-02-27T06:47:24Z"
    400       },
    401       {
    402         "hn_id": "45340869",
    403         "title": "Wan-Animate: Unified Character Animation, Replacement with Holistic Replication",
    404         "points": 2,
    405         "comments": 0,
    406         "url": "https://news.ycombinator.com/item?id=45340869",
    407         "created_at": "2025-09-22T23:23:18Z"
    408       },
    409       {
    410         "hn_id": "43206059",
    411         "title": "A Solution of the Kakeya Conjecture",
    412         "points": 2,
    413         "comments": 0,
    414         "url": "https://news.ycombinator.com/item?id=43206059",
    415         "created_at": "2025-02-28T14:34:55Z"
    416       },
    417       {
    418         "hn_id": "44774539",
    419         "title": "Show HN: Arch-Router – Aligning LLM Routing with Human Preferences",
    420         "points": 1,
    421         "comments": 0,
    422         "url": "https://news.ycombinator.com/item?id=44774539",
    423         "created_at": "2025-08-03T06:32:04Z"
    424       }
    425     ],
    426     "top_points": 66,
    427     "total_points": 137,
    428     "total_comments": 19
    429   }
    430 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs