scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (21604B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "DSCodeBench: A Realistic Benchmark for Data Science Code Generation",
      6     "authors": [
      7       "Shuyin Ouyang",
      8       "Dong Huang",
      9       "Jingwen Guo",
     10       "Zeyu Sun",
     11       "Qihao Zhu",
     12       "Jie M. Zhang"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv / AAAI 2026",
     16     "arxiv_id": "2505.15621",
     17     "doi": null
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Abstract claims are supported: 1,000 problems across 10 libraries (confirmed in statistics section), robust scaling behavior (Table 2 shows consistent within-family scaling), GPT-4o pass@1 of 0.392 (Table 2).",
     25         "source": "opus"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper claims 'architectural and training optimizations play a critical role beyond model scaling alone' (comparing GPT-4o-mini vs GPT-3.5-turbo) but cannot isolate these factors. Scaling claims are based on observational comparisons within model families where training data, architecture, and scale all change simultaneously.",
     31         "source": "opus"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The title 'A Realistic Benchmark for Data Science Code Generation' and abstract claim 'realistic data science code generation tasks' are broader than the tested scope of Python-only with 10 specific libraries. The Limitations section acknowledges Python-only scope, but the title and abstract do not bound the claims.",
     37         "source": "opus"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No alternative explanations are discussed for observed results. For example, why Matplotlib/Seaborn scores are low (training data distribution? API complexity?), or why scaling behavior is clear on DSCodeBench but not DS-1000, is not explored.",
     43         "source": "opus"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper measures functional correctness via test suites and explicitly acknowledges the proxy gap: 'DSCodeBench evaluates functional correctness based on unit tests, without explicitly assessing other important dimensions of code quality, such as computational efficiency, coding style, readability, or security.'",
     49         "source": "opus"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "A dedicated 'Limitation' section is present in the Appendix with substantive discussion of three specific limitations (Python-only, simplified error handling, functional correctness only).",
     57         "source": "opus"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Specific threats discussed: Python-only coverage, error-handling simplification ('error-raising code segments are either removed or replaced with default behaviors'), exclusion of multi-file codebases, and no efficiency/security/readability evaluation.",
     63         "source": "opus"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Explicit scope boundaries stated: 'DSCodeBench focuses exclusively on Python and ten popular data science libraries,' does not assess 'runtime performance, security, and adherence to coding best practices,' and 'primarily targets single-function or single-file tasks.'",
     69         "source": "opus"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Funding disclosed: ITEA Genius and ITEA GreenCode projects (InnovateUK), UKRI CDT in Safe and Trusted AI (EP/S023356/1), and NSFC (62402482).",
     77         "source": "opus"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Author affiliations listed: King's College London, NUS, Chinese Academy of Sciences, Peking University. None affiliated with companies whose models are evaluated.",
     83         "source": "opus"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Funders are government and academic bodies (InnovateUK, UKRI, NSFC) with no financial interest in the benchmark results or evaluated models.",
     89         "source": "opus"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests or financial interests statement is present in the paper.",
     95         "source": "opus"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "'Realistic,' 'complicated,' and 'data science code generation' are central to the paper's claims but never formally defined; 'realistic' is operationalized implicitly through comparison metrics (line length, word count) without a stated definition.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper explicitly lists three contributions: the DSCodeBench benchmark, the automated construction pipeline (scope determination through manual editing), and a comprehensive empirical evaluation of 10 LLMs.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper provides a detailed Table 1 comparing DSCodeBench to 9 existing benchmarks on multiple dimensions and a Related Work section discussing how DSCodeBench extends and addresses specific limitations of DS-1000, DA-Code, and DataSciBench.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "benchmark-creation": {
    121       "construct_design": {
    122         "construct_validity_argued": {
    123           "applies": true,
    124           "answer": false,
    125           "justification": "The paper argues that scaling behavior 'validates its ability to distinguish model capabilities,' but this conflates construct validity with discriminative ability; no formal argument is made for why pass@k on these GitHub-sourced function-level tasks measures 'realistic data science programming capability' as a construct.",
    126           "source": "haiku"
    127         },
    128         "difficulty_distribution_characterized": {
    129           "applies": true,
    130           "answer": false,
    131           "justification": "Figure 2 shows per-library averages of problem word count and solution line count, but no difficulty tiers (easy/medium/hard) are defined or measured; difficulty is characterized only implicitly through model performance gaps.",
    132           "source": "haiku"
    133         },
    134         "ceiling_floor_effects_checked": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "Ceiling and floor effects are not explicitly checked or discussed; some models approach floor on specific libraries (DeepSeek-1.3B scores 0.000 on SciPy) and some tasks show very low scores across all models, but no systematic analysis is conducted.",
    138           "source": "haiku"
    139         },
    140         "human_baseline_included": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No human baseline performance is reported; the paper uses LLM-as-a-judge for alignment verification but does not measure how human developers perform on the benchmark tasks.",
    144           "source": "haiku"
    145         },
    146         "scoring_rubric_justified": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "Pass@k is standard and described; however, the 0.5 structural similarity threshold for visualization tasks (Matplotlib/Seaborn) is stated without justification for why this threshold is appropriate or how sensitive results are to this choice.",
    150           "source": "haiku"
    151         }
    152       },
    153       "robustness": {
    154         "contamination_resistance_designed": {
    155           "applies": true,
    156           "answer": true,
    157           "justification": "The paper explicitly designs contamination resistance via code reconstruction (AST transformation, parameter refactoring), systematic perturbations to function signatures and control flow, and reports text similarity <0.4 and AST similarity <0.5 between LLM-generated and ground truth code.",
    158           "source": "haiku"
    159         },
    160         "temporal_robustness_discussed": {
    161           "applies": true,
    162           "answer": false,
    163           "justification": "Future work mentions extending coverage but does not discuss how long the benchmark will remain valid before models are trained on it, nor does it provide a plan for versioning or dynamic generation to prevent gaming.",
    164           "source": "haiku"
    165         },
    166         "failure_modes_discussed": {
    167           "applies": true,
    168           "answer": true,
    169           "justification": "The limitations section identifies key failure modes: Python-only scope limits generalizability, error-handling simplification reduces fidelity, single-function focus misses project-level complexity, and functional-correctness-only evaluation misses efficiency and security.",
    170           "source": "haiku"
    171         },
    172         "baseline_implementations_provided": {
    173           "applies": true,
    174           "answer": true,
    175           "justification": "The paper states 'The benchmark, code, and experiment results are available at https://github.com/ShuyinOuyang/DSCodeBench,' providing the construction pipeline, evaluation framework, and all reported results for reproduction.",
    176           "source": "haiku"
    177         }
    178       },
    179       "documentation": {
    180         "dataset_documentation_complete": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "The construction pipeline is thoroughly documented across five steps (scope determination, ground truth code, test case generation, problem description, manual editing) with filtering criteria, pipeline statistics, and per-library breakdowns.",
    184           "source": "haiku"
    185         },
    186         "licensing_and_access_clear": {
    187           "applies": true,
    188           "answer": false,
    189           "justification": "A GitHub URL is provided but no license or terms of use are mentioned anywhere in the paper; it is unclear whether the benchmark is freely usable, what restrictions apply to the GitHub-sourced code, or whether attribution is required.",
    190           "source": "haiku"
    191         },
    192         "intended_use_specified": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "The broader impact section explicitly states intended use (evaluating LLMs on data science code generation) and cautions against over-optimizing for benchmark performance and using it as the sole evaluation method.",
    196           "source": "haiku"
    197         }
    198       }
    199     }
    200   },
    201   "claims": [
    202     {
    203       "claim": "DSCodeBench is more challenging than DS-1000: GPT-4o achieves pass@1=0.392 on DSCodeBench vs 0.451 on DS-1000.",
    204       "evidence": "Table 2 shows consistent pass@1 decline across all 10 models when moving from DS-1000 to DSCodeBench.",
    205       "supported": "strong"
    206     },
    207     {
    208       "claim": "DSCodeBench exhibits robust scaling behavior (larger models consistently outperform smaller), which DS-1000 does not.",
    209       "evidence": "Table 2 shows monotonic scaling in DeepSeek and Qwen families on DSCodeBench; DS-1000 shows irregular scaling (e.g., DeepSeek-33B scores lower than V2-Lite on DS-1000).",
    210       "supported": "moderate"
    211     },
    212     {
    213       "claim": "GPT-4o achieves the best pass@1 of 0.392, indicating LLMs have significant room to improve on realistic data science code generation.",
    214       "evidence": "Table 2 reports GPT-4o pass@1=0.392, pass@3=0.438, with all other models substantially lower.",
    215       "supported": "strong"
    216     },
    217     {
    218       "claim": "Open-source models significantly underperform closed-source: best open-source (DeepSeek-33B) achieves pass@1=0.222 vs GPT-4o's 0.392.",
    219       "evidence": "Table 2 directly compares pass@1 scores across open and closed-source models.",
    220       "supported": "strong"
    221     },
    222     {
    223       "claim": "Visualization libraries (Matplotlib, Seaborn) are hardest for LLMs, with GPT-4o scoring only 0.210 and 0.141 respectively.",
    224       "evidence": "Table 3 shows Matplotlib and Seaborn consistently yield lowest pass@1 across all models.",
    225       "supported": "strong"
    226     },
    227     {
    228       "claim": "Automated test case scripts achieve 97.8% mean line coverage across ground truth code.",
    229       "evidence": "Figure 5 reports per-library coverage ranging from 95.5% (Pandas) to 99.7% (Matplotlib), overall mean 97.8%.",
    230       "supported": "strong"
    231     },
    232     {
    233       "claim": "Data leakage is minimal: text similarity <0.4 and AST similarity <0.5 between LLM-generated and ground truth code.",
    234       "evidence": "Figures 3 and 4 show similarity distributions across all models and libraries remain below these thresholds.",
    235       "supported": "moderate"
    236     }
    237   ],
    238   "methodology_tags": [
    239     "benchmark-eval",
    240     "benchmark-creation"
    241   ],
    242   "key_findings": "DSCodeBench is a harder, more discriminating benchmark than DS-1000, with the best model (GPT-4o) achieving only 0.392 pass@1 compared to 0.451 on DS-1000. The benchmark reliably shows scaling behavior (larger models outperform smaller) which DS-1000 fails to show, attributable to more test cases, longer solutions, and structured problem descriptions. Visualization tasks remain the hardest domain for all models. The construction pipeline achieves 97.8% test case coverage with contamination mitigation reducing code similarity below 0.4 (text) and 0.5 (AST).",
    243   "red_flags": [
    244     {
    245       "flag": "No human baseline",
    246       "detail": "Despite claiming to measure 'realistic data science code generation capability,' no human developer baseline is reported, making it impossible to assess task difficulty relative to human performance or whether the benchmark is solving the right problem."
    247     },
    248     {
    249       "flag": "Visualization threshold unjustified",
    250       "detail": "The 0.5 structural similarity threshold for Matplotlib/Seaborn tasks is stated without justification; this threshold may be systematically biased against correct but stylistically different plots."
    251     },
    252     {
    253       "flag": "LLM-generated problem descriptions",
    254       "detail": "Problem descriptions are generated by LLMs from ground truth code, creating potential circular bias where problem descriptions may be better aligned with LLM capabilities than with human natural language, and the 97.4% alignment agreement uses LLM-as-a-judge (GPT-4o-mini/GPT-4o) which may have style biases."
    255     },
    256     {
    257       "flag": "Scaling as construct validity evidence is weak",
    258       "detail": "The paper claims scaling behavior 'validates' the benchmark's ability to distinguish model capabilities, but larger models are generally trained on more code data including GitHub—scaling on GitHub-sourced tasks does not prove the benchmark measures 'realistic data science programming' rather than familiarity with GitHub code patterns."
    259     },
    260     {
    261       "flag": "No license disclosed",
    262       "detail": "The benchmark is built from GitHub code but no license or terms of use are stated, creating legal uncertainty about benchmark reuse and redistribution, particularly given the GitHub code provenance."
    263     }
    264   ],
    265   "cited_papers": [
    266     {
    267       "title": "DS-1000: A Natural and Reliable Benchmark for Data Science Code Generation",
    268       "relevance": "Primary baseline; DSCodeBench explicitly addresses DS-1000's limitations (short solutions, few test cases, ambiguous descriptions) and compares against it throughout."
    269     },
    270     {
    271       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    272       "relevance": "Foundational code generation benchmark used as comparison baseline in Table 1 and as methodological precedent for pass@k metric."
    273     },
    274     {
    275       "title": "Program Synthesis with Large Language Models (MBPP)",
    276       "relevance": "Comparison benchmark in Table 1; baseline for beginner Python programming task evaluation."
    277     },
    278     {
    279       "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls",
    280       "relevance": "Recent comparison benchmark with complex instructions; compared in Table 1 as more advanced general benchmark."
    281     },
    282     {
    283       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    284       "relevance": "Comparison benchmark addressing contamination; DSCodeBench's contamination mitigation approach is contextualized against it."
    285     },
    286     {
    287       "title": "DA-Code: Agent Data Science Code Generation Benchmark for Large Language Models",
    288       "relevance": "Data science–specific comparison benchmark; compared in Table 1 for task diversity vs. evaluation depth tradeoff."
    289     },
    290     {
    291       "title": "DataSciBench: An LLM Agent Benchmark for Data Science",
    292       "relevance": "Recent data science benchmark comparison; shown to have insufficient test cases (2.3 per task) motivating DSCodeBench's 200-test design."
    293     },
    294     {
    295       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    296       "relevance": "Cited as example of realistic software engineering evaluation; DSCodeBench positions itself in same tradition of GitHub-grounded benchmarks."
    297     },
    298     {
    299       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    300       "relevance": "One of the evaluated open-source model families; used to demonstrate scaling behavior on DSCodeBench."
    301     },
    302     {
    303       "title": "An Empirical Study of the Non-Determinism of ChatGPT in Code Generation",
    304       "relevance": "Authored by first author (Ouyang et al. 2025a); motivates the randomness control methodology (temperature=0.2, 3 runs)."
    305     }
    306   ],
    307   "engagement_factors": {
    308     "practical_relevance": {
    309       "score": 2,
    310       "justification": "The benchmark is publicly available on GitHub and directly usable by ML practitioners evaluating LLM code generation, with a customizable test case framework."
    311     },
    312     "surprise_contrarian": {
    313       "score": 1,
    314       "justification": "The finding that GPT-4o achieves only 0.392 on 'realistic' data science tasks (vs 0.451 on DS-1000) mildly challenges assumptions about LLM coding capability."
    315     },
    316     "fear_safety": {
    317       "score": 0,
    318       "justification": "No safety or AI risk concerns raised; the broader impact section discusses skill atrophy risk but this is a minor point."
    319     },
    320     "drama_conflict": {
    321       "score": 0,
    322       "justification": "No controversy or conflict; the paper is a straightforward benchmark construction paper with no contentious claims."
    323     },
    324     "demo_ability": {
    325       "score": 2,
    326       "justification": "Benchmark and code are publicly available at GitHub; anyone can run their model against DSCodeBench immediately."
    327     },
    328     "brand_recognition": {
    329       "score": 1,
    330       "justification": "King's College London and National University of Singapore are reputable but not flagship AI labs; no OpenAI/Google/DeepMind brand association."
    331     }
    332   },
    333   "hn_data": {
    334     "threads": [
    335       {
    336         "hn_id": "36184838",
    337         "title": "Reverse Engineering Self-Supervised Learning",
    338         "points": 86,
    339         "comments": 16,
    340         "url": "https://news.ycombinator.com/item?id=36184838",
    341         "created_at": "2023-06-04T11:43:46Z"
    342       },
    343       {
    344         "hn_id": "43870679",
    345         "title": "Show HN: I built an AI tool to practice technical interviews with",
    346         "points": 12,
    347         "comments": 1,
    348         "url": "https://news.ycombinator.com/item?id=43870679",
    349         "created_at": "2025-05-02T14:57:13Z"
    350       },
    351       {
    352         "hn_id": "45300655",
    353         "title": "Generalizable Geometric Image Caption Synthesis",
    354         "points": 3,
    355         "comments": 0,
    356         "url": "https://news.ycombinator.com/item?id=45300655",
    357         "created_at": "2025-09-19T12:05:01Z"
    358       },
    359       {
    360         "hn_id": "43405094",
    361         "title": "Politicians' misinformation behavior and public engagement, in 4 countries",
    362         "points": 3,
    363         "comments": 0,
    364         "url": "https://news.ycombinator.com/item?id=43405094",
    365         "created_at": "2025-03-18T21:03:45Z"
    366       },
    367       {
    368         "hn_id": "44324675",
    369         "title": "ProtoReasoning: Prototypes as the Foundation for Generalizable Reasoning in LLMs",
    370         "points": 2,
    371         "comments": 0,
    372         "url": "https://news.ycombinator.com/item?id=44324675",
    373         "created_at": "2025-06-20T04:10:28Z"
    374       },
    375       {
    376         "hn_id": "43776339",
    377         "title": "The Bitter Lesson Learned from 2k Multilingual Benchmarks",
    378         "points": 2,
    379         "comments": 0,
    380         "url": "https://news.ycombinator.com/item?id=43776339",
    381         "created_at": "2025-04-23T20:31:54Z"
    382       },
    383       {
    384         "hn_id": "40488690",
    385         "title": "Neuromorphic dreaming: A pathway to efficient learning in artificial agents",
    386         "points": 2,
    387         "comments": 0,
    388         "url": "https://news.ycombinator.com/item?id=40488690",
    389         "created_at": "2024-05-27T08:03:31Z"
    390       }
    391     ],
    392     "top_points": 86,
    393     "total_points": 110,
    394     "total_comments": 17
    395   }
    396 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs