scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (21421B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "DSCodeBench: A Realistic Benchmark for Data Science Code Generation",
      6     "authors": [
      7       "Shuyin Ouyang",
      8       "Dong Huang",
      9       "Jingwen Guo",
     10       "Zeyu Sun",
     11       "Qihao Zhu",
     12       "Jie M. Zhang"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv",
     16     "arxiv_id": "2505.15621",
     17     "doi": null
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "All key abstract claims (1,000 problems, 10 libraries, GPT-4o pass@1 0.392, scaling behavior, comparison metrics vs DS-1000) are directly supported by Table 1, Table 2, and Figure 2.",
     25         "source": "haiku"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper claims scaling behavior 'validates' the benchmark's discrimination ability, but this is a circular post-hoc argument; it does not rule out that task selection or perturbation methodology specifically favors larger models rather than reflecting inherent capability differences.",
     31         "source": "haiku"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper claims DSCodeBench is 'a rigorous and trustworthy foundation for advancing LLM-based data science programming' broadly, but only 10 models and 1,000 Python-only single-function tasks were tested; the framing does not adequately reflect this scope.",
     37         "source": "haiku"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper does not discuss alternative explanations for its main finding—that DSCodeBench shows scaling behavior while DS-1000 does not—such as whether perturbation design or task filtering biases the difficulty distribution toward features favored by larger models.",
     43         "source": "haiku"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The limitations section explicitly acknowledges that pass@k measures functional correctness only and not efficiency, security, or readability, with the evaluation scope clearly bounded accordingly.",
     49         "source": "haiku"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "A dedicated 'Limitation' section appears in the Appendix covering Python-only scope, single-function restriction, simplified error-handling, and functional-correctness-only evaluation.",
     57         "source": "haiku"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Specific threats are named: error-handling simplification (replacing raises with None/defaults), exclusion of multi-file and project-level tasks, SSIM-only visualization scoring, and restriction to Python.",
     63         "source": "haiku"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper explicitly states what DSCodeBench does NOT cover: other languages (R), multi-file workflows, runtime efficiency, security, code style, and readability—both in limitations and future work sections.",
     69         "source": "haiku"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Funding is disclosed in the Acknowledgement section: ITEA Genius and GreenCode (InnovateUK), UKRI CDT (EP/S023356/1), and NSFC (62402482).",
     77         "source": "haiku"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Author affiliations are listed in the header: King's College London, NUS, CAS, and Peking University—none are affiliated with the commercial LLM vendors evaluated.",
     83         "source": "haiku"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Funders (InnovateUK, UKRI, NSFC) are government/public research agencies unrelated to the commercial vendors (OpenAI, DeepSeek, Alibaba/Qwen) whose models are evaluated.",
     89         "source": "haiku"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests statement or financial interests declaration appears; the paper contains only the funding acknowledgement and no 'no competing interests' statement.",
     95         "source": "haiku"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "The key term 'realistic' is used as the central descriptor throughout the paper but is never formally defined; it is only operationalized implicitly via proxy metrics (code length, test count, GitHub source) without explicit definition.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Three contributions are explicitly enumerated in bullet form: the DSCodeBench dataset, the construction pipeline, and the empirical evaluation of 10 state-of-the-art LLMs.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Table 1 provides direct metric-by-metric comparison against 9 prior benchmarks, and the introduction explains specific limitations of each predecessor that motivate DSCodeBench's design choices.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "benchmark-creation": {
    121       "construct_design": {
    122         "construct_validity_argued": {
    123           "applies": true,
    124           "answer": false,
    125           "justification": "The paper asserts GitHub-sourced code is more realistic than Stack Overflow snippets by definition, but provides no independent validation (no domain expert surveys, no analysis of what tasks data scientists actually perform) to confirm the construct maps to real-world data science work.",
    126           "source": "haiku"
    127         },
    128         "difficulty_distribution_characterized": {
    129           "applies": true,
    130           "answer": false,
    131           "justification": "Per-library model performance variation is shown (Table 3, pass@1 range 0.029–0.591) but no explicit difficulty tiers are defined or characterized; difficulty is inferred post-hoc from model pass rates rather than measured a priori.",
    132           "source": "haiku"
    133         },
    134         "ceiling_floor_effects_checked": {
    135           "applies": true,
    136           "answer": true,
    137           "justification": "The best model (GPT-4o) achieves 0.392 pass@1 and the weakest (DeepSeek-1.3B) achieves 0.076, confirming no ceiling or severe floor effects; the benchmark clearly discriminates across the evaluated model range.",
    138           "source": "haiku"
    139         },
    140         "human_baseline_included": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No human performance baseline is provided; the alignment analysis (97.4% agreement) tests whether a human can understand the problem description, not whether humans can solve the coding tasks.",
    144           "source": "haiku"
    145         },
    146         "scoring_rubric_justified": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "Pass@k is adopted from Chen et al. 2021 without discussing alternatives; the SSIM > 0.5 threshold for visualization tasks is stated but not justified or ablated, and no edge-case scoring analysis is provided.",
    150           "source": "haiku"
    151         }
    152       },
    153       "robustness": {
    154         "contamination_resistance_designed": {
    155           "applies": true,
    156           "answer": true,
    157           "justification": "The paper applies systematic code perturbations (signature changes, line insertions/removals, control flow restructuring) and validates decontamination via text similarity < 0.4 and AST similarity < 0.5 across all models and libraries (Figures 3–4).",
    158           "source": "haiku"
    159         },
    160         "temporal_robustness_discussed": {
    161           "applies": true,
    162           "answer": false,
    163           "justification": "No discussion of how long the current 1,000 problems will remain undiscovered or ungamed; future work mentions expansions but does not address plans for benchmark updates or versioning as models improve.",
    164           "source": "haiku"
    165         },
    166         "failure_modes_discussed": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "Limitations discuss what the benchmark does not measure but do not analyze failure modes of the benchmark itself—e.g., whether high test-case coverage metrics can be gamed, or whether perturbation-based decontamination introduces systematic biases in task difficulty.",
    170           "source": "haiku"
    171         },
    172         "baseline_implementations_provided": {
    173           "applies": true,
    174           "answer": true,
    175           "justification": "The paper states 'The benchmark, code, and experiment results are available at https://github.com/ShuyinOuyang/DSCodeBench', providing all components needed to reproduce reported numbers.",
    176           "source": "haiku"
    177         }
    178       },
    179       "documentation": {
    180         "dataset_documentation_complete": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "The paper provides detailed construction documentation covering all 5 pipeline stages, filtering criteria, per-library statistics (Table 1, Figure 2), alignment procedures, and test case generation methodology including prompts.",
    184           "source": "haiku"
    185         },
    186         "licensing_and_access_clear": {
    187           "applies": true,
    188           "answer": false,
    189           "justification": "A GitHub URL is provided but no license is stated anywhere in the paper; it is unclear under what terms the benchmark can be used, modified, or redistributed by others.",
    190           "source": "haiku"
    191         },
    192         "intended_use_specified": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "The 'Broader Impact' section explicitly specifies intended uses (research, education, industry) and cautions that the benchmark should be used alongside diverse evaluation methods to avoid over-optimization.",
    196           "source": "haiku"
    197         }
    198       }
    199     }
    200   },
    201   "claims": [
    202     {
    203       "claim": "DSCodeBench is more challenging than DS-1000, as GPT-4o achieves pass@1 of 0.392 on DSCodeBench vs 0.451 on DS-1000.",
    204       "evidence": "Table 2 shows direct head-to-head comparison on all 10 evaluated models; DSCodeBench consistently yields lower pass@1 and pass@3 than DS-1000 across all models.",
    205       "supported": "strong"
    206     },
    207     {
    208       "claim": "DSCodeBench exhibits robust scaling behavior where larger models systematically outperform smaller ones, while DS-1000 shows irregular scaling.",
    209       "evidence": "Table 2 shows consistent within-family ordering for DeepSeek (1.3B < 6.7B < 33B) and Qwen (7B < 14B < 32B) on DSCodeBench; DS-1000 shows reversals (e.g., DeepSeek-33B underperforms V2-Lite).",
    210       "supported": "strong"
    211     },
    212     {
    213       "claim": "Automatically generated test cases achieve 97.8% mean line coverage across all 1,000 benchmark problems.",
    214       "evidence": "Figure 5 reports per-library coverage from 95.5% (Pandas) to 99.7% (Matplotlib) with an overall mean of 97.8%.",
    215       "supported": "strong"
    216     },
    217     {
    218       "claim": "Contamination is effectively mitigated: LLM-generated code has text similarity < 0.4 and AST similarity < 0.5 to ground truth.",
    219       "evidence": "Figures 3 and 4 show similarity distributions across 10 models and 10 libraries all below stated thresholds, using text and AST similarity metrics.",
    220       "supported": "moderate"
    221     },
    222     {
    223       "claim": "DSCodeBench provides more reliable evaluation than DS-1000, evidenced by lower variance in model performance scores.",
    224       "evidence": "Table 2 standard deviations are reported (e.g., GPT-4o correct: 391.7±4.6 on DSCodeBench vs 450.7±2.9 on DS-1000); the comparison is inconsistent and not uniformly lower on DSCodeBench.",
    225       "supported": "weak"
    226     },
    227     {
    228       "claim": "Problem descriptions achieve 97.4% alignment between the description and ground truth code as judged by human experts and LLM judges.",
    229       "evidence": "Alignment section describes dual-stage validation using two author-experts and GPT-4o-mini/GPT-4o as judges; however, the LLM judges are among the models being benchmarked, introducing potential bias.",
    230       "supported": "weak"
    231     }
    232   ],
    233   "methodology_tags": [
    234     "benchmark-eval"
    235   ],
    236   "key_findings": "DSCodeBench introduces 1,000 realistic data science coding problems sourced from GitHub across 10 Python libraries, with average solutions of 22.5 lines and 200 test cases per problem—far exceeding prior benchmarks like DS-1000 (3.6 lines, 2.1 tests). The best evaluated model (GPT-4o) achieves only 0.392 pass@1, confirming the benchmark's difficulty, and unlike DS-1000, all model families exhibit consistent scaling behavior where larger models outperform smaller ones. Visualization libraries (Matplotlib, Seaborn) remain the hardest across all models with pass@1 as low as 0.010–0.141 for GPT-4o, revealing a persistent capability gap. Automatically generated test case scripts achieve 97.8% mean code coverage, supporting the benchmark's evaluation reliability.",
    237   "red_flags": [
    238     {
    239       "flag": "No human performance baseline",
    240       "detail": "The benchmark claims to measure realistic data science coding ability and to be calibrated to real-world difficulty, but provides no human performance baseline—making it impossible to contextualize LLM scores or validate difficulty calibration against human capability."
    241     },
    242     {
    243       "flag": "Circular construct validity argument",
    244       "detail": "'GitHub code is more realistic than Stack Overflow' is an assumption used as its own justification for construct validity; no domain expert surveys, observational studies of real data scientist workflows, or external validation criteria are used."
    245     },
    246     {
    247       "flag": "LLM-as-judge using benchmarked models",
    248       "detail": "Alignment validation used GPT-4o-mini and GPT-4o as LLM judges to assess whether problems are solvable, yet these are among the primary models being evaluated on the benchmark, creating a potential circularity in quality assessment."
    249     },
    250     {
    251       "flag": "Visualization scoring threshold unjustified",
    252       "detail": "The SSIM > 0.5 threshold for plot-drawing task evaluation is stated without justification, ablation, or discussion of sensitivity—an arbitrary threshold that may arbitrarily penalize visually acceptable solutions or accept poor-quality plots."
    253     },
    254     {
    255       "flag": "Scaling behavior used as post-hoc validity proxy",
    256       "detail": "Adherence to scaling laws is treated as evidence of benchmark quality ('validating its ability to distinguish model capabilities'), but scaling behavior could equally reflect task selection or perturbation methodology biased toward larger model strengths rather than being an independent quality signal."
    257     }
    258   ],
    259   "cited_papers": [
    260     {
    261       "title": "DS-1000: A Natural and Reliable Benchmark for Data Science Code Generation",
    262       "relevance": "Primary benchmark being compared against and improved upon; DSCodeBench is explicitly positioned as addressing DS-1000's three key limitations in code length, test coverage, and problem structure."
    263     },
    264     {
    265       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    266       "relevance": "Introduces pass@k metric adopted throughout this paper and establishes the baseline paradigm for functional-correctness-based code generation benchmarking."
    267     },
    268     {
    269       "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions",
    270       "relevance": "Compared directly in Table 1 as the leading general code generation benchmark; provides the competitive upper bound for complex benchmark design."
    271     },
    272     {
    273       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    274       "relevance": "Compared in Table 1; represents an approach to contamination-resistant benchmarking that informs DSCodeBench's decontamination design."
    275     },
    276     {
    277       "title": "DA-Code: Agent Data Science Code Generation Benchmark for Large Language Models",
    278       "relevance": "Directly compared as a data science–specific benchmark focusing on task diversity; noted for insufficient test coverage that DSCodeBench addresses."
    279     },
    280     {
    281       "title": "DataSciBench: An LLM Agent Benchmark for Data Science",
    282       "relevance": "Contemporary data science benchmark compared in Table 1; contrasted for smaller scale (222 problems, 2.3 avg test cases) and limited library coverage."
    283     },
    284     {
    285       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming – The Rise of Code Intelligence",
    286       "relevance": "One of the primary open-source model families evaluated; key for assessing whether open-source coding models can compete with closed-source on realistic tasks."
    287     },
    288     {
    289       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    290       "relevance": "Cited as an example of realistic software engineering benchmarks sourced from GitHub; contextualizes DSCodeBench's GitHub-based construction philosophy."
    291     }
    292   ],
    293   "engagement_factors": {
    294     "practical_relevance": {
    295       "score": 3,
    296       "justification": "Practitioners evaluating LLMs for data science can directly use the publicly released benchmark to compare models across 10 specific Python libraries with a rigorous 200-test evaluation framework."
    297     },
    298     "surprise_contrarian": {
    299       "score": 1,
    300       "justification": "The finding that GPT-4o achieves only 39% on realistic data science tasks is modestly surprising given the hype around LLM coding capability, but the direction is consistent with prior findings."
    301     },
    302     "fear_safety": {
    303       "score": 0,
    304       "justification": "No AI safety or risk concerns are raised; the paper focuses entirely on capability evaluation for data science productivity use cases."
    305     },
    306     "drama_conflict": {
    307       "score": 1,
    308       "justification": "The paper directly critiques DS-1000 as unreliable and misleading (irregular scaling, format ambiguity, insufficient tests), which is a mild controversy in the benchmarking community."
    309     },
    310     "demo_ability": {
    311       "score": 3,
    312       "justification": "The benchmark is publicly released on GitHub with code, evaluation framework, per-library problems, and full experiment results enabling immediate use and replication."
    313     },
    314     "brand_recognition": {
    315       "score": 1,
    316       "justification": "Authors are from King's College London, NUS, CAS, and Peking University—reputable institutions but not the top-tier AI labs that drive outsized HN and community attention."
    317     }
    318   },
    319   "hn_data": {
    320     "threads": [
    321       {
    322         "hn_id": "36184838",
    323         "title": "Reverse Engineering Self-Supervised Learning",
    324         "points": 86,
    325         "comments": 16,
    326         "url": "https://news.ycombinator.com/item?id=36184838",
    327         "created_at": "2023-06-04T11:43:46Z"
    328       },
    329       {
    330         "hn_id": "43870679",
    331         "title": "Show HN: I built an AI tool to practice technical interviews with",
    332         "points": 12,
    333         "comments": 1,
    334         "url": "https://news.ycombinator.com/item?id=43870679",
    335         "created_at": "2025-05-02T14:57:13Z"
    336       },
    337       {
    338         "hn_id": "45300655",
    339         "title": "Generalizable Geometric Image Caption Synthesis",
    340         "points": 3,
    341         "comments": 0,
    342         "url": "https://news.ycombinator.com/item?id=45300655",
    343         "created_at": "2025-09-19T12:05:01Z"
    344       },
    345       {
    346         "hn_id": "43405094",
    347         "title": "Politicians' misinformation behavior and public engagement, in 4 countries",
    348         "points": 3,
    349         "comments": 0,
    350         "url": "https://news.ycombinator.com/item?id=43405094",
    351         "created_at": "2025-03-18T21:03:45Z"
    352       },
    353       {
    354         "hn_id": "44324675",
    355         "title": "ProtoReasoning: Prototypes as the Foundation for Generalizable Reasoning in LLMs",
    356         "points": 2,
    357         "comments": 0,
    358         "url": "https://news.ycombinator.com/item?id=44324675",
    359         "created_at": "2025-06-20T04:10:28Z"
    360       },
    361       {
    362         "hn_id": "43776339",
    363         "title": "The Bitter Lesson Learned from 2k Multilingual Benchmarks",
    364         "points": 2,
    365         "comments": 0,
    366         "url": "https://news.ycombinator.com/item?id=43776339",
    367         "created_at": "2025-04-23T20:31:54Z"
    368       },
    369       {
    370         "hn_id": "40488690",
    371         "title": "Neuromorphic dreaming: A pathway to efficient learning in artificial agents",
    372         "points": 2,
    373         "comments": 0,
    374         "url": "https://news.ycombinator.com/item?id=40488690",
    375         "created_at": "2024-05-27T08:03:31Z"
    376       }
    377     ],
    378     "top_points": 86,
    379     "total_points": 110,
    380     "total_comments": 17
    381   }
    382 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs