ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27834B)


      1 {
      2   "paper": {
      3     "title": "DSCodeBench: A Realistic Benchmark for Data Science Code Generation",
      4     "authors": [
      5       "Shuyin Ouyang",
      6       "Dong Huang",
      7       "Jingwen Guo",
      8       "Zeyu Sun",
      9       "Qihao Zhu",
     10       "Jie M. Zhang"
     11     ],
     12     "year": 2025,
     13     "venue": "AAAI 2026",
     14     "arxiv_id": "2505.15621"
     15   },
     16   "scan_version": 2,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "DSCodeBench is a 1,000-problem benchmark for data science code generation across 10 Python libraries, sourced from GitHub with extensive test suites (200 tests per problem by default). Evaluation of 10 LLMs shows GPT-4o achieves best pass@1 of 0.392, with clear scaling behavior within model families. DSCodeBench is substantially harder than DS-1000 and exhibits lower evaluation variance. Visualization libraries (Matplotlib, Seaborn) remain particularly challenging for all models.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "GitHub repository URL provided: 'The benchmark, code, and experiment results are available at https://github.com/ShuyinOuyang/DSCodeBench.'"
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The benchmark dataset (1,000 problems with ground truth code, test case scripts, and problem descriptions) is released at the GitHub repository."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Hardware is described (Intel Xeon Platinum 8336C, 8×A100, 2.0TiB memory) but no software environment specification is provided — no requirements.txt, Python version, or library versions listed in the paper."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No step-by-step reproduction instructions are provided in the paper. A GitHub link is given but no reproduction guide or README is described in the paper itself."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Standard deviations are reported across 3 runs in Tables 2 and 4 (e.g., '391.7±4.6' for GPT-4o correct outputs, '105.0±4.3' for partially correct)."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No statistical significance tests are used. Claims like 'GPT-4o achieves the highest pass@1' and scaling behavior are based on raw number comparisons without p-values, t-tests, or any formal testing."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Results are reported with baseline context: 'GPT-4o obtains pass@1 and pass@3 of 0.451 and 0.545 on DS-1000 but only 0.392 and 0.438 on DSCodeBench.' Absolute performance values and cross-benchmark differences are provided throughout."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No justification is given for the choice of 1,000 problems, 10 models, or 3 experimental runs. No power analysis or sample size rationale is discussed."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Standard deviations across 3 identical runs are reported in Tables 2 and 4 (e.g., '391.7±4.6', '307.3±5.6')."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "DSCodeBench is compared against 9 other benchmarks in Table 1 (HumanEval, MBPP, APPS, BigCodeBench, LiveCodeBench, DSP, DA-Code, DataSciBench, DS-1000). Model results are compared across both DSCodeBench and DS-1000 in Table 2."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Evaluated models include contemporary options: GPT-4o (2024), Qwen2.5-Coder (2024), DeepSeek-Coder-V2 (2024). Benchmark comparisons include recent work like BigCodeBench (2024) and LiveCodeBench (2024)."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "The benchmark construction pipeline has multiple components (seed code collection, context reconstruction, filtering, test case generation, manual editing) but no ablation study examines which components contribute most to benchmark quality or difficulty."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Multiple metrics reported: pass@1, pass@3, average correct/partially correct/wrong counts, per-library breakdowns, test case coverage (97.8%), and text/AST similarity for leakage analysis."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "Evaluation of LLM-generated code is entirely automated via test suites. Human evaluation is used only for benchmark construction alignment (97.4% agreement among authors + LLM judges), not for evaluating model outputs."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Models are evaluated on the full 1,000-problem benchmark without any fine-tuning or tuning on DSCodeBench data. The entire benchmark functions as a held-out test set for pre-trained models."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Table 3 provides per-library pass@1 breakdowns across all 10 libraries for each model. Tables 5 and 6 provide similar breakdowns at temperature=0.6."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Common failure modes discussed: 'data structure mismatches, such as incompatible shapes or types during transformation steps, particularly in NumPy, TensorFlow, and PyTorch tasks. Visualization libraries also exhibit frequent issues, with models often mis-specifying figures or plotting arguments.'"
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Several negative results reported: Matplotlib and Seaborn scores remain very low across all models, increasing temperature to 0.6 significantly degrades Qwen2.5-Coder-7B and 14B performance, and DeepSeek-Coder-33B performs worse than 6.7B on DS-1000."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Abstract claims are supported: 1,000 problems across 10 libraries (confirmed in statistics section), robust scaling behavior (Table 2 shows consistent within-family scaling), GPT-4o pass@1 of 0.392 (Table 2)."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper claims 'architectural and training optimizations play a critical role beyond model scaling alone' (comparing GPT-4o-mini vs GPT-3.5-turbo) but cannot isolate these factors. Scaling claims are based on observational comparisons within model families where training data, architecture, and scale all change simultaneously."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The title 'A Realistic Benchmark for Data Science Code Generation' and abstract claim 'realistic data science code generation tasks' are broader than the tested scope of Python-only with 10 specific libraries. The Limitations section acknowledges Python-only scope, but the title and abstract do not bound the claims."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "No alternative explanations are discussed for observed results. For example, why Matplotlib/Seaborn scores are low (training data distribution? API complexity?), or why scaling behavior is clear on DSCodeBench but not DS-1000, is not explored."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper measures functional correctness via test suites and explicitly acknowledges the proxy gap: 'DSCodeBench evaluates functional correctness based on unit tests, without explicitly assessing other important dimensions of code quality, such as computational efficiency, coding style, readability, or security.'"
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "Models identified by marketing names only: 'GPT-3.5-turbo', 'GPT-4o-mini', 'GPT-4o' without API versions or snapshot dates. Open-source models list parameter sizes but no checkpoint version identifiers."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Full prompt text provided in the Appendix for all prompt types: Code Generation Prompt, Test Case Script Generation/Repair Prompts, Code Problem Description Generation Prompt, and LLM Judge Prompt."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Temperature set to 0.2 (and 0.6 for sensitivity): 'We set the temperature of all models to 0.2, while keeping all other parameters at their default values.' Random seed set to 42 for test case generation."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used. Models are prompted directly with code problem descriptions and generate solutions in a single pass."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Full pipeline documented in detail: seed code collection from DS-1000 and Stack Overflow, GitHub REST API search yielding 807,198 candidates, AST-based context reconstruction, two-stage filtering to 7,623, test case generation to 2,407, manual editing to final 1,000."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "A dedicated 'Limitation' section is present in the Appendix with substantive discussion of three specific limitations (Python-only, simplified error handling, functional correctness only)."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Specific threats discussed: Python-only coverage, error-handling simplification ('error-raising code segments are either removed or replaced with default behaviors'), exclusion of multi-file codebases, and no efficiency/security/readability evaluation."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Explicit scope boundaries stated: 'DSCodeBench focuses exclusively on Python and ten popular data science libraries,' does not assess 'runtime performance, security, and adherence to coding best practices,' and 'primarily targets single-function or single-file tasks.'"
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The benchmark dataset including code problems, ground truth code, test case scripts, and problem descriptions is available at the GitHub repository."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Data collection is described in detail: seed code from DS-1000 reference code and Stack Overflow top-voted answers (up to 500 per library), GitHub REST API search using line-level queries, deduplication, and metadata preservation."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Code sample selection described: seed code from DS-1000 and Stack Overflow, GitHub search via REST API, filtering by star count (≥10), minimum API calls (≥3), and compilation success. However, potential selection bias toward popular repositories is not discussed."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Full pipeline with counts at each stage: 807,198 initial candidates → property filtering → functional filtering → 7,623 validated → test case generation → 2,407 retained → manual editing → 1,000 final problems."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Funding disclosed: ITEA Genius and ITEA GreenCode projects (InnovateUK), UKRI CDT in Safe and Trusted AI (EP/S023356/1), and NSFC (62402482)."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations listed: King's College London, NUS, Chinese Academy of Sciences, Peking University. None affiliated with companies whose models are evaluated."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "Funders are government and academic bodies (InnovateUK, UKRI, NSFC) with no financial interest in the benchmark results or evaluated models."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests or financial interests statement is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No training data cutoff dates are stated for any of the 10 evaluated models (GPT-3.5-turbo, GPT-4o-mini, GPT-4o, DeepSeek variants, Qwen variants)."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": true,
    241         "justification": "Discussed extensively: 'using raw GitHub code directly poses a risk of data leakage, as large language models may have been pre-trained on the same public code.' Mitigation via code perturbation, context reconstruction, and similarity analysis (text <0.4, AST <0.5)."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": true,
    246         "justification": "Contamination addressed through systematic code perturbation ('modifying function signatures, adding or removing lines of code, restructuring control flows'), context reconstruction, and quantitative similarity analysis confirming low overlap."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in the study. The paper evaluates LLMs on a code generation benchmark."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in the study."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in the study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in the study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in the study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in the study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in the study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "No inference cost, latency, or per-problem cost is reported. API costs for closed-source models and inference time for open-source models are not mentioned."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "Hardware is described (Intel Xeon Platinum 8336C, 8×A100, 2.0TiB) but total GPU hours, wall-clock time, or API spend are not quantified."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": true,
    302         "justification": "Results reported across 3 runs with standard deviations. Random seed set to 42. The paper also verifies that different seeds in test case generation produce consistent results."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": true,
    307         "justification": "'We also run all the models 3 times identically to mitigate the randomness that affects our experiment results.'"
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "Temperature set to 0.2 without justification for this specific value. No search budget or method for selecting this hyperparameter is described. Temperature=0.6 is tested as sensitivity analysis but not as part of a systematic search."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": true,
    317         "justification": "A uniform configuration (temperature=0.2, default parameters) is applied across all models, avoiding cherry-picking. Additional temperature=0.6 results are provided for robustness in Table 4."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": false,
    321         "answer": false,
    322         "justification": "No statistical significance tests are performed at all, so multiple comparison correction is not applicable."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors create the benchmark and evaluate models on it without acknowledging potential biases of self-evaluation, such as benchmark design choices that might inflate difficulty claims or favor certain model families."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "Performance is compared across models of vastly different sizes (1.3B to GPT-4o) without discussing or normalizing for compute budget. Scaling claims do not account for compute differences between models."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": true,
    337         "justification": "Construct validity is discussed through comparison with 9 other benchmarks (Table 1), test case coverage analysis (97.8%), alignment validation (97.4% agreement), and explicit acknowledgment that functional correctness doesn't capture efficiency, style, or security."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "No scaffolding involved. Models are prompted directly without any agentic framework."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "The paper discusses general data leakage risk from GitHub code but does not specifically address temporal leakage: no model training cutoff dates are stated, and the timing of GitHub code availability relative to model training is not analyzed."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of whether the detailed problem descriptions (including step-by-step logic, function signatures, and examples) provide information beyond what would be available in realistic coding scenarios."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No discussion of whether training and test data share structural similarities. Problems are constructed from GitHub repositories that models may have trained on; while code is perturbed, structural patterns may overlap."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": true,
    364         "justification": "Concrete detection methods applied: text similarity analysis (scores <0.4 across libraries), AST similarity analysis (scores <0.5), and semantic comparison via pass@k performance gap between model outputs and ground truth."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "DSCodeBench is substantially more challenging than DS-1000, with all models achieving lower scores.",
    371       "evidence": "Table 2: GPT-4o drops from 0.451 to 0.392 pass@1; all 10 models show lower scores on DSCodeBench vs DS-1000.",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "DSCodeBench exhibits robust scaling behavior where larger models systematically outperform smaller ones.",
    376       "evidence": "Table 2: Within DeepSeek family, pass@1 increases 0.076→0.163→0.205→0.222 with model size. Within Qwen family, 0.116→0.213→0.229.",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "The best LLM (GPT-4o) achieves pass@1 of 0.392, indicating significant room for improvement.",
    381       "evidence": "Table 2: GPT-4o pass@1 = 0.392 averaged across 3 runs (391.7±4.6 correct out of 1,000).",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "DSCodeBench provides more reliable evaluation than DS-1000 with lower variance in results.",
    386       "evidence": "Observation in results that variance is smaller in DSCodeBench than DS-1000, supported by comparing ± values in Table 2. No formal statistical test of this claim.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "Data leakage is effectively mitigated through code perturbation and context reconstruction.",
    391       "evidence": "Text similarity <0.4, AST similarity <0.5 between LLM-generated code and ground truth (Figures 3-4). Code perturbation and context reconstruction described in Manual Editing section.",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "Visualization libraries (Matplotlib, Seaborn) remain particularly challenging for all current LLMs.",
    396       "evidence": "Table 3: Matplotlib pass@1 ranges 0.010–0.311, Seaborn 0.052–0.177 across all models, consistently the lowest-scoring libraries.",
    397       "supported": "strong"
    398     }
    399   ],
    400   "red_flags": [
    401     {
    402       "flag": "No statistical significance tests",
    403       "detail": "All claims of model superiority and scaling behavior are based on raw number comparisons across 3 runs with no formal statistical testing. With only 3 runs and sometimes overlapping standard deviations, claims of 'systematic' differences are not statistically validated."
    404     },
    405     {
    406       "flag": "No model version specification",
    407       "detail": "Closed-source models are identified by marketing names only (GPT-4o, GPT-4o-mini, GPT-3.5-turbo) without snapshot dates or API versions. Results may not be reproducible as model behavior changes over time."
    408     },
    409     {
    410       "flag": "Overclaiming in title and abstract",
    411       "detail": "The title claims 'Data Science Code Generation' broadly but the benchmark covers only Python with 10 libraries. Error handling is simplified (removed or replaced with defaults), and only single-file tasks are included, limiting the 'realistic' claim."
    412     },
    413     {
    414       "flag": "No inference cost reporting",
    415       "detail": "Evaluating 10 models on 1,000 problems × 3 runs (plus temperature=0.6 experiments) involves substantial compute, especially for closed-source APIs. No cost information makes practical reproducibility unclear."
    416     }
    417   ],
    418   "cited_papers": [
    419     {
    420       "title": "DS-1000: A natural and reliable benchmark for data science code generation",
    421       "authors": ["Y. Lai", "C. Li", "Y. Wang", "T. Zhang", "R. Zhong", "L. Zettlemoyer", "W. Yih", "D. Fried", "S. Wang", "T. Yu"],
    422       "year": 2023,
    423       "relevance": "Primary baseline benchmark for data science code generation being compared against and improved upon."
    424     },
    425     {
    426       "title": "Evaluating large language models trained on code",
    427       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    428       "year": 2021,
    429       "arxiv_id": "2107.03374",
    430       "relevance": "Introduced HumanEval and pass@k metric, foundational for code generation evaluation."
    431     },
    432     {
    433       "title": "Program synthesis with large language models",
    434       "authors": ["J. Austin", "A. Odena", "M. Nye"],
    435       "year": 2021,
    436       "arxiv_id": "2108.07732",
    437       "relevance": "Introduced MBPP benchmark for Python code generation evaluation."
    438     },
    439     {
    440       "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions",
    441       "authors": ["T. Y. Zhuo", "M. C. Vu", "J. Chim"],
    442       "year": 2024,
    443       "arxiv_id": "2406.15877",
    444       "relevance": "Contemporary code generation benchmark with diverse function calls, compared in Table 1."
    445     },
    446     {
    447       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    448       "authors": ["N. Jain", "K. Han", "A. Gu"],
    449       "year": 2024,
    450       "arxiv_id": "2403.07974",
    451       "relevance": "Contamination-free code generation benchmark addressing temporal leakage."
    452     },
    453     {
    454       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    455       "authors": ["C. E. Jimenez", "J. Yang", "A. Wettig"],
    456       "year": 2023,
    457       "arxiv_id": "2310.06770",
    458       "relevance": "Landmark benchmark for realistic software engineering tasks from GitHub issues."
    459     },
    460     {
    461       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming—The Rise of Code Intelligence",
    462       "authors": ["D. Guo", "Q. Zhu", "D. Yang"],
    463       "year": 2024,
    464       "arxiv_id": "2401.14196",
    465       "relevance": "Open-source code generation model family evaluated in this study."
    466     },
    467     {
    468       "title": "Qwen2.5-Coder Technical Report",
    469       "authors": ["B. Hui", "J. Yang", "Z. Cui"],
    470       "year": 2024,
    471       "arxiv_id": "2409.12186",
    472       "relevance": "Open-source code generation model family evaluated in this study."
    473     },
    474     {
    475       "title": "DataSciBench: An LLM Agent Benchmark for Data Science",
    476       "authors": ["D. Zhang", "S. Zhoubian", "M. Cai"],
    477       "year": 2025,
    478       "arxiv_id": "2502.13897",
    479       "relevance": "Contemporary data science benchmark compared against DSCodeBench."
    480     },
    481     {
    482       "title": "DA-Code: Agent Data Science Code Generation Benchmark for Large Language Models",
    483       "authors": ["Y. Huang", "J. Luo", "Y. Yu"],
    484       "year": 2024,
    485       "arxiv_id": "2410.07331",
    486       "relevance": "Data science code generation benchmark focusing on task diversity."
    487     },
    488     {
    489       "title": "AgentCoder: Multi-agent-based code generation with iterative testing and optimisation",
    490       "authors": ["D. Huang", "J. M. Zhang", "M. Luck"],
    491       "year": 2023,
    492       "arxiv_id": "2312.13010",
    493       "relevance": "Multi-agent approach to code generation with iterative testing and refinement."
    494     },
    495     {
    496       "title": "An empirical study of the non-determinism of ChatGPT in code generation",
    497       "authors": ["S. Ouyang", "J. M. Zhang", "M. Harman", "M. Wang"],
    498       "year": 2025,
    499       "relevance": "Studies LLM non-determinism in code generation, motivating randomness control design in DSCodeBench."
    500     }
    501   ]
    502 }

Impressum · Datenschutz