scan.json (18611B)
1 { 2 "paper": { 3 "title": "DA-Code: Agent Data Science Code Generation Benchmark for Large Language Models", 4 "authors": ["Yiming Huang", "Jianwen Luo", "Yan Yu", "Yitong Zhang", "Fangyu Lei", "Yifan Wei", "Shizhu He", "Lifu Huang", "Xiao Liu", "Jun Zhao", "Kang Liu"], 5 "year": 2024, 6 "venue": "arXiv", 7 "arxiv_id": "2410.07331" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "The paper states 'We release our benchmark at https://da-code-bench.github.io' in the abstract." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The benchmark with 500 examples is released at the benchmark website. The data sources come from Kaggle, GitHub, and other web sources." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "The paper mentions Docker-based environment with Python, SQL, Conda, and database engines but does not provide a requirements.txt, Dockerfile, or detailed dependency list with versions." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions are provided in the paper. The benchmark website is referenced but no README with commands to replicate experiments is described." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Tables 3 and 4 report only point estimates (e.g., '30.5% score') with no confidence intervals or error bars." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper compares multiple models and frameworks (e.g., DA-Agent vs OpenDevin) and claims superiority without any statistical significance tests." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": false, 46 "justification": "Raw score differences are reported but no standardized effect sizes (Cohen's d, etc.) are provided. Percentage differences lack baseline context beyond the raw numbers." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "The benchmark has 500 examples and the subset DA-Code-100 has 100 randomly sampled tasks, but no justification or power analysis is provided for these sizes." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "All results are single-run numbers with greedy sampling. No variance, standard deviation, or multiple-run results are reported." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Table 4 compares DA-Agent against X-Agent, AutoGen, and OpenDevin. Table 1 compares DA-Code against DS-1000, Arcade, MLAgentBench, and DA-Bench." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "Baselines include OpenDevin (2024), AutoGen (2023), and X-Agent (2023), which are contemporary agent frameworks." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Section 5.3 presents ablation studies on reference plan provision and max history length (Table 4)." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper reports Score, Completion Rate (%), #Avg Steps, and Executable Code (%) in Table 3, plus per-category breakdowns." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": false, 83 "justification": "No human evaluation of the system outputs is performed. Evaluation is entirely automated via execution-based scoring." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "The full 500-example benchmark serves as the test set, and DA-Code-100 is a randomly sampled subset. No tuning was done on these examples." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Table 3 provides breakdowns by DW, ML, EDA and by difficulty level (Easy, Medium, Hard). Figure 3 shows fine-grained per-category performance." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Section 5.4 discusses error analysis with four categories: hallucination issues, inability to follow instructions, persistent code errors, and misinterpretation of task context." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports that even the best model achieves only 30.5%, and discusses where models fail (e.g., DW tasks, prolonged task sequences not improving outcomes)." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims 30.5% accuracy for the best LLM, which matches GPT-4's score in Table 3. The claim that DA-Agent outperforms other frameworks is supported by Table 4." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The ablation on reference plans (Section 5.3) uses controlled single-variable manipulation, showing that adding reference plans improves score from 31.5 to 39.7. The history length ablation is similarly controlled." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The title claims 'Agent Data Science Code Generation Benchmark for Large Language Models' broadly, but the benchmark covers only specific data science subcategories and only tests a limited set of models. No explicit scope boundaries are stated." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper does not discuss alternative explanations for observed performance differences. For instance, DA-Agent's advantage over other frameworks could be due to environment-specific tuning rather than general superiority." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper says 'GPT-4', 'GPT-4o', 'Claude-3-Opus' without specific version identifiers or snapshot dates (e.g., no 'gpt-4-0613')." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": false, 137 "justification": "The agent's system prompt and prompt templates are not provided in the paper or appendix. Only the action space format is described." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Section 5.1 states greedy sampling strategy, maximum step length of 20, max history length of 15 steps, and action execution time limit of 300 seconds." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "Section 4 describes the DA-Agent framework in detail: Docker environment, action space (Bash, Python, SQL, Terminate), response mechanism, and memory windows." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 3.5 describes the full annotation pipeline: data source selection, task rewriting/creation, task implementation, evaluation setup, and cross-validation with red team testing." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "A dedicated 'Limitations' section appears after the conclusion, discussing unexplored fine-tuning and the need for deeper investigation." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "The limitations section is generic — it mentions fine-tuning LLMs was not explored and the benchmark warrants more investigation, but does not discuss specific threats like annotator bias, benchmark coverage gaps, or evaluation robustness concerns." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "No explicit statements about what the results do NOT show. The paper does not bound its claims to specific model families, data types, or task domains." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": true, 176 "justification": "The benchmark is released at https://da-code-bench.github.io, which should include the 500 task examples and evaluation suites." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 3.5 describes the annotation pipeline: data sources from Kaggle/GitHub, task rewriting process, environment setup, and cross-validation." 182 }, 183 "recruitment_methods_described": { 184 "applies": true, 185 "answer": false, 186 "justification": "The paper states 'We recruit ten annotators who are highly proficient in data analysis, SQL, and Python' but does not describe how they were recruited or their specific backgrounds." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "Figure 2 and Section 3.5 document the full pipeline from data source selection through task definition, implementation, evaluation setup, and cross-validation with red team testing." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": true, 198 "justification": "Acknowledgements section lists National Key R&D Program of China (No. 2022ZD0160503), NSFC (No.62376270), and CCF-BaiChuan-Ebtech Foundation Model Fund." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are listed: Chinese Academy of Sciences, UC Davis, Microsoft Research Asia, Shanghai AI Laboratory." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": true, 208 "justification": "Funding is from government research programs (NSFC, National Key R&D Program) which have no financial stake in the benchmark results." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper evaluates GPT-4, GPT-4o, Claude-3-Opus, and others on the benchmark but does not state any model's training data cutoff date." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No discussion of whether benchmark tasks or their source data appeared in any model's training data, despite data being sourced from Kaggle and GitHub." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "The benchmark data comes from Kaggle and GitHub which are common training data sources. This contamination risk is not discussed." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in the study. Annotators created the benchmark but are not research subjects." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in the study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in the study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in the study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in the study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in the study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in the study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "No API costs, tokens consumed, or wall-clock time per task are reported despite the agent making multiple LLM calls per task." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No total computational budget, GPU hours, or API spend is reported for the experiments." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "The best LLM (GPT-4) achieves only 30.5% score on DA-Code using DA-Agent.", 286 "evidence": "Table 3 shows GPT-4 achieving 30.5 total score across all 500 tasks.", 287 "supported": "strong" 288 }, 289 { 290 "claim": "DA-Agent outperforms existing agent frameworks (OpenDevin, AutoGen, X-Agent) on DA-Code.", 291 "evidence": "Table 4 shows DA-Agent scoring 31.5 vs OpenDevin 26.2, AutoGen 18.6, X-Agent 6.7 on DA-Code-100.", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "Providing a reference plan improves agent performance from 31.5 to 39.7 score.", 296 "evidence": "Table 4 shows DA-Code with reference plan achieving 39.7 vs 31.5 without on DA-Code-100.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "Model performance decreases with increasing task difficulty, validating the difficulty grading.", 301 "evidence": "Table 3 shows GPT-4 scoring 45.4 (Easy), 27.8 (Medium), 23.4 (Hard).", 302 "supported": "strong" 303 } 304 ], 305 "methodology_tags": ["benchmark-eval"], 306 "key_findings": "DA-Code is a 500-example benchmark for evaluating LLMs on agent-based data science tasks covering data wrangling, machine learning, and exploratory data analysis. The best-performing model (GPT-4) achieves only 30.5% score, indicating substantial room for improvement. The DA-Agent framework outperforms existing frameworks like OpenDevin and AutoGen. Analysis reveals that models struggle with data wrangling tasks and that providing reference plans significantly boosts performance.", 307 "red_flags": [ 308 { 309 "flag": "No uncertainty quantification", 310 "detail": "All results are single-run with greedy decoding. No error bars, confidence intervals, or multiple-run variance is reported for any experiment." 311 }, 312 { 313 "flag": "Contamination risk unaddressed", 314 "detail": "Benchmark data is sourced from Kaggle and GitHub, which are common in LLM training corpora. No discussion of whether models may have seen this data during training." 315 }, 316 { 317 "flag": "Framework comparison on small subset", 318 "detail": "The comparison with competing frameworks (Table 4) uses only DA-Code-100 (100 randomly sampled tasks), not the full 500-task benchmark, without justification for this sample size." 319 }, 320 { 321 "flag": "No model version specificity", 322 "detail": "Models are identified only by marketing names (GPT-4, GPT-4o, Claude-3-Opus) without API versions or snapshot dates, making results non-reproducible." 323 } 324 ], 325 "cited_papers": [ 326 { 327 "title": "Evaluating large language models trained on code", 328 "authors": ["Mark Chen"], 329 "year": 2021, 330 "arxiv_id": "2107.03374", 331 "relevance": "Introduces HumanEval, a foundational LLM code generation benchmark." 332 }, 333 { 334 "title": "SWE-bench: Can language models resolve real-world GitHub issues?", 335 "authors": ["Carlos E Jimenez"], 336 "year": 2023, 337 "relevance": "Major repository-level code generation benchmark for LLM agents." 338 }, 339 { 340 "title": "Benchmarking large language models as AI research agents", 341 "authors": ["Qian Huang"], 342 "year": 2023, 343 "arxiv_id": "2310.03302", 344 "relevance": "MLAgentBench defines auto ML tasks in interactive environments, directly compared in this paper." 345 }, 346 { 347 "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversation", 348 "authors": ["Qingyun Wu"], 349 "year": 2023, 350 "relevance": "Multi-agent framework used as baseline comparison in DA-Code experiments." 351 }, 352 { 353 "title": "Executable code actions elicit better LLM agents", 354 "authors": ["Xingyao Wang"], 355 "year": 2024, 356 "arxiv_id": "2402.01030", 357 "relevance": "CodeAct framework that underpins OpenDevin, a baseline in this paper." 358 }, 359 { 360 "title": "SWE-agent: Agent computer interfaces enable software engineering language models", 361 "authors": ["John Yang"], 362 "year": 2024, 363 "relevance": "Agent framework for software engineering tasks with specialized action design." 364 }, 365 { 366 "title": "InterCode: Standardizing and benchmarking interactive coding with execution feedback", 367 "authors": ["John Yang"], 368 "year": 2024, 369 "relevance": "Inspired DA-Code's interactive sandbox environment design." 370 }, 371 { 372 "title": "Reflexion: Language agents with verbal reinforcement learning", 373 "authors": ["Noah Shinn"], 374 "year": 2024, 375 "relevance": "Foundational agent method for self-reflection and iterative improvement in code generation." 376 }, 377 { 378 "title": "DS-1000: A natural and reliable benchmark for data science code generation", 379 "authors": ["Yuhang Lai"], 380 "year": 2023, 381 "relevance": "Prior data science code generation benchmark directly compared in Table 1." 382 }, 383 { 384 "title": "InfiAgent-DABench: Evaluating agents on data analysis tasks", 385 "authors": ["Xueyu Hu"], 386 "year": 2024, 387 "arxiv_id": "2401.05507", 388 "relevance": "Concurrent data analysis benchmark compared in Table 1." 389 } 390 ] 391 }