scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (18611B)
      1 {
      2   "paper": {
      3     "title": "DA-Code: Agent Data Science Code Generation Benchmark for Large Language Models",
      4     "authors": ["Yiming Huang", "Jianwen Luo", "Yan Yu", "Yitong Zhang", "Fangyu Lei", "Yifan Wei", "Shizhu He", "Lifu Huang", "Xiao Liu", "Jun Zhao", "Kang Liu"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2410.07331"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The paper states 'We release our benchmark at https://da-code-bench.github.io' in the abstract."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The benchmark with 500 examples is released at the benchmark website. The data sources come from Kaggle, GitHub, and other web sources."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions Docker-based environment with Python, SQL, Conda, and database engines but does not provide a requirements.txt, Dockerfile, or detailed dependency list with versions."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided in the paper. The benchmark website is referenced but no README with commands to replicate experiments is described."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Tables 3 and 4 report only point estimates (e.g., '30.5% score') with no confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper compares multiple models and frameworks (e.g., DA-Agent vs OpenDevin) and claims superiority without any statistical significance tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Raw score differences are reported but no standardized effect sizes (Cohen's d, etc.) are provided. Percentage differences lack baseline context beyond the raw numbers."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The benchmark has 500 examples and the subset DA-Code-100 has 100 randomly sampled tasks, but no justification or power analysis is provided for these sizes."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "All results are single-run numbers with greedy sampling. No variance, standard deviation, or multiple-run results are reported."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Table 4 compares DA-Agent against X-Agent, AutoGen, and OpenDevin. Table 1 compares DA-Code against DS-1000, Arcade, MLAgentBench, and DA-Bench."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include OpenDevin (2024), AutoGen (2023), and X-Agent (2023), which are contemporary agent frameworks."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Section 5.3 presents ablation studies on reference plan provision and max history length (Table 4)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper reports Score, Completion Rate (%), #Avg Steps, and Executable Code (%) in Table 3, plus per-category breakdowns."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of the system outputs is performed. Evaluation is entirely automated via execution-based scoring."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The full 500-example benchmark serves as the test set, and DA-Code-100 is a randomly sampled subset. No tuning was done on these examples."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 3 provides breakdowns by DW, ML, EDA and by difficulty level (Easy, Medium, Hard). Figure 3 shows fine-grained per-category performance."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 5.4 discusses error analysis with four categories: hallucination issues, inability to follow instructions, persistent code errors, and misinterpretation of task context."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that even the best model achieves only 30.5%, and discusses where models fail (e.g., DW tasks, prolonged task sequences not improving outcomes)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims 30.5% accuracy for the best LLM, which matches GPT-4's score in Table 3. The claim that DA-Agent outperforms other frameworks is supported by Table 4."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The ablation on reference plans (Section 5.3) uses controlled single-variable manipulation, showing that adding reference plans improves score from 31.5 to 39.7. The history length ablation is similarly controlled."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title claims 'Agent Data Science Code Generation Benchmark for Large Language Models' broadly, but the benchmark covers only specific data science subcategories and only tests a limited set of models. No explicit scope boundaries are stated."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss alternative explanations for observed performance differences. For instance, DA-Agent's advantage over other frameworks could be due to environment-specific tuning rather than general superiority."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper says 'GPT-4', 'GPT-4o', 'Claude-3-Opus' without specific version identifiers or snapshot dates (e.g., no 'gpt-4-0613')."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The agent's system prompt and prompt templates are not provided in the paper or appendix. Only the action space format is described."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 5.1 states greedy sampling strategy, maximum step length of 20, max history length of 15 steps, and action execution time limit of 300 seconds."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Section 4 describes the DA-Agent framework in detail: Docker environment, action space (Bash, Python, SQL, Terminate), response mechanism, and memory windows."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 3.5 describes the full annotation pipeline: data source selection, task rewriting/creation, task implementation, evaluation setup, and cross-validation with red team testing."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "A dedicated 'Limitations' section appears after the conclusion, discussing unexplored fine-tuning and the need for deeper investigation."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "The limitations section is generic — it mentions fine-tuning LLMs was not explored and the benchmark warrants more investigation, but does not discuss specific threats like annotator bias, benchmark coverage gaps, or evaluation robustness concerns."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No explicit statements about what the results do NOT show. The paper does not bound its claims to specific model families, data types, or task domains."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The benchmark is released at https://da-code-bench.github.io, which should include the 500 task examples and evaluation suites."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 3.5 describes the annotation pipeline: data sources from Kaggle/GitHub, task rewriting process, environment setup, and cross-validation."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The paper states 'We recruit ten annotators who are highly proficient in data analysis, SQL, and Python' but does not describe how they were recruited or their specific backgrounds."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Figure 2 and Section 3.5 document the full pipeline from data source selection through task definition, implementation, evaluation setup, and cross-validation with red team testing."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Acknowledgements section lists National Key R&D Program of China (No. 2022ZD0160503), NSFC (No.62376270), and CCF-BaiChuan-Ebtech Foundation Model Fund."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: Chinese Academy of Sciences, UC Davis, Microsoft Research Asia, Shanghai AI Laboratory."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Funding is from government research programs (NSFC, National Key R&D Program) which have no financial stake in the benchmark results."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper evaluates GPT-4, GPT-4o, Claude-3-Opus, and others on the benchmark but does not state any model's training data cutoff date."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether benchmark tasks or their source data appeared in any model's training data, despite data being sourced from Kaggle and GitHub."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The benchmark data comes from Kaggle and GitHub which are common training data sources. This contamination risk is not discussed."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in the study. Annotators created the benchmark but are not research subjects."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in the study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in the study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in the study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in the study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No API costs, tokens consumed, or wall-clock time per task are reported despite the agent making multiple LLM calls per task."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total computational budget, GPU hours, or API spend is reported for the experiments."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "The best LLM (GPT-4) achieves only 30.5% score on DA-Code using DA-Agent.",
    286       "evidence": "Table 3 shows GPT-4 achieving 30.5 total score across all 500 tasks.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "DA-Agent outperforms existing agent frameworks (OpenDevin, AutoGen, X-Agent) on DA-Code.",
    291       "evidence": "Table 4 shows DA-Agent scoring 31.5 vs OpenDevin 26.2, AutoGen 18.6, X-Agent 6.7 on DA-Code-100.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Providing a reference plan improves agent performance from 31.5 to 39.7 score.",
    296       "evidence": "Table 4 shows DA-Code with reference plan achieving 39.7 vs 31.5 without on DA-Code-100.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Model performance decreases with increasing task difficulty, validating the difficulty grading.",
    301       "evidence": "Table 3 shows GPT-4 scoring 45.4 (Easy), 27.8 (Medium), 23.4 (Hard).",
    302       "supported": "strong"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval"],
    306   "key_findings": "DA-Code is a 500-example benchmark for evaluating LLMs on agent-based data science tasks covering data wrangling, machine learning, and exploratory data analysis. The best-performing model (GPT-4) achieves only 30.5% score, indicating substantial room for improvement. The DA-Agent framework outperforms existing frameworks like OpenDevin and AutoGen. Analysis reveals that models struggle with data wrangling tasks and that providing reference plans significantly boosts performance.",
    307   "red_flags": [
    308     {
    309       "flag": "No uncertainty quantification",
    310       "detail": "All results are single-run with greedy decoding. No error bars, confidence intervals, or multiple-run variance is reported for any experiment."
    311     },
    312     {
    313       "flag": "Contamination risk unaddressed",
    314       "detail": "Benchmark data is sourced from Kaggle and GitHub, which are common in LLM training corpora. No discussion of whether models may have seen this data during training."
    315     },
    316     {
    317       "flag": "Framework comparison on small subset",
    318       "detail": "The comparison with competing frameworks (Table 4) uses only DA-Code-100 (100 randomly sampled tasks), not the full 500-task benchmark, without justification for this sample size."
    319     },
    320     {
    321       "flag": "No model version specificity",
    322       "detail": "Models are identified only by marketing names (GPT-4, GPT-4o, Claude-3-Opus) without API versions or snapshot dates, making results non-reproducible."
    323     }
    324   ],
    325   "cited_papers": [
    326     {
    327       "title": "Evaluating large language models trained on code",
    328       "authors": ["Mark Chen"],
    329       "year": 2021,
    330       "arxiv_id": "2107.03374",
    331       "relevance": "Introduces HumanEval, a foundational LLM code generation benchmark."
    332     },
    333     {
    334       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    335       "authors": ["Carlos E Jimenez"],
    336       "year": 2023,
    337       "relevance": "Major repository-level code generation benchmark for LLM agents."
    338     },
    339     {
    340       "title": "Benchmarking large language models as AI research agents",
    341       "authors": ["Qian Huang"],
    342       "year": 2023,
    343       "arxiv_id": "2310.03302",
    344       "relevance": "MLAgentBench defines auto ML tasks in interactive environments, directly compared in this paper."
    345     },
    346     {
    347       "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversation",
    348       "authors": ["Qingyun Wu"],
    349       "year": 2023,
    350       "relevance": "Multi-agent framework used as baseline comparison in DA-Code experiments."
    351     },
    352     {
    353       "title": "Executable code actions elicit better LLM agents",
    354       "authors": ["Xingyao Wang"],
    355       "year": 2024,
    356       "arxiv_id": "2402.01030",
    357       "relevance": "CodeAct framework that underpins OpenDevin, a baseline in this paper."
    358     },
    359     {
    360       "title": "SWE-agent: Agent computer interfaces enable software engineering language models",
    361       "authors": ["John Yang"],
    362       "year": 2024,
    363       "relevance": "Agent framework for software engineering tasks with specialized action design."
    364     },
    365     {
    366       "title": "InterCode: Standardizing and benchmarking interactive coding with execution feedback",
    367       "authors": ["John Yang"],
    368       "year": 2024,
    369       "relevance": "Inspired DA-Code's interactive sandbox environment design."
    370     },
    371     {
    372       "title": "Reflexion: Language agents with verbal reinforcement learning",
    373       "authors": ["Noah Shinn"],
    374       "year": 2024,
    375       "relevance": "Foundational agent method for self-reflection and iterative improvement in code generation."
    376     },
    377     {
    378       "title": "DS-1000: A natural and reliable benchmark for data science code generation",
    379       "authors": ["Yuhang Lai"],
    380       "year": 2023,
    381       "relevance": "Prior data science code generation benchmark directly compared in Table 1."
    382     },
    383     {
    384       "title": "InfiAgent-DABench: Evaluating agents on data analysis tasks",
    385       "authors": ["Xueyu Hu"],
    386       "year": 2024,
    387       "arxiv_id": "2401.05507",
    388       "relevance": "Concurrent data analysis benchmark compared in Table 1."
    389     }
    390   ]
    391 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs