scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21201B)
      1 {
      2   "paper": {
      3     "title": "CORE-Bench: Fostering the Credibility of Published Research Through a Computational Reproducibility Agent Benchmark",
      4     "authors": ["Zachary S. Siegel", "Sayash Kapoor", "Nitya Nadgir", "Benedikt Stroebl", "Arvind Narayanan"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2409.11363"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub repository URL provided: https://github.com/siegelz/core-bench (footnote 3, Section 2.1)."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The benchmark dataset is released alongside the code at the GitHub repository. Tasks are based on publicly available CodeOcean capsules."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The evaluation harness specifies Azure VM types (Standard_E2as_v5, Standard_NC4as_T4_v3), Ubuntu Linux, 80 GB disk (Appendix B). Agents run in isolated VMs with specified configurations."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper describes an evaluation harness with detailed instructions for running agents on VMs, including resume flags and resource management (Appendix B). The GitHub repository provides the harness code."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "95% confidence intervals reported for CORE-Agent accuracy across three runs in Table A3 (e.g., '60.60% ± 4.51%'). Error bars shown in Figure 8."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims CORE-Agent outperforms AutoGPT and GPT-4o outperforms GPT-4o-mini, but no statistical significance tests are reported for these comparisons."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Absolute accuracy differences with baseline context are reported throughout (e.g., AutoGPT 6.7% vs CORE-Agent 21.48% on Hard; GPT-4o-mini 8.9% → 44.44% with task-specific modifications)."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The benchmark has 90 papers (45 train/45 test) and CORE-Agent was run 3 times, but no justification for why 90 papers or 3 runs is sufficient. No power analysis."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Standard deviations reported from three trials in Figure 8 error bars, and confidence intervals in Table A3. AutoGPT was only run once due to cost constraints, which is acknowledged."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Two baselines compared: unmodified AutoGPT (general-purpose) and CORE-Agent (task-specific), each with two LLM backends (Section 3, Table 5)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "AutoGPT is a contemporary general-purpose agent. This is a new benchmark so there are no prior baselines to compare against; the chosen baselines are reasonable for establishing initial results."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The comparison between AutoGPT and CORE-Agent across difficulty levels effectively serves as an ablation of task-specific modifications. Table 4 details incremental modifications. The cost limit analysis (Figure 7) ablates the budget constraint."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Reports task accuracy (pass@1), average cost, pass@k, pass^k reliability metric, and breakdowns by vision vs. text questions (Sections 4, C.3, C.4)."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "This is a benchmark paper measuring automated agent accuracy against ground-truth reproductions. Human evaluation of agent outputs is not relevant — correctness is objectively verifiable."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Explicit 45/45 train/test split. 'We report all results in this section on the test split unless otherwise mentioned, since we used the train split while developing the agent' (Section 4)."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results broken down by difficulty level (Easy/Medium/Hard), discipline (CS/Social Science/Medicine), programming language (Python/R), and question modality (vision/text) in Section 4 and Figure 8."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 4.6 qualitatively analyzes common failure cases at each difficulty level: retrieval from multiple files, dependency installation loops, competing instructions. Appendices D.3.1-D.3.4 provide detailed examples."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Increasing cost limit from $4 to $10 did not significantly improve GPT-4o-mini performance (Section 4.3, Figure 7). Section 4.7 reports an agent attempting unsafe behavior (creating online accounts)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims (best agent 21% on hardest task, 270 tasks from 90 papers, three disciplines) are all supported by results in Tables 5 and A3."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims about task-specific modifications improving accuracy are supported by controlled comparisons (same LLM, same benchmark, only modifications differ). The ablation design is adequate for these claims."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Claims are bounded to the tested setting. The paper acknowledges limitations of CodeOcean sourcing and specific disciplines. Title and abstract appropriately scope to 'computational reproducibility' rather than broader claims."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper discusses alternative explanations: Python vs R performance differences may be due to output format rather than language difficulty (Section 4.5), CS tasks being easier may be confounded with Python prevalence, cost limits may constrain performance."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Exact model versions specified: 'GPT-4o-2024-05-13 and GPT-4o-mini-2024-07-18' (Section 3)."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Full CORE-Agent prompts provided in Appendix D.2, including the --ai-role and --best-practice arguments for each difficulty level."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No mention of temperature, top-p, or other API sampling parameters for the LLM calls. Only the $4 cost limit per task is specified."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "AutoGPT architecture described including tools (execute_shell, query_vision_language_model), auto-summarization of past actions, and CORE-Agent modifications (Table 4, Appendix D)."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Capsule selection process documented with 10 filtering criteria (Table 2), filtering pipeline from 5,090 capsules to 90 (Figure 3), and task question construction process (Appendix A.3)."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No dedicated limitations or threats-to-validity section. Some limitations are scattered throughout (e.g., CodeOcean sourcing, cost constraints) but there is no substantive dedicated discussion."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No specific threats to validity discussed. The paper does not address construct validity threats from using CodeOcean (pre-verified reproducible capsules) vs. real-world papers, or selection bias from the 10 filtering criteria."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show. It does not discuss that the benchmark only covers papers already known to be reproducible, limiting generalization to real-world reproducibility checking."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The benchmark data and evaluation harness are released at the GitHub repository. CodeOcean capsules are publicly accessible."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Data collection via CodeOcean webscraping described (Appendix A.1), with 10 filtering criteria (Table 2) and manual verification of each capsule's local reproducibility."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data sourced from CodeOcean public repositories."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Pipeline documented: 5,090 capsules scraped → filtered by discipline/language/10 criteria → 90 capsules selected → 3 manual reproductions per capsule → 181 task questions created (Figure 3, Appendix A)."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Acknowledgments section mentions compute support from Princeton University's Center for Statistics and Machine Learning and OpenAI's researcher access program."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All authors affiliated with Princeton University, clearly stated. No authors are affiliated with OpenAI or AutoGPT."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "OpenAI provided compute support via researcher access program, and the paper exclusively evaluates OpenAI models (GPT-4o, GPT-4o-mini). OpenAI has a financial interest in demonstrating model capabilities."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement found in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper does not state the training data cutoff for GPT-4o or GPT-4o-mini, despite evaluating them on tasks derived from public CodeOcean repositories."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "The paper acknowledges contamination concerns: 'CORE-Bench's foundation in public repositories enables periodic updates of the benchmark tasks, which could mitigate concerns about contamination and saturation' (Section 2.1)."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "While the paper mentions the possibility of periodic updates to mitigate contamination, it does not analyze whether the CodeOcean capsules used were available before GPT-4o's training cutoff. Many capsules likely predate the model's training data."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Average API costs reported per task and per difficulty level (Figure 6, Table A3). Successful tasks averaged $0.54 vs $2.59 for failed tasks with CORE-Agent GPT-4o."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Azure VM types specified (Appendix B), $4 per-task cost limit stated, total evaluation time (~2 hours with parallelization vs 20+ days sequential). Cost limit sensitivity analysis in Figure 7."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "The best agent (CORE-Agent with GPT-4o) achieves 21% accuracy on CORE-Bench-Hard, 58% on Medium, and 60% on Easy.",
    286       "evidence": "Table 5 and Table A3 report these results on the test set across three runs with confidence intervals.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Task-specific modifications to general-purpose agents yield significant performance improvements, especially for weaker models.",
    291       "evidence": "Section 4.2: AutoGPT GPT-4o-mini went from 8.9% to 44.44% on Easy with minimal modifications. Table 5 shows consistent improvements across all settings.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Stronger models (GPT-4o) outperform weaker models (GPT-4o-mini) despite having a lower token budget due to higher per-token costs.",
    296       "evidence": "Section 4.3 and Table 5: GPT-4o outperforms GPT-4o-mini across all agents and difficulty levels despite the same $4 cost limit.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Increasing the cost limit from $4 to $10 does not significantly improve accuracy because agents get stuck rather than needing more budget.",
    301       "evidence": "Section 4.3 and Figure 7: GPT-4o-mini unchanged, GPT-4o modest increase from 26% to 31% on train set Hard tasks.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Computational reproducibility is a widespread problem across scientific fields, with significant proportions of studies failing despite available code and data.",
    306       "evidence": "Table 1 summarizes 18 studies across 15+ fields showing reproducibility failure rates. The ML Reproducibility Challenge 2022 data shows 10/28 papers had errors.",
    307       "supported": "strong"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "CORE-Bench introduces 270 computational reproducibility tasks from 90 papers across CS, social science, and medicine at three difficulty levels. The best baseline agent (CORE-Agent with GPT-4o) achieves only 21% on the hardest tasks, showing substantial room for improvement. Task-specific modifications to general-purpose agents yield large gains (e.g., 8.9% to 44.4% for GPT-4o-mini on Easy), and stronger models outperform weaker ones even with lower token budgets. Agents primarily fail due to dependency installation loops and difficulty retrieving results from multiple output files.",
    312   "red_flags": [
    313     {
    314       "flag": "No limitations section",
    315       "detail": "The paper lacks a dedicated limitations or threats-to-validity section, which is notable for a benchmark paper making claims about real-world utility."
    316     },
    317     {
    318       "flag": "Selection bias in benchmark construction",
    319       "detail": "All 90 papers are pre-verified as reproducible on CodeOcean, with 10 filtering criteria that exclude many real-world scenarios (e.g., >45 min runtime, >10GB, complex bash commands). This limits construct validity claims about real-world reproducibility, but this is not discussed."
    320     },
    321     {
    322       "flag": "OpenAI compute support with exclusive OpenAI model evaluation",
    323       "detail": "OpenAI provided researcher access program support, and only OpenAI models were evaluated. No non-OpenAI models tested despite the benchmark being model-agnostic."
    324     },
    325     {
    326       "flag": "AutoGPT only run once",
    327       "detail": "Due to cost constraints, AutoGPT was only run once on the test set, so no confidence intervals are available for the baseline comparison. This weakens the comparative claims."
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "Evaluating Large Language Models Trained on Code",
    333       "authors": ["Mark Chen"],
    334       "year": 2021,
    335       "arxiv_id": "2107.03374",
    336       "relevance": "Foundational code generation benchmark (HumanEval) that CORE-Bench aims to complement with more realistic tasks."
    337     },
    338     {
    339       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    340       "authors": ["Carlos E. Jimenez", "John Yang"],
    341       "year": 2023,
    342       "arxiv_id": "2310.06770",
    343       "relevance": "Key benchmark for evaluating AI agents on real-world software engineering tasks, directly compared to CORE-Bench's approach."
    344     },
    345     {
    346       "title": "AI Agents That Matter",
    347       "authors": ["Sayash Kapoor", "Benedikt Stroebl", "Zachary S. Siegel"],
    348       "year": 2024,
    349       "arxiv_id": "2407.01502",
    350       "relevance": "Companion paper on agent evaluation methodology, addressing cost-accuracy tradeoffs and benchmark design principles."
    351     },
    352     {
    353       "title": "The AI Scientist: Towards Fully Automated Open-Ended Scientific Discovery",
    354       "authors": ["Chris Lu"],
    355       "year": 2024,
    356       "arxiv_id": "2408.06292",
    357       "relevance": "Proposes automating the full research pipeline; CORE-Bench argues reproducibility is a necessary prerequisite step."
    358     },
    359     {
    360       "title": "SWE-AGENT: Agent-Computer Interfaces Enable Automated Software Engineering",
    361       "authors": ["John Yang", "Carlos E. Jimenez"],
    362       "year": 2024,
    363       "relevance": "Demonstrates importance of task-specific agent modifications for software engineering, paralleling CORE-Agent's approach."
    364     },
    365     {
    366       "title": "Lessons from the Trenches on Reproducible Evaluation of Language Models",
    367       "authors": ["Stella Biderman"],
    368       "year": 2024,
    369       "arxiv_id": "2405.14782",
    370       "relevance": "Discusses challenges in reproducible LLM evaluation, directly relevant to benchmarking methodology."
    371     },
    372     {
    373       "title": "Benchmarking Large Language Models As AI Research Agents",
    374       "authors": ["Qian Huang"],
    375       "year": 2023,
    376       "arxiv_id": "2310.03302",
    377       "relevance": "Benchmark for LLMs as research agents conducting ML experiments, complementary to CORE-Bench's reproducibility focus."
    378     },
    379     {
    380       "title": "SciCode: A Research Coding Benchmark Curated by Scientists",
    381       "authors": ["Minyang Tian"],
    382       "year": 2024,
    383       "arxiv_id": "2407.13168",
    384       "relevance": "Research coding benchmark for scientific tasks, a related benchmark in the AI-for-science agent space."
    385     },
    386     {
    387       "title": "Security of AI Agents",
    388       "authors": ["Yifeng He"],
    389       "year": 2024,
    390       "arxiv_id": "2406.08689",
    391       "relevance": "Addresses safety concerns for AI agents, relevant to CORE-Bench's finding about agents attempting unsafe web actions."
    392     },
    393     {
    394       "title": "DiscoveryBench: Towards Data-Driven Discovery with Large Language Models",
    395       "authors": ["Bodhisattwa Prasad Majumder"],
    396       "year": 2024,
    397       "arxiv_id": "2407.01725",
    398       "relevance": "Benchmark for LLM scientific discovery capabilities, complementary to CORE-Bench's reproducibility evaluation."
    399     }
    400   ]
    401 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs