scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26536B)
      1 {
      2   "paper": {
      3     "title": "PaperBench: Evaluating AI's Ability to Replicate AI Research",
      4     "authors": ["Giulio Starace", "Oliver Jaffe", "Dane Sherburn", "James Aung", "Chan Jun Shern", "Leon Maksin", "Rachel Dias", "Evan Mays", "Benjamin Kinsella", "Wyatt Thompson", "Johannes Heidecke", "Amelia Glaese", "Tejal Patwardhan"],
      5     "year": 2025,
      6     "venue": "International Conference on Machine Learning",
      7     "arxiv_id": "2504.01848",
      8     "doi": "10.48550/arXiv.2504.01848"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "PaperBench evaluates AI agents' ability to replicate 20 ICML 2024 papers from scratch, with 8,316 individually gradable rubric tasks co-developed with original authors. Claude 3.5 Sonnet achieved the best score of 21.0% with BasicAgent, while o1 with IterativeAgent reached 24.4%. ML PhD human baselines achieved 41.4% (best@3) after 48 hours, outperforming agents at longer horizons. An LLM-based judge (o3-mini) achieves F1=0.83 on JudgeEval, enabling scalable automated grading.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The abstract states 'We open-source our code to facilitate future research' and the conclusion reiterates open-sourcing PaperBench."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The benchmark dataset (20 papers with rubrics, addenda, JudgeEval gold labels) is released as part of the open-source package."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper specifies Ubuntu 24.04 and A10 GPU for agent environments but does not provide a requirements.txt, Dockerfile, or detailed dependency list for reproducing the evaluation pipeline itself."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided in the paper. The code is open-sourced but specific commands to run evaluations are not detailed in the paper text."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Tables 4, 5, 6 report standard error of the mean (±) for all model results. Figure 3 includes error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims Claude 3.5 Sonnet outperforms other models and that IterativeAgent boosts o1/o3-mini but provides no statistical significance tests — only point estimates with SEM."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Results are reported as absolute percentages with baselines (e.g., 21.0% vs 13.2% for Claude vs o1), providing sufficient context for effect magnitude."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "3 runs per paper per model is used without justification for why 3 runs is adequate given the high variance observed (Tables 10-18 show large per-paper variance)."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Standard error is reported across 3 runs per paper. Per-paper results in Appendix I show individual run scores and standard errors."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Multiple frontier models compared (GPT-4o, o1, o3-mini, DeepSeek-R1, Claude 3.5 Sonnet, Gemini 2.0 Flash), plus a human PhD baseline."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "All evaluated models are frontier models current as of late 2024/early 2025. Human baseline uses current ML PhDs."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "BasicAgent vs IterativeAgent comparison (Section 5.3) ablates scaffold design. PaperBench Code-Dev ablates the execution/result-match requirement. Results stratified by requirement type (Table 9)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Replication Score (overall), scores stratified by Code Development/Execution/Result Match requirement types (Table 9), and per-paper breakdowns."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "JudgeEval (Section 4.2) validates the LLM judge against human expert judgments. Rubrics were co-developed with original paper authors."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper mentions maintaining 'a held-out set for internal use' (Section 3) and releases two NeurIPS papers as a development set, separate from the 20-paper test set."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Per-paper results in Appendix I (Tables 10-18), per-requirement-type breakdown in Table 9, and per-paper node counts in Tables 2 and 7."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 5.2 discusses specific failure modes: models finishing early, failing to strategize, o3-mini struggling with tool usage, models claiming partial reproduction was the goal."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "IterativeAgent hurts Claude 3.5 Sonnet performance (21.0% → 16.1%, Table 5). Many models score very poorly (<10%). Agents plateau after first hour (Figure 3)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims (21.0% for Claude, human baseline outperforms models, F1=0.83 for judge) are all supported by Tables 4, 5, and 3 respectively."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "Section 5.3 attributes score differences between BasicAgent and IterativeAgent to scaffold design ('prompt tuning used for IterativeAgent is differentially suited for OpenAI o-series models') without controlled ablation isolating the specific mechanism."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper bounds claims to 'replicating ML research papers' and acknowledges the 20-paper dataset limitation in Section 7. Claims about agent capabilities are carefully hedged."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 5.2 discusses scaffold weakness vs model capability as alternative explanations. Section 7 discusses contamination. The paper notes results 'do not represent the upper limit of these models' capabilities.'"
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper clearly defines what it measures (Replication Score = weighted proportion of rubric leaf nodes satisfied) and distinguishes this from broader ML R&D capability. Section 7 acknowledges the benchmark 'does not capture every aspect of real-world research.'"
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Exact model versions specified in Section 5.2 footnotes: gpt-4o-2024-08-06, o1-2024-12-17, o3-mini-2025-01-31, claude-3-5-sonnet-20241022, gemini-2.0-flash."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Full system prompts for BasicAgent (Figure 10), IterativeAgent (Figures 11-12), task instructions (Figures 13-14), and judge prompts (Figures 7-9) are provided in the appendix."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Reasoning effort settings stated (o1, o3-mini use 'high'). Time limits (12h, 36h) specified. Reproduction runtime cap (12h) stated."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "BasicAgent and IterativeAgent scaffolds described in detail (Section 5.1, Appendix F), including tool-use loop, context management, file reader, web browser tools, and the 'end task' tool modification."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Paper selection pipeline documented in Appendix B with explicit filtering criteria (commercial, empirical, hardware, model dependency, data, reproducibility, framework, accessibility filters)."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 7 'Limitations' provides substantive discussion of dataset size, contamination, challenging dataset creation, judge performance, and cost."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 7 discusses specific threats: contamination from pre-training on authors' codebases, non-deterministic judge, rubric creation difficulty, and specific cost figures ($400 per rollout, $66 per grading)."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper explicitly states it covers only 20 ICML 2024 papers, acknowledges results 'do not represent the upper limit of these models' capabilities,' and that 'PaperBench does not capture every aspect of real-world research.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The benchmark, rubrics, and JudgeEval gold labels are open-sourced. Per-paper per-run results are provided in Appendix I."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Paper selection from ICML 2024 described in Appendix B with systematic filtering. Rubric creation process described in Appendix C. Human baseline recruitment described in Section 5.4."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 5.4 and footnote 17 describe human baseline recruitment: ML PhDs from named universities (Berkeley, Cambridge, CMU, etc.), CV screen followed by ML and git technical test. 8 participants, assigned to papers they were most confident about."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Full pipeline documented: paper selection (Appendix B) → rubric creation (Appendix C) → agent execution (Section 5.1) → reproduction (Section 2.2) → grading (Section 2.3). JudgeEval construction described in Section 4.2."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding disclosure or acknowledgment of funding sources. All authors are from OpenAI but no explicit funding statement."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All authors listed as affiliated with OpenAI on the first page."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "OpenAI has a financial interest in demonstrating model capabilities and in the benchmark ecosystem. The paper evaluates OpenAI models (o1, o3-mini, GPT-4o) alongside competitors."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement. OpenAI employees evaluating OpenAI models constitutes an undisclosed conflict."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff dates stated for any of the evaluated models. Section 7 acknowledges contamination risk but does not state cutoffs."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "Section 7 discusses that original codebases exist online and models pre-trained on large corpora 'may have internalized solutions.' However, no empirical overlap analysis is performed."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "Section 7 acknowledges contamination risk from pre-training on authors' codebases. Section 2.5 implements blacklists preventing agents from using original code repositories. Monitor checks for blacklist violations (Appendix E)."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "The human baseline study (Section 5.4) was not pre-registered."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No mention of IRB or ethics approval for the human baseline study involving 8 ML PhD participants."
    253       },
    254       "demographics_reported": {
    255         "applies": true,
    256         "answer": true,
    257         "justification": "Footnote 17 lists participant universities (Berkeley, Cambridge, CMU, Columbia, Cornell, Purdue, TU Wien, UMass Amherst). Qualification: ML PhDs with CV screen and technical test."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": true,
    261         "answer": true,
    262         "justification": "Section 5.4 and footnote 17: participants must be enrolled in or have completed a PhD in ML, pass a CV screen, and pass an ML and git technical test."
    263       },
    264       "randomization_described": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "Participants were assigned to papers 'they were most confident about replicating' — self-selection rather than randomization, and this non-random assignment is not discussed as a limitation."
    268       },
    269       "blinding_described": {
    270         "applies": true,
    271         "answer": false,
    272         "justification": "No blinding described. Participants knew they were being compared to AI agents. Evaluators (the LLM judge) graded both human and agent submissions."
    273       },
    274       "attrition_reported": {
    275         "applies": true,
    276         "answer": true,
    277         "justification": "Section 5.4 describes the process: 3 independent attempts per paper, evaluated after one week, only the best performer extended for remaining weeks. One human attempt for test-time-model-adaptation ended at 24 hours (noted in Figure 3 caption)."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Section 7: ~$400 per o1 IterativeAgent 12-hour rollout, $66 per paper for o3-mini grading, $10 per paper for Code-Dev grading. Figure 5 plots judge cost. Section 4.1 estimates judge token usage."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Agent runtime limits (12h, 36h) stated. Single A10 GPU per run. Total cost estimate: $8000 per eval run for 20 papers. 646 total runs mentioned in Section 2.5."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "3 runs per paper per model reported in Appendix I (Tables 10-18), showing substantial variance across runs. The paper explicitly notes 'we see high variance in results on the same paper' and recommends 'several seeds when evaluating.'"
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Section 5.2: '3 runs per paper.' Footnotes note exceptions (2 runs for some configurations). Human baseline: 3 independent attempts per paper."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget reported for scaffold prompt tuning. Section F.1 mentions 'preliminary experiments' to adjust prompts but doesn't quantify the search process."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The paper reports results for BasicAgent and IterativeAgent but does not explain how many scaffold variants were tried before settling on these two, or what validation process was used."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Multiple models compared across 20 papers with no correction for multiple comparisons."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "OpenAI authors evaluate OpenAI models using OpenAI-developed scaffolding and judges, without acknowledging potential self-comparison bias. The scaffold was tuned on preliminary experiments which may favor OpenAI models."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Figure 3 shows performance over time (compute). Table 5 compares 12h vs 36h runs. The paper notes agents plateau after 1 hour."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "Appendix A discusses future directions for rubric-based evaluation. Section 7 discusses limitations of the benchmark. The paper explicitly states it 'does not capture every aspect of real-world research' and discusses specification gaming risks (Appendix A.3)."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "Section 5.2-5.3 evaluates two different scaffolds (BasicAgent, IterativeAgent) and finds large differences (Claude drops from 21% to 16.1%, o1 rises from 13.2% to 24.4%), explicitly noting 'models' sensitivities to prompting.'"
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "Section 7 discusses that ICML 2024 papers' codebases exist online and models may have internalized solutions. Papers selected from recent conference to mitigate this."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": true,
    348         "justification": "Section 2.1: agents are not shown the rubric during attempts. Section 2.5: blacklists prevent access to original codebases. Monitor checks for violations."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether the 20 papers share structural similarities that could affect results, or whether model pre-training on ML paper corpora creates systematic advantages."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": true,
    358         "justification": "Section 2.5 and Appendix E describe a post-hoc monitor that checks agent logs for blacklisted URLs. 10 violations found and disqualified across 646 runs."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Claude 3.5 Sonnet (New) with BasicAgent achieves the best average Replication Score of 21.0% on PaperBench.",
    365       "evidence": "Table 4 reports 21.0 ± 0.8% for Claude 3.5 Sonnet, highest among 6 models tested with BasicAgent.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "o1 with IterativeAgent achieves 24.4%, significantly outperforming its BasicAgent score of 13.2%.",
    370       "evidence": "Tables 4 and 5 show o1 BasicAgent at 13.2 ± 0.3% vs IterativeAgent at 24.4 ± 0.7%.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Human ML PhD baseline (best@3) achieves 41.4% on a 3-paper subset after 48 hours, outperforming o1 at 26.6%.",
    375       "evidence": "Section 5.4 and Figure 3 show human-agent comparison over time. Humans surpass agents after 24 hours.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "The LLM-based judge (o3-mini with SimpleJudge) achieves F1=0.83 on JudgeEval, making it a reasonable stand-in for human judges.",
    380       "evidence": "Table 3 reports F1=0.83 for o3-mini at $66/paper cost. Expert human performance is treated as ideal.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Models are proficient at writing code quickly but fail to effectively work beyond the first hour to improve submissions.",
    385       "evidence": "Figure 3 shows o1 scores plateauing after 1 hour. Table 9 shows Code Development scores far exceed Execution and Result Match scores.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "PaperBench Code-Dev is weakly correlated with full PaperBench (Pearson r=0.48).",
    390       "evidence": "Footnote 5 in Section 2.6 reports r=0.48 with PB = 0.45*PBCD + 0.05.",
    391       "supported": "weak"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "Company evaluating own products",
    397       "detail": "All authors are OpenAI employees. OpenAI models (o1, o3-mini, GPT-4o) are evaluated alongside competitors. The judge uses OpenAI models (o3-mini). The scaffold was tuned using preliminary experiments that may favor OpenAI models. No competing interests statement."
    398     },
    399     {
    400       "flag": "Small sample size for human baseline",
    401       "detail": "Only 8 human participants on 4 papers (3 attempts each). Best@3 metric on 3 papers is a very small sample for human-AI comparison claims. Participants self-selected papers they were 'most confident about,' potentially inflating human scores."
    402     },
    403     {
    404       "flag": "Non-random assignment in human study",
    405       "detail": "Human participants chose papers they were confident about replicating, introducing selection bias. Some received A100 GPUs instead of A10s. These are not discussed as threats to validity."
    406     },
    407     {
    408       "flag": "Small and potentially unrepresentative benchmark",
    409       "detail": "20 papers from ICML 2024 Spotlight/Oral only. Selection required author cooperation (42 reached, 20 agreed), introducing potential bias toward papers whose authors were willing to collaborate."
    410     },
    411     {
    412       "flag": "Judge evaluated by judge's own developer",
    413       "detail": "JudgeEval uses only 4-5 papers and is created by the same team that built SimpleJudge. The evaluation of the judge is limited in scope and potentially biased toward the judge's strengths."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "MLE-bench: Evaluating machine learning agents on machine learning engineering",
    419       "authors": ["Jun Shern Chan", "Neil Chowdhury", "Oliver Jaffe"],
    420       "year": 2024,
    421       "arxiv_id": "2410.07095",
    422       "relevance": "Benchmark evaluating AI agents on Kaggle ML engineering tasks, directly related predecessor."
    423     },
    424     {
    425       "title": "CORE-Bench: Fostering the Credibility of Published Research Through a Computational Reproducibility Agent Benchmark",
    426       "authors": ["Zachary S. Siegel", "Sayash Kapoor"],
    427       "year": 2024,
    428       "arxiv_id": "2409.11363",
    429       "relevance": "Agent benchmark for computational reproducibility of research papers given existing repositories."
    430     },
    431     {
    432       "title": "RE-Bench: Evaluating frontier AI R&D capabilities of language model agents against human experts",
    433       "authors": ["Hjalmar Wijk", "Tao Lin"],
    434       "year": 2024,
    435       "arxiv_id": "2411.15114",
    436       "relevance": "Benchmark for AI R&D capabilities with human expert comparisons, directly comparable methodology."
    437     },
    438     {
    439       "title": "MLAgentBench: Evaluating Language Agents on Machine Learning Experimentation",
    440       "authors": ["Qian Huang", "Jian Vora", "Percy Liang", "Jure Leskovec"],
    441       "year": 2024,
    442       "relevance": "Evaluates language agents on ML experimentation tasks including Kaggle competitions."
    443     },
    444     {
    445       "title": "Can LLMs Generate Novel Research Ideas? A Large-Scale Human Study with 100+ NLP Researchers",
    446       "authors": ["Chenglei Si", "Diyi Yang", "Tatsunori Hashimoto"],
    447       "year": 2024,
    448       "arxiv_id": "2409.04109",
    449       "relevance": "Large-scale study comparing LLM vs human research ideation capabilities."
    450     },
    451     {
    452       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    453       "authors": ["Shunyu Yao", "Jeffrey Zhao"],
    454       "year": 2023,
    455       "arxiv_id": "2210.03629",
    456       "relevance": "Foundation agent architecture (ReAct) used as basis for PaperBench's BasicAgent scaffold."
    457     },
    458     {
    459       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    460       "authors": ["Lianmin Zheng"],
    461       "year": 2023,
    462       "relevance": "LLM-as-judge methodology foundational to PaperBench's automated grading approach."
    463     },
    464     {
    465       "title": "AI Sandbagging: Language Models Can Strategically Underperform on Evaluations",
    466       "authors": ["Teun van der Weij"],
    467       "year": 2024,
    468       "arxiv_id": "2406.07358",
    469       "relevance": "Discusses strategic underperformance on evaluations, relevant to PaperBench's specification gaming concerns."
    470     },
    471     {
    472       "title": "Agent-as-a-Judge: Evaluate Agents with Agents",
    473       "authors": ["Mingchen Zhuge"],
    474       "year": 2024,
    475       "arxiv_id": "2410.10934",
    476       "relevance": "Agent-based judging framework for evaluating AI agent outputs on complex tasks."
    477     },
    478     {
    479       "title": "DSBench: How Far Are Data Science Agents to Becoming Data Science Experts?",
    480       "authors": ["Liqiang Jing"],
    481       "year": 2024,
    482       "arxiv_id": "2409.07703",
    483       "relevance": "Benchmark for data science agent capabilities, related evaluation methodology."
    484     }
    485   ]
    486 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs