scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20958B)
      1 {
      2   "paper": {
      3     "title": "From Crowdsourced Data to High-Quality Benchmarks: Arena-Hard and BenchBuilder Pipeline",
      4     "authors": ["Tianle Li", "Wei-Lin Chiang", "Evan Frick", "Lisa Dunlap", "Tianhao Wu", "Banghua Zhu", "Joseph E. González", "Ion Stoica"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2406.11939"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub repository provided: https://github.com/lmarena/arena-hard-auto (Section 1, footnote 1)."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Arena-Hard-Auto benchmark of 500 prompts is released as part of the open-source repository. The source data (Chatbot Arena, WildChat-1M) are publicly available datasets."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section found in the paper. Only mentions of specific models and APIs used."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided in the paper. The pipeline is described at a high level but specific commands or scripts to replicate results are not included."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "95% confidence intervals are computed via 100 rounds of bootstrapping on judgment results (Section 6.1). Confidence intervals shown in Figure 5 and used throughout the metrics."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No formal significance tests (p-values, t-tests, etc.) are used. Comparisons between benchmarks rely on the proposed metrics (separability, agreement) without statistical tests of whether differences are significant."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Effect sizes are provided in context: e.g., '3x higher separation' compared to MT-Bench, specific percentage differences in separability (87.4% vs 22.6%), agreement (90.9% vs 26.6%), and Brier scores (0.069 vs 0.09)."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification for why 500 prompts were chosen, why 250 clusters were sampled, or why 20 models were used for evaluation. These are stated but not justified."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Variance is captured through bootstrapped confidence intervals (100 rounds of bootstrapping). Confidence intervals are shown in Figure 5 and used in the separability metric."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Comparisons against MT-Bench, AlpacaEval 2.0 LC, and Chatbot Arena (Table 1). Random baselines also compared (Table 2, Appendix Table 7)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "MT-Bench (2023), AlpacaEval 2.0 LC (2024), and Chatbot Arena are all contemporary and widely-used LLM benchmarks at time of publication."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Several ablation-like analyses: quality score threshold effects (Figure 3), random vs. curated baselines (Table 2, Table 7), different annotators (Llama-3-70B vs GPT-4-Turbo, Table 8), different judges (Table 4), and style control effects (Table 5)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple metrics used: Separability, Confidence Agreement, Spearman Correlation, Kendall Tau Correlation, and Brier Score (Table 1)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Chatbot Arena human preference rankings serve as the ground truth against which Arena-Hard-Auto is validated. The entire framework is evaluated by how well it aligns with human judgments."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is a benchmark construction paper, not a model training paper. The concept of held-out test sets does not apply in the traditional sense."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Topic cluster analysis provided (Figure 4), per-model breakdowns in multiple tables, per-quality-score breakdowns (Figure 3), and per-judge breakdowns (Table 4)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Stylistic biases (length bias, self-bias) are discussed as failure modes in Sections 6.5 and 6.6. Table 5 shows how style manipulation can game the benchmark without style control."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Self-bias of GPT-4-Turbo judge is reported (Section 6.6, Appendix Table 10) showing it favors OpenAI models. Claude-3-Opus and Llama-3-70B perform worse as judges (Table 4)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims of '3x higher separation' supported by Table 1 (87.4% vs 22.6%), '98.6% correlation' supported by Table 3/Appendix Table 9, '$20 cost' stated in Section 4.3/Table 1."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper makes causal claims about the quality score filtering improving benchmark quality. This is supported by controlled comparisons: curated vs. random baselines (Table 2, Table 7), and ablations across quality score thresholds (Figure 3)."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Limitations section (Section 7) explicitly states the benchmark 'currently lacks evaluation for multi-turn and non-English interactions' and acknowledges the seven qualities 'may not fully capture the range of possible attributes, potentially skewing towards prompts in technical domains.'"
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper discusses stylistic bias as an alternative explanation for benchmark scores (Section 6.5), self-bias of judges (Section 6.6), and tests whether results hold under style control (Table 3)."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Specific model versions provided throughout: gpt-4-0314, gpt-4-1106-preview, gpt-4-turbo-2024-04-09, claude-3-opus-20240229, claude-3-sonnet-20240229, gemini-1.5-pro-0514, llama-3-70b-instruct, etc. (footnote 4, Table 4)."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Prompt templates for the judge are stated to be in Section C (Appendix). Style control system prompts are provided in Table 6 (e.g., 'You are a helpful assistant who thoroughly explains things with as much detail as possible.')."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No temperature, top-p, or other sampling parameters reported for LLM API calls used in judging or annotation."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. The pipeline is a data curation workflow, not an agent system."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Data pipeline documented: 200,000 initial prompts from Chatbot Arena, filtering of duplicates/multi-turn/non-English, clustering into 4,000 topics, quality scoring, threshold filtering (score <6 removed, cluster mean <5 removed), sampling 2 per cluster from 250 clusters (Section 4.2)."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Dedicated Section 7 'Limitations' discusses biases in the pipeline, lack of multi-turn and non-English coverage, and potential skew toward technical domains."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Specific threats identified: seven quality criteria may skew toward technical domains, lack of multi-turn data in crowdsourced sources, primary language proficiency of authors limiting non-English evaluation."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 7 explicitly states what is NOT covered: multi-turn interactions, non-English evaluation. The benchmark is bounded to single-turn English queries."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The benchmark prompts are released via the GitHub repository. Source datasets (Chatbot Arena conversations, WildChat-1M) are publicly available."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Data sourced from Chatbot Arena (200,000 prompts) and WildChat-1M (150,000 queries). Collection via crowdsourced live platforms described in Sections 4.1-4.2."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants recruited for this study. Data comes from existing crowdsourced platforms (Chatbot Arena, WildChat). Human preferences used as ground truth are from the existing Chatbot Arena platform."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Full pipeline documented: embedding with text-embedding-3-small, UMAP dimensionality reduction, HDBSCAN clustering into 4,000 topics, LLM quality scoring on 7 criteria, threshold filtering, balanced sampling across clusters (Section 4.1-4.2, Figure 2)."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section found in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All authors listed with UC Berkeley affiliation. Authors are also affiliated with the LMSYS Chatbot Arena project, which is the source of the benchmark data."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding disclosed, so independence cannot be assessed. The authors operate the Chatbot Arena platform whose data and rankings are used as ground truth, creating a potential conflict."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement found in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper evaluates multiple LLMs on the benchmark but does not state training data cutoff dates for any of the models used."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether Arena-Hard-Auto prompts (sourced from Chatbot Arena) could overlap with training data of the evaluated models. Chatbot Arena conversations are public and could be in training sets."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "Section 2 discusses benchmark leakage as a motivation. The paper argues BenchBuilder enables 'continuous benchmark updates' to address contamination risk of static benchmarks. However, contamination of Arena-Hard-Auto itself is not directly tested."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants recruited for this study. Human preference data comes from existing Chatbot Arena platform."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants recruited for this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants recruited for this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants recruited for this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants recruited for this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants recruited for this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants recruited for this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Evaluation cost per model reported as $20 (Table 1, abstract). Pipeline annotation cost reported as ~$500 with GPT-4-Turbo or ~$45 with Llama-3-70B (Section 4.3)."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "API costs are reported but total computational budget (GPU hours for embeddings, clustering, etc.) is not stated."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Arena-Hard-Auto achieves 98.6% correlation with Chatbot Arena human preference rankings.",
    286       "evidence": "Table 3 shows 98.6% Confidence Agreement and 98.6% Spearman Correlation with style-controlled Chatbot Arena English Hard Prompts ranking. Table 1 shows 93.2% Spearman and 90.9% Confidence Agreement with overall Chatbot Arena.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Arena-Hard-Auto provides 3x higher separation of model performances compared to MT-Bench.",
    291       "evidence": "Table 1: Arena-Hard-Auto separability 87.4% vs MT-Bench 22.6% (3.87x). Figure 5 visually demonstrates the difference.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Arena-Hard-Auto evaluation costs only $20 per model.",
    296       "evidence": "Table 1 lists eval cost per model as $20. Section 4.3 provides cost breakdown for the pipeline annotation step.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Style control effectively neutralizes gaming via response length or markdown manipulation.",
    301       "evidence": "Table 5 shows that with style control, the 'detail' variant of Llama-3.1-70B no longer outperforms the base model, whereas without style control it gains ~9 points.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "BenchBuilder generalizes to different data sources (WildChat).",
    306       "evidence": "Table 2 shows Wild-Hard-Auto (from WildChat) achieves 86.7% separability and 88.6% confidence agreement vs 75.6% and 36.4% for random baseline.",
    307       "supported": "moderate"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "The paper introduces BenchBuilder, an automated pipeline for curating LLM benchmarks from crowdsourced data, and Arena-Hard-Auto, a 500-prompt benchmark. Arena-Hard-Auto achieves 87.4% separability (vs 22.6% for MT-Bench) and up to 98.6% agreement with human preferences at $20 per model evaluation. The paper also proposes style control methods that effectively mitigate length and formatting biases in LLM-as-a-judge evaluation, and demonstrates the pipeline generalizes across data sources (Chatbot Arena, WildChat).",
    312   "red_flags": [
    313     {
    314       "flag": "Authors evaluate their own platform",
    315       "detail": "The authors operate LMSYS Chatbot Arena, whose rankings serve as ground truth for validating Arena-Hard-Auto. This circular dependency means the benchmark is validated against a system the authors control. High agreement could partly reflect shared biases in data sourcing and evaluation methodology."
    316     },
    317     {
    318       "flag": "Contamination of Arena-Hard-Auto not directly tested",
    319       "detail": "While benchmark leakage is discussed as motivation, the paper does not test whether Arena-Hard-Auto prompts (sourced from public Chatbot Arena conversations) appear in training data of the evaluated models. Given Chatbot Arena data is publicly scraped, this is a real risk."
    320     },
    321     {
    322       "flag": "No hyperparameters for LLM calls",
    323       "detail": "Temperature, top-p, and other sampling parameters for the judge and annotator LLM calls are not reported, despite these being known to significantly affect output."
    324     }
    325   ],
    326   "cited_papers": [
    327     {
    328       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    329       "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"],
    330       "year": 2023,
    331       "relevance": "Foundational work on LLM-as-a-judge evaluation methodology that Arena-Hard-Auto builds upon."
    332     },
    333     {
    334       "title": "Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference",
    335       "authors": ["Wei-Lin Chiang", "Lianmin Zheng", "Ying Sheng"],
    336       "year": 2024,
    337       "relevance": "The crowdsourced human evaluation platform whose data and rankings serve as ground truth for this work."
    338     },
    339     {
    340       "title": "Length-Controlled AlpacaEval: A Simple Way to Debias Automatic Evaluators",
    341       "authors": ["Yann Dubois", "Balázs Galambosi", "Percy Liang", "Tatsunori B. Hashimoto"],
    342       "year": 2024,
    343       "arxiv_id": "2404.04475",
    344       "relevance": "Key baseline benchmark and introduces length bias control methods that Arena-Hard-Auto extends."
    345     },
    346     {
    347       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    348       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"],
    349       "year": 2024,
    350       "relevance": "Prominent task-based LLM benchmark for code, relevant to the survey's scope of AI coding evaluation."
    351     },
    352     {
    353       "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions",
    354       "authors": ["Terry Yue Zhuo"],
    355       "year": 2024,
    356       "arxiv_id": "2406.15877",
    357       "relevance": "Contemporary code generation benchmark relevant to evaluating LLM programming capabilities."
    358     },
    359     {
    360       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    361       "authors": ["Naman Jain"],
    362       "year": 2024,
    363       "arxiv_id": "2403.07974",
    364       "relevance": "Live benchmark addressing contamination in code evaluation, directly relevant to benchmark methodology quality."
    365     },
    366     {
    367       "title": "AgentBench: Evaluating LLMs as Agents",
    368       "authors": ["Xiao Liu"],
    369       "year": 2023,
    370       "relevance": "Benchmark for evaluating LLMs in agentic settings, relevant to the survey's coverage of agent evaluation."
    371     },
    372     {
    373       "title": "Evaluating Large Language Models Trained on Code",
    374       "authors": ["Mark Chen"],
    375       "year": 2021,
    376       "arxiv_id": "2107.03374",
    377       "relevance": "Introduces HumanEval benchmark for code generation, foundational to LLM code evaluation methodology."
    378     },
    379     {
    380       "title": "NLP Evaluation in Trouble: On the Need to Measure LLM Data Contamination for Each Benchmark",
    381       "authors": ["Oscar Sainz"],
    382       "year": 2023,
    383       "relevance": "Directly addresses benchmark contamination measurement, a key methodological concern in the survey."
    384     },
    385     {
    386       "title": "WildBench: Benchmarking LLMs with Challenging Tasks from Real Users in the Wild",
    387       "authors": ["Bill Yuchen Lin"],
    388       "year": 2024,
    389       "arxiv_id": "2406.04770",
    390       "relevance": "Contemporary benchmark using real user queries for LLM evaluation, comparable approach to Arena-Hard-Auto."
    391     }
    392   ]
    393 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs