scan.json (22429B)
1 { 2 "paper": { 3 "title": "Copilot Arena: A Platform for Code LLM Evaluation in the Wild", 4 "authors": ["Wayne Chi", "Valerie Chen", "Anastasios Nikolas Angelopoulos", "Wei-Lin Chiang", "Aditya Mittal", "Naman Jain", "Tianjun Zhang", "Ion Stoica", "Chris Donahue", "Ameet Talwalkar"], 5 "year": 2025, 6 "venue": "", 7 "arxiv_id": "" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "GitHub repo provided: https://github.com/lm-sys/copilot-arena, and VSCode extension at https://lmarena.ai/copilot." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "The paper states 'we will release a curated dataset' and 'Upon publication, we will also open-source more diverse examples' — future promises count as NO. Privacy concerns limit full data release." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No requirements.txt, Dockerfile, or detailed environment setup is mentioned in the paper." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions are provided. The paper describes the system but does not give instructions for replicating the experiments or analysis." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": true, 36 "justification": "95% confidence intervals are computed via bootstrapping for the BT model rankings (Section 4.1, Table 4, Table 5). Lower and upper bounds are reported for each model's beta estimate." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "Spearman rank correlations are reported but no formal significance tests (p-values) are provided for claims that rankings differ between platforms or that win-rate changes are 'significant' — the paper uses a percentile threshold (top 90th) rather than a statistical test." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "Spearman rank correlation coefficients are reported (rs = 0.62, 0.48, ≤0.1) for leaderboard comparisons. BT coefficients with confidence intervals provide magnitude context. Win-rate differences are quantified." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper reports 11k+ pairwise judgments and 4.5M suggestions but does not justify whether this sample size is sufficient for the claims made, nor is a power analysis discussed." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Bootstrap confidence intervals across 100 rounds are reported in Tables 4 and 5, showing spread of BT coefficient estimates." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "The paper compares Copilot Arena rankings against multiple existing evaluations: Chatbot Arena (general and coding), LiveBench, LiveCodeBench, and BigCodeBench (Section 4.2, Figure 5)." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "Baselines include recent and maintained benchmarks: LiveBench, LiveCodeBench, BigCodeBench, and Chatbot Arena — all contemporary at time of writing." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "The paper includes ablations: style-controlled BT model (Appendix D, Table 5), Snip-It vs non-Snip-It prompting (Table 2), and win-rate analysis across data partitions (task type, context length, FiM, language) in Section 5.2." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Multiple metrics used: BT coefficients, win-rates, Spearman rank correlation, pass@1 for prompting evaluation (Table 2), and pairwise win-rate matrices." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": true, 83 "justification": "The entire platform is built around human evaluation — users provide pairwise preference judgments on code completions in their IDE. Over 11k judgments collected." 84 }, 85 "held_out_test_set": { 86 "applies": false, 87 "answer": false, 88 "justification": "This is a live preference platform, not a train/test paradigm. There is no model training on the collected data in this paper." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Extensive breakdowns by task category, programming language, context length, FiM vs completion-only, and natural language (Section 5, Figures 6 and 7)." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Appendix A.2.3 shows examples of errors without Snip-It. The paper also discusses completion order bias (86% first-completion preference) in Appendix C and discusses where smaller models underperform." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports that models explicitly trained for infilling 'do not experience large changes to win-rate' (Section 5.2), which is a surprising negative result. Also reports low correlation with static benchmarks (rs ≤ 0.1)." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Abstract claims about rankings differing from existing evaluations are supported by Spearman correlations in Section 4.2. Claims about consistency across programming languages and variation by task are supported by Figure 7 and Section 5.2." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": false, 115 "justification": "The paper claims differences are attributable to 'the more realistic distribution of data and tasks' (abstract) — this is a causal claim from observational data without controlling for confounds like user population differences, model API differences, or selection effects." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": true, 120 "justification": "The limitations section (Section 7) explicitly states 'it is unclear to what extent our results encapsulate all real-world use cases' and notes the platform doesn't perfectly mirror real-world tools like GitHub Copilot." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": true, 125 "justification": "Section 5.2 considers confounding variables like response length and latency (style-controlled BT ablation in Appendix D). The paper also discusses completion order bias as an alternative explanation for preferences. Limitations section acknowledges platform differences." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "Table 3 in Appendix B provides full model names with version identifiers: claude-3-5-sonnet-20240620, gpt-4o-2024-08-06, gpt-4o-mini-2024-07-18, codestral-2405, gemini-1.5-flash-002, gemini-1.5-pro-002, etc." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "Full prompt templates with examples are provided in Appendix A.2 (PSM, SPM, Mask, IPF formats) including the actual prompt text used for code completion, and task detection prompts in Appendix C." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": false, 142 "justification": "No mention of temperature, top-p, or other sampling parameters used when calling the LLM APIs for code completion. The 0.5 second delay and 8000 token input limit are mentioned but not model generation hyperparameters." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "The system is a code completion platform, not an agentic scaffolding setup. Models are called directly for single-turn completions." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Appendix C documents data preprocessing: natural language detection method (lingua detector, confidence > 0.7), programming language detection (file extension), task clustering pipeline (multi-step LLM classification), and filtering criteria for completion bias analysis (outlier removal)." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 7 'Discussion' begins with a dedicated 'Limitations' subsection with substantive discussion." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "Specific threats discussed: pairwise completions and slower latency don't perfectly mirror real-world platforms, unable to include GitHub Copilot because its model is not available via API, privacy considerations limit data release, completion order bias (86% first-completion selection)." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "The paper states 'this should not be treated as the sole defining metric of model quality, but instead an informative one' and acknowledges inability to include GitHub Copilot and that results may not encapsulate all real-world use cases." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "Raw data is not released due to privacy concerns. The paper promises future curated releases with PII removal but does not provide data access at time of publication." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Sections 2-3 describe data collection: VSCode extension deployed to real users, pairwise completion interface, voting mechanism, privacy settings (Appendix B). Collection period and scale (4.5M suggestions, 11k+ votes) are stated." 182 }, 183 "recruitment_methods_described": { 184 "applies": true, 185 "answer": false, 186 "justification": "The paper does not describe how users were recruited to install the Copilot Arena VSCode extension, whether this introduces selection bias, or what channels were used for recruitment." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The pipeline from user interaction to analysis is documented: code context capture → model routing → completion generation → user voting → BT analysis. Data filtering for analysis is described (Appendix C: language detection, task clustering, outlier removal for bias analysis)." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": true, 198 "justification": "Section 9 'Acknowledgments' lists NSF grants (IIS1705121, IIS1838017, IIS2046613, IIS2112471) and funding from Sony AI, Meta, Morgan Stanley, Amazon, Google, and Scribe." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations (Carnegie Mellon University and UC Berkeley) are clearly listed. No authors appear affiliated with the model providers being evaluated." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "Funders include Google, Amazon, and Meta — companies whose models (Gemini, GPT-4o via partnership) are evaluated in the leaderboard. This creates a potential conflict of interest that is not acknowledged." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests statement is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": false, 219 "answer": false, 220 "justification": "The paper evaluates models on live user data, not on a benchmark that could be in training data. Contamination is not a concern for a live preference platform." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": false, 224 "answer": false, 225 "justification": "Not applicable — user code contexts are live and novel, not a pre-existing benchmark dataset." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": false, 229 "answer": false, 230 "justification": "Not applicable — this is a live evaluation platform, not a static benchmark. The paper actually positions itself as an alternative that avoids contamination issues." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": true, 236 "answer": false, 237 "justification": "No pre-registration is mentioned for the study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": true, 241 "answer": false, 242 "justification": "No IRB or ethics board approval is mentioned despite collecting data from human users and their code." 243 }, 244 "demographics_reported": { 245 "applies": true, 246 "answer": false, 247 "justification": "No user demographics are reported — no information on experience level, geographic distribution, or professional background of the developers using the platform." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": true, 251 "answer": false, 252 "justification": "No inclusion or exclusion criteria for participants are stated. Anyone who installs the VSCode extension can participate." 253 }, 254 "randomization_described": { 255 "applies": true, 256 "answer": true, 257 "justification": "Section 2.2 describes the model routing/sampling strategy: models are randomly paired for each comparison, with a latency-optimized sampling strategy described." 258 }, 259 "blinding_described": { 260 "applies": true, 261 "answer": true, 262 "justification": "Users are blinded to model identities during evaluation — they see two anonymous completions and only learn model identities after voting (Appendix B.1: 'After the user votes, we reveal the model pair')." 263 }, 264 "attrition_reported": { 265 "applies": true, 266 "answer": false, 267 "justification": "No information on how many users installed the extension vs. how many actually provided votes, or user dropout/retention over time." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "No API costs, per-completion costs, or latency measurements are reported for running the platform, despite serving 4.5 million suggestions across 10 models." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No total computational budget, API spend, or infrastructure costs are stated for operating the platform." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "Model rankings from Copilot Arena differ from those of existing evaluations, with highest correlation (rs = 0.62) with Chatbot Arena coding and low correlation (rs ≤ 0.1) with static benchmarks.", 286 "evidence": "Section 4.2 and Figure 5 show Spearman rank correlations between Copilot Arena and other leaderboards across 10 models.", 287 "supported": "moderate" 288 }, 289 { 290 "claim": "Downstream task type significantly affects relative model win-rates (31.1% of possible changes), while programming language has little effect (6.7% of possible changes).", 291 "evidence": "Section 5.2 and Figure 7 show win-rate difference analysis across data partitions with a top-90th-percentile threshold.", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "Models explicitly trained for infilling do not experience large changes to win-rate on FiM vs non-FiM tasks.", 296 "evidence": "Section 5.2 and Table 6 show DeepSeek Coder performance remains consistent when using Chat API with prompting scheme vs native FiM.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "Smaller models tend to overperform on data similar to static benchmarks compared to Copilot Arena's realistic distribution.", 301 "evidence": "Section 4.2 and 5.2 discuss GPT-4o mini and Qwen-2.5 Coder performing worse on frontend/backend tasks, longer contexts, and non-Python settings in Copilot Arena.", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "The Snip-It prompting method improves pass@1 for most models and prompt templates.", 306 "evidence": "Table 2 in Appendix A.2.2 shows pass@1 comparisons across 9 models and 4 prompt templates with and without Snip-It.", 307 "supported": "strong" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval", "observational"], 311 "key_findings": "Copilot Arena is a VSCode-integrated platform for evaluating code LLMs via live pairwise preference judgments, collecting 11k+ votes across 10 models. Rankings differ substantially from static benchmarks (Spearman rs ≤ 0.1) but correlate moderately with Chatbot Arena coding (rs = 0.62). Task type is the strongest factor affecting model rankings, while programming language has minimal effect. Code-specific models like DeepSeek Coder are competitive with general-purpose SOTA models, and smaller models tend to overperform on static benchmark-like data compared to realistic IDE usage.", 312 "red_flags": [ 313 { 314 "flag": "Completion order bias", 315 "detail": "Users selected the first completion 86% of the time. While the paper investigates this and finds non-trivial decision times, such a strong positional bias could significantly affect rankings if model assignment to positions is not perfectly balanced." 316 }, 317 { 318 "flag": "No IRB approval for human subjects research", 319 "detail": "The platform collects code and preference data from users but does not mention IRB or ethics review, despite this constituting human subjects research." 320 }, 321 { 322 "flag": "Funders include evaluated model providers", 323 "detail": "Google and Amazon are listed as funders, and Google's Gemini models are evaluated in the leaderboard. This conflict is not acknowledged." 324 }, 325 { 326 "flag": "Unknown user population", 327 "detail": "No demographics or recruitment methods described. Self-selected users who install a research VSCode extension may not be representative of professional developers, limiting generalizability claims." 328 } 329 ], 330 "cited_papers": [ 331 { 332 "title": "Judging LLM-as-a-judge with MT-bench and Chatbot Arena", 333 "authors": ["Lianmin Zheng", "Wei-Lin Chiang"], 334 "year": 2023, 335 "relevance": "Foundational work on arena-based LLM evaluation via human preferences that Copilot Arena builds upon." 336 }, 337 { 338 "title": "Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference", 339 "authors": ["Wei-Lin Chiang"], 340 "year": 2024, 341 "arxiv_id": "2403.04132", 342 "relevance": "Primary comparison platform for human preference evaluation of LLMs." 343 }, 344 { 345 "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code", 346 "authors": ["Naman Jain", "King Han"], 347 "year": 2024, 348 "arxiv_id": "2403.07974", 349 "relevance": "Live coding benchmark designed to reduce contamination, used as baseline comparison." 350 }, 351 { 352 "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions", 353 "authors": ["Terry Yue Zhuo"], 354 "year": 2024, 355 "arxiv_id": "2406.15877", 356 "relevance": "Code generation benchmark used as baseline comparison for model rankings." 357 }, 358 { 359 "title": "AI Agents That Matter", 360 "authors": ["Sayash Kapoor", "Benedikt Stroebl"], 361 "year": 2024, 362 "arxiv_id": "2407.01502", 363 "relevance": "Discusses evaluation methodology concerns for AI agents, directly relevant to survey scope." 364 }, 365 { 366 "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot", 367 "authors": ["Sida Peng", "Eirini Kalliamvakou"], 368 "year": 2023, 369 "arxiv_id": "2302.06590", 370 "relevance": "Major study on AI coding assistant productivity impact." 371 }, 372 { 373 "title": "The RealHumanEval: Evaluating Large Language Models' Abilities to Support Programmers", 374 "authors": ["Hussein Mozannar", "Valerie Chen"], 375 "year": 2024, 376 "arxiv_id": "2404.02806", 377 "relevance": "Human-centric evaluation of LLMs for programming support, directly comparable methodology." 378 }, 379 { 380 "title": "Evaluating Large Language Models Trained on Code", 381 "authors": ["Mark Chen"], 382 "year": 2021, 383 "arxiv_id": "2107.03374", 384 "relevance": "HumanEval benchmark — foundational code generation evaluation that Copilot Arena aims to complement." 385 }, 386 { 387 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 388 "authors": ["Carlos E Jimenez", "John Yang"], 389 "year": 2023, 390 "relevance": "Major real-world coding benchmark for evaluating LLM software engineering capabilities." 391 }, 392 { 393 "title": "The Productivity Effects of Generative AI: Evidence from a Field Experiment with GitHub Copilot", 394 "authors": ["Kevin Zheyuan Cui", "Mert Demirer"], 395 "year": 2024, 396 "relevance": "Field experiment on coding assistant productivity, complementary evaluation methodology." 397 }, 398 { 399 "title": "LiveBench: A Challenging, Contamination-Free LLM Benchmark", 400 "authors": ["Colin White", "Samuel Dooley"], 401 "year": 2024, 402 "arxiv_id": "2406.19314", 403 "relevance": "Live benchmark designed to avoid contamination, used as baseline comparison." 404 } 405 ] 406 }