scan.json (26983B)
1 { 2 "paper": { 3 "title": "ArtifactsBench: Bridging the Visual-Interactive Gap in LLM Code Generation Evaluation", 4 "authors": [ 5 "Chenchen Zhang", 6 "Yuhang Li", 7 "Can Xu", 8 "Jiaheng Liu", 9 "Ao Liu", 10 "Changzhi Zhou", 11 "Ken Deng", 12 "Dengpeng Wu", 13 "Guanhua Huang", 14 "Kejiao Li", 15 "Qi Yi", 16 "Ruibin Xiong", 17 "Shihui Hu", 18 "Yue Zhang", 19 "Yuhao Jiang", 20 "Zenan Xu", 21 "Yuanxing Zhang", 22 "Wiggin Zhou", 23 "Chayse Zhou", 24 "Fengzong Lian" 25 ], 26 "year": 2025, 27 "venue": "arXiv", 28 "arxiv_id": "2507.04952", 29 "doi": "10.48550/arXiv.2507.04952" 30 }, 31 "checklist": { 32 "artifacts": { 33 "code_released": { 34 "applies": true, 35 "answer": true, 36 "justification": "The paper states 'We open-source ArtifactsBench, including the benchmark, evaluation harness, and baseline results at https://artifactsbenchmark.github.io' (abstract and Section 1). A concrete URL is provided." 37 }, 38 "data_released": { 39 "applies": true, 40 "answer": true, 41 "justification": "The benchmark dataset of 1,825 tasks is stated to be released at the project website. Section A.4 states 'We release dataset specs, checklist templates, evaluation scripts, and referee settings.'" 42 }, 43 "environment_specified": { 44 "applies": true, 45 "answer": false, 46 "justification": "No requirements.txt, Dockerfile, or environment specification is described in the paper. The execution environment is described at a high level (Playwright, headless Chromium, 1024x768) but no dependency versions or environment recreation details are provided." 47 }, 48 "reproduction_instructions": { 49 "applies": true, 50 "answer": false, 51 "justification": "While the evaluation pipeline is described conceptually, no step-by-step reproduction instructions (README with commands, scripts to run) are provided in the paper itself. The project website is referenced but the paper does not contain reproduction steps." 52 } 53 }, 54 "statistical_methodology": { 55 "confidence_intervals_or_error_bars": { 56 "applies": true, 57 "answer": false, 58 "justification": "All results in Tables 3, 6, and 7 are reported as point estimates without confidence intervals or error bars. No uncertainty quantification is provided for the benchmark scores." 59 }, 60 "significance_tests": { 61 "applies": true, 62 "answer": false, 63 "justification": "The paper makes numerous comparative claims (e.g., one model outperforms another, generalists outperform specialists) but provides no statistical significance tests. All comparisons are based on raw score differences." 64 }, 65 "effect_sizes_reported": { 66 "applies": true, 67 "answer": false, 68 "justification": "No effect sizes (Cohen's d, etc.) are reported. Differences between models are presented as raw score gaps without formal quantification of effect magnitude." 69 }, 70 "sample_size_justified": { 71 "applies": true, 72 "answer": false, 73 "justification": "The benchmark contains 1,825 tasks and the human validation study uses 280 queries with 6 models, but no justification for why these specific sample sizes were chosen is provided. No power analysis is discussed." 74 }, 75 "variance_reported": { 76 "applies": true, 77 "answer": false, 78 "justification": "No variance, standard deviation, or spread measures are reported for any results. It is unclear whether experiments were run multiple times. Scores appear to be single-run evaluations." 79 } 80 }, 81 "evaluation_design": { 82 "baselines_included": { 83 "applies": true, 84 "answer": true, 85 "justification": "The paper evaluates 30+ LLMs as baselines spanning multiple families (Qwen, DeepSeek, Gemma, GPT, Claude, Gemini, Hunyuan, Seed) as described in Section 4.1 and Table 3." 86 }, 87 "baselines_contemporary": { 88 "applies": true, 89 "answer": true, 90 "justification": "Baselines include very recent models: Claude 4.0-Sonnet, GPT-5, Gemini-2.5-Pro, DeepSeek-R1-0528, Qwen3 family — all from 2025. The baselines are contemporary and competitive." 91 }, 92 "ablation_study": { 93 "applies": true, 94 "answer": true, 95 "justification": "Tables 4 and 5 present ablation studies on the evaluation pipeline: with/without images, single vs. multiple screenshots, with/without answer text, image vs. caption-based evaluation. These demonstrate the contribution of individual components." 96 }, 97 "multiple_metrics": { 98 "applies": true, 99 "answer": true, 100 "justification": "The paper uses multiple metrics: overall score, vision-oriented vs. code-oriented scores, per-interactivity-level breakdowns (SV, MMD, HD, II), per-category breakdowns, Pair ACC for human agreement, and normalized Footrule consistency for ranking correlation with WebDev Arena." 101 }, 102 "human_evaluation": { 103 "applies": true, 104 "answer": true, 105 "justification": "Section 4.3 and A.3 describe a double-blind human study with 280 queries and 6 models judged by multiple front-end engineers. Pairwise agreement (Pair ACC) is computed between automated referee and human experts." 106 }, 107 "held_out_test_set": { 108 "applies": true, 109 "answer": false, 110 "justification": "The paper does not describe a held-out test set. All 1,825 tasks appear to be used for evaluation without a train/dev/test split. The human validation uses a random subset of 280 queries from the same pool." 111 }, 112 "per_category_breakdown": { 113 "applies": true, 114 "answer": true, 115 "justification": "Table 3 provides breakdowns by interactivity level (SV, MMD, HD, II) and by task category (GAME, SVG, WEB, SI, MS). Figure 5 shows performance across difficulty tiers." 116 }, 117 "failure_cases_discussed": { 118 "applies": true, 119 "answer": true, 120 "justification": "Section 4.2 discusses that 'All models score lowest on Intensive Interactive tasks and complex Management System scenarios.' The paper identifies systematic failure modes on intensive-interactive tasks. Section A.1 acknowledges that discrete screenshot sampling may miss complex interaction failures." 121 }, 122 "negative_results_reported": { 123 "applies": true, 124 "answer": true, 125 "justification": "Table 5 shows that removing images or answers degrades performance. The paper reports that specialist models (coder, VL) underperform generalists, which is a somewhat negative finding for the specialist paradigm. Distillation on limited data is shown to yield only modest gains." 126 } 127 }, 128 "claims_and_evidence": { 129 "abstract_claims_supported": { 130 "applies": true, 131 "answer": true, 132 "justification": "The abstract claims 94.4% ranking consistency with WebDev Arena (supported by Figure 7 and Section 4.4), up to 90.95% pairwise agreement with human experts (supported by Table 4), and evaluation of 30+ LLMs (supported by Table 3). All abstract claims have corresponding results." 133 }, 134 "causal_claims_justified": { 135 "applies": true, 136 "answer": true, 137 "justification": "The ablation studies (Tables 4, 5) make causal claims about the contribution of screenshots and code-aware judging through controlled single-variable manipulations: adding images improves Pair ACC, multiple screenshots further improve it, removing answers degrades quality. These ablations are adequate for the causal claims made." 138 }, 139 "generalization_bounded": { 140 "applies": true, 141 "answer": false, 142 "justification": "The title and abstract use broad language ('LLM Code Generation Evaluation') but the benchmark focuses specifically on interactive visual artifacts (web apps, games, SVGs). The claims about model capabilities are stated broadly ('generalist skills outperform specialist expertise') but tested only on this specific benchmark domain. Section A.1 acknowledges some limitations but does not bound the generalization of the main findings." 143 }, 144 "alternative_explanations_discussed": { 145 "applies": true, 146 "answer": false, 147 "justification": "The paper does not discuss alternative explanations for its key findings. For example, it does not consider whether the MLLM referee's own biases could explain the high correlation with WebDev Arena, or whether the finding that generalists outperform specialists could be an artifact of task selection bias. The limitations section (A.1) discusses future work directions but not alternative explanations for observed results." 148 } 149 }, 150 "setup_transparency": { 151 "model_versions_specified": { 152 "applies": true, 153 "answer": false, 154 "justification": "Most models are specified with marketing names only: 'Gemini-2.5-Pro', 'Claude 4.0-Sonnet', 'GPT-4o'. Some have snapshot dates (e.g., 'GPT-4.1-2025-04-14', 'O1-2024-12-17', 'Claude Sonnet 4 (20250514)') but many do not have version identifiers or API snapshot dates. 'Hunyuan-Turbos-Preview' and 'Hunyuan-A13B' lack version specifics." 155 }, 156 "prompts_provided": { 157 "applies": true, 158 "answer": true, 159 "justification": "The paper provides full prompt texts in the appendix: query quality filtering prompt (Figure 12), category classification prompts (Figures 13, 14), checklist generation prompts (Figures 15, 16), and the final scoring prompt (Figure 17). These are the actual prompts used, not summaries." 160 }, 161 "hyperparameters_reported": { 162 "applies": true, 163 "answer": false, 164 "justification": "Section 4.1 states 'temperature and top-p tuned per official recommendations; max tokens sufficient to prevent truncation' but does not report the actual values. Specific temperature, top-p, and max_tokens settings are not provided." 165 }, 166 "scaffolding_described": { 167 "applies": true, 168 "answer": true, 169 "justification": "The evaluation pipeline is described in detail: sandbox execution with Playwright (headless Chromium), deterministic seeds, 1024x768 resolution, three-step screenshot capture, dual-referee scoring, 10-dimension checklists. The pipeline architecture is shown in Figure 3." 170 }, 171 "data_preprocessing_documented": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 2.2 describes the full data construction pipeline including sourcing and filtering, de-duplication (MinHash + semantic similarity + perceptual hashing), prompt rewriting, difficulty calibration, checklist generation, solvability validation, and execution harness setup. Figure 2 provides a visual overview." 175 } 176 }, 177 "limitations_and_scope": { 178 "limitations_section_present": { 179 "applies": true, 180 "answer": true, 181 "justification": "Appendix A.1 contains a dedicated 'Limitations and Future Work' section discussing limitations of the screenshot-based evaluation approach and the single-turn evaluation scope." 182 }, 183 "threats_to_validity_specific": { 184 "applies": true, 185 "answer": true, 186 "justification": "Section A.1 discusses specific limitations: (1) discrete screenshot sampling may miss fluid/complex interactions in long-horizon tasks, (2) the benchmark evaluates single-turn generation only, not iterative refinement or agentic development. These are specific to this study's design." 187 }, 188 "scope_boundaries_stated": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section A.1 explicitly states what the benchmark does NOT evaluate: it 'does not assess an LLM's capability to function as an autonomous agent that can iteratively refine an artifact based on feedback, debug its own code in response to errors, or plan and execute a multi-step development process.' The section also notes that screenshot sampling 'may not fully capture the fluidity, correctness, and robustness of the entire interactive experience.'" 192 } 193 }, 194 "data_integrity": { 195 "raw_data_available": { 196 "applies": true, 197 "answer": true, 198 "justification": "The paper states that the benchmark data, evaluation harness, and baseline results are open-sourced at https://artifactsbenchmark.github.io. Section A.4 confirms 'We release dataset specs, checklist templates, evaluation scripts, and referee settings.'" 199 }, 200 "data_collection_described": { 201 "applies": true, 202 "answer": true, 203 "justification": "Section 2.2 describes the data collection procedure in detail: aggregating candidates from expert showcases, open SVG/web-snippet datasets, web case studies, and LLM visual-to-query from screenshots, followed by automated filtering, de-duplication, prompt rewriting, and solvability validation." 204 }, 205 "recruitment_methods_described": { 206 "applies": true, 207 "answer": false, 208 "justification": "For the human validation study (280 queries, Section A.3), participants are described only as 'multiple engineers with extensive front-end development experience.' No details about how many annotators participated, how they were recruited, or whether they were internal Tencent employees. This is relevant because internal employees could have conflicts of interest." 209 }, 210 "data_pipeline_documented": { 211 "applies": true, 212 "answer": true, 213 "justification": "Section 2.2 and Figure 2 document the full pipeline from sourcing through filtering, de-duplication, rewriting, calibration, solvability validation, to final benchmark construction. The realized difficulty split (559/611/655) is stated. The checklist calibration process with Cohen's kappa >= 0.8 is described." 214 } 215 }, 216 "conflicts_of_interest": { 217 "funding_disclosed": { 218 "applies": true, 219 "answer": false, 220 "justification": "No funding source is disclosed. There is no acknowledgments section listing grants or funding. The work is from Tencent but no explicit funding statement is made." 221 }, 222 "affiliations_disclosed": { 223 "applies": true, 224 "answer": true, 225 "justification": "Author affiliations are disclosed in Section 7 (Contributions and Acknowledgements). Most authors are from Tencent, with some from NJU and PKU. One contributor is listed as 'Independent.'" 226 }, 227 "funder_independent_of_outcome": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper is produced by Tencent's Hunyuan team and evaluates Hunyuan models (Hunyuan-Turbos-Preview, Hunyuan-A13B) within the benchmark. Tencent has a direct financial interest in their models performing well. The funder/employer is not independent of the outcome." 231 }, 232 "financial_interests_declared": { 233 "applies": true, 234 "answer": false, 235 "justification": "No competing interests statement is provided. Given that the authors work at Tencent and evaluate Tencent's Hunyuan models, a financial interest declaration would be appropriate but is absent." 236 } 237 }, 238 "contamination": { 239 "training_cutoff_stated": { 240 "applies": true, 241 "answer": false, 242 "justification": "No model training data cutoff dates are stated. The paper evaluates 30+ LLMs on the benchmark without discussing when their training data was collected." 243 }, 244 "train_test_overlap_discussed": { 245 "applies": true, 246 "answer": true, 247 "justification": "Section 2.2 describes contamination control: 'Two-stage filtering: (i) MinHash + semantic similarity over prompts, checklists, and normalized DOM/CSS/JS; (ii) screenshot perceptual hashing to catch visually near-identical artifacts.' The dataset is stated to be self-constructed with contamination control measures." 248 }, 249 "benchmark_contamination_addressed": { 250 "applies": true, 251 "answer": true, 252 "justification": "Section 2.2 explicitly addresses contamination under 'De-duplication & contamination control' with two-stage filtering including MinHash, semantic similarity, and perceptual hashing. Flagged items are 're-authored or discarded.' The benchmark is newly constructed, reducing contamination risk." 253 } 254 }, 255 "human_studies": { 256 "pre_registered": { 257 "applies": true, 258 "answer": false, 259 "justification": "The human validation study (280 queries, 6 models) is not pre-registered. No link to a pre-registration is provided." 260 }, 261 "irb_or_ethics_approval": { 262 "applies": true, 263 "answer": false, 264 "justification": "No IRB or ethics board approval is mentioned for the human evaluation study involving front-end engineers as annotators." 265 }, 266 "demographics_reported": { 267 "applies": true, 268 "answer": false, 269 "justification": "Human evaluators are described only as 'multiple engineers with extensive front-end development experience.' No demographics (number of annotators, years of experience, etc.) are reported." 270 }, 271 "inclusion_exclusion_criteria": { 272 "applies": true, 273 "answer": false, 274 "justification": "No inclusion or exclusion criteria for human evaluators are stated. The only qualifier is 'extensive front-end development experience.'" 275 }, 276 "randomization_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "The human study is a validation/annotation task (rating artifact quality), not an experimental study comparing treatment vs. control conditions. Randomization of condition assignment is not applicable." 280 }, 281 "blinding_described": { 282 "applies": true, 283 "answer": true, 284 "justification": "Section A.3 states: 'The process follows a double-blind protocol: annotators remain unaware of the MLLM's scores, and the samples appear in randomized order to mitigate bias.'" 285 }, 286 "attrition_reported": { 287 "applies": true, 288 "answer": false, 289 "justification": "No information is provided about how many human evaluators started vs. finished the study, or whether any annotations were excluded." 290 } 291 }, 292 "cost_and_practicality": { 293 "inference_cost_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No inference cost, API costs, or tokens consumed are reported for the evaluation pipeline despite it requiring running 30+ LLMs on 1,825 tasks plus dual-referee scoring." 297 }, 298 "compute_budget_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "No total computational budget, GPU hours, or API spend is stated. The paper does not quantify the cost of running the full evaluation pipeline." 302 } 303 } 304 }, 305 "claims": [ 306 { 307 "claim": "ArtifactsBench achieves 94.4% normalized Footrule ranking consistency with WebDev Arena.", 308 "evidence": "Section 4.4 and Figure 7 show the ranking correlation between ArtifactsBench (judged by Gemini-2.5-Pro) and WebDev Arena, reporting 94.4% consistency.", 309 "supported": "moderate" 310 }, 311 { 312 "claim": "The automated MLLM referee achieves up to 90.95% pairwise agreement with human experts.", 313 "evidence": "Table 4 shows Gemini-2.5-Pro with multiple screenshots achieves 90.95% Pair ACC. The human study uses 280 queries and 6 models judged by multiple front-end engineers (Section A.3).", 314 "supported": "moderate" 315 }, 316 { 317 "claim": "Instruction-tuned generalist models consistently outperform specialist coder/VL models on visual artifact generation.", 318 "evidence": "Section 4.2 and Table 3 show Qwen-2.5-Instruct surpasses both Qwen-2.5-coder and Qwen2.5-VL-72B across the evaluated tasks.", 319 "supported": "moderate" 320 }, 321 { 322 "claim": "Performance scales with model size and deliberation time.", 323 "evidence": "Section 4.2 describes scaling trends within the Qwen2.5 and Qwen3 families in Table 3, and notes that 'slow thinkers' score higher.", 324 "supported": "moderate" 325 }, 326 { 327 "claim": "Multiple screenshots better capture dynamics than a single screenshot for MLLM-as-Judge evaluation.", 328 "evidence": "Table 4 shows consistent improvement going from 'w/o img' to 'w/ img' to 'w/ imgs' across all three referee models (e.g., Gemini-2.5-Pro: 79.06% → 87.10% → 90.95%).", 329 "supported": "strong" 330 }, 331 { 332 "claim": "Proprietary multimodal models show a clear advantage over open-source models.", 333 "evidence": "Table 3 shows Gemini-2.5-Pro and Claude 4.0-Sonnet at the top of rankings. Table 6 shows GPT-5 leading at 72.55 average score.", 334 "supported": "moderate" 335 } 336 ], 337 "methodology_tags": [ 338 "benchmark-eval" 339 ], 340 "key_findings": "ArtifactsBench introduces a 1,825-task benchmark for evaluating LLM-generated interactive visual artifacts across nine domains (games, web apps, SVG, simulations, etc.), using an automated MLLM-as-Judge evaluation pipeline with dual referees. The benchmark achieves 94.4% ranking consistency with WebDev Arena and up to 90.95% pairwise agreement with human experts. Key findings include that proprietary models (Gemini-2.5-Pro, Claude 4.0-Sonnet) lead the rankings, performance scales with model size, and instruction-tuned generalist models outperform specialist coder or vision-language models on this task.", 341 "red_flags": [ 342 { 343 "flag": "Conflict of interest: Tencent evaluating own models", 344 "detail": "The benchmark is created by Tencent's Hunyuan team and evaluates Tencent's Hunyuan models (Hunyuan-Turbos-Preview, Hunyuan-A13B). Hunyuan-Turbos-Preview ranks highly among closed-source models (50.97 AVG in Table 3), close to Claude 3.7-Sonnet. The checklist generation itself uses 'Hunyuan-Turbos' (Section 3.1), meaning the evaluation rubrics were generated by the same company's model. No conflict of interest statement is provided." 345 }, 346 { 347 "flag": "No uncertainty quantification", 348 "detail": "Results across 30+ models on 1,825 tasks are reported as point estimates with no confidence intervals, error bars, significance tests, or variance across runs. It is unclear whether results are from single runs or averaged." 349 }, 350 { 351 "flag": "Hyperparameters not fully specified", 352 "detail": "The paper states 'temperature and top-p tuned per official recommendations' without providing the actual values used for each model. Different temperature settings could meaningfully affect scores, and the vague description prevents reproduction." 353 }, 354 { 355 "flag": "Human study details insufficient", 356 "detail": "The human validation study (280 queries, 6 models) does not report the number of annotators, their demographics, how they were recruited, or whether they are Tencent employees. If annotators are internal, this compounds the conflict-of-interest concern." 357 }, 358 { 359 "flag": "Circular evaluation risk", 360 "detail": "Hunyuan-Turbos is used to generate the 10-dimension checklists (Section 3.1), and Hunyuan models are then evaluated using those checklists. While human refinement is mentioned, the initial checklist generation by the same company's model creates a potential circularity where the evaluation criteria may be biased toward what Hunyuan models produce." 361 } 362 ], 363 "cited_papers": [ 364 { 365 "title": "Evaluating large language models trained on code", 366 "authors": ["Mark Chen"], 367 "year": 2021, 368 "arxiv_id": "2107.03374", 369 "relevance": "Foundational LLM code generation benchmark (HumanEval) that ArtifactsBench aims to extend beyond algorithmic correctness." 370 }, 371 { 372 "title": "SWE-bench: Can language models resolve real-world github issues?", 373 "authors": ["Carlos E Jimenez"], 374 "year": 2023, 375 "arxiv_id": "2310.06770", 376 "relevance": "Key benchmark for LLM-based software engineering that ArtifactsBench complements by focusing on visual/interactive artifacts." 377 }, 378 { 379 "title": "Judging LLM-as-a-judge with MT-bench and chatbot arena", 380 "authors": ["Lianmin Zheng"], 381 "year": 2023, 382 "relevance": "Establishes the LLM-as-Judge paradigm that ArtifactsBench builds upon for automated evaluation." 383 }, 384 { 385 "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning", 386 "authors": ["Daya Guo"], 387 "year": 2025, 388 "arxiv_id": "2501.12948", 389 "relevance": "One of the top-performing open-source models evaluated on ArtifactsBench, relevant to understanding LLM code generation capabilities." 390 }, 391 { 392 "title": "WebGen-Bench: Evaluating LLMs on generating interactive and functional websites from scratch", 393 "authors": ["Zimu Lu"], 394 "year": 2025, 395 "arxiv_id": "2505.03733", 396 "relevance": "Directly related benchmark for web page generation that ArtifactsBench compares against." 397 }, 398 { 399 "title": "FullFront: Benchmarking MLLMs across the full front-end engineering workflow", 400 "authors": ["Haoyu Sun"], 401 "year": 2025, 402 "arxiv_id": "2505.17399", 403 "relevance": "Related benchmark for front-end web development evaluation, comparing comprehension and generation capabilities." 404 }, 405 { 406 "title": "Web-Bench: A LLM code benchmark based on web standards and frameworks", 407 "authors": ["Kai Xu"], 408 "year": 2025, 409 "arxiv_id": "2505.07473", 410 "relevance": "Related benchmark focusing on DOM alignment and web task automation that ArtifactsBench compares against (69.4% vs 94.4% WebDev Arena consistency)." 411 }, 412 { 413 "title": "MLLM-bench: evaluating multimodal LLMs with per-sample criteria", 414 "authors": ["Wentao Ge"], 415 "year": 2023, 416 "arxiv_id": "2311.13951", 417 "relevance": "Related work on using per-sample evaluation criteria for multimodal LLM assessment." 418 }, 419 { 420 "title": "WebArena: A realistic web environment for building autonomous agents", 421 "authors": ["Shuyan Zhou"], 422 "year": 2023, 423 "arxiv_id": "2307.13854", 424 "relevance": "Influential web agent benchmark; WebDev Arena is used as the gold standard for human preference alignment in ArtifactsBench." 425 }, 426 { 427 "title": "Qwen3 technical report", 428 "authors": ["An Yang"], 429 "year": 2025, 430 "arxiv_id": "2505.09388", 431 "relevance": "Technical report for one of the major model families extensively evaluated on ArtifactsBench." 432 }, 433 { 434 "title": "CodeCriticBench: A holistic code critique benchmark for large language models", 435 "authors": ["Alexander Zhang"], 436 "year": 2025, 437 "arxiv_id": "2502.16614", 438 "relevance": "Related benchmark for evaluating LLM code critique capabilities, relevant to code quality evaluation methodology." 439 } 440 ] 441 }