scan.json (32405B)
1 { 2 "paper": { 3 "title": "DesignBench: A Comprehensive Benchmark for MLLM-based Front-end Code Generation", 4 "authors": [ 5 "Jingyu Xiao", 6 "Man Ho Lam", 7 "Ming Wang", 8 "Yuxuan Wan", 9 "Junliang Liu", 10 "Yintong Huo", 11 "Michael R. Lyu" 12 ], 13 "year": 2026, 14 "venue": "Conference '26 (ACM)", 15 "arxiv_id": "2506.06251", 16 "doi": "10.48550/arXiv.2506.06251" 17 }, 18 "scan_version": 3, 19 "active_modules": ["experimental_rigor", "data_leakage"], 20 "methodology_tags": ["benchmark-eval"], 21 "key_findings": "DesignBench evaluates 9 MLLMs across 3 front-end frameworks (React, Vue, Angular) and vanilla HTML/CSS on generation, edit, and repair tasks using 900 webpage samples. Claude-3.7, GPT-4o, Gemini-2.0, and Pixtral-124B are the top performers, but all models struggle significantly with framework-specific syntax (especially Angular) and show very low adoption of component-based design patterns (0.24% for React). Code-only input consistently outperforms image-only and even multimodal input for edit and repair tasks, suggesting MLLMs underutilize visual information. UI issue detection accuracy averages only 27.1% across all models and frameworks.", 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "The paper states 'Our code and data are available at https://github.com/WebPAI/DesignBench' in the abstract and references [43]." 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": true, 32 "justification": "The benchmark data (900 webpage samples) is released at the same GitHub repository. The paper states 'Our code and data are available at https://github.com/WebPAI/DesignBench.'" 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "The paper specifies temperature=0 and max tokens but does not provide a requirements.txt, Dockerfile, or detailed environment setup section listing library versions needed to reproduce the evaluation pipeline." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper references the GitHub repo but does not include step-by-step reproduction instructions in the paper itself. No 'Reproducing Results' section or specific commands are provided." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": true, 48 "answer": false, 49 "justification": "All results in Tables 5-10 are reported as point estimates (e.g., CLIP scores, MLLM scores) with no confidence intervals or error bars." 50 }, 51 "significance_tests": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper makes numerous comparative claims (e.g., 'Claude-3.7 achieves the highest performance', 'code-only input consistently outperforms image-only') based solely on comparing raw numbers without any statistical significance tests." 55 }, 56 "effect_sizes_reported": { 57 "applies": true, 58 "answer": false, 59 "justification": "Raw metric values are reported in tables but no formal effect sizes (Cohen's d, relative improvement with baseline context, etc.) are computed. Differences between models or conditions are described qualitatively without quantifying effect magnitudes." 60 }, 61 "sample_size_justified": { 62 "applies": true, 63 "answer": false, 64 "justification": "The benchmark contains 900 samples (430 generation, 359 edit, 111 repair) but no justification is given for why these sizes are adequate. No power analysis is discussed." 65 }, 66 "variance_reported": { 67 "applies": true, 68 "answer": false, 69 "justification": "Results are from single runs with temperature=0. No variance, standard deviation, or spread measures across multiple experimental runs are reported." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": true, 76 "justification": "Nine MLLMs are compared against each other across all tasks and frameworks. Table 1 also compares DesignBench against prior benchmarks (Pix2code, WebSight, Web2Code, WebCode2M, Design2Code)." 77 }, 78 "baselines_contemporary": { 79 "applies": true, 80 "answer": true, 81 "justification": "The evaluated models include state-of-the-art MLLMs: Claude-3.7-sonnet (Feb 2025), GPT-4o (Nov 2024), Gemini-2.0-Flash, Qwen2.5-VL, and Llama-3.2-Vision, all contemporary at the time of evaluation." 82 }, 83 "ablation_study": { 84 "applies": true, 85 "answer": true, 86 "justification": "RQ4 (Section 6.4, Table 7) conducts an input context ablation study comparing code-only, image-only, and combined inputs for Design Edit and Repair tasks, systematically varying input modalities while keeping everything else constant." 87 }, 88 "multiple_metrics": { 89 "applies": true, 90 "answer": true, 91 "justification": "Six metrics are used: CLIP (semantic similarity), SSIM (structural similarity), CSR (compilation success rate), CMLS (code modification location similarity), CMCS (code modification content similarity), and MLLM-as-Judge scores." 92 }, 93 "human_evaluation": { 94 "applies": true, 95 "answer": true, 96 "justification": "Five PhD students conducted human validation of the MLLM-as-Judge metric by evaluating model outputs. Inter-annotator agreement was high (Kappa 0.8648 and 0.8428) and MLLM judge accuracy was 95.54% for edit and 91.89% for repair compared to human majority votes." 97 }, 98 "held_out_test_set": { 99 "applies": true, 100 "answer": true, 101 "justification": "The benchmark is used as a zero-shot test set. Models are not trained or tuned on any portion of DesignBench data; all 900 samples serve as evaluation-only data." 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": true, 106 "justification": "Extensive breakdowns are provided: by framework (React/Vue/Angular/Vanilla), by task (generation/edit/repair), by difficulty level (easy/medium/hard in Table 6), by input context (Table 7), by edit type, by issue type, and by failure category (Figure 9)." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": true, 111 "justification": "RQ6 (Section 6.6) provides systematic failure analysis with 22 failure types across the three tasks. Figure 9 shows detailed failure type distributions for each model, including compile errors, layout disorder, wrong objects, partial edits, and no-repair attempts." 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": true, 116 "justification": "Multiple negative findings are reported: very low component-based design adoption (0.24% for React, Table 9), poor UI issue detection accuracy (27.14% average, Table 10), severe Angular compilation challenges, and multimodal input providing minimal benefit over code-only." 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": true, 123 "justification": "The abstract claims about framework-specific limitations (supported by RQ2, Table 5, Fig. 6), task-related bottlenecks (supported by RQ1), and performance variations (supported by RQ3-RQ4) are all substantiated by corresponding experimental results in Sections 6.1-6.6." 124 }, 125 "causal_claims_justified": { 126 "applies": true, 127 "answer": true, 128 "justification": "The main causal-ish claims come from the input context ablation (RQ4), where code-only vs image-only vs both are compared in a controlled manipulation. The claim 'code representations convey more precise semantic information' is hedged with 'suggesting' and supported by the controlled experiment varying only the input type." 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": true, 133 "justification": "The title specifies 'MLLM-based Front-end Code Generation' and findings are generally scoped to the tested frameworks and models. The external validity section (Section 7) explicitly acknowledges the limitation to React, Vue, and Angular frameworks." 134 }, 135 "alternative_explanations_discussed": { 136 "applies": true, 137 "answer": false, 138 "justification": "The paper does not discuss alternative explanations for its findings. For example, code-only outperforming image-only could be due to token budget differences, prompt design effects, or the specific way images are encoded, but none of these alternatives are considered." 139 }, 140 "proxy_outcome_distinction": { 141 "applies": true, 142 "answer": true, 143 "justification": "The paper clearly defines what each metric measures (CLIP for semantic visual similarity, SSIM for structural similarity, CSR for compilation, CMLS/CMCS for code modification quality) and validates the MLLM-as-Judge proxy against human evaluation (95.54% and 91.89% accuracy). Claims match the granularity of measurements." 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": true, 149 "answer": true, 150 "justification": "Table 4 lists specific model versions for 7 of 9 models: GPT-4o-2024-11-20, Claude-3-7-sonnet-20250219, Pixtral-12B-2409, Qwen2.5-VL-7B/72B-Instruct, Llama-3.2-11B/90B-Vision-Instruct. Two exceptions lack snapshot dates: Gemini-2.0-Flash and Pixtral-large-latest (a floating tag)." 151 }, 152 "prompts_provided": { 153 "applies": true, 154 "answer": false, 155 "justification": "Figure 5 shows only simplified prompt templates with placeholders like '<framework>', '<instruction>', '{code}', and '<image>'. The paper states 'detailed prompts available in our code [43]' but does not include the actual prompt text used in experiments." 156 }, 157 "hyperparameters_reported": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section 5.2 states: 'we set the temperature to 0 and the maximum number of tokens output as the upper limit of MLLMs' maximum output token. All other parameters are kept at their default settings.'" 161 }, 162 "scaffolding_described": { 163 "applies": false, 164 "answer": false, 165 "justification": "No agentic scaffolding is used. Models are prompted once and generate code output directly without tool use, retry logic, or multi-turn feedback loops." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": true, 170 "justification": "Section 4.2 documents the full data collection and processing pipeline for each task: GitHub/top 500 collection, single-file tool for saving, image placeholder replacement, Selenium screenshots, V0/Vue0 crawling with filtering criteria, annotator quality assessment with majority voting, and GPT-4o translation for Angular/Vanilla edit samples." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": true, 177 "justification": "Section 7 'Threats to Validity' provides substantive discussion of internal and external validity threats." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section 7 discusses specific threats: MLLM-as-judge reliability (mitigated by human validation with 95.54% accuracy), data leakage from closed applications and manually-written ground truth, and limitation to three specific frameworks. These are specific to this study." 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": true, 187 "justification": "External validity explicitly states: 'We only include limited frameworks of React, Vue, and Angular' and explains the rationale. Section 3.2 also notes that interactive and multi-page applications (Interaction2Code, MRWeb) are 'out of our scope.'" 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": true, 194 "justification": "The benchmark data is released at https://github.com/WebPAI/DesignBench, including webpage samples, annotations, and evaluation code, enabling independent verification." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": true, 199 "justification": "Section 4.2 provides detailed data collection procedures: GitHub projects (152 popular projects with deployed links, 4055 avg stars), Moz top 500 websites (158 framework-based webpages), V0 platform (541 React projects), Vue0 (1,349 Vue projects), and 120 webcode2m samples." 200 }, 201 "recruitment_methods_described": { 202 "applies": true, 203 "answer": true, 204 "justification": "Sample recruitment is well-described: GitHub search for framework-specific projects with star count filtering, Moz top 500 ranking for popular websites, V0/Vue0 platforms for edit task data with compilation filtering. Annotators are described as 'five PhD students with three years of front-end development experience.'" 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": true, 209 "justification": "The full pipeline is documented in Section 4.2 with counts at each stage: Design Edit goes from 541+1,349 crawled projects → filtered for compilation + 2+ iterations → annotator quality assessment (3 clarity levels × 3 quality levels) → majority voting → 359 high-quality samples → 146 translated to Angular/Vanilla → verified by annotators." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding sources or acknowledgments section is present in the paper. University-affiliated researchers typically have grant funding but none is disclosed." 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "Author affiliations are clearly listed: six authors from The Chinese University of Hong Kong and one from Singapore Management University. They are not affiliated with any of the evaluated model providers." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": false, 226 "justification": "No funding source is disclosed, so independence cannot be assessed. Without a funding disclosure statement, this criterion is not satisfied." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": false, 231 "justification": "No competing interests or financial interests statement is present in the paper." 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": true, 237 "answer": false, 238 "justification": "Section 5.3 reviews the training datasets of open-source models but does not state the training data cutoff dates for any of the 9 evaluated models." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": true, 242 "answer": true, 243 "justification": "Section 5.3 discusses potential overlap: reviews training datasets of Qwen, Pixtral, and Llama (finding they don't include web HTML code), notes that design edit/repair data is unlikely contaminated due to proprietary instruction-code-image triples, and computes BLEU scores between generated and original code." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": true, 247 "answer": true, 248 "justification": "Section 5.3 addresses contamination through multiple arguments: half the webpages come from closed applications (top 500 websites), repair ground truth is manually written by developers, and Table 3 shows low BLEU scores (0.05-0.15) between generated and original code, indicating models are not memorizing." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human subjects are studied. The five PhD students serve as benchmark annotators and metric validators, not as research participants." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human subjects study is conducted. Annotators are creating/validating the benchmark, not being studied." 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human subjects study. Annotators are described only as 'five PhD students with three years of front-end development experience.'" 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human subjects study is conducted." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human subjects study is conducted." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human subjects study is conducted." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human subjects study is conducted." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": true, 291 "answer": true, 292 "justification": "Section 5.2 reports: 'The entire benchmark evaluation incurs an average API cost of $52 per model on average. The average processing times per sample with a single thread, are 49 seconds for generation, 29 seconds for editing, and 25 seconds for repair.'" 293 }, 294 "compute_budget_stated": { 295 "applies": true, 296 "answer": true, 297 "justification": "Total API cost ($52 per model average) and per-sample processing times are reported in Section 5.2, providing a clear picture of the computational budget." 298 } 299 }, 300 "experimental_rigor": { 301 "seed_sensitivity_reported": { 302 "applies": true, 303 "answer": false, 304 "justification": "Results are from single runs with temperature=0. No seed sensitivity analysis is reported. While temperature=0 makes outputs nominally deterministic, API non-determinism is not tested or acknowledged." 305 }, 306 "number_of_runs_stated": { 307 "applies": true, 308 "answer": false, 309 "justification": "The number of experimental runs is not explicitly stated. Temperature is set to 0, implying single runs, but this is not confirmed and the implications for result stability are not discussed." 310 }, 311 "hyperparameter_search_budget": { 312 "applies": true, 313 "answer": false, 314 "justification": "No hyperparameter search is reported. The prompt design process is not described — it is unclear how the prompts in Figure 5 were developed or whether alternatives were tested." 315 }, 316 "best_config_selection_justified": { 317 "applies": true, 318 "answer": true, 319 "justification": "A single, standard configuration is used across all models (temperature=0, max tokens at model limit, default other parameters). No configuration selection or tuning is performed, eliminating cherry-picking concerns." 320 }, 321 "multiple_comparison_correction": { 322 "applies": true, 323 "answer": false, 324 "justification": "The paper makes many implicit comparisons across 9 models × 4 frameworks × 3 tasks without any statistical testing, let alone multiple comparison correction." 325 }, 326 "self_comparison_bias_addressed": { 327 "applies": true, 328 "answer": false, 329 "justification": "The authors design the benchmark, prompts, and evaluation metrics, then evaluate third-party models. No discussion of potential biases from their own design choices affecting results (e.g., prompt design favoring certain models)." 330 }, 331 "compute_budget_vs_performance": { 332 "applies": true, 333 "answer": false, 334 "justification": "While API costs and model sizes are mentioned, no analysis relates performance to compute budget. Comparing a 7B model to a 124B model without discussing the compute-performance tradeoff leaves an incomplete picture." 335 }, 336 "benchmark_construct_validity": { 337 "applies": true, 338 "answer": false, 339 "justification": "The paper validates MLLM-as-Judge against human evaluation but does not discuss whether CLIP and SSIM scores are valid proxies for UI quality, or whether the benchmark tasks comprehensively capture 'front-end engineering capability.' No comparison with alternative evaluation approaches." 340 }, 341 "scaffold_confound_addressed": { 342 "applies": false, 343 "answer": false, 344 "justification": "No scaffolding is involved. Models are prompted directly without agentic frameworks, tool use, or multi-turn interactions." 345 } 346 }, 347 "data_leakage": { 348 "temporal_leakage_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "No temporal analysis is provided. The paper does not discuss when the benchmark webpages were created relative to model training cutoffs. GitHub projects and top 500 websites may have been in training data for models trained after their publication." 352 }, 353 "feature_leakage_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "No discussion of whether the evaluation setup leaks information. For example, in design edit/repair tasks, providing the original code could contain structural hints not available in real-world scenarios." 357 }, 358 "non_independence_addressed": { 359 "applies": true, 360 "answer": false, 361 "justification": "No discussion of whether test examples share structural similarities. Multiple webpages from the same GitHub projects or similar website categories could introduce non-independence, but this is not addressed." 362 }, 363 "leakage_detection_method": { 364 "applies": true, 365 "answer": true, 366 "justification": "Table 3 computes BLEU scores between generated and original webpage code for all 9 models. Low scores (0.05-0.15) provide concrete evidence that models are not memorizing and reproducing original code." 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "Claude-3.7, GPT-4o, Gemini-2.0, and Pixtral-124B are the top-performing MLLMs across three front-end tasks.", 373 "evidence": "Table 5 shows these models consistently achieve the highest CLIP scores (0.60-0.83), MLLM scores (8.01-9.15 for edit, 5.28-7.32 for repair), and compilation rates across frameworks. Section 6.1, Finding 1.", 374 "supported": "strong" 375 }, 376 { 377 "claim": "Larger models consistently outperform smaller variants within the same family.", 378 "evidence": "Table 5 shows systematic improvements: Llama-90B vs 11B, Pixtral-124B vs 12B, Qwen-72B vs 7B across all metrics and tasks. Section 6.1, Finding 2.", 379 "supported": "strong" 380 }, 381 { 382 "claim": "MLLMs demonstrate strongest performance with vanilla HTML/CSS and significant challenges with Angular implementations.", 383 "evidence": "Figure 6 shows vanilla HTML achieves highest CLIP scores (>0.72) and perfect compilation rates, while Angular has lowest compilation (0.6-0.7) and CLIP (0.45-0.55). Section 6.2, Finding 4.", 384 "supported": "strong" 385 }, 386 { 387 "claim": "Code-only input consistently outperforms image-only input for Design Edit and Design Repair tasks.", 388 "evidence": "Table 7 shows code-only MLLM scores (8.40-8.57 for edit) exceed image-only (7.37-7.68) across all models. Multimodal combinations provide minimal additional improvement. Section 6.4, Finding 6.", 389 "supported": "strong" 390 }, 391 { 392 "claim": "MLLMs show critical deficiencies in component-based implementation, with very low adoption rates.", 393 "evidence": "Table 9 shows component-based design adoption rates of 0.24% (React), 5% (Vue), and 19% (Angular) on average across models. Case study in Figure 8 and Listing 1 demonstrates hardcoded repetitive structures instead of v-for directives. Section 6.5.2, Finding 8.", 394 "supported": "strong" 395 }, 396 { 397 "claim": "MLLMs struggle with identifying UI design issues, with an overall average accuracy of only 27.14%.", 398 "evidence": "Table 10 shows UI issue identification accuracy ranging from 0.0149 (Llama-11B) to 0.3951 (GPT-4o), with framework-specific averages of 29.72% (React), 22.05% (Vue), 22.75% (Angular), and 34.03% (Vanilla). Section 6.5.3, Finding 9.", 399 "supported": "strong" 400 }, 401 { 402 "claim": "MLLM performance degrades significantly under harder task conditions.", 403 "evidence": "Table 6 shows degradation across all tasks: generation CLIP drops from 0.79-0.86 (easy) to 0.43-0.53 (hard), edit MLLM scores drop from 8.64-9.05 to 7.51-8.19, and repair MLLM scores drop from 6.93-7.21 to 4.19-6.93. Section 6.3, Finding 5.", 404 "supported": "strong" 405 } 406 ], 407 "red_flags": [ 408 { 409 "flag": "No statistical significance tests", 410 "detail": "All comparative claims across 9 models, 4 frameworks, and 3 tasks are based on comparing raw numbers without any statistical tests, confidence intervals, or error bars. Given single-run results, it is impossible to determine whether observed differences are meaningful or within noise." 411 }, 412 { 413 "flag": "GPT-4o used as both subject and judge", 414 "detail": "GPT-4o is evaluated as one of the 9 benchmark subjects while simultaneously serving as the MLLM-as-Judge for Design Edit and Design Repair scoring. This creates a potential self-evaluation bias where GPT-4o may rate its own outputs more favorably." 415 }, 416 { 417 "flag": "GPT-4o used to create benchmark data it is then evaluated on", 418 "detail": "Section 4.2 states that 146 edit samples were translated from React/Vue to Angular and vanilla HTML/CSS 'using GPT-4o.' GPT-4o is then evaluated on these same samples in Table 5, potentially giving it an advantage on data it generated." 419 }, 420 { 421 "flag": "Single-run results with no variance reporting", 422 "detail": "All results are from single experimental runs (temperature=0). Even with deterministic settings, API non-determinism can affect results. No spread measures are reported, making it impossible to assess result stability." 423 }, 424 { 425 "flag": "No funding disclosure", 426 "detail": "No funding sources or competing interests are declared despite the research being conducted at two major universities. The absence of disclosure is not the same as absence of conflicts." 427 } 428 ], 429 "cited_papers": [ 430 { 431 "title": "Design2Code: How Far Are We From Automating Front-End Engineering?", 432 "authors": ["Chenglei Si", "Yanzhe Zhang", "Zhengyuan Yang", "Ruibo Liu", "Diyi Yang"], 433 "year": 2024, 434 "arxiv_id": "2403.03163", 435 "relevance": "Directly competing benchmark for design-to-code evaluation using real-world webpages from Common Crawl." 436 }, 437 { 438 "title": "Webcode2m: A real-world dataset for code generation from webpage designs", 439 "authors": ["Yi Gui", "Zhen Li", "Yao Wan"], 440 "year": 2025, 441 "relevance": "Large-scale real-world webpage-to-code benchmark providing training data and test sets for MLLM evaluation." 442 }, 443 { 444 "title": "Automatically Generating UI Code from Screenshot: A Divide-and-Conquer-Based Approach", 445 "authors": ["Yuxuan Wan", "Chaozheng Wang", "Yi Dong"], 446 "year": 2025, 447 "relevance": "MLLM-based approach for UI code generation using divide-and-conquer strategy, directly evaluated by DesignBench-type tasks." 448 }, 449 { 450 "title": "Interaction2Code: How far are we from automatic interactive webpage generation?", 451 "authors": ["Jingyu Xiao", "Yuxuan Wan", "Yintong Huo"], 452 "year": 2024, 453 "arxiv_id": "2411.03292", 454 "relevance": "Related benchmark focusing on interactive webpage generation, a complementary evaluation dimension to DesignBench." 455 }, 456 { 457 "title": "Livecodebench: Holistic and contamination free evaluation of large language models for code", 458 "authors": ["Naman Jain", "King Han", "Alex Gu"], 459 "year": 2024, 460 "arxiv_id": "2403.07974", 461 "relevance": "Contamination-free code evaluation benchmark, relevant to data leakage and benchmark design methodology." 462 }, 463 { 464 "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions", 465 "authors": ["Terry Yue Zhuo"], 466 "year": 2024, 467 "arxiv_id": "2406.15877", 468 "relevance": "Code generation benchmark with diverse task types, relevant to evaluation methodology for LLM code capabilities." 469 }, 470 { 471 "title": "SWE-bench Multimodal: Do AI Systems Generalize to Visual Software Domains?", 472 "authors": ["John Yang", "Carlos E Jimenez", "Alex L Zhang"], 473 "year": 2025, 474 "relevance": "Multimodal software engineering benchmark evaluating whether AI systems generalize to visual domains, closely related to DesignBench's scope." 475 }, 476 { 477 "title": "MLLM-as-a-Judge: Assessing multimodal LLM-as-a-judge with vision-language benchmark", 478 "authors": ["Dongping Chen"], 479 "year": 2024, 480 "relevance": "Establishes methodology for using MLLMs as evaluation judges, directly adopted as a key evaluation metric in DesignBench." 481 }, 482 { 483 "title": "MMCode: Benchmarking Multimodal Large Language Models for Code Generation with Visually Rich Programming Problems", 484 "authors": ["Kaixin Li", "Yuchen Tian", "Qisheng Hu"], 485 "year": 2024, 486 "relevance": "Multimodal code generation benchmark using visual programming problems, relevant to evaluating MLLMs on code from visual inputs." 487 }, 488 { 489 "title": "ComUICoder: Component-based Reusable UI Code Generation for Complex Websites via Semantic Segmentation and Element-wise Feedback", 490 "authors": ["Jingyu Xiao"], 491 "year": 2026, 492 "arxiv_id": "2602.19276", 493 "relevance": "MLLM approach for reusable UI component generation, directly addresses the component-based design limitations identified in DesignBench." 494 }, 495 { 496 "title": "Web2Code: A Large-scale Webpage-to-Code Dataset and Evaluation Framework for Multimodal LLMs", 497 "authors": ["Sukmin Yun"], 498 "year": 2024, 499 "arxiv_id": "2406.20098", 500 "relevance": "Webpage-to-code evaluation framework for MLLMs using synthetic data, a direct predecessor benchmark to DesignBench." 501 }, 502 { 503 "title": "CruxEval: A benchmark for code reasoning, understanding and execution", 504 "authors": ["Alex Gu", "Baptiste Rozière", "Hugh Leather"], 505 "year": 2024, 506 "arxiv_id": "2401.03065", 507 "relevance": "Code reasoning benchmark for LLMs, relevant to evaluating code understanding capabilities." 508 } 509 ], 510 "engagement_factors": { 511 "practical_relevance": { 512 "score": 2, 513 "justification": "Front-end developers can use the benchmark results to choose which MLLM works best for their framework, and the findings about code-only vs multimodal input inform practical workflow decisions." 514 }, 515 "surprise_contrarian": { 516 "score": 1, 517 "justification": "The finding that multimodal input doesn't improve over code-only is somewhat surprising, but most findings (bigger models better, vanilla easier than frameworks) confirm expectations." 518 }, 519 "fear_safety": { 520 "score": 0, 521 "justification": "No AI safety, security, or risk concerns are raised." 522 }, 523 "drama_conflict": { 524 "score": 0, 525 "justification": "No controversy or conflict with prior work; the paper positions itself as filling gaps in existing benchmarks." 526 }, 527 "demo_ability": { 528 "score": 2, 529 "justification": "Code and data are released on GitHub, enabling researchers to reproduce evaluations, though there is no live demo or pip-installable tool." 530 }, 531 "brand_recognition": { 532 "score": 1, 533 "justification": "Authors are from CUHK, a well-regarded but not headline-grabbing lab. The paper evaluates well-known models (GPT-4o, Claude, Gemini) which adds some recognition." 534 } 535 } 536 }