scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (22377B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "DesignBench: A Comprehensive Benchmark for MLLM-based Front-end Code Generation",
      6     "authors": [
      7       "Jingyu Xiao",
      8       "Ming Wang",
      9       "Man Ho Lam",
     10       "Yuxuan Wan",
     11       "Junliang Liu",
     12       "Yintong Huo",
     13       "Michael R. Lyu"
     14     ],
     15     "year": 2025,
     16     "venue": "arXiv.org",
     17     "arxiv_id": "2506.06251",
     18     "doi": "10.48550/arXiv.2506.06251"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The abstract claims about framework-specific limitations (supported by RQ2, Table 5, Fig. 6), task-related bottlenecks (supported by RQ1), and performance variations (supported by RQ3-RQ4) are all substantiated by corresponding experimental results in Sections 6.1-6.6.",
     26         "source": "opus"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The main causal-ish claims come from the input context ablation (RQ4), where code-only vs image-only vs both are compared in a controlled manipulation. The claim 'code representations convey more precise semantic information' is hedged with 'suggesting' and supported by the controlled experiment varying only the input type.",
     32         "source": "opus"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "The title specifies 'MLLM-based Front-end Code Generation' and findings are generally scoped to the tested frameworks and models. The external validity section (Section 7) explicitly acknowledges the limitation to React, Vue, and Angular frameworks.",
     38         "source": "opus"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper does not discuss alternative explanations for its findings. For example, code-only outperforming image-only could be due to token budget differences, prompt design effects, or the specific way images are encoded, but none of these alternatives are considered.",
     44         "source": "opus"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper clearly defines what each metric measures (CLIP for semantic visual similarity, SSIM for structural similarity, CSR for compilation, CMLS/CMCS for code modification quality) and validates the MLLM-as-Judge proxy against human evaluation (95.54% and 91.89% accuracy). Claims match the granularity of measurements.",
     50         "source": "opus"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Section 7 'Threats to Validity' provides substantive discussion of internal and external validity threats.",
     58         "source": "opus"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Section 7 discusses specific threats: MLLM-as-judge reliability (mitigated by human validation with 95.54% accuracy), data leakage from closed applications and manually-written ground truth, and limitation to three specific frameworks. These are specific to this study.",
     64         "source": "opus"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "External validity explicitly states: 'We only include limited frameworks of React, Vue, and Angular' and explains the rationale. Section 3.2 also notes that interactive and multi-page applications (Interaction2Code, MRWeb) are 'out of our scope.'",
     70         "source": "opus"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No funding sources or acknowledgments section is present in the paper. University-affiliated researchers typically have grant funding but none is disclosed.",
     78         "source": "opus"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Author affiliations are clearly listed: six authors from The Chinese University of Hong Kong and one from Singapore Management University. They are not affiliated with any of the evaluated model providers.",
     84         "source": "opus"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No funding source is disclosed, so independence cannot be assessed. Without a funding disclosure statement, this criterion is not satisfied.",
     90         "source": "opus"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests or financial interests statement is present in the paper.",
     96         "source": "opus"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The three tasks (Design Generation, Design Edit, Design Repair) are formally defined with mathematical notation in Section 4.1; front-end frameworks are explained in Section 2.2 with market share figures; MLLM is spelled out as Multimodal Large Language Models.",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper explicitly lists three contributions in a bulleted section: first multi-framework multi-task benchmark, extensive evaluation of nine MLLMs across multiple dimensions, and identification of 22 failure types with actionable guidance.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 3 systematically reviews existing UI code generation benchmarks and Table 1 provides a structured comparison showing exactly how DesignBench addresses three specific gaps (framework integration, task coverage, evaluation dimensions) relative to each prior work.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "benchmark-creation": {
    122       "construct_design": {
    123         "construct_validity_argued": {
    124           "applies": true,
    125           "answer": true,
    126           "justification": "Section 4.1 grounds the three tasks in the real-world front-end development pipeline described in Section 2.1, and metrics are each justified for what they capture: visual fidelity (CLIP/SSIM), compilation correctness (CSR), and code modification precision (CMLS/CMCS via AST comparison).",
    127           "source": "haiku"
    128         },
    129         "difficulty_distribution_characterized": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "Difficulty is explicitly operationalized per task: Generation uses a weighted formula (image size, element count, color variety, layout complexity normalized 0-100 with <30/30-80/>80 thresholds); Edit uses annotator labels; Repair uses lines-of-code thresholds (<10 easy, >30 hard).",
    133           "source": "haiku"
    134         },
    135         "ceiling_floor_effects_checked": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "Qwen-7B reaches near-floor CLIP scores (~0.09) for React/Vue and top models hit near-ceiling CSR (~1.0) on edit/repair tasks, but the paper does not acknowledge or discuss these boundary effects as limitations on benchmark discrimination.",
    139           "source": "haiku"
    140         },
    141         "human_baseline_included": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "No human developer baseline performance is reported for any of the three tasks; human involvement is limited to quality annotation and MLLM-as-judge validation, not establishing human-level task performance.",
    145           "source": "haiku"
    146         },
    147         "scoring_rubric_justified": {
    148           "applies": true,
    149           "answer": true,
    150           "justification": "Section 5.4 justifies each metric: CLIP for semantic similarity, SSIM for structural layout similarity, CSR for compilation, CMLS/CMCS via AST-based Jaccard/CodeBLEU for modification precision, and MLLM-as-judge validated with Kappa 0.86-0.84 and 90-95% human agreement on 359/111 sampled items.",
    151           "source": "haiku"
    152         }
    153       },
    154       "robustness": {
    155         "contamination_resistance_designed": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "Contamination is addressed reactively via post-hoc BLEU score measurement (Section 5.3) rather than proactively built into benchmark design; no temporal splits, canary strings, or dynamic generation mechanisms are implemented.",
    159           "source": "haiku"
    160         },
    161         "temporal_robustness_discussed": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "The paper does not discuss whether rapid MLLM improvement will obsolete the benchmark, when ceiling effects will render it uninformative, or any plan for updating the benchmark as models improve.",
    165           "source": "haiku"
    166         },
    167         "failure_modes_discussed": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "Threats to Validity addresses MLLM-as-judge reliability and data leakage, but does not discuss benchmark failure modes such as whether multiple valid code solutions exist (harming CMLS/CMCS scoring), whether GPT-4o-translated samples introduce systematic bias, or whether the repair ground truth is unique.",
    171           "source": "haiku"
    172         },
    173         "baseline_implementations_provided": {
    174           "applies": true,
    175           "answer": true,
    176           "justification": "Code, data, annotation guidelines, prompts, and evaluation scripts are all publicly available at https://github.com/WebPAI/DesignBench, explicitly referenced multiple times throughout the paper.",
    177           "source": "haiku"
    178         }
    179       },
    180       "documentation": {
    181         "dataset_documentation_complete": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Section 4.2 details data collection sources (GitHub, Moz top 500, Vercel V0, Vue0), filtering criteria (compilation success, PhD annotator quality rating), annotation process (five annotators, majority voting, iterative guideline refinement), and links to annotation guidelines in the repository.",
    185           "source": "haiku"
    186         },
    187         "licensing_and_access_clear": {
    188           "applies": true,
    189           "answer": false,
    190           "justification": "The GitHub repository is linked but no explicit open-source license for the benchmark data or code is stated in the paper; the ACM copyright header covers the paper itself but does not clarify reuse terms for the dataset.",
    191           "source": "haiku"
    192         },
    193         "intended_use_specified": {
    194           "applies": true,
    195           "answer": false,
    196           "justification": "The paper describes what the benchmark evaluates but does not state what should NOT be concluded from results — no guidance on whether scores predict production development performance, how to interpret model rankings on unseen website domains, or limitations of the CLIP/SSIM metrics as proxies.",
    197           "source": "haiku"
    198         }
    199       }
    200     }
    201   },
    202   "claims": [
    203     {
    204       "claim": "MLLMs perform significantly worse on framework-based development (React, Vue, Angular) than vanilla HTML/CSS, with Angular being most challenging",
    205       "evidence": "Figure 6 shows average CLIP >0.72 for vanilla vs 0.45-0.56 for Angular; compilation rates are perfect for vanilla vs 0.60-0.75 for Angular across all nine models",
    206       "supported": "strong"
    207     },
    208     {
    209       "claim": "Larger models consistently outperform smaller counterparts within the same family across all three tasks",
    210       "evidence": "Table 5 shows systematic performance gaps for Pixtral-124B vs 12B, Qwen-72B vs 7B, and Llama-90B vs 11B across all tasks and frameworks",
    211       "supported": "strong"
    212     },
    213     {
    214       "claim": "Code-only input consistently outperforms image-only input for design edit and repair tasks",
    215       "evidence": "Table 7 shows code-only achieves 8.40-8.43 vs image-only 7.37-7.67 for top models on Design Edit; similar advantage for Design Repair (6.53-6.70 vs 5.47-5.81)",
    216       "supported": "strong"
    217     },
    218     {
    219       "claim": "Combining code and image inputs provides minimal improvement over code-only and occasionally degrades performance",
    220       "evidence": "Table 7 shows 'Both' columns nearly identical to 'Code' columns for top models, with some models (GPT-4o, Gemini on repair) slightly lower with combined input",
    221       "supported": "strong"
    222     },
    223     {
    224       "claim": "MLLMs critically underutilize component-based design with average adoption rates of 0.24%, 5%, and 19% for React, Vue, and Angular respectively",
    225       "evidence": "Table 9 shows near-zero component adoption across all nine evaluated models for React, with most models at 0% for React and <10% for Vue",
    226       "supported": "strong"
    227     },
    228     {
    229       "claim": "MLLMs can only identify UI display issues with 27% average accuracy across all frameworks and models",
    230       "evidence": "Table 10 reports per-framework averages of 0.297, 0.221, 0.228, 0.340 (React, Vue, Angular, Vanilla) averaging 0.271; even best model GPT-4o achieves only 0.395 average",
    231       "supported": "strong"
    232     },
    233     {
    234       "claim": "MLLM performance degrades significantly with task difficulty, especially for Design Repair where some models experience catastrophic failure",
    235       "evidence": "Table 6 shows Gemini-2.0 dropping from 7.07 (Easy) to 4.19 (Hard) on repair; Llama-11B from 3.91 to 2.41; difficulty impact is most severe for repair relative to generation and edit",
    236       "supported": "strong"
    237     }
    238   ],
    239   "methodology_tags": [
    240     "benchmark-eval"
    241   ],
    242   "key_findings": "DesignBench reveals that MLLMs are substantially less capable at framework-based front-end development than vanilla HTML/CSS, with Angular proving most challenging due to TypeScript module systems and component architecture. Code-only input consistently outperforms multimodal input for edit and repair tasks, exposing that current MLLMs cannot effectively leverage visual information when code context is available. MLLMs fail to adopt component-based design patterns (0.24% for React) and identify UI display issues at only 27% accuracy — pointing to fundamental gaps in framework-specific knowledge and visual diagnostic reasoning. The evaluation of nine models at two size points each shows consistent scale benefits but no model approaches human-level front-end engineering capability across all three task types.",
    243   "red_flags": [
    244     {
    245       "flag": "No human baseline",
    246       "detail": "No human developer performance is reported for any task, making it impossible to calibrate how close or far MLLMs are from human-level capability — the central question for a practical benchmark."
    247     },
    248     {
    249       "flag": "Judge model also evaluated",
    250       "detail": "GPT-4o serves as the MLLM-as-judge metric for design edit and repair while simultaneously being one of the nine evaluated models, creating a potential self-evaluation bias; the paper does not discuss this conflict."
    251     },
    252     {
    253       "flag": "GPT-4o-translated benchmark samples",
    254       "detail": "Angular and vanilla HTML/CSS edit samples are created by translating React/Vue samples using GPT-4o, introducing a model-generated benchmark component verified only by human spot-check; these samples may systematically reflect GPT-4o's generation patterns."
    255     },
    256     {
    257       "flag": "Small repair set with shallow per-framework n",
    258       "detail": "Design Repair has only 111 total samples (28 per framework), limiting statistical reliability of per-framework and per-difficulty subgroup analyses."
    259     },
    260     {
    261       "flag": "Ceiling effects on CSR not acknowledged",
    262       "detail": "Top models achieve CSR of 0.95-1.00 for design edit and repair tasks, indicating the compilation metric is nearly saturated for competitive models, but this floor limitation is not discussed."
    263     },
    264     {
    265       "flag": "No license specified for benchmark data",
    266       "detail": "Despite a public GitHub repository, no data license is stated, leaving reuse rights unclear for researchers who wish to build on or redistribute the benchmark."
    267     }
    268   ],
    269   "cited_papers": [
    270     {
    271       "title": "Design2Code: How Far Are We From Automating Front-End Engineering?",
    272       "relevance": "Direct predecessor benchmark using 484 real-world webpages; DesignBench explicitly extends its scope to multi-framework and multi-task settings"
    273     },
    274     {
    275       "title": "WebCode2M: A Real-World Dataset for Code Generation from Webpage Designs",
    276       "relevance": "Large-scale real-world benchmark with 20K samples; DesignBench samples vanilla HTML pages directly from this dataset"
    277     },
    278     {
    279       "title": "Web2Code: A Large-scale Webpage-to-Code Dataset and Evaluation Framework for Multimodal LLMs",
    280       "relevance": "Synthetic benchmark for HTML code generation; identified as a key prior benchmark with limitations DesignBench addresses"
    281     },
    282     {
    283       "title": "Unlocking the conversion of Web Screenshots into HTML Code with the WebSight Dataset",
    284       "relevance": "Large synthetic benchmark (823K samples) for training and evaluation; represents the synthetic data paradigm DesignBench moves beyond with real-world sourcing"
    285     },
    286     {
    287       "title": "Automatically Generating UI Code from Screenshot: A Divide-and-Conquer-Based Approach (DCGen)",
    288       "relevance": "State-of-the-art MLLM approach for UI code generation from the same research group; directly relevant to understanding what DesignBench evaluates"
    289     },
    290     {
    291       "title": "MLLM-as-a-Judge: Assessing Multimodal LLM-as-a-Judge with Vision-Language Benchmark",
    292       "relevance": "Methodological foundation for the MLLM-as-judge evaluation metric used in DesignBench for design edit and repair scoring"
    293     },
    294     {
    295       "title": "Interaction2Code: How far are we from automatic interactive webpage generation?",
    296       "relevance": "Related benchmark for interactive web application generation explicitly noted as out of scope for DesignBench; important for positioning"
    297     },
    298     {
    299       "title": "pix2code: Generating code from a graphical user interface screenshot",
    300       "relevance": "Foundational early work on UI-to-code generation; included in Table 1 comparison as the earliest benchmark in the lineage"
    301     }
    302   ],
    303   "engagement_factors": {
    304     "practical_relevance": {
    305       "score": 2,
    306       "justification": "Front-end developers can use the benchmark results to choose which MLLM works best for their framework, and the findings about code-only vs multimodal input inform practical workflow decisions."
    307     },
    308     "surprise_contrarian": {
    309       "score": 1,
    310       "justification": "The finding that multimodal input doesn't improve over code-only is somewhat surprising, but most findings (bigger models better, vanilla easier than frameworks) confirm expectations."
    311     },
    312     "fear_safety": {
    313       "score": 0,
    314       "justification": "No AI safety, security, or risk concerns are raised."
    315     },
    316     "drama_conflict": {
    317       "score": 0,
    318       "justification": "No controversy or conflict with prior work; the paper positions itself as filling gaps in existing benchmarks."
    319     },
    320     "demo_ability": {
    321       "score": 2,
    322       "justification": "Code and data are released on GitHub, enabling researchers to reproduce evaluations, though there is no live demo or pip-installable tool."
    323     },
    324     "brand_recognition": {
    325       "score": 1,
    326       "justification": "Authors are from CUHK, a well-regarded but not headline-grabbing lab. The paper evaluates well-known models (GPT-4o, Claude, Gemini) which adds some recognition."
    327     }
    328   },
    329   "hn_data": {
    330     "threads": [
    331       {
    332         "hn_id": "44148662",
    333         "title": "Beyond Attention: Toward Machines with Intrinsic Higher Mental States",
    334         "points": 67,
    335         "comments": 19,
    336         "url": "https://news.ycombinator.com/item?id=44148662"
    337       },
    338       {
    339         "hn_id": "37070323",
    340         "title": "Transformative AGI by 2043 is <1% likely",
    341         "points": 33,
    342         "comments": 41,
    343         "url": "https://news.ycombinator.com/item?id=37070323"
    344       },
    345       {
    346         "hn_id": "43667963",
    347         "title": "Transfer between Modalities with MetaQueries",
    348         "points": 25,
    349         "comments": 12,
    350         "url": "https://news.ycombinator.com/item?id=43667963"
    351       },
    352       {
    353         "hn_id": "43628028",
    354         "title": "NNN: Next-Generation Neural Networks for Marketing Mix Modeling",
    355         "points": 25,
    356         "comments": 3,
    357         "url": "https://news.ycombinator.com/item?id=43628028"
    358       },
    359       {
    360         "hn_id": "44859559",
    361         "title": "Modern Methods in Associative Memory",
    362         "points": 5,
    363         "comments": 1,
    364         "url": "https://news.ycombinator.com/item?id=44859559"
    365       },
    366       {
    367         "hn_id": "36306353",
    368         "title": "Transformative AGI by 2043 is <1% likely",
    369         "points": 3,
    370         "comments": 4,
    371         "url": "https://news.ycombinator.com/item?id=36306353"
    372       },
    373       {
    374         "hn_id": "46908281",
    375         "title": "LLMs do plan before they genenrate tokens",
    376         "points": 3,
    377         "comments": 0,
    378         "url": "https://news.ycombinator.com/item?id=46908281"
    379       },
    380       {
    381         "hn_id": "44236081",
    382         "title": "Geopolitical biases in LLMs",
    383         "points": 2,
    384         "comments": 0,
    385         "url": "https://news.ycombinator.com/item?id=44236081"
    386       },
    387       {
    388         "hn_id": "44556736",
    389         "title": "ASK HN: Why Google's Gemini 2.5 paper has 3295 authors?",
    390         "points": 2,
    391         "comments": 4,
    392         "url": "https://news.ycombinator.com/item?id=44556736"
    393       },
    394       {
    395         "hn_id": "44256016",
    396         "title": "Can Theoretical Physics Research Benefit from Language Agents?",
    397         "points": 1,
    398         "comments": 0,
    399         "url": "https://news.ycombinator.com/item?id=44256016"
    400       }
    401     ],
    402     "top_points": 67,
    403     "total_points": 166,
    404     "total_comments": 84
    405   }
    406 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs