scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (21874B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "DesignBench: A Comprehensive Benchmark for MLLM-based Front-end Code Generation",
      6     "authors": ["Jingyu Xiao", "Man Ho Lam", "Ming Wang", "Yuxuan Wan", "Junliang Liu", "Yintong Huo", "Michael R. Lyu"],
      7     "year": 2025,
      8     "venue": "arXiv.org",
      9     "arxiv_id": "2506.06251",
     10     "doi": "10.48550/arXiv.2506.06251"
     11   },
     12   "checklist": {
     13     "claims_and_evidence": {
     14       "abstract_claims_supported": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "The abstract's claims about benchmark coverage (900 samples, 3 frameworks, 3 tasks) and findings (framework-specific limitations, task bottlenecks) are all substantiated by Tables 2-10 and Findings 1-10.",
     18         "source": "haiku"
     19       },
     20       "causal_claims_justified": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The paper makes causal claims such as 'increased model capacity enhances essential web development capabilities' (Finding 2) from correlational benchmark data comparing differently-sized models without controlling for architecture, training data, or instruction tuning differences.",
     24         "source": "haiku"
     25       },
     26       "generalization_bounded": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper makes broad generalizations such as 'textual code offers MLLMs more semantic information than visual data' and 'MLLMs still face challenges in fixing front-end errors' from a narrow benchmark without bounding these to the specific benchmark setting.",
     30         "source": "haiku"
     31       },
     32       "alternative_explanations_discussed": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The finding that code-only input outperforms image-only input is interpreted as code providing 'more precise semantic information,' but alternatives such as models being better pre-trained on code than UI screenshots or metric-specific artifacts are not considered.",
     36         "source": "haiku"
     37       },
     38       "proxy_outcome_distinction": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "CLIP and SSIM scores are used as proxies for visual UI generation quality without validation that they correlate with human judgments of front-end quality; only the MLLM-as-Judge metric is validated against human evaluation (>90% accuracy).",
     42         "source": "haiku"
     43       }
     44     },
     45     "limitations_and_scope": {
     46       "limitations_section_present": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Section 7 'Threats to Validity' covers both internal validity (MLLM-as-judge reliability, data leakage) and external validity (limited framework coverage), going well beyond a single sentence.",
     50         "source": "haiku"
     51       },
     52       "threats_to_validity_specific": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Internal threats are addressed with specific mitigations: MLLM judge validated at >90% accuracy with Kappa 0.86/0.84, and BLEU scores (0.06-0.15 range) are measured as evidence against data leakage.",
     56         "source": "haiku"
     57       },
     58       "scope_boundaries_stated": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The paper explicitly states that interactive and multi-page benchmarks are 'out of our scope' and that external validity is bounded to React, Vue, and Angular due to market dominance.",
     62         "source": "haiku"
     63       }
     64     },
     65     "conflicts_of_interest": {
     66       "funding_disclosed": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No funding acknowledgments or grant information appear anywhere in the paper; API costs alone (~$52/model × 9 models) suggest external support that is not disclosed.",
     70         "source": "haiku"
     71       },
     72       "affiliations_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "All seven authors disclose affiliations (The Chinese University of Hong Kong and Singapore Management University) in the author contact block; none are affiliated with the commercial MLLM providers being evaluated.",
     76         "source": "haiku"
     77       },
     78       "funder_independent_of_outcome": {
     79         "applies": false,
     80         "answer": false,
     81         "justification": "No funder is disclosed, making independence assessment impossible; authors are from academic institutions not commercially tied to the evaluated models.",
     82         "source": "haiku"
     83       },
     84       "financial_interests_declared": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No competing interests statement, no declaration of patents or equity, and no disclosure of financial relationships with any of the evaluated MLLM providers (Anthropic, OpenAI, Google, Meta, Mistral).",
     88         "source": "haiku"
     89       }
     90     },
     91     "scope_and_framing": {
     92       "key_terms_defined": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Section 4.1 formally defines Design Generation, Design Edit, and Design Repair with mathematical notation (task functions TG, TE, TR with explicit inputs and outputs); front-end frameworks are defined in Section 2.2.",
     96         "source": "haiku"
     97       },
     98       "intended_contribution_clear": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Three contributions are explicitly enumerated: the first multi-framework multi-task benchmark, extensive 9-MLLM evaluation across multiple dimensions, and a 22-type failure taxonomy with actionable guidance.",
    102         "source": "haiku"
    103       },
    104       "engagement_with_prior_work": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 3 reviews DL-based, CV-based, and MLLM-based UI code generation methods; Table 1 directly compares DesignBench against six prior benchmarks across five dimensions showing clear differentiation.",
    108         "source": "haiku"
    109       }
    110     }
    111   },
    112   "type_checklist": {
    113     "benchmark-creation": {
    114       "construct_design": {
    115         "construct_validity_argued": {
    116           "applies": true,
    117           "answer": false,
    118           "justification": "The paper identifies gaps in existing benchmarks and fills them, but does not argue why CLIP/SSIM specifically measure visual UI generation capability or why CMLS/CMCS measure code edit quality rather than syntactic similarity to ground truth.",
    119           "source": "haiku"
    120         },
    121         "difficulty_distribution_characterized": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Section 6.3 defines difficulty for all three tasks with specific criteria: a composite score (image size, UI elements, color variety, layout complexity) for generation; annotator labels for editing; lines-of-code threshold (<10/10-30/>30) for repair. Table 6 reports results by difficulty tier.",
    125           "source": "haiku"
    126         },
    127         "ceiling_floor_effects_checked": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "Qwen-7B achieves near-zero CLIP scores (0.04-0.09) and zero MLLM scores on several framework-task combinations indicating floor effects, but the paper does not flag this as a benchmark limitation or discuss discriminability at the low end.",
    131           "source": "haiku"
    132         },
    133         "human_baseline_included": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Human annotators are used for data curation and MLLM judge validation but no human performance baseline is reported for any of the three benchmark tasks (generation, edit, or repair).",
    137           "source": "haiku"
    138         },
    139         "scoring_rubric_justified": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "The MLLM-as-Judge metric is validated against human evaluation (95.54%/91.89% accuracy), but the choice of CLIP and SSIM as visual metrics is not justified against alternatives, and edge cases in CMLS/CMCS Jaccard-based scoring are not discussed.",
    143           "source": "haiku"
    144         }
    145       },
    146       "robustness": {
    147         "contamination_resistance_designed": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "The paper includes a post-hoc contamination check via BLEU scores (Section 5.3) but no design-level contamination resistance measures such as temporal splits, canary strings, or dynamic item generation are used.",
    151           "source": "haiku"
    152         },
    153         "temporal_robustness_discussed": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No discussion of benchmark longevity, saturation risk, or update plans; top frontier models already achieve CLIP >0.80 on vanilla HTML tasks, suggesting ceiling risk as models continue to improve.",
    157           "source": "haiku"
    158         },
    159         "failure_modes_discussed": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "The paper extensively documents failure modes of MLLMs (22 failure categories in RQ6) but does not analyze failure modes of the benchmark itself — e.g., whether CLIP scores can be gamed, whether the MLLM judge has systematic biases, or what aspects of UI quality the benchmark fails to capture.",
    163           "source": "haiku"
    164         },
    165         "baseline_implementations_provided": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "Full code, data, annotation guidelines, prompts, and evaluator implementations are available at https://github.com/WebPAI/DesignBench, enabling reproduction of all reported numbers.",
    169           "source": "haiku"
    170         }
    171       },
    172       "documentation": {
    173         "dataset_documentation_complete": {
    174           "applies": true,
    175           "answer": true,
    176           "justification": "Section 4.2 describes data sources (GitHub, Moz Top 500, V0, Vue0, WebCode2M), collection tools (single-file-cli, Selenium), annotation process with PhD annotators and majority voting, and sample counts per task and framework; detailed guidelines are on GitHub.",
    177           "source": "haiku"
    178         },
    179         "licensing_and_access_clear": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "The GitHub repository is publicly linked but no license is specified in the paper or discussed; the copyright status of content collected from top-500 commercial websites and GitHub projects is not addressed.",
    183           "source": "haiku"
    184         },
    185         "intended_use_specified": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "Section 8 provides guidance on using findings, but there is no explicit statement of what benchmark results should NOT be interpreted as (e.g., not a measure of general coding ability, not a proxy for production-readiness of MLLM-generated code).",
    189           "source": "haiku"
    190         }
    191       }
    192     }
    193   },
    194   "claims": [
    195     {
    196       "claim": "MLLMs perform substantially worse on framework-based development (React, Vue, Angular) than on vanilla HTML/CSS",
    197       "evidence": "Table 5 and Fig. 6: vanilla HTML achieves CLIP scores >0.72 and near-perfect compilation rates, while Angular achieves CLIP 0.45-0.55 and compilation 0.60-0.76 for top models",
    198       "supported": "strong"
    199     },
    200     {
    201       "claim": "Larger model variants consistently outperform smaller variants within the same family across all tasks and frameworks",
    202       "evidence": "Table 5 shows consistent performance gaps for Llama-90B vs 11B, Pixtral-124B vs 12B, and Qwen-72B vs 7B across all metrics",
    203       "supported": "strong"
    204     },
    205     {
    206       "claim": "Code-only input outperforms image-only input for edit and repair tasks; multimodal combination provides minimal additional improvement",
    207       "evidence": "Table 7: top models score 8.40-8.43 (code-only) vs 7.37-7.67 (image-only) for Design Edit; similar pattern for Design Repair with 6.53-6.70 vs 5.47-5.81",
    208       "supported": "strong"
    209     },
    210     {
    211       "claim": "MLLMs achieve only 27.1% average accuracy in identifying UI display issues across all frameworks",
    212       "evidence": "Table 10 reports per-framework averages of 0.2972, 0.2205, 0.2275, 0.3403 on React, Vue, Angular, Vanilla, averaging 0.2714",
    213       "supported": "strong"
    214     },
    215     {
    216       "claim": "MLLMs almost never use component-based design in React (0.24% adoption rate)",
    217       "evidence": "Table 9 shows average component adoption rates of 0.24%, 5%, and 19% for React, Vue, and Angular respectively across nine models",
    218       "supported": "strong"
    219     },
    220     {
    221       "claim": "Angular framework produces the worst compilation success rates among all frameworks",
    222       "evidence": "Fig. 6 shows Angular compilation rates of 0.60-0.70 for top models vs >0.83 for React/Vue and perfect for Vanilla HTML",
    223       "supported": "strong"
    224     },
    225     {
    226       "claim": "MLLMs can fix compilation errors in approximately 53% of cases across all frameworks",
    227       "evidence": "Table 8 reports average repair rates of 0.53, 0.52, 0.53 for React, Vue, Angular based on a sample of 30 webpages with compilation errors",
    228       "supported": "moderate"
    229     }
    230   ],
    231   "methodology_tags": ["benchmark-eval"],
    232   "key_findings": "DesignBench reveals that MLLMs perform significantly worse on framework-based web development (React, Vue, Angular) than on vanilla HTML/CSS, with Angular posing the greatest challenge due to TypeScript component architecture and MLLMs achieving only 60-76% compilation rates there. Code-only input consistently outperforms image+code multimodal input for edit and repair tasks, a counterintuitive finding suggesting current MLLMs underutilize visual information. MLLMs achieve only 27.1% average accuracy in identifying UI display issues and adopt component-based design patterns in under 1% of React generations, indicating fundamental gaps in framework-specific reasoning. A 22-type failure taxonomy is catalogued, with design repair showing the most severe limitations including high rates of no-repair attempts.",
    233   "red_flags": [
    234     {
    235       "flag": "GPT-4o self-evaluation bias",
    236       "detail": "GPT-4o is used as the MLLM judge for Design Edit and Repair tasks (Section 5.4) while also being evaluated as a model, creating potential self-serving bias where GPT-4o judges its own outputs."
    237     },
    238     {
    239       "flag": "GPT-4o generates benchmark samples it is then evaluated on",
    240       "detail": "146 Angular and vanilla HTML/CSS samples for the edit task were auto-translated by GPT-4o (Section 4.2); GPT-4o is then evaluated on those same samples, potentially giving it an advantage from familiarity with its own output style."
    241     },
    242     {
    243       "flag": "No human baseline on benchmark tasks",
    244       "detail": "No human performance measurement is provided for generation, edit, or repair tasks, making it impossible to interpret whether 0.27 issue detection accuracy or CLIP scores around 0.70 represent near-human, far-below-human, or a reasonable baseline."
    245     },
    246     {
    247       "flag": "Floor effects unaddressed",
    248       "detail": "Qwen-7B achieves near-zero CLIP scores (0.04-0.09) and zero MLLM scores on several framework-task combinations; the paper does not flag this as a benchmark discriminability concern."
    249     },
    250     {
    251       "flag": "CLIP/SSIM metrics unjustified for UI quality",
    252       "detail": "CLIP semantic similarity and SSIM structural similarity are used as primary visual metrics without validation that they correlate with human judgment of UI generation quality; no ablation or correlation with MLLM judge scores is reported."
    253     },
    254     {
    255       "flag": "No funding disclosure",
    256       "detail": "No acknowledgments or funding section is present despite API evaluation costs of approximately $52/model × 9 models = ~$470 minimum, plus data collection infrastructure."
    257     },
    258     {
    259       "flag": "No benchmark license",
    260       "detail": "The GitHub repository is public but no license is specified; copyright status of content collected from Moz Top-500 commercial websites and GitHub projects is unaddressed, creating legal ambiguity for benchmark users."
    261     },
    262     {
    263       "flag": "Contamination check is post-hoc and weak",
    264       "detail": "The contamination check relies solely on BLEU score between model outputs and original code being low (Section 5.3); low BLEU only shows models don't verbatim copy — it does not rule out training data advantage from exposure to the same or similar websites."
    265     }
    266   ],
    267   "cited_papers": [
    268     {
    269       "title": "Design2Code: How Far Are We From Automating Front-End Engineering?",
    270       "relevance": "Direct predecessor benchmark using 484 real-world webpages for design-to-code evaluation; DesignBench extends this with multi-framework support and additional tasks"
    271     },
    272     {
    273       "title": "WebCode2M: A Real-World Dataset for Code Generation from Webpage Designs",
    274       "relevance": "Large-scale benchmark (20K samples) that DesignBench uses as a data source for vanilla HTML samples and as a comparison baseline"
    275     },
    276     {
    277       "title": "Web2Code: A Large-scale Webpage-to-Code Dataset and Evaluation Framework for Multimodal LLMs",
    278       "relevance": "Prior synthetic benchmark for HTML parsing evaluation; DesignBench addresses its limitation of relying on synthetic data"
    279     },
    280     {
    281       "title": "Unlocking the conversion of Web Screenshots into HTML Code with the WebSight Dataset",
    282       "relevance": "Large synthetic benchmark (823K samples) for HTML code generation compared against in Table 1"
    283     },
    284     {
    285       "title": "Automatically Generating UI Code from Screenshot: A Divide-and-Conquer-Based Approach (DCGen)",
    286       "relevance": "State-of-the-art MLLM-based UI code generation method; one of the key approaches DesignBench is designed to evaluate"
    287     },
    288     {
    289       "title": "pix2code: Generating code from a graphical user interface screenshot",
    290       "relevance": "Foundational early benchmark for UI code generation using DSL; cited as the work DesignBench extends beyond"
    291     },
    292     {
    293       "title": "MLLM-as-a-judge: Assessing multimodal LLM-as-a-judge with vision-language benchmark",
    294       "relevance": "Foundational work justifying the MLLM-as-judge evaluation methodology used in DesignBench for Design Edit and Repair scoring"
    295     },
    296     {
    297       "title": "SWE-bench Multimodal: Do AI Systems Generalize to Visual Software Domains?",
    298       "relevance": "Related multimodal code benchmark demonstrating the broader trend of evaluating MLLMs on visual software engineering tasks"
    299     }
    300   ],
    301   "engagement_factors": {
    302     "practical_relevance": {
    303       "score": 3,
    304       "justification": "Directly actionable for developers choosing MLLMs for front-end work; the code-only > multimodal finding changes how practitioners should prompt models for UI edit/repair."
    305     },
    306     "surprise_contrarian": {
    307       "score": 2,
    308       "justification": "The finding that multimodal input adds no benefit over code-only for edit/repair tasks contradicts the premise of using multimodal models for visual UI tasks."
    309     },
    310     "fear_safety": {
    311       "score": 0,
    312       "justification": "No safety or risk concerns; the paper evaluates front-end coding capability, not safety-critical or adversarial systems."
    313     },
    314     "drama_conflict": {
    315       "score": 1,
    316       "justification": "Shows frontier models struggle significantly on framework-based tasks and nearly never use component patterns, but results are framed constructively rather than as indictment."
    317     },
    318     "demo_ability": {
    319       "score": 3,
    320       "justification": "Full code, data, and evaluation scripts available on GitHub; anyone can run the benchmark evaluation against their preferred MLLM with provided prompts."
    321     },
    322     "brand_recognition": {
    323       "score": 2,
    324       "justification": "Evaluates and ranks Claude-3.7, GPT-4o, and Gemini-2.0 — high-profile brands that drive reader interest even though the authoring institution (CUHK) is not a top-tier AI lab."
    325     }
    326   },
    327   "hn_data": {
    328     "threads": [
    329       {"hn_id": "44148662", "title": "Beyond Attention: Toward Machines with Intrinsic Higher Mental States", "points": 67, "comments": 19, "url": "https://news.ycombinator.com/item?id=44148662"},
    330       {"hn_id": "37070323", "title": "Transformative AGI by 2043 is <1% likely", "points": 33, "comments": 41, "url": "https://news.ycombinator.com/item?id=37070323"},
    331       {"hn_id": "43667963", "title": "Transfer between Modalities with MetaQueries", "points": 25, "comments": 12, "url": "https://news.ycombinator.com/item?id=43667963"},
    332       {"hn_id": "43628028", "title": "NNN: Next-Generation Neural Networks for Marketing Mix Modeling", "points": 25, "comments": 3, "url": "https://news.ycombinator.com/item?id=43628028"},
    333       {"hn_id": "44859559", "title": "Modern Methods in Associative Memory", "points": 5, "comments": 1, "url": "https://news.ycombinator.com/item?id=44859559"},
    334       {"hn_id": "36306353", "title": "Transformative AGI by 2043 is <1% likely", "points": 3, "comments": 4, "url": "https://news.ycombinator.com/item?id=36306353"},
    335       {"hn_id": "46908281", "title": "LLMs do plan before they genenrate tokens", "points": 3, "comments": 0, "url": "https://news.ycombinator.com/item?id=46908281"},
    336       {"hn_id": "44236081", "title": "Geopolitical biases in LLMs", "points": 2, "comments": 0, "url": "https://news.ycombinator.com/item?id=44236081"},
    337       {"hn_id": "44556736", "title": "ASK HN: Why Google's Gemini 2.5 paper has 3295 authors?", "points": 2, "comments": 4, "url": "https://news.ycombinator.com/item?id=44556736"},
    338       {"hn_id": "44256016", "title": "Can Theoretical Physics Research Benefit from Language Agents?", "points": 1, "comments": 0, "url": "https://news.ycombinator.com/item?id=44256016"}
    339     ],
    340     "top_points": 67,
    341     "total_points": 166,
    342     "total_comments": 84
    343   }
    344 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs