scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22454B)
      1 {
      2   "paper": {
      3     "title": "Modular Layout Synthesis (MLS): Front-end Code via Structure Normalization and Constrained Generation",
      4     "authors": ["Chong Liu", "Ming Zhang", "Fei Li", "Hao Zhou", "Xiaoshuang Chen", "Ye Yuan"],
      5     "year": 2025,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2512.18996",
      8     "doi": "10.48550/arXiv.2512.18996"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "MLS proposes a three-stage UI-to-Code framework (visual-to-structure, blueprint compression with motif harvesting, constraint-based multi-framework generation) that separates layout parsing, reuse extraction, and framework-specific synthesis. The paper claims significant improvements over baselines in code reusability and structural integrity across React/Vue/Angular. However, the authors explicitly state that ALL experimental results are synthetic (randomly generated), making every empirical claim unverifiable.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No code repository URL or archive is provided anywhere in the paper."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper evaluates on publicly available benchmarks: Design2Code (Si et al. 2025), Vision2UI (Gui et al. 2024), and WebSight (Laurençon et al. 2024). However, the 300-case multi-framework subset is described as 'manually converted' and synthetic, and is not released."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications, requirements files, or dependency details are provided."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No reproduction instructions, scripts, or README are provided."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results in Tables 1-3 are point estimates with no confidence intervals or error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims MLS 'significantly outperforms' baselines but provides no statistical significance tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Tables 1-3 report absolute metric values for all methods, allowing effect size computation (e.g., CLIP 0.844 vs 0.821 for next-best baseline, Reuse@K 0.412 vs 0.275)."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Dataset sizes are stated (484, 2000, 300) but no justification is given for why these sizes are sufficient for the claims made."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or spread measures are reported across any runs."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Five baselines are compared: Direct MLLM Prompting, DCGen, LayoutCoder, WebSight-Sightseer, and WebVIA-UI2Code (Table 1)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include recent 2024-2025 work: DCGen (Wan et al. 2024), LayoutCoder (Wu et al. 2025), WebVIA-UI2Code (Xu et al. 2025)."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table 3 shows ablations removing component mining, type inference, and constrained decoding. However, all results are synthetic."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Five metrics are used: CLIP (visual similarity), TED (tree edit distance), Reuse@K, DupRate, and TypeCheck."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation is included. All evaluation is automated. For a UI code generation system, human judgment of code quality and maintainability would be relevant."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No discussion of train/test/validation splits or held-out evaluation. The paper mentions using existing test sets but does not clarify separation."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 2 breaks down results by framework (React, Vue, Angular). Table 3 provides per-ablation breakdowns."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "Section 4.7 mentions 'qualitative analysis' and 'typical cases' but only describes successes. No failure cases are shown or discussed."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "Every experiment shows MLS improving over baselines. No negative results or failed approaches are reported."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The abstract claims MLS 'significantly outperforms existing baselines' but the Section 4 disclaimer states 'All numeric results in this section are synthetic (randomly generated but internally consistent).' Synthetic results cannot support empirical claims."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The ablation study (Table 3) makes causal claims about component contributions (e.g., 'component mining mainly boosts reuse'), but results are synthetic and cannot justify causal inference."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The abstract and conclusion claim broad superiority ('superior code reusability and structural integrity across multiple frameworks') but scope is not bounded. The title and framing suggest general applicability beyond the tested setting."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No alternative explanations for results are discussed. No threats to validity or confounding factors are considered."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper measures automated metrics (CLIP, TED, Reuse@K, DupRate, TypeCheck) but frames these as demonstrating 'production-ready code' and 'superior code reusability.' No discussion of the gap between these proxy metrics and actual developer productivity or code maintainability."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No specific model versions are mentioned. The paper references using an 'LLM' in Module C and a 'visual-semantic encoder' in Module A without naming or versioning either."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "Module C uses an LLM with a 'constraint-based generation protocol' but no actual prompts or system instructions are provided."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Key hyperparameters are mentioned symbolically (λg, λr, ϵ, dmax, amin, η, γ) but no actual values are provided. No LLM temperature or sampling settings are stated."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The three-module pipeline (Module A: visual-to-structure, Module B: blueprint compression, Module C: constrained generation) is described in detail in Section 3, including the constrained decoding protocol."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "No data preprocessing steps are documented. The paper references datasets but does not describe how they were prepared for the experiments."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no limitations section. The paper has only a Broader Impact section (Section 5) and a brief Conclusion (Section 6)."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity are discussed anywhere in the paper."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No explicit scope boundaries are stated. The paper does not clarify what settings or scenarios are excluded from its claims."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw data or experimental outputs are available for verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The 300-case multi-framework subset is described only as 'manually converted' with no detail on the conversion process. The synthetic disclaimer undermines all data descriptions."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data sources are standard public benchmarks."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "No data pipeline is documented from raw inputs to final evaluation."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source or acknowledgments section is present."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All authors are listed as affiliated with Nanjing University."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding is disclosed, so independence cannot be assessed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial disclosures statement is present."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The paper uses an LLM in Module C but does not state any training data cutoff date."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether the LLM's training data includes any of the benchmark examples."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "Design2Code and other benchmarks may be in LLM training data. This is not discussed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "The system involves an LLM and a vision encoder but no inference cost, latency, or token consumption is reported."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No computational budget, GPU hours, or training time is mentioned."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No mention of random seeds or seed sensitivity analysis. All results are single point estimates."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs is not stated anywhere."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Multiple hyperparameters are introduced (λg, λr, η, γ, dmax, amin) but no search budget or tuning methodology is described."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No discussion of how the reported configuration was selected from alternatives."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Multiple comparisons are made across 5 baselines and 5 metrics with no correction applied (no statistical tests at all)."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors evaluate their own system against their own implementations/runs of baselines with no acknowledgment of potential bias."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "MLS adds multiple processing stages (vision encoder, blueprint compression, constrained LLM decoding) over simpler baselines, but compute differences are not discussed."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether the metrics (CLIP, TED, Reuse@K, DupRate, TypeCheck) actually capture code quality or real-world usability."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "MLS uses a complex multi-stage scaffold while baselines may use simpler pipelines. This confound between the scaffold and the method's contributions is not addressed."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of temporal leakage. The benchmarks (Design2Code, Vision2UI) may have been in the LLM's training data."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup provides information not available in real usage."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of independence between training and test data."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention methods are used or mentioned."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "MLS significantly outperforms existing baselines on Design2Code, achieving CLIP 0.844, TED 0.233, Reuse@K 0.412, DupRate 0.241, TypeCheck 86.9%.",
    365       "evidence": "Table 1, Section 4.4. However, the paper explicitly states: 'All numeric results in this section are synthetic (randomly generated but internally consistent)' (Section 4 disclaimer).",
    366       "supported": "unsupported"
    367     },
    368     {
    369       "claim": "MLS achieves superior multi-framework portability (avg 0.76) compared to baselines (0.58-0.63).",
    370       "evidence": "Table 2, Section 4.5. Same synthetic disclaimer applies.",
    371       "supported": "unsupported"
    372     },
    373     {
    374       "claim": "Component mining is the key driver of reuse improvement, while constrained decoding and type inference primarily improve compilation success.",
    375       "evidence": "Table 3 ablation study, Section 4.6. Same synthetic disclaimer applies.",
    376       "supported": "unsupported"
    377     }
    378   ],
    379   "red_flags": [
    380     {
    381       "flag": "All results explicitly synthetic",
    382       "detail": "Section 4 contains a disclaimer: 'All numeric results in this section are synthetic (randomly generated but internally consistent) to provide a complete paper draft.' This means every empirical claim in the paper is unsupported by actual experiments. The abstract and conclusion do not mention this, presenting synthetic results as if they demonstrate real performance."
    383     },
    384     {
    385       "flag": "Misleading abstract and conclusion",
    386       "detail": "The abstract states 'Evaluations show that MLS significantly outperforms existing baselines' and the conclusion states results 'improve code reusability, portability across React/Vue/Angular, and structural integrity.' Neither mentions that results are synthetic, creating a false impression of validated empirical findings."
    387     },
    388     {
    389       "flag": "No model or implementation details",
    390       "detail": "Despite describing a concrete system with three modules, the paper never names the LLM used in Module C, the architecture of the visual-semantic encoder in Module A, or any concrete hyperparameter values. It is unclear whether MLS was actually implemented."
    391     },
    392     {
    393       "flag": "No limitations discussion",
    394       "detail": "The paper has no limitations section, no threats to validity, and no discussion of when or why the approach might fail."
    395     },
    396     {
    397       "flag": "Results appear too clean",
    398       "detail": "MLS beats every baseline on every metric (Tables 1-3) with no variance reported. This is consistent with the synthetic nature of the results but would be suspicious even without the disclaimer."
    399     }
    400   ],
    401   "cited_papers": [
    402     {
    403       "title": "Design2Code: Benchmarking Multimodal Code Generation for Automated Front-end Engineering",
    404       "authors": ["Chenglei Si", "Yanzhe Zhang", "Ryan Li", "Zhengyuan Yang", "Ruibo Liu", "Diyi Yang"],
    405       "year": 2025,
    406       "relevance": "Major benchmark for UI-to-code generation, directly relevant to evaluating LLM code generation capabilities."
    407     },
    408     {
    409       "title": "Automatically Generating UI Code from Screenshot: A Divide-and-Conquer-Based Approach",
    410       "authors": ["Yuxuan Wan", "Chaozheng Wang", "Yi Dong", "Wenxuan Wang", "Shuqing Li", "Yintong Huo", "Michael R. Lyu"],
    411       "year": 2024,
    412       "arxiv_id": "2406.16386",
    413       "relevance": "Divide-and-conquer prompting pipeline for UI-to-code, a key baseline for LLM-based code generation approaches."
    414     },
    415     {
    416       "title": "Unlocking the Conversion of Web Screenshots into HTML Code with the WebSight Dataset",
    417       "authors": ["Hugo Laurençon", "Léo Tronchon", "Victor Sanh"],
    418       "year": 2024,
    419       "arxiv_id": "2403.09029",
    420       "relevance": "Large-scale synthetic dataset and finetuned VLM for screenshot-to-HTML, relevant to LLM-based code generation evaluation."
    421     },
    422     {
    423       "title": "MLLM-based UI2Code Automation Guided by UI Layout Information",
    424       "authors": ["Fan Wu", "Cuiyun Gao", "Shuqing Li", "Xinjie Wen", "Qing Liao"],
    425       "year": 2025,
    426       "arxiv_id": "2506.10376",
    427       "relevance": "Layout-guided approach to LLM-based UI code generation, directly comparable baseline."
    428     },
    429     {
    430       "title": "WebVIA: A Web-based Vision-Language Agentic Framework for Interactive and Verifiable UI-to-Code Generation",
    431       "authors": ["Mingde Xu", "Zhen Yang", "Wenyi Hong"],
    432       "year": 2025,
    433       "arxiv_id": "2511.06251",
    434       "relevance": "Agentic framework for UI-to-code with interactive verification, relevant to agentic AI code generation."
    435     },
    436     {
    437       "title": "Type-Constrained Code Generation with Language Models",
    438       "authors": ["Niels Mündler", "Jingxuan He", "Hao Wang", "Koushik Sen", "Dawn Song", "Martin Vechev"],
    439       "year": 2025,
    440       "arxiv_id": "2504.09246",
    441       "relevance": "Type-constrained decoding for LLM code generation, directly relevant to constrained code synthesis methods."
    442     },
    443     {
    444       "title": "Grammar-Constrained Decoding for Structured NLP Tasks Without Finetuning",
    445       "authors": ["Saibo Geng", "Martin Josifoski", "Maxime Peyrard", "Robert West"],
    446       "year": 2023,
    447       "relevance": "Grammar-constrained decoding for LLMs, foundational work for the constrained generation approach used in MLS."
    448     },
    449     {
    450       "title": "Grammar-Aligned Decoding",
    451       "authors": ["Kyunghyun Park"],
    452       "year": 2024,
    453       "relevance": "Studies quality distortions from grammar-constrained decoding in LLMs, relevant to understanding constrained generation tradeoffs."
    454     },
    455     {
    456       "title": "pix2code: Generating Code from a Graphical User Interface Screenshot",
    457       "authors": ["Tony Beltramelli"],
    458       "year": 2018,
    459       "relevance": "Seminal work on end-to-end screenshot-to-code generation."
    460     },
    461     {
    462       "title": "Generating Structured Outputs from Language Models",
    463       "authors": ["Anonymous"],
    464       "year": 2025,
    465       "arxiv_id": "2501.10868",
    466       "relevance": "Structured generation frameworks for LLMs, relevant to constrained code generation."
    467     }
    468   ]
    469 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs