scan.json (31351B)
1 { 2 "paper": { 3 "title": "SlidesGen-Bench: Evaluating Slides Generation via Computational and Quantitative Metrics", 4 "authors": [ 5 "Yunqiao Yang", 6 "Wenbo Li", 7 "Houxing Ren", 8 "Zimu Lu", 9 "Ke Wang", 10 "Zhiyuan Huang", 11 "Zhuofan Zong", 12 "Mingjie Zhan", 13 "Hongsheng Li" 14 ], 15 "year": 2026, 16 "venue": "arXiv", 17 "arxiv_id": "2601.09487", 18 "doi": "10.48550/arXiv.2601.09487" 19 }, 20 "scan_version": 2, 21 "active_modules": ["experimental_rigor", "data_leakage"], 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "The abstract states: 'Our code and data are available at https://github.com/YunqiaoYang/SlidesGen-Bench.' A GitHub URL is provided." 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": true, 32 "justification": "The paper announces the Slides-Align1.5k dataset and states code and data are available at the GitHub repository. The 189 benchmark instructions and QuizBank are described as part of the released assets." 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "No mention of requirements.txt, Dockerfile, conda environment, or library versions anywhere in the paper or appendix." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but the paper contains no README-like instructions, commands, or scripts to replicate experiments." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": true, 48 "answer": false, 49 "justification": "Tables 1, 2, 4, and 5 report only point estimates. Table 2 reports standard deviation of Spearman correlation across scenarios, but no confidence intervals or error bars on the main results." 50 }, 51 "significance_tests": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper claims 'SlidesGen-Bench demonstrates superior performance' and 'significantly surpassing' baselines (Section 3.2) based solely on comparing point estimates in Table 2 (0.71 vs 0.57). No statistical significance tests (t-tests, bootstrap tests, etc.) are reported." 55 }, 56 "effect_sizes_reported": { 57 "applies": true, 58 "answer": false, 59 "justification": "Results are reported as raw numbers (Spearman correlations, accuracy percentages, aesthetic scores) without formal effect sizes such as Cohen's d. The paper reports absolute differences but provides no standardized effect size measures." 60 }, 61 "sample_size_justified": { 62 "applies": true, 63 "answer": false, 64 "justification": "The benchmark uses 189 instructions, 1.5k human annotations, and 50 parameter tuning pairs. None of these sample sizes are justified with power analysis or explicit reasoning for adequacy." 65 }, 66 "variance_reported": { 67 "applies": true, 68 "answer": false, 69 "justification": "Table 2 reports Std of Spearman correlation, but this represents variability across scenarios, not across experimental runs. Tables 1, 4, and 5 report single-run results with no standard deviations across runs, seeds, or replications." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": true, 76 "justification": "Table 2 compares SlidesGen-Bench against PPT-Eval (PPTAgent), LLM-as-Judge Rating, and LLM-as-Judge Arena. Multiple baselines are included." 77 }, 78 "baselines_contemporary": { 79 "applies": true, 80 "answer": true, 81 "justification": "Baselines include PPTAgent (Zheng et al., 2025) and LLM-as-Judge methods from Zheng et al. (2023). These are contemporary and represent the state of the art in slide evaluation." 82 }, 83 "ablation_study": { 84 "applies": true, 85 "answer": true, 86 "justification": "Table 3 presents a comprehensive ablation study of the aesthetic metrics, testing individual components (Engagement, Harmony, Usability, Visual HRV) and their combinations. Section 3.3.1 analyzes contributions." 87 }, 88 "multiple_metrics": { 89 "applies": true, 90 "answer": true, 91 "justification": "The paper evaluates across three distinct dimensions: Content (QuizBank accuracy), Aesthetics (Harmony, Engagement, Usability, Visual Rhythm), and Editability (PEI levels). Human alignment uses Spearman correlation and Identical ratio." 92 }, 93 "human_evaluation": { 94 "applies": true, 95 "answer": true, 96 "justification": "The Slides-Align1.5k dataset provides human preference rankings from annotators using a web-based interface (Appendix O). Table 2 compares automated metrics against these human judgments." 97 }, 98 "held_out_test_set": { 99 "applies": true, 100 "answer": false, 101 "justification": "Appendix E.1.3 uses 50 slide pairs for parameter optimization via Bayesian optimization. Table 2 evaluates on Slides-Align1.5k. The paper does not explicitly state that these two sets are non-overlapping, leaving potential data leakage between tuning and evaluation." 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": true, 106 "justification": "Table 1 provides per-topic and per-difficulty breakdowns of QuizBank accuracy. Table 4 breaks down aesthetics into Usability, Engagement, Harmony, and Rhythm. Table 16 (Appendix J) gives per-purpose aesthetics breakdowns." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": true, 111 "justification": "Section 3.3.2 'Analysis of Errors in QuizBank' provides a detailed failure taxonomy (Table 17): Missing Content (57.7%), Value Mismatch (21.9%), VLM Extraction Failures (6.6%). Tables 18-19 give per-model breakdowns and qualitative examples." 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": true, 116 "justification": "The ablation study (Table 3) shows that Engagement alone performs poorly (Spearman 0.224). Table 5 shows all systems fail at L4+ editability. The error analysis reveals systematic failures in content preservation. Business topics are identified as a bottleneck (61.61% accuracy)." 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": true, 123 "justification": "The abstract claims 'SlidesGen-Bench achieves a higher degree of alignment with human judgment than existing evaluation pipelines.' Table 2 supports this with Spearman 0.71 vs 0.57 for the best baseline. The claim of 'three core principles: universality, quantification, and reliability' is supported by the evaluation across diverse systems (Table 1), computational metrics (Section 2.3), and human alignment study (Table 2)." 124 }, 125 "causal_claims_justified": { 126 "applies": true, 127 "answer": true, 128 "justification": "The paper's main causal claims come from the ablation study (Table 3), which uses controlled single-variable manipulation (removing one metric at a time). The claim that 'readability and visual clarity are the primary drivers of user preference' is supported by the ablation showing Usability alone at 0.574 vs Engagement at 0.224." 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": true, 133 "justification": "Section 6 (Limitation) explicitly bounds scope: 'evaluation focuses exclusively on static visual content, overlooking temporal dynamics such as animations and slide transitions' and 'remains primarily English-centric.' The paper acknowledges limits to multilingual and specialized-domain generalization." 134 }, 135 "alternative_explanations_discussed": { 136 "applies": true, 137 "answer": false, 138 "justification": "The paper does not discuss alternative explanations for its main findings. For example, the higher human alignment could be partly due to the choice of human annotation methodology rather than intrinsic metric quality. The paper also doesn't consider whether LLM evaluator bias in the QuizBank evaluation could systematically favor certain systems." 139 }, 140 "proxy_outcome_distinction": { 141 "applies": true, 142 "answer": true, 143 "justification": "The paper explicitly frames QuizBank accuracy as a 'proxy for content quality' (Section 3.2: 'accuracy results, serving as a proxy for slide content quality'). The aesthetic metrics are directly validated against human preference rankings. The gap between measurement and claim is acknowledged." 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": true, 149 "answer": false, 150 "justification": "The paper uses 'GPT-4o' for annotation (Section 2.1) without a snapshot date or API version. The evaluated systems (Gamma, Kimi, Zhipu, etc.) are referenced by product name without underlying model versions. No model version identifiers are provided anywhere." 151 }, 152 "prompts_provided": { 153 "applies": true, 154 "answer": true, 155 "justification": "Full prompts are provided in the appendix: Figures 18-20 (QuizBank construction), Figure 21 (slide extraction), Figure 22 (quiz evaluation), Figures 23-24 (LLM-as-Judge baselines). Actual prompt text with formatting rules is included." 156 }, 157 "hyperparameters_reported": { 158 "applies": true, 159 "answer": false, 160 "justification": "The computational aesthetic metrics have detailed parameter tables (Tables 8-13 in Appendix E). However, no hyperparameters (temperature, top-p, max tokens) are reported for the LLM-based components: GPT-4o annotation, QuizBank construction agents, or the LLM evaluator in the open-book exam." 161 }, 162 "scaffolding_described": { 163 "applies": true, 164 "answer": true, 165 "justification": "The multi-agent QuizBank construction pipeline is described in detail (Section 2.2, Figure 3): Phase I (Forensic Analyst, 100k+ token context window), Phase II (Reflexion with hallucination checks and cyclic feedback), Phase III (Exam Setter with structured output). Each agent's role and I/O is specified." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": true, 170 "justification": "Section 2.1 documents the preprocessing pipeline: 30k+ slides collected → length filter (5-40 pages) → python-pptx text extraction → GPT-4o topic annotation → Wikipedia content compilation → invalid/sparse filtering → 94 topic-based + 95 purpose-based instructions = 189 total. Filtering criteria are stated." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": true, 177 "justification": "Section 6 'Limitation' is a dedicated section discussing two specific limitations of the framework." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section 6 discusses specific threats: (1) the evaluation 'focuses exclusively on static visual content, overlooking temporal dynamics such as animations and slide transitions,' and (2) 'remains primarily English-centric.' These are specific to this study's design choices." 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": true, 187 "justification": "Section 6 states what the benchmark does NOT cover: animations, slide transitions, multilingual contexts, and specialized domains like medical or legal reports. The paper explicitly names excluded capabilities." 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": true, 194 "justification": "The paper states 'Our code and data are available at https://github.com/YunqiaoYang/SlidesGen-Bench.' The Slides-Align1.5k dataset and benchmark instructions are described as released artifacts." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": true, 199 "justification": "Section 2.1 describes collection in detail: ~30k human-authored slides from various sources, filtered by length (5-40 pages), text extracted via python-pptx, annotated by GPT-4o for topics/purposes, Wikipedia content compiled as source documents." 200 }, 201 "recruitment_methods_described": { 202 "applies": true, 203 "answer": false, 204 "justification": "Appendix O mentions a 'web-based annotation interface for ranking' but does not describe how human annotators were recruited, how many participated, their qualifications, or whether the recruitment method could introduce selection bias." 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": true, 209 "justification": "The full pipeline is documented: raw slides (30k+) → filtering → text extraction → GPT-4o annotation → Wikipedia compilation → 189 instructions. The QuizBank pipeline (Figure 3) documents extraction → verification → generation. Filtering counts are provided (30k → 189)." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding or acknowledgments section is present in the paper. The work involves CUHK MMLab and SenseTime Research, a major AI company, but no funding sources are disclosed." 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "Author affiliations are clearly listed: CUHK MMLab, SenseTime Research, CPII under InnoHK, and Shanghai AI Laboratory. All institutional connections are stated in the header." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": false, 226 "justification": "No funding is disclosed, so independence cannot be assessed. SenseTime Research (author affiliation) is a major AI company that could have indirect interest in slide generation benchmarking results, but this potential conflict is not addressed." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": false, 231 "justification": "No competing interests or financial disclosure statement is present in the paper." 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": true, 237 "answer": false, 238 "justification": "The paper evaluates 9 LLM-based slide generation systems but states no training data cutoff dates for any of them. GPT-4o is used for annotation without specifying its training cutoff." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": true, 242 "answer": false, 243 "justification": "The benchmark source documents are compiled from Wikipedia, which is in every LLM's training data. The paper does not discuss whether the evaluated systems' LLMs may have seen this content during training, which could inflate content quality scores." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": true, 247 "answer": false, 248 "justification": "Wikipedia content (the primary source for benchmark instructions) has been publicly available for years and is in LLM training sets. The paper does not address whether this creates an unfair advantage for systems with better memorization of Wikipedia content." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "The paper uses human annotators as evaluators of system outputs, not as research subjects. The humans provide preference rankings for metric validation, not data as study participants." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "Human annotators served as judges evaluating slide quality, not as study participants in human subjects research." 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human subjects study was conducted. Annotators were used for evaluation validation only." 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human subjects study. Annotators provided preference rankings for metric validation." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "Not applicable — no experimental study with human participants as subjects." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "Not applicable — no experimental study with human participants as subjects." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "Not applicable — no human subjects study." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": true, 291 "answer": false, 292 "justification": "No inference cost or latency is reported for running the evaluation pipeline. The QuizBank construction uses GPT-4o with 100k+ token context windows, and multiple LLM calls are made per evaluation, but costs are not quantified." 293 }, 294 "compute_budget_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "No computational budget is stated. The pipeline involves processing 189 instructions through 9 generation systems, running multi-agent QuizBank construction, computing aesthetic metrics, and conducting human annotation — none of these costs are quantified." 298 } 299 }, 300 "experimental_rigor": { 301 "seed_sensitivity_reported": { 302 "applies": true, 303 "answer": false, 304 "justification": "No mention of random seeds or seed sensitivity analysis. The LLM-based components (QuizBank evaluation, GPT-4o annotation) are stochastic, but results are reported as single-run point estimates without seed variation analysis." 305 }, 306 "number_of_runs_stated": { 307 "applies": true, 308 "answer": false, 309 "justification": "The number of experimental runs is not stated for any LLM-based component. The computational aesthetic metrics are deterministic, but the QuizBank evaluation (LLM open-book exam) has inherent stochasticity not characterized by multiple runs." 310 }, 311 "hyperparameter_search_budget": { 312 "applies": true, 313 "answer": true, 314 "justification": "Appendix E.1.3 documents parameter optimization: 50 human-ranked slide pairs used with Bayesian optimization to maximize Spearman ρ. Table 13 shows initial vs. optimized parameters and their impact on correlation. Grid search is mentioned." 315 }, 316 "best_config_selection_justified": { 317 "applies": true, 318 "answer": true, 319 "justification": "Appendix E documents a three-stage methodology: (1) empirical distribution analysis on 3,000+ human slides, (2) initial parameter selection from distribution characteristics, (3) optimization against human preference rankings. Table 13 shows which parameters changed and which retained literature values." 320 }, 321 "multiple_comparison_correction": { 322 "applies": false, 323 "answer": false, 324 "justification": "No formal statistical hypothesis tests are performed in the paper, so multiple comparison correction is structurally inapplicable. The paper compares systems via point estimates without any statistical testing." 325 }, 326 "self_comparison_bias_addressed": { 327 "applies": true, 328 "answer": false, 329 "justification": "The authors propose SlidesGen-Bench and compare it against baseline evaluation methods (PPT-Eval, LLM-as-Judge). No discussion of author-evaluation bias — the human alignment dataset and parameter tuning were all constructed by the authors, who also designed the metrics being evaluated." 330 }, 331 "compute_budget_vs_performance": { 332 "applies": true, 333 "answer": false, 334 "justification": "The paper compares their computational metrics against LLM-based evaluation baselines without discussing the compute cost differences. Their method uses deterministic image processing while baselines use expensive LLM calls, but this tradeoff is not quantified." 335 }, 336 "benchmark_construct_validity": { 337 "applies": true, 338 "answer": true, 339 "justification": "The paper directly addresses construct validity by validating metrics against human preference rankings (Table 2, Spearman correlation 0.71). The ablation study (Table 3) decomposes which metric components drive alignment. Section 4 discusses limitations of prior evaluation approaches (reference-dependent, LLM bias)." 340 }, 341 "scaffold_confound_addressed": { 342 "applies": false, 343 "answer": false, 344 "justification": "The evaluated systems (Gamma, Kimi, NotebookLM, etc.) are tested as bundled commercial products. The paper does not claim to isolate model capability from the generation framework — the tool IS the thing being evaluated." 345 } 346 }, 347 "data_leakage": { 348 "temporal_leakage_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "Source documents are compiled from Wikipedia, which predates all evaluated systems' training data. The paper does not discuss whether LLMs having seen this Wikipedia content during training could inflate content quality scores." 352 }, 353 "feature_leakage_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "The QuizBank evaluation uses an LLM to answer questions about generated slides. If the evaluator LLM has prior knowledge of the Wikipedia source content, it might correctly answer questions even from poorly generated slides. This potential feature leakage is not discussed." 357 }, 358 "non_independence_addressed": { 359 "applies": true, 360 "answer": false, 361 "justification": "No discussion of whether training and test data are independent. The 189 instructions are drawn from Wikipedia and real-world scenarios — no analysis of whether these overlap with the evaluated systems' training data." 362 }, 363 "leakage_detection_method": { 364 "applies": true, 365 "answer": false, 366 "justification": "No concrete leakage detection or prevention methods are applied. No canary strings, membership inference tests, temporal splits, or decontamination pipelines are used." 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "SlidesGen-Bench achieves higher human alignment than existing evaluation pipelines (Spearman 0.71 vs 0.57 for the best baseline).", 373 "evidence": "Table 2 shows Average Spearman of 0.71 for SlidesGen-Bench vs 0.57 for LLM-as-Judge Rating, 0.53 for PPTAgent, and 0.52 for LLM-as-Judge Arena. Lowest Std (0.16) and highest Identical ratio (32.6 vs 20.7).", 374 "supported": "moderate" 375 }, 376 { 377 "claim": "The full multi-dimensional aesthetic method outperforms any single metric component.", 378 "evidence": "Table 3 ablation: Full Method achieves Spearman 0.710, vs best single metric Visual HRV at 0.618, Usability at 0.574, Harmony at 0.312, Engagement at 0.224.", 379 "supported": "moderate" 380 }, 381 { 382 "claim": "Zhipu achieves the highest overall content quality at 88.29% QuizBank accuracy.", 383 "evidence": "Table 1 shows Zhipu at 88.29% average accuracy across all topics and difficulty levels, leading in High difficulty (84.07%) and Medium (88.97%) categories.", 384 "supported": "moderate" 385 }, 386 { 387 "claim": "Skywork-Banana achieves the highest aesthetics score (27.28) across all evaluated systems.", 388 "evidence": "Table 4 shows Skywork-Banana leading with total Aesthetics 27.28, highest Engagement (8.30), and competitive Usability (5.62).", 389 "supported": "moderate" 390 }, 391 { 392 "claim": "A 'Structural Barrier' exists at Level 3 editability, with only Quark breaching it.", 393 "evidence": "Table 5 shows that 6 of 9 systems cap at L2 (Vector), 2 at L1 (Patchwork), 1 at L0 (Static). Only Quark reaches L3 (Structural). No system reaches L4 or L5.", 394 "supported": "strong" 395 }, 396 { 397 "claim": "Missing Content is the dominant failure mode, accounting for 57.7% of errors in generated slides.", 398 "evidence": "Section 3.3.2 and Table 17 analyze 2,499 incorrect instances (19.2% of 13,023 total). Missing Content: 57.7%, Value Mismatch: 21.9%, VLM Failure: 6.6%.", 399 "supported": "strong" 400 }, 401 { 402 "claim": "Usability is the primary driver of user preference among aesthetic metrics.", 403 "evidence": "Table 3 ablation shows Usability alone achieves Spearman 0.574, substantially outperforming Engagement (0.224) and Harmony (0.312) in isolation.", 404 "supported": "moderate" 405 } 406 ], 407 "methodology_tags": ["benchmark-eval"], 408 "key_findings": "SlidesGen-Bench introduces a three-dimensional benchmark (Content, Aesthetics, Editability) for evaluating slide generation, achieving Spearman correlation of 0.71 with human preference versus 0.57 for the best existing method. Across 9 evaluated systems, Zhipu leads in content quality (88.29%), Skywork-Banana in aesthetics (27.28), and only Quark breaches the Level 3 structural editability barrier. The error analysis reveals Missing Content (57.7%) as the dominant failure mode, and the ablation study shows text usability/readability is the strongest single predictor of human aesthetic preference.", 409 "red_flags": [ 410 { 411 "flag": "No statistical significance testing", 412 "detail": "All system comparisons and claims of superiority (e.g., 'significantly surpassing') are based on comparing point estimates without any statistical tests. With 9 systems evaluated across multiple metrics, the risk of spurious differences is substantial." 413 }, 414 { 415 "flag": "Wikipedia contamination risk", 416 "detail": "Benchmark source documents are compiled from Wikipedia, which is in every LLM's training data. Systems whose underlying LLMs have better memorization of Wikipedia content could score higher on content quality without actually being better at slide generation. This is not discussed." 417 }, 418 { 419 "flag": "Circular LLM evaluation", 420 "detail": "The QuizBank evaluation uses an LLM to assess content quality of LLM-generated slides. If the evaluator LLM shares knowledge with the generation LLMs (both trained on Wikipedia), it could correctly answer questions regardless of slide quality, inflating content scores." 421 }, 422 { 423 "flag": "Sparse human annotation methodology", 424 "detail": "The human alignment study (Slides-Align1.5k) is central to the paper's claims, but annotator recruitment methods, number of annotators, inter-annotator agreement, qualifications, and demographic information are not reported. Only a screenshot of the annotation interface is provided (Appendix O)." 425 }, 426 { 427 "flag": "Potential tuning-evaluation data overlap", 428 "detail": "Aesthetic metric parameters were optimized on 50 human-ranked slide pairs (Appendix E.1.3), then evaluated on Slides-Align1.5k. The paper does not explicitly state these sets are non-overlapping." 429 }, 430 { 431 "flag": "No variance across stochastic components", 432 "detail": "The QuizBank evaluation and GPT-4o annotation are stochastic processes, but all results are reported as single-run point estimates. No seed sensitivity or multi-run variance analysis is provided for any LLM-based component." 433 } 434 ], 435 "cited_papers": [ 436 { 437 "title": "GPT-4 Technical Report", 438 "authors": ["OpenAI"], 439 "year": 2023, 440 "arxiv_id": "2303.08774", 441 "relevance": "Foundational LLM capability paper; GPT-4o is used in this benchmark's pipeline for annotation." 442 }, 443 { 444 "title": "LLaMA: Open and Efficient Foundation Language Models", 445 "authors": ["Hugo Touvron", "Thibaut Lavril", "Gautier Izacard"], 446 "year": 2023, 447 "arxiv_id": "2302.13971", 448 "relevance": "Open-source LLM foundational to many slide generation systems evaluated." 449 }, 450 { 451 "title": "Executable Code Actions Elicit Better LLM Agents", 452 "authors": ["Xingyao Wang", "Yangyi Chen", "Lifan Yuan"], 453 "year": 2024, 454 "relevance": "Code agent capabilities relevant to code-driven slide generation paradigm." 455 }, 456 { 457 "title": "OpenDevin: An Open Platform for AI Software Developers as Generalist Agents", 458 "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"], 459 "year": 2024, 460 "arxiv_id": "2407.16741", 461 "relevance": "Open-source AI agent platform relevant to automated generation workflows." 462 }, 463 { 464 "title": "A Survey on Code Generation with LLM-Based Agents", 465 "authors": ["Yihong Dong", "Xue Jiang", "Jiaru Qian"], 466 "year": 2025, 467 "relevance": "Survey of LLM code generation capabilities relevant to code-driven slide generation." 468 }, 469 { 470 "title": "No Free Labels: Limitations of LLM-as-a-Judge without Human Grounding", 471 "authors": ["Michael Krumdick", "Charles Lovering", "Varshini Reddy"], 472 "year": 2025, 473 "relevance": "Directly relevant to LLM evaluation limitations; cited as motivation for human-aligned metrics." 474 }, 475 { 476 "title": "Judging the Judges: Evaluating Alignment and Vulnerabilities in LLMs-as-Judges", 477 "authors": ["Aman Singh Thakur", "Kartik Choudhary"], 478 "year": 2025, 479 "relevance": "Documents LLM-as-Judge vulnerabilities including stochasticity and bias, motivating computational metrics." 480 }, 481 { 482 "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", 483 "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"], 484 "year": 2023, 485 "relevance": "Foundational LLM-as-Judge methodology paper; used as baseline in this benchmark." 486 }, 487 { 488 "title": "Large Language Models are Not Fair Evaluators", 489 "authors": ["Peiyi Wang", "Lei Li", "Liang Chen"], 490 "year": 2024, 491 "relevance": "Documents verbosity bias and position bias in LLM evaluation, motivating computational alternatives." 492 }, 493 { 494 "title": "Verbosity Bias in Preference Labeling by Large Language Models", 495 "authors": ["Keita Saito", "Akifumi Wachi"], 496 "year": 2023, 497 "relevance": "Documents LLM verbosity bias in evaluation, cited as limitation of existing slide evaluation pipelines." 498 }, 499 { 500 "title": "PPTAgent: Generating and Evaluating Presentations Beyond Text-to-Slides", 501 "authors": ["Hao Zheng", "Xinyan Guan", "Hao Kong"], 502 "year": 2025, 503 "relevance": "Primary baseline for slide evaluation; PPT-Eval method is compared against in Table 2." 504 }, 505 { 506 "title": "Qwen2.5 Technical Report", 507 "authors": ["An Yang", "Baosong Yang"], 508 "year": 2024, 509 "arxiv_id": "2412.15115", 510 "relevance": "Recent LLM technical report relevant to model capabilities in generation tasks." 511 } 512 ] 513 }