scan.json (30584B)
1 { 2 "paper": { 3 "title": "StructTest: Benchmarking LLMs' Reasoning through Compositional Structured Outputs", 4 "authors": [ 5 "Hailin Chen", 6 "Fangkai Jiao", 7 "Mathieu Ravaut", 8 "Nawshad Farruque", 9 "Xuan Phi Nguyen", 10 "Chengwei Qin", 11 "Manan Dey", 12 "Bosheng Ding", 13 "Caiming Xiong", 14 "Shafiq Joty", 15 "Yingbo Zhou" 16 ], 17 "year": 2024, 18 "venue": "arXiv (Preprint, under review)", 19 "arxiv_id": "2412.18011", 20 "doi": "10.48550/arXiv.2412.18011" 21 }, 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "The paper provides a GitHub link in the abstract: 'Code & Data: https://github.com/SparkJiao/StructTest'." 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": true, 32 "justification": "The GitHub link is labeled 'Code & Data', indicating the benchmark data is released. The paper also uses publicly available datasets (e.g., GSM8K) for some subtasks." 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup section is provided in the paper." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link exists but the paper itself contains no README-like instructions for replicating experiments." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": true, 48 "answer": false, 49 "justification": "All tables (1-4) report point estimates only with no confidence intervals, error bars, or ± notation. Results are presented as single accuracy percentages." 50 }, 51 "significance_tests": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper makes comparative claims (e.g., 'DeepSeek-R1 consistently outperform others') based solely on comparing raw accuracy numbers without any statistical significance tests." 55 }, 56 "effect_sizes_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "The paper reports Pearson correlations (92.5% with Arena, 96.3% with MMLU) and relative performance differences (e.g., '70% drop in accuracy on Hard tasks compared to Easy ones,' 'error rates as high as 84%'). Full accuracy tables provide baseline context for all comparisons." 60 }, 61 "sample_size_justified": { 62 "applies": true, 63 "answer": false, 64 "justification": "The number of test examples per task is not stated in the paper, and no justification for sample sizes is provided. No power analysis is discussed." 65 }, 66 "variance_reported": { 67 "applies": true, 68 "answer": false, 69 "justification": "No variance, standard deviation, or multi-run spread measures are reported. All results appear to be single-run evaluations." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": true, 76 "justification": "The paper evaluates 17 LLMs that serve as mutual baselines. Results are also compared to external benchmarks (ChatBot Arena, MMLU) in Table 4." 77 }, 78 "baselines_contemporary": { 79 "applies": true, 80 "answer": true, 81 "justification": "Baselines include models current as of late 2024: DeepSeek-R1, GPT-4o, Claude-3.5-sonnet, Gemini-1.5-pro. Appendix A confirms inference was performed Nov 27 – Dec 14, 2024." 82 }, 83 "ablation_study": { 84 "applies": true, 85 "answer": false, 86 "justification": "Figure 5 shows per-domain correlation analysis, but no formal ablation study tests which benchmark components contribute most to discriminative power. They do not remove domains or difficulty levels to measure their individual contribution to overall benchmark validity." 87 }, 88 "multiple_metrics": { 89 "applies": true, 90 "answer": true, 91 "justification": "Results include per-task accuracy (exact match, pass rates), aggregate Easy/Hard/All scores, and Pearson correlation with external benchmarks (Arena and MMLU). Error rate analysis by features is also provided (Figures 2-4)." 92 }, 93 "human_evaluation": { 94 "applies": true, 95 "answer": false, 96 "justification": "No human evaluation is performed. All assessment is rule-based and programmatic, which the paper positions as a feature. However, validating the rule-based evaluator against human judgments would strengthen the benchmark." 97 }, 98 "held_out_test_set": { 99 "applies": true, 100 "answer": true, 101 "justification": "Models are evaluated as-is on StructTest without any tuning. For math tasks, standard GSM8K test splits are used. Section 3.5 mentions plans for a confidential held-out test set for future iterations." 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": true, 106 "justification": "Extensive per-category breakdowns in Tables 1-3: by domain (Summarization, Code, HTML, Math), difficulty (Easy/Hard), and individual subtask (e.g., 7 summarization formats, 4 code tasks). Figure 5 shows per-domain correlation." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": true, 111 "justification": "Figure 2 analyzes GPT-4o error rates by bullet point count and length. Figure 3 shows tag-count error distributions for HTML. Figure 4 shows format-specific error rates for math. Section 4.1 discusses specific failure patterns (e.g., indentation collapse, long output formatting failures)." 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": false, 116 "justification": "While the paper reports that models fail on the benchmark (which is expected by design), it does not report negative results about its own methodology — e.g., task designs that didn't work, evaluation rules that needed revision, or approaches tried and abandoned." 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": true, 123 "justification": "Abstract claims that StructTest is challenging (supported by Table 1: best model at 74.76%), provides unbiased evaluation (rule-based by design), and serves as a proxy for reasoning (supported by 92.5% correlation with Arena in Table 4) are supported by the results." 124 }, 125 "causal_claims_justified": { 126 "applies": true, 127 "answer": false, 128 "justification": "The paper claims models 'may have overfitted to specific formats and styles' (Section 4.1) based on performance drops when format changes, but doesn't rule out alternative explanations (e.g., novel task difficulty independent of overfitting). Ablation studies or controlled experiments would be needed to support this causal claim." 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": false, 133 "justification": "The title claims to benchmark 'LLMs' Reasoning' but the benchmark measures structured output generation and instruction following across 4 domains. The abstract claims it is 'a robust proxy for measuring reasoning capabilities' — this is a broad generalization from format-following to general reasoning that is not adequately bounded." 134 }, 135 "alternative_explanations_discussed": { 136 "applies": true, 137 "answer": false, 138 "justification": "The paper does not discuss alternative explanations for its key findings. For the high correlation with Arena/MMLU, it doesn't consider that both may simply measure instruction-following ability rather than reasoning. For format overfitting claims, it doesn't consider that novel/complex tasks are inherently harder regardless of memorization." 139 }, 140 "proxy_outcome_distinction": { 141 "applies": true, 142 "answer": false, 143 "justification": "The paper measures structured output compliance and calls it 'reasoning.' While Section 1 argues structured outputs require 'decomposing instructions, understanding and retaining subtle constraints during extended decoding, and executing logical actions,' there is no rigorous analysis distinguishing format-following ability from reasoning ability. The word 'proxy' is used but the gap is not adequately discussed." 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": true, 149 "answer": true, 150 "justification": "Table 5 (Appendix A) provides exact API versions for all closed-source models (e.g., 'gpt-4o-2024-08-06', 'claude-3-5-sonnet-20241022', 'gemini-1.5-pro-002'). Open-source models are identified by specific model names and sizes (e.g., 'Llama-3.1-70B', 'DeepSeek-R1')." 151 }, 152 "prompts_provided": { 153 "applies": true, 154 "answer": true, 155 "justification": "Full prompt examples are provided for every task type in Appendix B (Figures 6-17), including actual prompt text with format instructions, examples, and response templates." 156 }, 157 "hyperparameters_reported": { 158 "applies": true, 159 "answer": false, 160 "justification": "No generation hyperparameters are reported — temperature, top-p, max tokens, and other sampling settings are not mentioned anywhere in the paper." 161 }, 162 "scaffolding_described": { 163 "applies": false, 164 "answer": false, 165 "justification": "No agentic scaffolding is used. Models are prompted directly with task instructions and generate responses in a single pass." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": true, 170 "justification": "Sections 3.1-3.4 describe task construction in detail: use of Python ast package for code tasks, uniform sampling of parameters from fixed intervals for HTML/summarization, 7 final answer styles and 5 CoT styles for math, and the construction of Easy/Hard splits." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": false, 177 "justification": "There is no dedicated Limitations or Threats to Validity section. Section 4.3 discusses updating the benchmark and Section 3.5 discusses contamination robustness, but neither constitutes a substantive limitations discussion." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": false, 182 "justification": "No specific threats to validity are discussed. The paper does not address issues like whether format-following correlates with reasoning only for current model families, whether the 14-model correlation analysis is statistically underpowered, or whether the benchmark's difficulty comes from genuine reasoning demands vs. unfamiliar formatting." 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": false, 187 "justification": "No explicit scope boundaries are stated. The paper claims StructTest is a proxy for 'general reasoning ability' and 'comprehensive model evaluation' without stating what it does not measure — e.g., creative generation, multi-turn dialogue, factual knowledge, or long-context reasoning." 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": false, 194 "justification": "The benchmark tasks are released via GitHub, but raw model outputs (predictions for each of the 17 LLMs on each task) are not explicitly stated to be available for independent verification." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": true, 199 "justification": "Sections 3.1-3.4 describe task construction procedures in detail. The sources of underlying data are specified (e.g., GSM8K for math, code snippets for code tasks). Evaluation rules are formalized with mathematical notation (Equations 1-4)." 200 }, 201 "recruitment_methods_described": { 202 "applies": false, 203 "answer": false, 204 "justification": "No human participants. Data sources are standard benchmarks (GSM8K) and programmatically constructed tasks." 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": false, 209 "justification": "While task construction rules are described, the paper does not report exact counts of test examples per task, filtering criteria applied to source data, or the full pipeline from data collection to final benchmark composition." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding information or acknowledgments section is present in the paper." 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "Author affiliations are clearly listed: Nanyang Technological University, Salesforce Research, I2R A*STAR Singapore, and University of Alberta." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": false, 226 "justification": "Without funding disclosure, independence cannot be assessed. Multiple authors are from Salesforce Research, which has commercial interests in AI/LLM technology, though Salesforce models are not directly evaluated." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": false, 231 "justification": "No competing interests or financial disclosure statement is present in the paper." 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": true, 237 "answer": false, 238 "justification": "Training data cutoff dates are not stated for any of the 17 evaluated models. Only the inference date range is provided (Nov 27 – Dec 14, 2024)." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": true, 242 "answer": true, 243 "justification": "Section 3.5 extensively discusses contamination resistance: 'its tasks are designed to be highly unlikely to have been encountered by existing models during training.' The paper also notes that 'reliance on underlying data from existing benchmarks in some subtasks...raises concerns about data memorization' (Section 1)." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": true, 247 "answer": true, 248 "justification": "Section 3.5 addresses contamination directly, arguing that StructTest's compositional design and data-decoupled evaluation mitigate contamination risk. The paper also acknowledges that underlying datasets like GSM8K may be memorized, and argues that format requirements make memorization insufficient for high scores." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in the study." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in the study." 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in the study." 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in the study." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in the study." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in the study." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants in the study." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": true, 291 "answer": false, 292 "justification": "No API costs, token counts, or latency figures are reported despite evaluating 17 models. The paper claims StructTest is 'cost-effective' and 'cheap to run' but never quantifies the actual cost." 293 }, 294 "compute_budget_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "The paper states inference was performed Nov 27 – Dec 14, 2024 (Appendix A) but does not quantify total compute: no GPU hours, API spend, or hardware specifications." 298 } 299 }, 300 "experimental_rigor": { 301 "seed_sensitivity_reported": { 302 "applies": true, 303 "answer": false, 304 "justification": "All results appear to be single-run evaluations. No multi-seed analysis or seed sensitivity reporting is present." 305 }, 306 "number_of_runs_stated": { 307 "applies": true, 308 "answer": false, 309 "justification": "The number of experimental runs per model is never stated. It appears each model was evaluated once." 310 }, 311 "hyperparameter_search_budget": { 312 "applies": true, 313 "answer": false, 314 "justification": "No hyperparameter search is described. Generation settings (temperature, sampling) are not even reported, let alone the search budget for finding them." 315 }, 316 "best_config_selection_justified": { 317 "applies": true, 318 "answer": false, 319 "justification": "No explanation is provided for how model generation settings were chosen. The paper does not discuss whether default or tuned settings were used." 320 }, 321 "multiple_comparison_correction": { 322 "applies": true, 323 "answer": false, 324 "justification": "The paper compares 17 models across multiple domains and difficulty levels without any statistical tests or correction for multiple comparisons." 325 }, 326 "self_comparison_bias_addressed": { 327 "applies": true, 328 "answer": false, 329 "justification": "The authors designed the benchmark and evaluated all models on it. They do not discuss potential bias in benchmark design that could favor certain model architectures or training approaches." 330 }, 331 "compute_budget_vs_performance": { 332 "applies": true, 333 "answer": false, 334 "justification": "Models ranging from 1.5B parameters (DeepSeek-R1-Distill-Qwen-1.5B) to frontier closed-source models are compared without any discussion of compute differences or performance per compute unit." 335 }, 336 "benchmark_construct_validity": { 337 "applies": true, 338 "answer": false, 339 "justification": "The paper claims StructTest measures 'reasoning' but actually measures structured output generation and instruction following. The connection between format compliance and reasoning is asserted ('skills closely aligned with complex reasoning') rather than empirically validated. The Pearson correlation with Arena/MMLU is presented as evidence but could reflect shared instruction-following ability rather than reasoning." 340 }, 341 "scaffold_confound_addressed": { 342 "applies": false, 343 "answer": false, 344 "justification": "No scaffolding is involved. Models are prompted directly in a single-turn evaluation." 345 } 346 }, 347 "data_leakage": { 348 "temporal_leakage_addressed": { 349 "applies": true, 350 "answer": true, 351 "justification": "Section 3.5 and Section 1 discuss temporal leakage: 'The reliance on underlying data from existing benchmarks in some subtasks, coupled with notable performance drops, raises concerns about data memorization rather than genuine generalization.' The paper argues StructTest's format requirements mitigate memorization effects." 352 }, 353 "feature_leakage_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "No discussion of whether the evaluation setup leaks information through context or provides hints not available in real usage." 357 }, 358 "non_independence_addressed": { 359 "applies": true, 360 "answer": false, 361 "justification": "The paper does not verify whether code snippets or other source data overlap with model training data. While Section 3.5 argues contamination is unlikely by design, no empirical independence verification is performed." 362 }, 363 "leakage_detection_method": { 364 "applies": true, 365 "answer": false, 366 "justification": "No concrete leakage detection methods (canary strings, membership inference, n-gram overlap analysis) are applied. The contamination argument is purely by design." 367 } 368 } 369 }, 370 "scan_version": 3, 371 "active_modules": ["experimental_rigor", "data_leakage"], 372 "claims": [ 373 { 374 "claim": "StructTest remains challenging even for top-performing models, with DeepSeek-R1 achieving only 74.76% overall and 67.10% on Hard tasks.", 375 "evidence": "Table 1 shows DeepSeek-R1 at 74.76% StructTest-All and 67.10% StructTest-Hard. GPT-4o scores 73.16% overall, 57.29% Hard. Many models collapse below 10% on specific Hard subtasks.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "StructTest achieves Pearson correlation of 92.5% with ChatBot Arena and 96.3% with MMLU, validating it as a proxy for general reasoning.", 380 "evidence": "Table 4 and Section 4.2 report correlations computed over 14 models with both Arena scores and MMLU accuracy.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Existing math reasoning comparisons between models are likely unreliable and unfair unless tested across diverse formats.", 385 "evidence": "Figure 4 shows GPT-4o error rates vary from 0% to 84% across 20 Hard math formats on GSM8K. Section 4.1 notes models scoring 90%+ on standard GSM8K drop to 60-77% on StructTest math.", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Open-source DeepSeek-R1 is comparable to top closed-source models on structured output tasks.", 390 "evidence": "Table 1 shows DeepSeek-R1 (74.76%) competitive with GPT-4o (73.16%) and Claude-3.5-sonnet (72.62%), outperforming them on Hard splits (67.10% vs 57.29% and 53.69%).", 391 "supported": "strong" 392 }, 393 { 394 "claim": "StructTest is robust to data contamination due to its compositional design decoupled from underlying data.", 395 "evidence": "Section 3.5 argues by design: tasks are compositional and unlikely to appear in training data, benchmark can be regularly updated. No empirical contamination testing is performed.", 396 "supported": "weak" 397 } 398 ], 399 "methodology_tags": ["benchmark-eval"], 400 "key_findings": "StructTest introduces a rule-based, programmatically verifiable benchmark evaluating 17 LLMs across 4 structured output domains (summarization, code, HTML, math). Even top models like DeepSeek-R1 achieve only 74.76% overall, with many models collapsing on Hard tasks. The benchmark correlates strongly with ChatBot Arena (r=0.925) and MMLU (r=0.963) over 14 models. Performance on format-constrained math tasks drops dramatically compared to standard benchmarks (e.g., Gemini-1.5-pro from 91.7% standard to 77.3% Easy / 73.4% Hard), suggesting models may have overfitted to specific answer formats.", 401 "red_flags": [ 402 { 403 "flag": "No error bars or uncertainty quantification", 404 "detail": "All 17 models are evaluated with single-run results across all tasks. No confidence intervals, standard deviations, or multi-run variance are reported anywhere in the paper." 405 }, 406 { 407 "flag": "Correlation claim on underpowered sample", 408 "detail": "The headline claim of 92.5% Pearson correlation with ChatBot Arena is computed on only 14 data points. With n=14, this correlation has very wide confidence intervals and is sensitive to individual outliers, but no uncertainty estimate is provided." 409 }, 410 { 411 "flag": "Construct validity gap: format-following framed as reasoning", 412 "detail": "The paper measures instruction-following and structured output compliance but frames this as 'reasoning.' The high correlation with Arena (which also measures instruction-following/helpfulness) could reflect shared instruction-following ability rather than reasoning. No experiment isolates the reasoning component from format compliance." 413 }, 414 { 415 "flag": "No limitations section", 416 "detail": "The paper has no dedicated limitations or threats-to-validity section. It does not discuss what StructTest cannot measure, whether the benchmark's difficulty reflects genuine reasoning demands vs. unfamiliar formatting, or the statistical limitations of the 14-model correlation analysis." 417 }, 418 { 419 "flag": "Missing generation hyperparameters", 420 "detail": "Temperature, top-p, max tokens, and other generation settings are not reported for any of the 17 models. These settings significantly affect output quality and could confound comparisons between models." 421 } 422 ], 423 "cited_papers": [ 424 { 425 "title": "SWE-bench: Can language models resolve real-world github issues?", 426 "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R Narasimhan"], 427 "year": 2024, 428 "relevance": "Major benchmark for evaluating LLMs on real-world software engineering tasks, directly relevant to LLM code generation evaluation." 429 }, 430 { 431 "title": "Chatbot Arena: An open platform for evaluating llms by human preference", 432 "authors": ["Wei-Lin Chiang", "Lianmin Zheng", "Ying Sheng"], 433 "year": 2024, 434 "arxiv_id": "2403.04132", 435 "relevance": "Human-based LLM evaluation platform used as the primary external validation benchmark for StructTest's correlation analysis." 436 }, 437 { 438 "title": "Measuring massive multitask language understanding", 439 "authors": ["Dan Hendrycks", "Collin Burns", "Steven Basart", "Andy Zou", "Mantas Mazeika", "Dawn Song", "Jacob Steinhardt"], 440 "year": 2020, 441 "relevance": "MMLU benchmark, the second external validation target for StructTest and a widely used LLM reasoning evaluation." 442 }, 443 { 444 "title": "How much are LLMs contaminated? A comprehensive survey and the LLMSanitize library", 445 "authors": ["Mathieu Ravaut", "Bosheng Ding", "Fangkai Jiao", "Hailin Chen"], 446 "year": 2024, 447 "relevance": "Survey on LLM benchmark contamination, directly motivating StructTest's contamination-resistant design." 448 }, 449 { 450 "title": "FOFO: A benchmark to evaluate LLMs' format-following capability", 451 "authors": ["Congying Xia", "Chen Xing", "Jiangshu Du"], 452 "year": 2024, 453 "relevance": "Prior work on evaluating LLMs' format-following ability, the most directly related benchmark to StructTest." 454 }, 455 { 456 "title": "Cheating automatic LLM benchmarks: Null models achieve high win rates", 457 "authors": ["Xiaosen Zheng", "Tianyu Pang", "Chao Du"], 458 "year": 2024, 459 "relevance": "Demonstrates vulnerability of model-based evaluation benchmarks, motivating StructTest's rule-based approach." 460 }, 461 { 462 "title": "Judging LLM-as-a-judge with MT-Bench and Chatbot Arena", 463 "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"], 464 "year": 2023, 465 "relevance": "MT-Bench and the LLM-as-judge paradigm, representing model-based evaluation that StructTest aims to complement." 466 }, 467 { 468 "title": "Length-controlled AlpacaEval: A simple way to debias automatic evaluators", 469 "authors": ["Yann Dubois", "Balázs Galambosi", "Percy Liang", "Tatsunori B. Hashimoto"], 470 "year": 2024, 471 "arxiv_id": "2404.04475", 472 "relevance": "Addresses length bias in model-based evaluation, a key limitation that StructTest's rule-based approach avoids." 473 }, 474 { 475 "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code", 476 "authors": ["Naman Jain", "King Han", "Alex Gu"], 477 "year": 2024, 478 "relevance": "Contamination-free code benchmark sharing StructTest's goal of contamination-resistant LLM evaluation." 479 }, 480 { 481 "title": "Training verifiers to solve math word problems", 482 "authors": ["Karl Cobbe", "Vineet Kosaraju", "Mohammad Bavarian"], 483 "year": 2021, 484 "relevance": "GSM8K benchmark used as the underlying data source for StructTest's math reasoning evaluation tasks." 485 }, 486 { 487 "title": "LLMs are biased towards output formats! Systematically evaluating and mitigating output format bias of LLMs", 488 "authors": ["Xuan Long Do", "Hai Nguyen Ngoc", "Tiviatis Sim"], 489 "year": 2024, 490 "relevance": "Studies how format instructions influence LLM task performance, closely related to StructTest's core investigation." 491 }, 492 { 493 "title": "Struc-bench: Are large language models really good at generating complex structured data?", 494 "authors": ["Xiangru Tang", "Yiming Zong", "Jason Phang"], 495 "year": 2023, 496 "relevance": "Prior benchmark for evaluating LLMs on structured generation, including HTML, directly preceding StructTest's approach." 497 } 498 ], 499 "engagement_factors": { 500 "practical_relevance": { 501 "score": 2, 502 "justification": "The benchmark and evaluation code are released and could be used by practitioners to evaluate LLMs, though it requires setup and is primarily a research tool." 503 }, 504 "surprise_contrarian": { 505 "score": 1, 506 "justification": "The finding that models lose up to 70% accuracy with format constraints and may be overfitting to popular answer formats is mildly surprising but aligns with growing intuition about benchmark fragility." 507 }, 508 "fear_safety": { 509 "score": 0, 510 "justification": "No AI safety or security concerns are raised by this benchmark evaluation paper." 511 }, 512 "drama_conflict": { 513 "score": 1, 514 "justification": "Implicit 'current benchmarks are unreliable' angle with format overfitting claims, but no strong controversy or direct challenges to specific parties." 515 }, 516 "demo_ability": { 517 "score": 2, 518 "justification": "Code and data released on GitHub (github.com/SparkJiao/StructTest), enabling users to evaluate their own models." 519 }, 520 "brand_recognition": { 521 "score": 2, 522 "justification": "From Salesforce Research and NTU; evaluates well-known models (GPT-4o, DeepSeek-R1, Claude-3.5-sonnet) but the benchmark itself is not yet widely known." 523 } 524 } 525 }