scan.json (23969B)
1 { 2 "paper": { 3 "title": "ByteSized32Refactored: Towards an Extensible Interactive Text Games Corpus for LLM World Modeling and Evaluation", 4 "authors": [ 5 "Haonan Wang", 6 "Junfeng Sun", 7 "Xingdi Yuan", 8 "Ruoyao Wang", 9 "Ziang Xiao" 10 ], 11 "year": 2025, 12 "venue": "arXiv", 13 "arxiv_id": "2509.23979" 14 }, 15 "checklist": { 16 "artifacts": { 17 "code_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "A GitHub link is provided in the paper: https://github.com/isle-dev/BYTESIZED32-Refactored (footnote 1 on p.1)." 21 }, 22 "data_released": { 23 "applies": true, 24 "answer": true, 25 "justification": "The corpus itself (32 refactored text games) is available via the GitHub repository. The original ByteSized32 is also publicly available. The evaluation uses these public corpora." 26 }, 27 "environment_specified": { 28 "applies": true, 29 "answer": false, 30 "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided in the paper. The paper mentions using Python and OpenAI's API but does not specify library versions or dependencies." 31 }, 32 "reproduction_instructions": { 33 "applies": true, 34 "answer": false, 35 "justification": "No step-by-step reproduction instructions are provided in the paper. The experimental procedure is described at a high level (one-shot prompting with GPT-4o, reflection steps) but there are no specific commands or scripts to replicate the experiments." 36 } 37 }, 38 "statistical_methodology": { 39 "confidence_intervals_or_error_bars": { 40 "applies": true, 41 "answer": false, 42 "justification": "All results are reported as single point estimates (e.g., '56.25%', '81.25%') with no confidence intervals or error bars in any table or figure." 43 }, 44 "significance_tests": { 45 "applies": true, 46 "answer": false, 47 "justification": "The paper makes comparative claims (e.g., Refactored vs. Original, GPT-4o vs. GPT-4) based solely on comparing raw percentages without any statistical significance tests." 48 }, 49 "effect_sizes_reported": { 50 "applies": true, 51 "answer": false, 52 "justification": "While the paper reports raw percentage differences (e.g., '+20.7%' for winnability), no formal effect size measures are provided. The delta columns in Tables 5-8 show raw differences but lack baseline context for most metrics and no standardized effect sizes are reported." 53 }, 54 "sample_size_justified": { 55 "applies": true, 56 "answer": false, 57 "justification": "The sample size of N=96 generated games (32 games x 3 seeds) is not justified. No power analysis or discussion of whether this sample size is sufficient for the claims being made." 58 }, 59 "variance_reported": { 60 "applies": true, 61 "answer": false, 62 "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. Results appear to be single-run numbers with no indication of variability." 63 } 64 }, 65 "evaluation_design": { 66 "baselines_included": { 67 "applies": true, 68 "answer": true, 69 "justification": "The original ByteSized32 corpus serves as the baseline, and results are compared between the Refactored and Original corpora across all four evaluation dimensions (Tables 3-8)." 70 }, 71 "baselines_contemporary": { 72 "applies": true, 73 "answer": true, 74 "justification": "The baseline is the original ByteSized32 (Wang et al., 2023), which is the direct predecessor that this work extends. This is the most appropriate baseline for this type of contribution." 75 }, 76 "ablation_study": { 77 "applies": true, 78 "answer": false, 79 "justification": "No ablation study is performed. The paper does not separately evaluate the contribution of GameBasic.py abstraction vs. the code modularity optimizations, making it impossible to attribute improvements to specific components." 80 }, 81 "multiple_metrics": { 82 "applies": true, 83 "answer": true, 84 "justification": "Four evaluation dimensions are used: Technical Validity (three sub-metrics), Specification Compliance (three sub-metrics), Physical Reality Alignment, and Winnability." 85 }, 86 "human_evaluation": { 87 "applies": true, 88 "answer": false, 89 "justification": "All evaluation is automated (Python interpreter checks, GPT-based alignment judgments, automated specification matching). No human evaluation of generated game quality is performed, though the paper makes subjective quality claims about game generation." 90 }, 91 "held_out_test_set": { 92 "applies": true, 93 "answer": true, 94 "justification": "The paper uses 'unseen evaluation' task specifications (Section 4.1) — the model generates games for specifications not used as the one-shot example, maintaining separation between demonstration and test." 95 }, 96 "per_category_breakdown": { 97 "applies": true, 98 "answer": true, 99 "justification": "Results are broken down by evaluation dimension (validity, compliance, alignment, winnability) and by sub-metrics within each dimension (e.g., game initialization, valid actions, runnable game within Technical Validity)." 100 }, 101 "failure_cases_discussed": { 102 "applies": true, 103 "answer": true, 104 "justification": "The paper discusses failure cases, particularly GPT-5's output formatting failures (Appendix C, Figure 6), API timeout issues (Appendix D), and the decline in technical validity on the refactored corpus compared to the original." 105 }, 106 "negative_results_reported": { 107 "applies": true, 108 "answer": true, 109 "justification": "The paper honestly reports that ByteSized32Refactored shows worse Technical Validity than the original (Tables 3-4), and that Physical Reality Alignment initially drops with the refactored code. GPT-5 failures are also reported in the appendix." 110 } 111 }, 112 "claims_and_evidence": { 113 "abstract_claims_supported": { 114 "applies": true, 115 "answer": true, 116 "justification": "The abstract claims 'a mix of performance' with improvements on two dimensions and decreases on the other two, which is accurately reflected in the results (compliance/winnability improve; validity/initial alignment decrease)." 117 }, 118 "causal_claims_justified": { 119 "applies": true, 120 "answer": false, 121 "justification": "The paper makes causal claims such as 'the hierarchical structure of the refactored code presents new challenges for LLMs' (Section 5) and 'abstraction increases reasoning demands' (Summary of Results). These causal attributions are not adequately justified — the refactored vs. original comparison confounds multiple factors (code length, structure, abstraction level) without controlled manipulation." 122 }, 123 "generalization_bounded": { 124 "applies": true, 125 "answer": false, 126 "justification": "The title claims relevance to 'LLM World Modeling and Evaluation' broadly, but experiments use only GPT-4o (main) and GPT-4/GPT-5 (appendix). Claims about 'LLMs' are not bounded to the tested models. The abstract says 'extensible' and 'scalable' without bounding to the specific domain of common-sense text games." 127 }, 128 "alternative_explanations_discussed": { 129 "applies": true, 130 "answer": true, 131 "justification": "The Limitations section acknowledges that 'observed improvements stem from structural biases rather than genuine advancements in modeling ability' (Limitation 1) and that evaluation metrics have 'limited construct validity' (Limitation 2). These are specific alternative explanations." 132 } 133 }, 134 "setup_transparency": { 135 "model_versions_specified": { 136 "applies": true, 137 "answer": false, 138 "justification": "The paper refers to 'GPT-4o', 'GPT-4', and 'GPT-5' without specifying snapshot dates or API versions (e.g., 'gpt-4o-2024-05-13'). Marketing names without version identifiers are not sufficient." 139 }, 140 "prompts_provided": { 141 "applies": true, 142 "answer": true, 143 "justification": "Full prompt texts are provided in Appendix F for game generation, specification compliance evaluation, physical reality alignment evaluation, reflection, and winnability evaluation. The actual prompt templates with placeholder variables are shown." 144 }, 145 "hyperparameters_reported": { 146 "applies": true, 147 "answer": true, 148 "justification": "Appendix A reports hyperparameters for both GPT-4o (temperature=0.0, top-p=1, frequency-penalty=0.0, presence-penalty=0.0) and GPT-5 (temperature=1, top-p=1, frequency-penalty=0.0, presence-penalty=0.0)." 149 }, 150 "scaffolding_described": { 151 "applies": true, 152 "answer": true, 153 "justification": "The reflection/self-correction scaffolding is described: the model generates code, receives Python interpreter error output, and regenerates (up to 3 reflection steps). The prompt for reflection is provided in Appendix F. Figure 1 shows the overall pipeline." 154 }, 155 "data_preprocessing_documented": { 156 "applies": true, 157 "answer": true, 158 "justification": "The paper documents how the refactored corpus was created (Section 3.2), including the abstraction of base classes into GameBasic.py, the code modularity optimizations (action_map, string construction), and the resulting code statistics (Table 1)." 159 } 160 }, 161 "limitations_and_scope": { 162 "limitations_section_present": { 163 "applies": true, 164 "answer": true, 165 "justification": "A dedicated 'Limitations' section is present after the Conclusion, with four numbered limitations covering structural biases, metric construct validity, reflection vs. ability, and efficiency of the reflection process." 166 }, 167 "threats_to_validity_specific": { 168 "applies": true, 169 "answer": true, 170 "justification": "The limitations are specific to this study: (1) 'observed improvements stem from structural biases rather than genuine advancements in modeling ability'; (2) metrics 'have limited construct validity, being susceptible to framework biases and coverage gaps'; (3) 'reflection enhances measurability' but 'does not directly improve ability'." 171 }, 172 "scope_boundaries_stated": { 173 "applies": true, 174 "answer": false, 175 "justification": "The paper does not explicitly state what the results do NOT show or which settings/models/domains are excluded from its claims. The limitations discuss weaknesses but do not draw explicit scope boundaries around the findings." 176 } 177 }, 178 "data_integrity": { 179 "raw_data_available": { 180 "applies": true, 181 "answer": false, 182 "justification": "The generated games, evaluation outputs, and raw experimental data are not made available. Only aggregated results in tables and figures are provided." 183 }, 184 "data_collection_described": { 185 "applies": true, 186 "answer": true, 187 "justification": "Section 4.1 describes the experiment setup: one-shot prompting with randomly selected examples, using task specifications from an unseen evaluation set, with up to 3 rounds of reflection. The evaluation metrics and their measurement procedures are described in Section 4.2." 188 }, 189 "recruitment_methods_described": { 190 "applies": false, 191 "answer": false, 192 "justification": "No human participants involved. The study evaluates LLM-generated text games using automated metrics." 193 }, 194 "data_pipeline_documented": { 195 "applies": true, 196 "answer": true, 197 "justification": "The pipeline from game generation through evaluation is documented: Figure 1 shows the overview, Section 4.1 describes the generation process, Section 4.2 describes each evaluation metric's measurement procedure, and the reflection loop is documented." 198 } 199 }, 200 "conflicts_of_interest": { 201 "funding_disclosed": { 202 "applies": true, 203 "answer": false, 204 "justification": "No funding source or acknowledgments section is present in the paper." 205 }, 206 "affiliations_disclosed": { 207 "applies": true, 208 "answer": true, 209 "justification": "Author affiliations are clearly listed: Johns Hopkins University, Liaoning Technical University, Microsoft Research Montréal, and Central University of Finance and Economics. One author (Xingdi Yuan) is from Microsoft Research, and the paper uses OpenAI products, but the paper does not evaluate Microsoft products." 210 }, 211 "funder_independent_of_outcome": { 212 "applies": true, 213 "answer": false, 214 "justification": "No funding information is disclosed, so independence cannot be assessed. One author is from Microsoft Research, which could have an interest in LLM evaluation frameworks, but no funding relationship is stated." 215 }, 216 "financial_interests_declared": { 217 "applies": true, 218 "answer": false, 219 "justification": "No competing interests or financial interests statement is present in the paper." 220 } 221 }, 222 "contamination": { 223 "training_cutoff_stated": { 224 "applies": true, 225 "answer": false, 226 "justification": "The paper uses GPT-4o and GPT-5 to generate code based on the ByteSized32 corpus but does not state the training data cutoff for any model. The original ByteSized32 corpus (2023) could be in GPT-4o's training data." 227 }, 228 "train_test_overlap_discussed": { 229 "applies": true, 230 "answer": false, 231 "justification": "No discussion of whether the original ByteSized32 code (publicly available on GitHub since 2023) appeared in GPT-4o's training data, despite the model being asked to generate similar code." 232 }, 233 "benchmark_contamination_addressed": { 234 "applies": true, 235 "answer": false, 236 "justification": "The ByteSized32 corpus was published in 2023 and is publicly available on GitHub. GPT-4o could have been trained on this code, which would affect the fairness of evaluating code generation based on one-shot examples from this corpus. This contamination risk is not addressed." 237 } 238 }, 239 "human_studies": { 240 "pre_registered": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants in this study." 244 }, 245 "irb_or_ethics_approval": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants in this study." 249 }, 250 "demographics_reported": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants in this study." 254 }, 255 "inclusion_exclusion_criteria": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study." 259 }, 260 "randomization_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "blinding_described": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 }, 270 "attrition_reported": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in this study." 274 } 275 }, 276 "cost_and_practicality": { 277 "inference_cost_reported": { 278 "applies": true, 279 "answer": false, 280 "justification": "No API costs, token consumption, or wall-clock time is reported despite extensive use of GPT-4o, GPT-4, and GPT-5 APIs with multiple reflection rounds per generated game." 281 }, 282 "compute_budget_stated": { 283 "applies": true, 284 "answer": false, 285 "justification": "No total computational budget, API spend, or hardware requirements are stated despite the experiments involving generation and evaluation of 96+ games across multiple models with multiple reflection steps." 286 } 287 } 288 }, 289 "claims": [ 290 { 291 "claim": "ByteSized32Refactored reduces the corpus from 20k to 10k total lines of Python code compared to the original ByteSized32.", 292 "evidence": "Table 1 shows average lines per game dropping from 618.1 to 303.19 (-50.9%), and tokens per game from 6792 to 2896 (Section 3.2.2).", 293 "supported": "strong" 294 }, 295 { 296 "claim": "ByteSized32Refactored allows substantially more examples to fit in LLM context windows compared to the original.", 297 "evidence": "Table 2 shows GPT-4o can fit 44 refactored examples vs. 18 original examples in its 128k context window; Qwen3-32B can fit 9 vs. 4; Llama3.2 can fit 2 vs. 1.", 298 "supported": "strong" 299 }, 300 { 301 "claim": "GPT-4o generates games with worse Technical Validity on ByteSized32Refactored compared to the original ByteSized32.", 302 "evidence": "Tables 3-4: After 3 reflections, Runnable Game is 61.46% on Refactored vs. 82.29% on Original; Game Initialization is 85.42% vs. 95.83%; Valid Actions is 70.83% vs. 90.62%.", 303 "supported": "moderate" 304 }, 305 { 306 "claim": "GPT-4o generates games with better Specification Compliance and Winnability on ByteSized32Refactored compared to the original.", 307 "evidence": "Tables 5-6: After reflection, Winnability is 54% on Refactored vs. 34% on Original; Task-critical actions 93.75% vs. 90.62%. However, Distractors compliance is lower (28.12% vs. 53.12%).", 308 "supported": "moderate" 309 }, 310 { 311 "claim": "The hierarchical structure of the refactored code presents new challenges for LLMs in code generation.", 312 "evidence": "Section 5 Summary: Technical validity drops are attributed to 'abstraction increases reasoning demands and raises the generation barrier.' This is a causal interpretation without controlled experimentation isolating abstraction from other changes.", 313 "supported": "weak" 314 }, 315 { 316 "claim": "GPT-4o consistently outperforms GPT-4 on the original ByteSized32 corpus.", 317 "evidence": "Appendix E.1: Tables 4 and 7 show GPT-4o achieves 82.29% runnable games after reflection vs. GPT-4's 57.3%, with gaps across all technical validity metrics.", 318 "supported": "moderate" 319 } 320 ], 321 "methodology_tags": [ 322 "benchmark-eval" 323 ], 324 "key_findings": "ByteSized32Refactored reduces the original ByteSized32 text game corpus from 20k to 10k lines of Python by abstracting common logic into a GameBasic.py foundation library with 7 base classes. Evaluation with GPT-4o shows mixed results: the refactored corpus leads to worse technical validity (61.46% vs. 82.29% runnable games) but better specification compliance and winnability (54% vs. 34% winnable games after reflection). The paper also reports GPT-5 difficulties with evaluation formatting and API constraints, and finds GPT-4o outperforms GPT-4 across all metrics on the original corpus.", 325 "red_flags": [ 326 { 327 "flag": "No statistical testing on comparative claims", 328 "detail": "All comparisons between Refactored vs. Original and GPT-4o vs. GPT-4 are based on raw percentage comparisons with N=96 games and no significance tests. Differences could be due to random variation." 329 }, 330 { 331 "flag": "No variance or uncertainty reporting", 332 "detail": "Results appear to be single-run numbers with no standard deviations, confidence intervals, or repeated trials reported. The stability of the results is unknown." 333 }, 334 { 335 "flag": "Benchmark contamination risk unaddressed", 336 "detail": "The original ByteSized32 corpus has been publicly available on GitHub since 2023. GPT-4o may have been trained on this code, which could affect the fairness of evaluating one-shot code generation from this corpus. This is not discussed." 337 }, 338 { 339 "flag": "Causal attribution without controlled experiments", 340 "detail": "The paper attributes performance differences to 'hierarchical structure' and 'abstraction increasing reasoning demands,' but the refactoring changed multiple factors simultaneously (code length, structure, abstraction, string operations) without ablation to isolate contributions." 341 }, 342 { 343 "flag": "GPT-based evaluation of physical reality alignment", 344 "detail": "Physical Reality Alignment is judged by GPT itself, creating a circular evaluation where the same family of models generates and evaluates the outputs. No human validation of these automated judgments is reported." 345 } 346 ], 347 "cited_papers": [ 348 { 349 "title": "Evaluating large language models trained on code", 350 "authors": ["Mark Chen", "Jerry Tworek"], 351 "year": 2021, 352 "arxiv_id": "2107.03374", 353 "relevance": "Introduces HumanEval, a foundational code generation benchmark relevant to evaluating LLM coding capabilities." 354 }, 355 { 356 "title": "SWE-bench: Can language models resolve real-world GitHub issues?", 357 "authors": ["Carlos E. Jimenez", "John Yang"], 358 "year": 2024, 359 "arxiv_id": "2310.06770", 360 "relevance": "Major benchmark for evaluating LLMs on realistic software engineering tasks in large codebases." 361 }, 362 { 363 "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code", 364 "authors": ["Naman Jain", "King Han"], 365 "year": 2024, 366 "arxiv_id": "2403.07974", 367 "relevance": "Addresses benchmark contamination in code evaluation, relevant to methodology quality assessment." 368 }, 369 { 370 "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions", 371 "authors": ["Terry Yue Zhuo"], 372 "year": 2024, 373 "arxiv_id": "2406.15877", 374 "relevance": "Benchmark evaluating compositional reasoning in code generation across libraries." 375 }, 376 { 377 "title": "A survey on code generation with LLM-based agents", 378 "authors": ["Yihong Dong", "Xue Jiang"], 379 "year": 2025, 380 "arxiv_id": "2508.00083", 381 "relevance": "Survey of LLM-based agent approaches to code generation, directly relevant to the survey scope." 382 }, 383 { 384 "title": "ByteSized32: A corpus and challenge task for generating task-specific world models expressed as text games", 385 "authors": ["Ruoyao Wang", "Graham Todd"], 386 "year": 2023, 387 "arxiv_id": "2305.14879", 388 "relevance": "The original corpus this paper extends; foundational work on LLM world modeling via text game generation." 389 }, 390 { 391 "title": "MultiPL-E: A scalable and extensible approach to benchmarking neural code generation", 392 "authors": ["Federico Cassano"], 393 "year": 2022, 394 "arxiv_id": "2208.08227", 395 "relevance": "Multi-language code generation benchmark relevant to evaluating LLM programming capabilities." 396 }, 397 { 398 "title": "Large language models for code generation: A comprehensive survey of challenges, techniques, evaluation, and applications", 399 "authors": ["Nam Huynh", "Beiyu Lin"], 400 "year": 2025, 401 "arxiv_id": "2503.01245", 402 "relevance": "Comprehensive survey of LLM code generation covering evaluation methodology." 403 }, 404 { 405 "title": "CoCo-Bench: A comprehensive code benchmark for multi-task large language model evaluation", 406 "authors": ["Wenjing Yin"], 407 "year": 2025, 408 "arxiv_id": "2504.20673", 409 "relevance": "Multi-task code benchmark including code understanding and review, relevant to LLM evaluation methodology." 410 }, 411 { 412 "title": "TextWorld: A learning environment for text-based games", 413 "authors": ["Marc-Alexandre Côté"], 414 "year": 2018, 415 "relevance": "Foundational text-based game framework relevant to LLM evaluation in interactive environments." 416 } 417 ] 418 }