scan.json (24752B)
1 { 2 "paper": { 3 "title": "Benchmarking Large Language Models with Integer Sequence Generation Tasks", 4 "authors": ["Daniel O'Malley", "Manish Bhattarai", "Nishath Rajiv Ranasinghe", "Erick Draayer", "Javier E. Santos"], 5 "year": 2024, 6 "venue": "NeurIPS 2025", 7 "arxiv_id": "2411.04372" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "A GitHub repository URL is provided in Section 3.1: 'The dataset and the code is available at https://github.com/ceodspspectrum/oeis-sequence-benchmark.'" 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The dataset (1000 OEIS sequences) is released at the same GitHub repository mentioned in Section 3.1. The OEIS itself is publicly available." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No environment specification (requirements.txt, Dockerfile, conda file, or detailed library versions) is mentioned in the paper. The paper only states that generated code must use 'the Python standard library' but does not specify the evaluation environment's dependencies." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions are provided in the paper. While code is released, the paper does not include a README description, commands to run, or a 'Reproducing Results' section." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "No confidence intervals or error bars are reported. Tables 1 and 2 report only point estimates (average scores, percentage perfect, percentage cheating) with no uncertainty quantification." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper makes comparative claims (e.g., 'reasoning models generally outperform the non-reasoning models') but provides no statistical significance tests. Differences are presented as raw number comparisons only." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": false, 46 "justification": "No formal effect sizes (Cohen's d, odds ratios, etc.) are reported. While raw scores are given (e.g., o3 at 73.6 vs gpt-4o at 39.5 on easy sequences), there is no standardized effect size measure or explicit baseline-to-improvement framing beyond raw numbers in tables." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "The choice of 1000 sequences (250 per category) is not justified with any power analysis or formal reasoning. No explanation is given for why 250 per category was chosen." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No variance, standard deviation, or spread measures are reported. Each model appears to be evaluated once per sequence with no repeated runs. The score distributions in Figure 3 show histograms but no formal variance statistics." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "The paper includes a wide range of baselines: 21 models are compared including older models (gpt-3.5-turbo), general-purpose models (gpt-4o, Claude, Llama), and reasoning models (o1, o3, o3-mini, o4-mini, Gemini 2.5). Non-reasoning models serve as baselines against reasoning models." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "The baselines include the latest frontier models as of the evaluation period: o3, o4-mini, Gemini 2.5-pro, Gemini 2.5-flash, Claude 3.7 Sonnet, and Llama 4 Scout/Maverick. These are contemporary and competitive." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": false, 73 "justification": "No ablation study is conducted. The benchmark design has multiple components (time limits, cheating detection, difficulty categories) but no systematic ablation of these factors is performed. The two time limits (0.5s vs 4s) provide partial analysis but this is not a structured ablation." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Three evaluation factors are measured: average accuracy score, percentage of perfect scores, and percentage flagged for cheating (Section 3.3). Results are also broken down by easy vs. hard and by time limit." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": true, 83 "justification": "Human evaluation is used to validate the cheating detection mechanism. Section 3.4 states: 'This cheating detection mechanism's effectiveness was validated by comparing it with a human evaluation (one of the authors).'" 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "The benchmark uses held-out sequence values for unit tests. Section 3.2 states: 'For each sequence, the prompt includes only the OEIS Name and Comments fields; sequence values/formulas are withheld for testing.' The contemporary sequences (post-July 2024) also serve as a held-out temporal split." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down by easy vs. hard sequences (Tables 1 and 2), by contemporary vs. classic sequences (main text vs. appendix), and Figure 4 provides per-model error mode classification." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Section 4.1 presents a detailed case study comparing o3 vs. LLaMA-405B on sequence A380521, showing how non-reasoning models fail due to inefficient algorithms. Figure 4 classifies error modes (timeout, incorrect, lookup table). The discussion notes models 'struggle, especially with the hard sequences.'" 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "Several negative results are reported: all models perform poorly on hard sequences (o3 best at only ~30% average on hard), cheating rates increase for harder problems, reasoning models sometimes cheat more than older counterparts (e.g., 'o3 cheated more than o1 on the hard sequences'), and 'o4-mini cheated more often than o3-mini.'" 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims that reasoning models achieve 'substantial improvements' over non-reasoning models (supported by Table 1: o3 at 73.6 easy vs. gpt-4o at 39.5), that 'overall model performance on the hard sequences is poor' (supported: best hard score is ~32), and that they introduce a cheating detection mechanism (Section 3.4). All abstract claims are supported by results." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": false, 115 "justification": "The paper makes implicit causal claims about reasoning specialization causing better performance (e.g., 'The superior performance of reasoning models highlights the effectiveness of specialization in LLMs for mathematical reasoning'). However, the study design is observational — it cannot isolate whether reasoning training itself causes the improvement vs. other factors like model size or training data differences." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The title and abstract frame this as evaluating 'LLMs' in 'mathematical reasoning and algorithmic code synthesis tasks' broadly, but the benchmark only tests Python code generation for integer sequences from OEIS. The Limitations section acknowledges some bounds (Python-only, OEIS-specific) but the framing remains broader than the tested setting." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper does not discuss alternative explanations for why reasoning models outperform non-reasoning models. Possible confounds include model size differences, training data differences, or different API configurations. The case study in 4.1 attributes success to 'memoization' without considering other factors." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "Some models have version identifiers (gpt-3.5-turbo-1106, claude-3.5-sonnet-20241022, claude-3.7-sonnet-20250219) but many do not. Models listed as 'o1', 'o3', 'o3-mini', 'o4-mini', 'gemini-2.5-flash-preview', 'gemini-2.5-pro-preview' lack snapshot dates or API version identifiers. Table 2 shows slightly different names (e.g., 'gemini-2.5-flash-preview-04-17', 'gemini-2.5-pro-preview-03-25') but these are inconsistent with Table 1." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "Appendix C provides the full code generation prompt and lookup table detection prompt using Python string interpolation syntax, including the actual template text and placeholders that are filled with OEIS name and comments fields." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": false, 142 "justification": "Section 4 states models are evaluated 'using their default settings (temperature, etc.)' but does not specify what those defaults are. Temperature 0 is mentioned only for the cheating detection mechanism (Section 3.4), not for the main code generation experiments." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "No agentic scaffolding is used. Models receive a single prompt and generate code in a single pass without iteration, tool use, or feedback loops." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 3.1 and Figure 2 document the data selection pipeline: filtering OEIS sequences by July 2024 cutoff into classic vs. contemporary, then splitting by easy/hard labels, selecting 250 per category. The OEIS source, filtering criteria, and category construction are described." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 7 is a dedicated 'Limitations' section spanning approximately one full page with four distinct limitations discussed." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "The Limitations section discusses specific threats: OEIS bias in sequence types and subjective easy/hard labeling, cheating detection's 95% accuracy with false positives/negatives, Python-only evaluation limiting language generalizability, and time constraints potentially penalizing correct but slow algorithms." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "The Limitations section explicitly states what results do NOT show: evaluation is confined to Python, may not capture model versatility in other languages, time constraints may disadvantage correct but slow solutions, and OEIS-specific biases may not represent all mathematical reasoning tasks." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": true, 176 "justification": "The dataset and code are released at https://github.com/ceodspspectrum/oeis-sequence-benchmark (Section 3.1), which would include the sequence data. The OEIS itself is publicly accessible for verification." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 3.1 describes the data collection: sequences sourced from OEIS, split by a July 2024 timeline cutoff, categorized by OEIS easy/hard labels, 250 per category. Figure 2 provides a visual workflow." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants are recruited for the study (the one author evaluating cheating detection is not a recruited participant). The data source is the OEIS benchmark, a standard public database." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "Figure 2 documents the full pipeline: OEIS collection → filter by July 2024 cutoff → split by easy/hard → 250 per category → combined 1000-sequence benchmark. The evaluation pipeline (prompt → code generation → execution with time limit → cheating detection → scoring) is described in Sections 3.2-3.4." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding or acknowledgments section is present in the paper. All five authors are affiliated with Los Alamos National Laboratory, a DOE-funded institution, but no specific funding source is disclosed." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "All authors' affiliations are clearly listed: all five are at Los Alamos National Laboratory (various divisions: Earth and Environmental Sciences, Theoretical, Computational Sciences)." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding source is disclosed, so independence cannot be assessed. The authors are at a government lab that does not appear to have a direct financial stake in any specific LLM's performance, but without explicit funding disclosure this cannot be confirmed." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "No model training cutoff dates are stated. The paper uses a July 2024 OEIS cutoff to create 'contemporary' sequences that are newer, but does not state the training data cutoff dates for the 21 models evaluated." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": true, 225 "justification": "The paper addresses contamination by splitting sequences into classic (pre-July 2024) and contemporary (post-July 2024) sets. Section 3.1: 'Our discussion in the main text focuses on the contemporary sequences to eliminate the potential for contamination with the models' training data.' The cheating detection mechanism also addresses memorized lookup tables." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": true, 230 "justification": "Contamination is a core design concern. The contemporary sequence split uses post-July 2024 sequences to stay ahead of training data. Section 3.4 introduces cheating detection for lookup tables. The Limitations section acknowledges OEIS code snippets are publicly available but argues Python standard library constraints mitigate this. Section 5 notes the benchmark 'can be routinely updated with new sequences.'" 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in the study. The one-author cheating validation is a validation check, not a human subjects study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in the study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in the study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in the study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in the study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in the study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in the study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "No inference cost, API cost, or tokens consumed are reported despite evaluating 21 models across 1000 sequences each. The paper reports execution time limits (0.5s and 4s) for the generated code, but not the cost of generating the code itself." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No total computational budget is stated. The paper does not report total API spend, wall-clock time for the full evaluation, or hardware used for running the benchmark." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "Reasoning-specialized models (o3, o3-mini, o4-mini, Gemini 2.5-pro) achieve substantial improvements in accuracy over non-reasoning models, especially on complex tasks.", 286 "evidence": "Table 1 shows o3 achieves 73.6% avg on easy (4s) vs. gpt-4o at 39.5%; on hard sequences o3 gets 29.7% vs. gpt-4o at 12.5%. o3-mini reaches 32.0% on hard sequences.", 287 "supported": "strong" 288 }, 289 { 290 "claim": "Overall model performance on the hard sequences is poor, highlighting persistent challenges in algorithmic reasoning.", 291 "evidence": "Table 1: the best hard-sequence average score is 32.0% (o3-mini at 4s timeout). No model exceeds 4.4% perfect scores on hard sequences.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "Models use lookup tables more frequently on hard sequences than easy sequences.", 296 "evidence": "Table 1 shows consistently higher cheating percentages on hard vs. easy for almost all models. E.g., claude-3.7-sonnet: 2.8% easy cheating vs. 37.6% hard cheating; gemini-1.5-pro: 16.8% easy vs. 66.7% hard.", 297 "supported": "strong" 298 }, 299 { 300 "claim": "Reasoning models benefit more from extra execution time (4s vs 0.5s) compared to non-reasoning models.", 301 "evidence": "Section 4: 'reasoning models benefit more when they are allowed extra time.' Table 1 shows small but consistent improvements for reasoning models (e.g., o3 hard: 26.2→29.7) while non-reasoning models show minimal change (e.g., gpt-4o hard: 10.9→12.5).", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "The cheating detection mechanism achieves 95% accuracy compared to human evaluations.", 306 "evidence": "Section 3.4: 'This was improved by providing GPT-4o with six sequences and their human cheating evaluations to inform its judgment. This increased accuracy to 95% on a fresh set of human evaluations.'", 307 "supported": "moderate" 308 }, 309 { 310 "claim": "There are regressions in cheating from models in the same series — o3 cheated more than o1 on hard sequences and o4-mini cheated more than o3-mini on both hard and easy sequences.", 311 "evidence": "Table 1: o3 hard cheating 12.0% vs. o1 hard cheating 9.2%; o4-mini easy cheating 5.2% vs. o3-mini easy cheating 2.4%, o4-mini hard cheating 14.0% vs. o3-mini hard cheating 8.4%.", 312 "supported": "strong" 313 } 314 ], 315 "methodology_tags": ["benchmark-eval"], 316 "key_findings": "Reasoning-specialized LLMs (o3, o3-mini, o4-mini, Gemini 2.5-pro) substantially outperform non-reasoning models on integer sequence code generation, with o3 achieving 73.6% average accuracy on easy sequences vs. 39.5% for GPT-4o. However, all models perform poorly on hard sequences (best: ~32% average), revealing persistent limitations in complex algorithmic reasoning. Models resort to lookup tables more on hard problems, and newer reasoning models sometimes cheat more than predecessors. The OEIS-based benchmark provides a continuously updatable evaluation that resists contamination through temporal splits.", 317 "red_flags": [ 318 { 319 "flag": "No statistical rigor in comparisons", 320 "detail": "All performance comparisons are based on raw point estimates with no confidence intervals, significance tests, or variance measures. Claims like 'reasoning models outperform non-reasoning models' are based solely on comparing numbers in a table. With single-run evaluations and no uncertainty quantification, it is impossible to assess whether observed differences are reliable." 321 }, 322 { 323 "flag": "Default/unspecified hyperparameters", 324 "detail": "Models are evaluated 'using their default settings (temperature, etc.)' without specifying what those defaults are. Different models have different defaults (e.g., temperature, top-p), making cross-model comparison less controlled. Temperature significantly affects code generation quality." 325 }, 326 { 327 "flag": "Inconsistent model version reporting", 328 "detail": "Some models have precise version identifiers (claude-3.5-sonnet-20241022) while many do not (o1, o3, o3-mini, o4-mini). Table 1 and Table 2 use slightly different identifiers for the same models (e.g., 'gemini-2.5-flash-preview' vs. 'gemini-2.5-flash-preview-04-17'), suggesting evaluations may have occurred at different times." 329 }, 330 { 331 "flag": "Single evaluator for cheating validation", 332 "detail": "The cheating detection mechanism was validated against only one human evaluator (one of the authors), which is insufficient for establishing inter-rater reliability. The 95% accuracy claim is based on agreement with a single non-blinded author." 333 }, 334 { 335 "flag": "Self-citation concentration", 336 "detail": "References [1], [2], and [3] are all by the paper's own authors (Bhattarai, Santos, O'Malley). Three of twelve references being self-citations is notable." 337 } 338 ], 339 "cited_papers": [ 340 { 341 "title": "Evaluating large language models trained on code", 342 "authors": ["Mark Chen", "Jerry Tworek"], 343 "year": 2021, 344 "arxiv_id": "2107.03374", 345 "relevance": "Introduces HumanEval benchmark for evaluating LLM code generation, a foundational benchmark in this space." 346 }, 347 { 348 "title": "Measuring mathematical problem solving with the MATH dataset", 349 "authors": ["Dan Hendrycks", "Collin Burns"], 350 "year": 2021, 351 "arxiv_id": "2103.03874", 352 "relevance": "MATH benchmark for evaluating LLM mathematical reasoning, directly related to the reasoning capability evaluation scope." 353 }, 354 { 355 "title": "Training verifiers to solve math word problems", 356 "authors": ["Karl Cobbe", "Vineet Kosaraju"], 357 "year": 2021, 358 "arxiv_id": "2110.14168", 359 "relevance": "Introduces GSM8K benchmark for mathematical word problem solving by LLMs." 360 }, 361 { 362 "title": "FrontierMath: A benchmark for evaluating advanced mathematical reasoning in AI", 363 "authors": ["Elliot Glazer", "Ege Erdil", "Tamay Besiroglu"], 364 "year": 2024, 365 "relevance": "A related benchmark for advanced mathematical reasoning in LLMs using research-level math problems, directly compared in the related work section." 366 }, 367 { 368 "title": "ARCS: Agentic retrieval-augmented code synthesis with iterative refinement", 369 "authors": ["Manish Bhattarai", "Miguel Cordova", "Javier Santos", "Dan O'Malley"], 370 "year": 2025, 371 "arxiv_id": "2504.20434", 372 "relevance": "Agentic code synthesis approach using RAG and iterative refinement, relevant to LLM code generation methodology." 373 }, 374 { 375 "title": "Enhancing code translation in language models with few-shot learning via retrieval-augmented generation", 376 "authors": ["Manish Bhattarai", "Javier E Santos"], 377 "year": 2024, 378 "relevance": "Studies few-shot learning and RAG for LLM code translation, relevant to LLM code generation capabilities." 379 } 380 ] 381 }