scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25459B)
      1 {
      2   "paper": {
      3     "title": "A Comparative Study on Reasoning Patterns of OpenAI's o1 Model",
      4     "authors": [
      5       "Siwei Wu",
      6       "Zhongyuan Peng",
      7       "Xinrun Du",
      8       "Tuney Zheng",
      9       "Minghao Liu",
     10       "Jialong Wu",
     11       "Jiachen Ma",
     12       "Yizhi Li",
     13       "Jian Yang",
     14       "Wangchunshu Zhou",
     15       "Qunshu Lin",
     16       "Junbo Zhao",
     17       "Zhaoxiang Zhang",
     18       "Wenhao Huang",
     19       "Ge Zhang",
     20       "Chenghua Lin",
     21       "J.H. Liu"
     22     ],
     23     "year": 2024,
     24     "venue": "arXiv preprint",
     25     "arxiv_id": "2410.13639"
     26   },
     27   "scan_version": 2,
     28   "active_modules": ["experimental_rigor", "data_leakage"],
     29   "methodology_tags": ["benchmark-eval"],
     30   "key_findings": "OpenAI's o1 model outperforms GPT-4o with various test-time compute methods (BoN, Step-wise BoN, Self-Refine, Agent Workflow) on math, coding, and commonsense reasoning benchmarks. The study identifies six reasoning patterns in o1 (Systematic Analysis, Method Reuse, Divide and Conquer, Self-Refinement, Context Identification, Emphasizing Constraints), with Divide and Conquer and Self-Refinement being most frequent. Reward model quality and search space both limit the upper bound of search-based methods like BoN. Agent Workflow with domain-specific prompts comes closest to o1 performance among the baselines.",
     31   "checklist": {
     32     "artifacts": {
     33       "code_released": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "GitHub repository URL provided in abstract: https://github.com/Open-Source-O1/o1_Reasoning_Patterns_Study"
     37       },
     38       "data_released": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "The paper states 'code and dataset are released' at the GitHub link. The benchmarks used (HotpotQA, Collie, USACO, AIME) are publicly available, and the filtered versions are indicated as released."
     42       },
     43       "environment_specified": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No environment specifications, requirements.txt, or dependency details are mentioned in the paper."
     47       },
     48       "reproduction_instructions": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no instructions for running experiments are described."
     52       }
     53     },
     54     "statistical_methodology": {
     55       "confidence_intervals_or_error_bars": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "All results in Table 1 and figures are point estimates with no confidence intervals or error bars."
     59       },
     60       "significance_tests": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The paper claims o1 'achieves the best results' and various methods 'outperform' others based solely on comparing raw numbers with no statistical tests."
     64       },
     65       "effect_sizes_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "Raw percentage scores are reported but no effect sizes (Cohen's d, etc.) are provided. Some percentage differences are mentioned (e.g., '2.55%') but without baseline context framing as effect sizes."
     69       },
     70       "sample_size_justified": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The filtered benchmark sizes (274, 226, 139, 90 samples in Table 3) are stated but no justification for why these sizes are adequate is provided."
     74       },
     75       "variance_reported": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. All results appear to be single-run numbers."
     79       }
     80     },
     81     "evaluation_design": {
     82       "baselines_included": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Four test-time compute baselines (BoN, Step-wise BoN, Self-Refine, Agent Workflow) and direct GPT-4o are compared against o1 in Table 1."
     86       },
     87       "baselines_contemporary": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Baselines include GPT-4o and state-of-the-art agent frameworks (Zhou et al., 2024), which are contemporary to the study period."
     91       },
     92       "ablation_study": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "The paper varies N for BoN (N=1,4,8,16 in Fig. 5), tests different reward models (Section 4.5, Fig. 4), and tests different backbone models (GPT-4o, Qwen2.5, Llama3), which serve as ablation-like analyses."
     96       },
     97       "multiple_metrics": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "Only accuracy is used as the evaluation metric across all benchmarks. No secondary metrics are reported."
    101       },
    102       "human_evaluation": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Human evaluation is used as a reward model baseline in Section 4.5 (Fig. 4), where humans judge the most suitable BoN response. Additionally, USACO code is manually run on test cases (Section 3.3)."
    106       },
    107       "held_out_test_set": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The benchmarks (HotpotQA, Collie, USACO, AIME) are used as test sets. The data filtering uses separate models (Llama3, Qwen, Yi, Claude) to select hard samples, and evaluation is done on these filtered sets."
    111       },
    112       "per_category_breakdown": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Results are broken down per benchmark (HotpotQA, Collie, USACO, AIME) across three domains in Table 1, and reasoning pattern analysis is broken down per benchmark in Fig. 1."
    116       },
    117       "failure_cases_discussed": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper discusses failures of Self-Refine (performance decline on Collie, Section 4.1), Step-wise BoN failures on complex tasks (Section 4.3), and BoN performance decline with increasing N."
    121       },
    122       "negative_results_reported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Self-Refine shows performance decline on Collie compared to GPT-4o. Step-wise BoN drops significantly on complex benchmarks. BoN performance stabilizes or declines when N>8 (Section 4.6)."
    126       }
    127     },
    128     "claims_and_evidence": {
    129       "abstract_claims_supported": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Abstract claims about o1 achieving best performance, reward model limitations, Agent Workflow superiority over Step-wise BoN, and six reasoning patterns are all supported by results in Table 1, Figs. 1-6."
    133       },
    134       "causal_claims_justified": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper makes causal claims like 'the domain-specific system prompt is crucial' and reward models 'limit the upper boundary' without controlled experiments isolating these factors. The Agent Workflow comparison conflates multiple differences (domain-specific prompts, tool use, workflow structure)."
    138       },
    139       "generalization_bounded": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The title claims to study 'Reasoning Patterns of OpenAI's o1 Model' generally, but the reasoning pattern analysis is based on manual inspection of only 20-30 samples per benchmark. The paper does not bound its claims to these specific benchmarks and sample sizes."
    143       },
    144       "alternative_explanations_discussed": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "No alternative explanations are discussed for the observed results. For example, the paper does not consider whether o1's advantage comes from larger training compute, different training data, or RLHF rather than reasoning patterns."
    148       },
    149       "proxy_outcome_distinction": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper equates benchmark accuracy with 'reasoning capability' without discussing whether these benchmarks actually capture reasoning ability vs. memorization, pattern matching, or other factors."
    153       }
    154     },
    155     "setup_transparency": {
    156       "model_versions_specified": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "The paper uses 'o1-preview', 'o1-mini', and 'GPT-4o' without specifying exact API versions or snapshot dates. Open-source models mention sizes (e.g., Llama3-72B, Qwen-72B) but no exact version strings."
    160       },
    161       "prompts_provided": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "The paper mentions using 'domain-specific system prompts' for Agent Workflow and refers to external GPTs, but no actual prompt text is provided in the paper."
    165       },
    166       "hyperparameters_reported": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for any of the LLM calls."
    170       },
    171       "scaffolding_described": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "The Agent Workflow uses 'state-of-the-art agent framework (Zhou et al., 2024)' and GPTs but the actual scaffolding details (tool descriptions, workflow, retry logic) are not described. The paper defers to external references."
    175       },
    176       "data_preprocessing_documented": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 3.1 describes the data filtering process: using four LLMs to answer each sample, then filtering out samples that more than two models answer correctly (LIME method). Table 3 shows filtered benchmark sizes."
    180       }
    181     },
    182     "limitations_and_scope": {
    183       "limitations_section_present": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No dedicated limitations or threats-to-validity section exists in the paper."
    187       },
    188       "threats_to_validity_specific": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No threats to validity are discussed anywhere in the paper."
    192       },
    193       "scope_boundaries_stated": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "The paper does not explicitly state what the results do NOT show. No scope boundaries are defined beyond the implicit choice of benchmarks."
    197       }
    198     },
    199     "data_integrity": {
    200       "raw_data_available": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The paper states code and dataset are released at the GitHub repository, which would include the filtered benchmark data and model outputs."
    204       },
    205       "data_collection_described": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Section 3.1 describes benchmark selection and the LIME-based filtering procedure. Section 3.3 describes evaluation metrics and how correctness is determined."
    209       },
    210       "recruitment_methods_described": {
    211         "applies": false,
    212         "answer": false,
    213         "justification": "No human participants are recruited. The study uses standard benchmarks and LLM APIs."
    214       },
    215       "data_pipeline_documented": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "The pipeline from raw benchmarks through LIME filtering to final evaluation is documented in Sections 3.1 and 3.3, with filtered sample counts in Table 3."
    219       }
    220     },
    221     "conflicts_of_interest": {
    222       "funding_disclosed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No funding information or acknowledgments section is present in the paper."
    226       },
    227       "affiliations_disclosed": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "Author affiliations are listed: M-A-P, University of Manchester, OpenO1 Team, 2077AI, Abaka AI, Zhejiang University, University of Chinese Academy of Sciences."
    231       },
    232       "funder_independent_of_outcome": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No funding is disclosed, so independence cannot be assessed. The OpenO1 Team affiliation suggests potential interest in o1-related research outcomes."
    236       },
    237       "financial_interests_declared": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No competing interests or financial interests statement is present in the paper."
    241       }
    242     },
    243     "contamination": {
    244       "training_cutoff_stated": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No training data cutoff dates are stated for any of the models used (o1, GPT-4o, Llama3, Qwen)."
    248       },
    249       "train_test_overlap_discussed": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No discussion of whether benchmark problems appeared in model training data. HotpotQA (2018), AIME problems, and USACO are all publicly available and could be in training data."
    253       },
    254       "benchmark_contamination_addressed": {
    255         "applies": true,
    256         "answer": false,
    257         "justification": "The benchmarks (HotpotQA 2018, AIME 22-24) were available online before model training. No contamination analysis or discussion is provided."
    258       }
    259     },
    260     "human_studies": {
    261       "pre_registered": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this benchmark evaluation study."
    265       },
    266       "irb_or_ethics_approval": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this benchmark evaluation study."
    270       },
    271       "demographics_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this benchmark evaluation study."
    275       },
    276       "inclusion_exclusion_criteria": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this benchmark evaluation study."
    280       },
    281       "randomization_described": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this benchmark evaluation study."
    285       },
    286       "blinding_described": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants in this benchmark evaluation study."
    290       },
    291       "attrition_reported": {
    292         "applies": false,
    293         "answer": false,
    294         "justification": "No human participants in this benchmark evaluation study."
    295       }
    296     },
    297     "cost_and_practicality": {
    298       "inference_cost_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No API costs, tokens consumed, or wall-clock time reported despite using paid APIs (GPT-4o, o1) with multiple calls per sample (BoN with N up to 16)."
    302       },
    303       "compute_budget_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No total computational budget or API spend is stated. The reasoning token counts for o1 are reported (Fig. 3) but not the total cost."
    307       }
    308     },
    309     "experimental_rigor": {
    310       "seed_sensitivity_reported": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "No mention of multiple random seeds or seed sensitivity. Results appear to be from single runs."
    314       },
    315       "number_of_runs_stated": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "The number of experimental runs is not stated. It is unclear whether results are from single runs or averaged."
    319       },
    320       "hyperparameter_search_budget": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "No hyperparameter search is described. The choice of N values (1,4,8,16) for BoN is not justified."
    324       },
    325       "best_config_selection_justified": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "No justification for why specific N values, reward models, or agent configurations were selected as the reported results."
    329       },
    330       "multiple_comparison_correction": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied despite comparing many methods across multiple benchmarks."
    334       },
    335       "self_comparison_bias_addressed": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "The authors do not acknowledge any bias from evaluating their own framework choices. They use their own implementations of baselines without discussing this limitation."
    339       },
    340       "compute_budget_vs_performance": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "o1 likely uses far more compute than GPT-4o direct inference, and BoN with N=16 uses 16x the compute, but compute-normalized comparisons are not provided."
    344       },
    345       "benchmark_construct_validity": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the benchmarks actually measure 'reasoning capability' as claimed. The paper assumes benchmark accuracy equals reasoning ability without questioning construct validity."
    349       },
    350       "scaffold_confound_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "Agent Workflow uses different scaffolding (domain-specific prompts, GPTs, agent framework) compared to other methods, but the paper attributes performance differences to the method rather than controlling for scaffold differences."
    354       }
    355     },
    356     "data_leakage": {
    357       "temporal_leakage_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "HotpotQA (2018), AIME problems (2022-2024), and USACO problems predate model training. No discussion of whether models saw these problems during training."
    361       },
    362       "feature_leakage_addressed": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No discussion of whether the evaluation setup leaks information. The LIME-based filtering uses model responses which could introduce selection bias favoring certain model behaviors."
    366       },
    367       "non_independence_addressed": {
    368         "applies": true,
    369         "answer": false,
    370         "justification": "No discussion of whether filtered benchmark samples are independent or share structural similarities."
    371       },
    372       "leakage_detection_method": {
    373         "applies": true,
    374         "answer": false,
    375         "justification": "No concrete leakage detection or prevention method is used despite using well-known public benchmarks with closed-source models."
    376       }
    377     }
    378   },
    379   "claims": [
    380     {
    381       "claim": "o1 achieves the best performance across almost all benchmarks compared to previous test-time compute methods and GPT-4o",
    382       "evidence": "Table 1 shows o1-preview or o1-mini outperform all baselines on most benchmarks. o1-mini achieves 62% on AIME vs 12.22% for GPT-4o; o1-preview achieves 44.6% on USACO vs 5.04% for GPT-4o.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Agent Workflow achieves significant improvements across all benchmarks and is closest to o1's performance",
    387       "evidence": "Table 1 shows Agent Workflow at 24.70% overall vs o1-preview's 34.32%. Agent Workflow reaches 22.22% on USACO (vs 5.04% GPT-4o) and 46.07% on Collie.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "Reward model quality limits the upper boundary of search-based methods (BoN)",
    392       "evidence": "Fig. 4 shows human reward model achieving ~33% on HotpotQA vs ~13-15% for automated reward models. Gap between human and automated reward models demonstrates the limitation.",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "Six reasoning patterns are identified in o1, with Divide and Conquer and Self-Refinement being most frequent",
    397       "evidence": "Fig. 2 shows frequency distribution of six patterns based on manual analysis of 20-30 samples per benchmark. DC and SR appear most frequently.",
    398       "supported": "weak"
    399     },
    400     {
    401       "claim": "Step-wise BoN is limited by long-context inference problems on complex tasks",
    402       "evidence": "Table 2 shows average reasoning tokens >200 for all tasks. Step-wise BoN achieves <12% on Collie and only half of other methods' AIME performance (Section 4.3).",
    403       "supported": "weak"
    404     }
    405   ],
    406   "red_flags": [
    407     {
    408       "flag": "No statistical rigor",
    409       "detail": "All comparisons are based on single point estimates with no error bars, confidence intervals, significance tests, or multiple runs. Claims of superiority are made by comparing raw numbers."
    410     },
    411     {
    412       "flag": "No contamination analysis",
    413       "detail": "Closed-source models (o1, GPT-4o) are tested on public benchmarks (HotpotQA from 2018, AIME problems) without any discussion of whether these appeared in training data. o1 may have been specifically trained on math competition problems."
    414     },
    415     {
    416       "flag": "Tiny sample for reasoning pattern analysis",
    417       "detail": "The six reasoning patterns are identified from manual inspection of only 20-30 samples per benchmark. This is too small for reliable pattern categorization and the categorization methodology is not described."
    418     },
    419     {
    420       "flag": "Uncontrolled comparisons",
    421       "detail": "Agent Workflow uses fundamentally different scaffolding (domain-specific prompts, GPTs, specialized agent frameworks) than other baselines, making it impossible to attribute performance differences to any single factor."
    422     },
    423     {
    424       "flag": "No compute-normalized comparison",
    425       "detail": "o1 uses significantly more inference compute than GPT-4o, and BoN with N=16 uses 16x the compute of direct inference. No cost or compute normalization is provided."
    426     },
    427     {
    428       "flag": "No limitations section",
    429       "detail": "The paper has no limitations, threats to validity, or scope boundaries discussion despite significant methodological weaknesses."
    430     },
    431     {
    432       "flag": "Potential selection bias in data filtering",
    433       "detail": "The LIME-based filtering selects problems that multiple models get wrong, which may bias the benchmark toward specific types of difficult problems rather than being representative of general difficulty."
    434     }
    435   ],
    436   "cited_papers": [
    437     {
    438       "title": "Scaling LLM Test-Time Compute Optimally Can Be More Effective Than Scaling Model Parameters",
    439       "authors": ["Charlie Snell", "Jaehoon Lee", "Kelvin Xu", "Aviral Kumar"],
    440       "year": 2024,
    441       "arxiv_id": "2408.03314",
    442       "relevance": "Core reference on test-time compute scaling, directly relevant to understanding inference-time reasoning strategies."
    443     },
    444     {
    445       "title": "Self-Refine: Iterative Refinement with Self-Feedback",
    446       "authors": ["Aman Madaan"],
    447       "year": 2024,
    448       "relevance": "Key baseline method for LLM self-improvement through iterative feedback, relevant to agentic coding workflows."
    449     },
    450     {
    451       "title": "Can Language Models Solve Olympiad Programming?",
    452       "authors": ["Quan Shi", "Michael Tang", "Karthik Narasimhan", "Shunyu Yao"],
    453       "year": 2024,
    454       "arxiv_id": "2404.10952",
    455       "relevance": "USACO benchmark used in the study; evaluates LLM coding capabilities on competitive programming."
    456     },
    457     {
    458       "title": "Agents: An Open-Source Framework for Autonomous Language Agents",
    459       "authors": ["Wangchunshu Zhou"],
    460       "year": 2023,
    461       "arxiv_id": "2309.07870",
    462       "relevance": "Agent framework used as baseline; relevant to agentic AI architecture design."
    463     },
    464     {
    465       "title": "Symbolic Learning Enables Self-Evolving Agents",
    466       "authors": ["Wangchunshu Zhou"],
    467       "year": 2024,
    468       "arxiv_id": "2406.18532",
    469       "relevance": "State-of-the-art agent workflow framework used in evaluations; relevant to agentic AI capabilities."
    470     },
    471     {
    472       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    473       "authors": ["Naman Jain"],
    474       "year": 2024,
    475       "arxiv_id": "2403.07974",
    476       "relevance": "Contamination-aware code evaluation benchmark, relevant to LLM code generation methodology."
    477     },
    478     {
    479       "title": "RewardBench: Evaluating Reward Models for Language Modeling",
    480       "authors": ["Nathan Lambert"],
    481       "year": 2024,
    482       "arxiv_id": "2403.13787",
    483       "relevance": "Benchmark for evaluating reward models, relevant to understanding limitations of search-based inference methods."
    484     },
    485     {
    486       "title": "Think Before You Speak: Training Language Models with Pause Tokens",
    487       "authors": ["Sachin Goyal"],
    488       "year": 2023,
    489       "arxiv_id": "2310.02226",
    490       "relevance": "Early work on enabling models to use additional compute before responding, precursor to test-time compute scaling."
    491     },
    492     {
    493       "title": "Quiet-STaR: Language Models Can Teach Themselves to Think Before Speaking",
    494       "authors": ["Eric Zelikman"],
    495       "year": 2024,
    496       "arxiv_id": "2403.09629",
    497       "relevance": "Unsupervised thought token learning for test-time compute, relevant to understanding reasoning capabilities."
    498     },
    499     {
    500       "title": "PAL: Program-Aided Language Models",
    501       "authors": ["Luyu Gao"],
    502       "year": 2023,
    503       "relevance": "Tool-augmented LLM reasoning approach, relevant to test-time compute and agentic methods."
    504     }
    505   ]
    506 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs