scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27676B)
      1 {
      2   "paper": {
      3     "title": "Atom of Thoughts for Markov LLM Test-Time Scaling",
      4     "authors": ["Fengwei Teng", "Quan Shi", "Zhaoyang Yu", "Jiayi Zhang", "Yuyu Luo", "Chenglin Wu", "Zhijiang Guo"],
      5     "year": 2025,
      6     "venue": "NeurIPS 2025",
      7     "arxiv_id": "2502.12018"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "The abstract states 'We submit our code alongside this paper and will make it publicly available to facilitate reproducibility and future research.' This is a promise of future release, not an actual working URL or archive provided in the paper. No repository link is given."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses publicly available benchmarks: MATH, GSM8K, MBPP, LiveCodeBench, AIME (with HuggingFace link provided in footnote 1: https://huggingface.co/datasets/Maxwell-Jia/AIME_2024), and LongBench (HotpotQA, MuSiQue, 2WikiMultiHopQA). All datasets are standard public benchmarks."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup section listing library versions is provided in the paper or appendices."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "While prompt templates and hyperparameters are described in the appendix, there are no step-by-step reproduction instructions, README with commands, or scripts to replicate experiments. The code is promised but not yet released."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Table 1 reports only point estimates (e.g., 83.6 on MATH, 95.0 on GSM8K) with no confidence intervals, error bars, or ± notation. Figure 3 shows cost-performance curves with no uncertainty bands."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims AOT outperforms baselines across multiple benchmarks but provides no statistical significance tests (no p-values, t-tests, or bootstrap tests). Comparisons are based solely on comparing point estimates."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports absolute performance numbers with baseline context, allowing readers to compute effect sizes. For example, Table 1 shows AOT at 83.6 vs CoT at 78.3 on MATH (GPT-4o-mini), and Section 4.4 reports 'on MATH, this full integration achieves 84.9% accuracy compared to ToT's 82.0%.' Baseline and AOT numbers are consistently provided together."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper evaluates on 'the first 1,000 cases from MATH for efficiency' (Appendix A.3.3) but does not justify why 1,000 is sufficient or discuss statistical power. Other benchmarks are evaluated in their entirety but without justification for adequacy."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No standard deviation, variance, or spread measures are reported across experimental runs. All results in Table 1 appear to be single-run numbers. There is no mention of averaging over multiple runs or seeds."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Table 1 includes extensive baselines: CoT, CoT-SC, Self-Refine (SR), Analogical Reasoning (AR), AFlow, ToT, GoT, and FoT. Section 4.1 describes the baseline selection rationale."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include contemporary methods: AFlow (2024), FoT (2024), and the paper tests on recent models including O3-mini (2025) and DeepSeek-R1 (2025). CoT and ToT are foundational methods that remain standard baselines."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Section 4.3 presents ablation studies examining two variants: (1) without decomposition (directly contracts without DAG), and (2) without DAG-guided contraction (decomposition without structural guidance). Figure 3 shows both ablations degrade performance."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper uses pass rates for mathematical and coding benchmarks, and F1 scores for multi-hop QA tasks (Section 4.1). Additionally, Table 2 in Appendix B.1 reports Answer Equivalence Maintenance, Test-time Complexity Reduction, and LLM-as-a-Judge Selection Rate."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of AOT's outputs is included. All evaluation is automated through benchmark pass rates, F1 scores, and LLM-as-a-judge assessments. Given claims about interpretability and reasoning quality, human evaluation of output quality would be relevant."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper uses established benchmark test sets (MATH, GSM8K, MBPP, LiveCodeBench, AIME, LongBench). These are standard held-out test sets from their respective benchmark suites, not used for tuning the method."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down by benchmark (MATH, GSM8K, MBPP, LongBench, AIME, LiveCodeBench) and by model category (non-reasoning LLMs vs reasoning LLMs). Table 2 in Appendix B.1 provides per-dataset quality metrics. Figures 5-7 analyze structural properties per problem."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 3.2 discusses failure modes including error propagation from memoryless transitions and semantic divergence. The Limitations section (Appendix C) discusses where the approach may struggle: fixed transition counts, computational overhead, and dependency on LLM capability for DAG generation."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The ablation study (Section 4.3) reports that removing DAG-guided contraction causes 'a more severe drop' than removing decomposition entirely, showing that partial structural cues 'can be more harmful than providing none at all.' AR baseline performs notably poorly and is excluded from some figures. Appendix B.3 reports accuracy decreases with increased structural complexity."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims AOT 'consistently outperforms existing baselines as computational budgets increase' — supported by Table 1 and Figure 3. It claims 'seamless integration with existing reasoning frameworks and different LLMs' — supported by Section 4.4 integration experiments with ToT, FoT, and both reasoning and non-reasoning LLMs."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper makes causal claims about component contributions (e.g., 'both ablations significantly degrade performance'). These are supported by controlled ablation experiments (Section 4.3) that systematically remove individual components while keeping other variables fixed, which is adequate for such claims."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper title and abstract claim a 'general-purpose reasoning framework' but tests only on math, code generation, and multi-hop QA tasks. Section 5 claims AOT enables 'scalable, high-performance inference' without bounding to the tested domains. The Limitations section (Appendix C) does not explicitly state what domains or task types were NOT tested."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss alternative explanations for its results. For example, performance improvements could be partly due to ensembling effects (the judge selects from three candidate solutions), additional LLM calls, or prompt engineering rather than the Markovian structure specifically. No threats-to-validity section or alternative explanations section exists."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper uses 'GPT-4o-mini', 'DeepSeek-V3', 'O3-mini', and 'DeepSeek-R1' without specifying exact API versions or snapshot dates. For example, no version like 'gpt-4o-mini-2024-07-18' is provided. Marketing names alone do not constitute specified versions."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Appendix A.1 provides complete prompt templates as Python code for all four prompt types (direct, decompose, contract, judge) across all three task domains (math, code, multi-hop QA). The templates include placeholders with clear variable names that receive the actual question/context values."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Appendix A.2 reports key hyperparameters: temperature T=1.0, maximum Markov chain length of 3, ToT branches of 3, FoT trees of {2, 4, 8}. Section 4.1 also references these settings."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The paper provides detailed descriptions of the agentic scaffolding: the two-phase transition mechanism (decomposition and contraction), the DAG construction process, the LLM-as-a-judge termination strategy, and integration with tree search and reflective refinement (Sections 3.1-3.2). Figure 2 provides an overview diagram."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Appendix A.3.3 documents dataset-specific processing: 'For the MATH dataset, we filter out questions with non-integer or non-decimal answers to ensure consistent evaluation. We evaluate the first 1,000 cases from MATH for efficiency, while assessing the remaining benchmarks in their entirety.' LongBench preprocessing is referenced via [2]."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Appendix C provides a dedicated Limitations section discussing three specific limitations: fixed maximum transition count, computational overhead of decomposition, and dependency on LLM capability for DAG generation."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The Limitations section (Appendix C) identifies specific threats: 'the current implementation relies on a fixed maximum transition count (set to 3), which may not be optimal for all problem types,' 'the decomposition process adds computational overhead compared to direct inference,' and 'weaker models may struggle to generate valid dependency graphs, potentially degrading performance.' These are specific to this study."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what it does NOT show. The Limitations section discusses weaknesses of the approach but does not bound the scope of claims (e.g., it does not state which task types, languages, or domains are excluded from its claims). The title and abstract present it as a general-purpose framework without explicit scope boundaries."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw experimental data (individual problem-level results, model outputs, DAG structures generated) is made available. Only aggregated results in tables and figures are provided."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The paper clearly describes data sources: standard benchmarks (MATH, GSM8K, MBPP, LiveCodeBench, AIME from HuggingFace, LongBench) with citations and links. Appendix A.3.3 describes dataset-specific filtering and selection."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data sources are standard public benchmarks, making recruitment methods not applicable."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The data pipeline is documented: benchmarks are selected (Section 4.1), MATH is filtered to integer/decimal answers and sampled to 1,000 cases (Appendix A.3.3), and evaluation metrics are specified (pass rates for math/code, F1 for QA). The reasoning pipeline (decomposition, contraction, judging) is described with examples in Appendix B.2."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding sources, grants, or acknowledgments section is present in the paper. Authors are affiliated with HKUST(GZ), DeepWisdom, and Renmin University of China, but no funding disclosure is provided."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: HKUST(GZ), DeepWisdom, and Renmin University of China. These are provided on the first page of the paper."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding is disclosed, so independence cannot be assessed. Authors from DeepWisdom (a company) may have commercial interest in the results, but this is not addressed."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper. DeepWisdom is a commercial entity, and potential financial interests related to the work are not disclosed."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper evaluates GPT-4o-mini, DeepSeek-V3, O3-mini, and DeepSeek-R1 on benchmarks including MATH and GSM8K. No training data cutoff dates are stated for any of these models."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of potential train/test overlap. MATH (2021), GSM8K (2021), and MBPP (2021) are all older benchmarks that could plausibly be in the training data of 2024-2025 models. This is not addressed."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "MATH, GSM8K, and MBPP were all published before 2022 and are widely used; any model trained after 2022 may have seen them. The paper uses LiveCodeBench, which is designed for contamination-free evaluation, but does not discuss contamination risks for the other benchmarks."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study. It is a benchmark evaluation of a reasoning framework."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Figure 3 plots performance vs cost in USD (log2 scale) for all methods. Figure 4 shows cost in absolute terms for integration experiments. The paper explicitly analyzes cost-performance tradeoffs as a core contribution."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "While relative costs are shown in figures, the total computational budget (total API spend, total number of API calls, total tokens consumed) is not stated. The paper shows cost curves but does not quantify the total resources spent on all experiments."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "AOT consistently outperforms existing baselines across both reasoning and non-reasoning LLMs on multiple benchmarks.",
    286       "evidence": "Table 1 shows AOT achieves highest scores across MATH (83.6 with GPT-4o-mini, 96.5 with DeepSeek-V3), GSM8K (95.0/98.2), MBPP (75.2/79.6), LongBench (68.5/71.0), AIME (83.0/81.7), and LiveCodeBench (32.2/30.9) compared to CoT, CoT-SC, SR, AR, AFlow, ToT, GoT, and FoT baselines (Section 4.2).",
    287       "supported": "moderate",
    288       "reason": "Results consistently favor AOT across all benchmarks and models, but no statistical tests are provided, no variance/error bars are reported, and results appear to be from single runs. The consistency across models and benchmarks provides moderate confidence."
    289     },
    290     {
    291       "claim": "Both ablation variants (without decomposition and without DAG-guided contraction) significantly degrade performance, with partial structural cues being more harmful than none.",
    292       "evidence": "Figure 3 shows both ablation curves fall below AOT across MATH, GSM8K, LongBench, AIME, and LiveCodeBench. Section 4.3 states the variant without DAG-guided contraction causes 'a more severe drop' (Section 4.3).",
    293       "supported": "moderate",
    294       "reason": "The ablation is well-designed as controlled single-variable manipulation, but without statistical tests or variance reporting, the significance of the degradation cannot be assessed precisely."
    295     },
    296     {
    297       "claim": "The DAG generation process maintains answer equivalence at >99% across all datasets.",
    298       "evidence": "Table 2 (Appendix B.1) reports Answer Equivalence Maintenance rates of 99.2% (MATH), 99.5% (GSM8K), 99.7% (MBPP), and 99.3% (LongBench).",
    299       "supported": "moderate",
    300       "reason": "The metric is assessed by LLM evaluation rather than ground-truth verification, so the 99%+ figure depends on the LLM evaluator's accuracy. The methodology for assessment is described but relies on another model's judgment."
    301     },
    302     {
    303       "claim": "Deeper Markov chains exhibit an emergent atomic reasoning structure where reasoning tokens converge toward minimal forms.",
    304       "evidence": "Figure 4 shows Token Ratio approaching 1.0 as cost increases, indicating convergence toward atomic structure. Section 4.4 describes this convergence pattern (Section 3.2 and 4.4).",
    305       "supported": "moderate",
    306       "reason": "The convergence trend is visible in Figure 4 but is demonstrated only on GPT-4o-mini. The 'emergence' claim is primarily observational and the atomic structure is defined by the authors rather than independently validated."
    307     },
    308     {
    309       "claim": "Integrating AOT with tree search and reflective refinement achieves compounding performance gains (e.g., 84.9% on MATH vs ToT's 82.0%, 81.2% on AIME vs ToT's 78.0%).",
    310       "evidence": "Section 4.4 reports specific numbers for the full integration (ToT + Markov chain + Reflective Refinement) achieving 84.9% on MATH and 81.2% on AIME.",
    311       "supported": "moderate",
    312       "reason": "Specific numbers are provided with clear baseline comparisons, but again no statistical tests or variance reporting. The improvements are consistent but of moderate magnitude."
    313     }
    314   ],
    315   "methodology_tags": ["benchmark-eval"],
    316   "key_findings": "AOT (Atom of Thoughts) proposes a Markovian reasoning framework that decomposes complex problems through iterative DAG-based decomposition and contraction, reducing historical dependencies in LLM reasoning. The method achieves consistent improvements over baselines (CoT, ToT, GoT, FoT, AFlow) across math, code, and multi-hop QA benchmarks with both reasoning (O3-mini, DeepSeek-R1) and non-reasoning (GPT-4o-mini, DeepSeek-V3) LLMs. Ablation studies confirm that both the decomposition and DAG-guided contraction components are necessary. The framework demonstrates an emergent 'atomic reasoning' property where deeper Markov chains converge toward irreducible problem representations.",
    317   "red_flags": [
    318     {
    319       "flag": "No variance or uncertainty quantification",
    320       "detail": "All results in Table 1 appear to be single-run point estimates with no error bars, confidence intervals, or standard deviations. For stochastic methods using temperature 1.0, this is a significant omission — results could vary meaningfully across runs."
    321     },
    322     {
    323       "flag": "No statistical significance tests",
    324       "detail": "The paper claims AOT 'outperforms' baselines across all benchmarks based solely on comparing point estimates. With no statistical tests, it is impossible to determine whether the differences are statistically significant or within normal variation."
    325     },
    326     {
    327       "flag": "Benchmark contamination risk unaddressed",
    328       "detail": "MATH (2021), GSM8K (2021), and MBPP (2021) are older benchmarks that 2024-2025 models may have seen during training. The paper uses LiveCodeBench (designed for contamination-free evaluation) for code but does not discuss contamination for math benchmarks. Since all methods use the same models, this may not affect relative comparisons, but absolute performance levels could be inflated."
    329     },
    330     {
    331       "flag": "Overclaimed generality",
    332       "detail": "The paper claims AOT is a 'general-purpose reasoning framework' but tests only on math, code generation, and multi-hop QA. Many other reasoning domains (logical reasoning, commonsense reasoning, planning, scientific reasoning) are not tested."
    333     },
    334     {
    335       "flag": "LLM-as-evaluator circularity",
    336       "detail": "The answer equivalence and complexity reduction metrics in Table 2 (Appendix B.1) are assessed by LLM evaluation rather than ground-truth verification. Using an LLM to evaluate the quality of an LLM reasoning framework introduces potential circularity."
    337     }
    338   ],
    339   "cited_papers": [
    340     {
    341       "title": "Tree of thoughts: Deliberate problem solving with large language models",
    342       "authors": ["Shunyu Yao", "Dian Yu", "Jeffrey Zhao", "Izhak Shafran", "Tom Griffiths", "Yuan Cao", "Karthik Narasimhan"],
    343       "year": 2023,
    344       "relevance": "Foundational test-time reasoning framework that AOT extends and compares against as a baseline."
    345     },
    346     {
    347       "title": "Graph of thoughts: Solving elaborate problems with large language models",
    348       "authors": ["Maciej Besta", "Nils Blach", "Ales Kubicek"],
    349       "year": 2024,
    350       "relevance": "Graph-based reasoning framework that AOT compares against and claims to improve upon in efficiency."
    351     },
    352     {
    353       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    354       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    355       "year": 2022,
    356       "relevance": "Foundational prompting technique that AOT builds upon and benchmarks against."
    357     },
    358     {
    359       "title": "Forest-of-thought: Scaling test-time compute for enhancing LLM reasoning",
    360       "authors": ["Zhenni Bi", "Kai Han", "Chuanjian Liu", "Yehui Tang", "Yunhe Wang"],
    361       "year": 2024,
    362       "arxiv_id": "2412.09078",
    363       "relevance": "Recent test-time scaling framework used as a key baseline and integration partner for AOT."
    364     },
    365     {
    366       "title": "AFlow: Automating agentic workflow generation",
    367       "authors": ["Jiayi Zhang", "Jinyu Xiang", "Zhaoyang Yu", "Fengwei Teng"],
    368       "year": 2024,
    369       "arxiv_id": "2410.10762",
    370       "relevance": "Agentic workflow framework used as a baseline; shares authors with AOT paper."
    371     },
    372     {
    373       "title": "Scaling LLM test-time compute optimally can be more effective than scaling model parameters",
    374       "authors": ["Charlie Snell", "Jaehoon Lee", "Kelvin Xu", "Aviral Kumar"],
    375       "year": 2024,
    376       "arxiv_id": "2408.03314",
    377       "relevance": "Key paper on test-time compute scaling that motivates AOT's approach to efficient inference-time reasoning."
    378     },
    379     {
    380       "title": "Self-consistency improves chain of thought reasoning in language models",
    381       "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"],
    382       "year": 2023,
    383       "relevance": "Self-consistency sampling method used as a baseline in AOT's evaluation."
    384     },
    385     {
    386       "title": "Self-refine: Iterative refinement with self-feedback",
    387       "authors": ["Aman Madaan", "Niket Tandon", "Prakhar Gupta"],
    388       "year": 2023,
    389       "relevance": "Self-refinement framework used as a baseline and integrated into AOT's reflective refinement component."
    390     },
    391     {
    392       "title": "Livecodebench: Holistic and contamination free evaluation of large language models for code",
    393       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    394       "year": 2024,
    395       "relevance": "Contamination-free code benchmark used for evaluating AOT on code generation tasks."
    396     },
    397     {
    398       "title": "Deepseek-r1: Incentivizing reasoning capability in LLMs via reinforcement learning",
    399       "authors": ["DeepSeek-AI"],
    400       "year": 2025,
    401       "arxiv_id": "2501.12948",
    402       "relevance": "State-of-the-art reasoning LLM used as a backbone model in AOT's evaluation experiments."
    403     },
    404     {
    405       "title": "Are more LLM calls all you need? Towards scaling laws of compound inference systems",
    406       "authors": ["Lingjiao Chen", "Jared Quincy Davis", "Boris Hanin"],
    407       "year": 2024,
    408       "arxiv_id": "2403.02419",
    409       "relevance": "Studies scaling laws for compound LLM inference systems, directly relevant to AOT's test-time scaling claims."
    410     },
    411     {
    412       "title": "Language agent tree search unifies reasoning, acting, and planning in language models",
    413       "authors": ["Andy Zhou", "Kai Yan", "Michal Shlapentokh-Rothman"],
    414       "year": 2024,
    415       "relevance": "Tree search approach for LLM agents that AOT's modular integration extends."
    416     }
    417   ]
    418 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs