scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26801B)
      1 {
      2   "paper": {
      3     "title": "AdaptEvolve: Improving Efficiency of Evolutionary AI Agents through Adaptive Model Selection",
      4     "authors": [
      5       "Pretam Ray",
      6       "Pratik Prabhanjan Brahma",
      7       "Zicheng Liu",
      8       "Emad Barsoum"
      9     ],
     10     "year": 2026,
     11     "venue": "arXiv preprint",
     12     "arxiv_id": "2602.11931"
     13   },
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The abstract states 'Our code is available at https://github.com/raypretam/adaptive_llm_selection', providing a working GitHub URL."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses publicly available benchmarks: LiveCodeBench v5 and MBPP. Both are standard public benchmarks that the authors did not modify."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The schema requires 'requirements.txt, Dockerfile, conda environment file, or a detailed Environment Setup section listing library versions.' The paper lists hyperparameters in Table 5 and mentions hardware (8x AMD MI250 GPUs), but does not provide a requirements.txt, Dockerfile, or library version listing. Citing scikit-learn and River without version numbers is insufficient. Hyperparameters are not the same as environment/dependency specifications."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "While code is released and hyperparameters are listed in Table 5, the paper does not provide step-by-step reproduction instructions within the paper itself. No README commands or 'Reproducing Results' section is described."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results in Table 1 and Table 2 are point estimates only (e.g., '73.6%', '35.4'). No confidence intervals, error bars, or uncertainty notation appear anywhere in the paper."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims AdaptEvolve 'strictly outperforms' and 'significantly surpasses' baselines but provides no p-values, t-tests, or any statistical significance test to support these comparisons."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The schema says 'A paper that says 12% improvement over baseline (from 45% to 57%) provides enough context for YES.' The paper reports exactly this pattern: 'retains 97.9% of the 32B upper-bound accuracy (73.6% vs 75.2%) while reducing compute cost by 34.4%', '2.4 points (73.6% vs 71.2%)', and '41.5% while maintaining 97.1% of peak accuracy.' These provide percentage improvements with explicit baseline context, which satisfies the schema criterion."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The warm-up set of N=50 is used to train the decision tree router, but no justification or power analysis is provided for why 50 samples is sufficient to bootstrap a reliable classifier."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Table 3 reports standard deviations and skewness for confidence metrics in the calibration set analysis, but main results in Table 1 and Table 2 are single-run point estimates with no variance across seeds or runs."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper compares against multiple baselines: Pure Small model (4B Iterative), Pure Large model (32B Iterative), Random Routing, Static Decision Tree, and the Cascading baseline (Chen et al., 2023). See Table 1 and Table 2."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The Cascading baseline references Chen et al. (2023), which is recent. Random routing and pure-model bounds are appropriate reference points. The baselines represent reasonable comparisons for the setting."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Appendix A.3 presents a pilot ablation comparing the Decision Tree router against a threshold-based linear switch on N=50 samples (Table 4), justifying the non-linear routing choice. The routing dynamics section also analyzes adaptation vs. static trees."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper reports Accuracy, Compute Cost, and Efficiency (Acc/Cost) across all configurations in Tables 1 and 2, plus the Small:Large usage ratio — multiple metrics characterizing the cost-accuracy trade-off."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "The paper evaluates code generation quality using automated pass/fail test cases on benchmarks. Human evaluation is clearly irrelevant to claims about routing efficiency in an automated system."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The warm-up calibration set (N=50) is distinct from the full evaluation. The main results in Table 1 use the full LiveCodeBench v5 (880 samples) and MBPP (974 samples), which are separate from the N=50 training set used to bootstrap the decision tree."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "Results are reported as single aggregate numbers per benchmark (Table 1). No per-category or per-difficulty breakdown is provided (e.g., easy vs. hard problems within LiveCodeBench)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "The limitations section mentions that the method requires access to token-level log-probabilities, but no failure cases or error analysis of the router's incorrect routing decisions are presented."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "All experiments presented show the proposed method matching or improving efficiency relative to baselines. No configurations where AdaptEvolve failed or was worse are discussed."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims 37.9% cost reduction and 97.5% accuracy retention. Table 1 and Section 3.6 support these numbers: LiveCodeBench shows 34.4% reduction at 97.9% accuracy, MBPP shows 41.5% at 97.1%, averaging to the stated figures."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The schema states 'ablation studies (removing component X reduces performance by Y%) ARE causal claims — check whether the ablation design is adequate (controlled single-variable manipulation counts as YES).' The paper does controlled single-variable comparisons: static tree vs. adaptive tree (Table 1), threshold vs. tree (Table 4), and various routing strategies vs. pure baselines. Each comparison changes one variable while holding the benchmark, models, and other settings constant. This is adequate controlled experimental design for the causal claims made. The lack of statistical power (single run) is captured by other checklist items (significance_tests, variance_reported)."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The conclusion states 'intelligent resource allocation is a viable pathway for scalable agentic reasoning' — a broad generalization beyond the tested setting of two Qwen3 models (4B and 32B) on two Python coding benchmarks with OpenEvolve. The paper does not bound this claim to its tested models, framework, or benchmarks."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative explanations for observed results, such as whether the efficiency gains could stem from benchmark-specific properties, the particular model family (Qwen3), or the warm-up set selection. No threats-to-validity section addresses confounds."
    131       }
    132     },
    133     "setup_transparency": {
    134       "model_versions_specified": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Section 3.2 specifies 'Qwen3-4B (Small/Router Target) and Qwen3-32B (Large/Escalation Target)' from the Qwen3 family (Yang et al., 2025), with Table 5 confirming model names. These are specific enough given the paper's publication date and the Qwen3 technical report citation."
    138       },
    139       "prompts_provided": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper describes the framework as using few-shot exemplars from previous high-scoring generations, but provides no actual prompt text sent to the models. Section 3.1 mentions the framework but does not show any prompts."
    143       },
    144       "hyperparameters_reported": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Table 5 (Appendix A.4) provides a comprehensive hyperparameters table: temperature=0.6, top-p=0.95, max tokens=20000, timeout=3600s, decision tree max_depth=5, warm-up N=50, plus all evolutionary framework parameters (population size, archive size, etc.)."
    148       },
    149       "scaffolding_described": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The paper describes the adaptive routing scaffold in detail: Section 2 explains the decision tree router Φ, confidence metric computation, warm-up phase, Hoeffding Adaptive Tree for online adaptation, and Figure 1 provides a workflow diagram. The escalation logic (Table 6) and confidence computation (Appendix A.1) are fully specified."
    153       },
    154       "data_preprocessing_documented": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "The paper does not document any preprocessing of the benchmark datasets. Section 3.3 states sample counts (880 for LiveCodeBench, 974 for MBPP) but provides no details on filtering, preprocessing, or any transformations applied to benchmark inputs."
    158       }
    159     },
    160     "limitations_and_scope": {
    161       "limitations_section_present": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 5 is titled 'Limitations' and discusses that the method requires access to token-level log-probabilities, restricting applicability to open-weight models or APIs exposing logprobs."
    165       },
    166       "threats_to_validity_specific": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The limitations section discusses only one specific constraint (logprob access requirement). It does not address threats to validity such as single-run results without confidence intervals, potential benchmark contamination, or the generalizability of findings to other model families or frameworks."
    170       },
    171       "scope_boundaries_stated": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "While the logprob limitation implies some scope boundary, the paper does not explicitly state what the results do NOT show — e.g., that results apply only to Qwen3 models, only to Python coding tasks, only to OpenEvolve-style frameworks, or only to the two tested benchmarks."
    175       }
    176     },
    177     "data_integrity": {
    178       "raw_data_available": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "Only aggregated results are presented in tables. No raw per-sample predictions, routing decisions, or intermediate outputs are released alongside the code."
    182       },
    183       "data_collection_described": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 3.3 describes the benchmarks used (LiveCodeBench v5 with 880 samples, MBPP with 974 samples), and the warm-up phase collection (N=50) is described in Section 3.5 with the labeling logic in Table 6."
    187       },
    188       "recruitment_methods_described": {
    189         "applies": false,
    190         "answer": false,
    191         "justification": "No human participants are involved. Data comes from standard public benchmarks (LiveCodeBench, MBPP), so recruitment does not apply."
    192       },
    193       "data_pipeline_documented": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "The confidence computation pipeline is described (Appendix A.1), but the full data pipeline from benchmark input to final evaluation output — including how the evolutionary iterations proceed, how outputs are collected, and how accuracy is computed — is only partially described through references to OpenEvolve."
    197       }
    198     },
    199     "conflicts_of_interest": {
    200       "funding_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The footnote on the first page states 'Work done during Internship at Advanced Micro Devices, Inc. (AMD).' This explicitly identifies AMD as the institution that supported the work. While there is no formal 'Acknowledgments' section with grant numbers, the funding source (AMD via internship) is clearly disclosed to the reader. The schema asks 'Is the funding source disclosed?' and it is."
    204       },
    205       "affiliations_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Author affiliations are listed on the title page: Pretam Ray (IIT Kharagpur) and three AMD employees. The footnote explicitly states the work was done during an internship at AMD."
    209       },
    210       "funder_independent_of_outcome": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "The work is performed at AMD and three of four authors are AMD employees. AMD produces the MI250 GPUs used in the experiments and has commercial interest in demonstrating efficient inference on AMD hardware. The funder is not independent of the outcome."
    214       },
    215       "financial_interests_declared": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No competing interests statement appears anywhere in the paper. The AMD affiliation is noted but no formal declaration of financial interests or competing interests is made."
    219       }
    220     },
    221     "contamination": {
    222       "training_cutoff_stated": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "The paper uses Qwen3-4B and Qwen3-32B but does not state the training data cutoff date for either model. The Qwen3 technical report (Yang et al., 2025) is cited but the cutoff date is not mentioned in the paper."
    226       },
    227       "train_test_overlap_discussed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No discussion of potential train/test overlap appears in the paper. MBPP (2021) is particularly at risk of being in Qwen3 training data, but this is not addressed."
    231       },
    232       "benchmark_contamination_addressed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "LiveCodeBench is cited as 'contamination free' by design (Jain et al., 2024), but MBPP (Austin et al., 2021) predates Qwen3 by several years and is widely used, making contamination a real concern that is not addressed. The paper does not discuss contamination risk for either benchmark."
    236       }
    237     },
    238     "human_studies": {
    239       "pre_registered": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved in this benchmark evaluation study."
    243       },
    244       "irb_or_ethics_approval": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this study."
    248       },
    249       "demographics_reported": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this study."
    253       },
    254       "inclusion_exclusion_criteria": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this study."
    258       },
    259       "randomization_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in this study."
    263       },
    264       "blinding_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in this study."
    268       },
    269       "attrition_reported": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants are involved in this study."
    273       }
    274     },
    275     "cost_and_practicality": {
    276       "inference_cost_reported": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Section 3.6 and Table 1 explicitly report inference cost in normalized compute units (one 32B call = 1.0, one 4B call = 0.125), with total costs per configuration reported across both benchmarks."
    280       },
    281       "compute_budget_stated": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "The schema asks 'Is the total computational budget stated? Look for: GPU hours, total API spend, hardware used, training time.' While Section 3.1 states the hardware (8x AMD MI250 GPUs) and Table 5 lists configuration parameters, the paper never states the TOTAL compute consumed — no GPU-hours, no wall-clock time for experiments, no total training/inference time. Listing hardware is necessary but not sufficient; the total budget (how long the hardware was used) is not quantified."
    285       }
    286     }
    287   },
    288   "claims": [
    289     {
    290       "claim": "AdaptEvolve reduces total inference compute by an average of 37.9% across benchmarks while retaining 97.5% of the upper-bound accuracy of static large-model baselines.",
    291       "evidence": "Table 1 shows LiveCodeBench: 34.4% cost reduction, 97.9% accuracy retention (73.6/75.2); MBPP: 41.5% cost reduction, 97.1% accuracy retention (91.3/94.0). Section 3.6 reports these averages.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "The Adaptive Hoeffding Tree strictly outperforms static decision trees by dynamically adjusting decision boundaries as the population evolves.",
    296       "evidence": "Table 1 shows HAT achieves 73.6% accuracy vs. 71.2% for static Decision Tree on LiveCodeBench (2.4 point improvement). Section 3.6 ('Adaptation to Drift') discusses this result.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Intrinsic confidence metrics (LGC, MC, TC, BWC) provide a reliable signal for predicting model solvability, with statistically separable distributions between solved and unsolved instances.",
    301       "evidence": "Table 3 shows mean LGC of 5.86 for unsolved vs. 7.98 for solved instances, with similar shifts across all four metrics. Section A.2 provides distributional analysis.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "A Decision Tree router outperforms simple linear threshold-based routing in the pilot study.",
    306       "evidence": "Table 4 shows Decision Tree achieves 74.12% accuracy at 2.02 cost (Eff=36.7) vs. Threshold Switch at 70.07% accuracy at 2.08 cost (Eff=33.7) on a stratified N=50 subset.",
    307       "supported": "weak"
    308     },
    309     {
    310       "claim": "AdaptEvolve achieves a superior Pareto frontier compared to the Cascading baseline, with similar accuracy but lower compute cost.",
    311       "evidence": "Table 2 shows HAT: 73.6% accuracy at 2.08 cost (Eff=35.4) vs. Cascading baseline: 73.8% accuracy at 2.81 cost (Eff=26.3).",
    312       "supported": "moderate"
    313     }
    314   ],
    315   "methodology_tags": [
    316     "benchmark-eval"
    317   ],
    318   "key_findings": "AdaptEvolve proposes a confidence-driven model selection framework for evolutionary agentic coding systems that routes between a small (4B) and large (32B) Qwen3 model based on token-level entropy signals. Evaluated on LiveCodeBench v5 and MBPP using the OpenEvolve framework, the method reduces inference compute by an average of 37.9% while retaining 97.5% of the large-model accuracy ceiling. An Adaptive Hoeffding Tree that handles distribution shift outperforms static decision tree routing. The approach requires access to token-level log-probabilities, limiting applicability to open-weight models or logprob-exposing APIs.",
    319   "red_flags": [
    320     {
    321       "flag": "No variance or error bars on main results",
    322       "detail": "All results in Tables 1 and 2 are single-run point estimates. With an evolutionary stochastic process (random seed=42 fixed), it is impossible to assess result stability without multiple seeds. The claimed improvements (e.g., 2.4 accuracy points on LiveCodeBench) may not be statistically significant."
    323     },
    324     {
    325       "flag": "AMD affiliation conflict of interest",
    326       "detail": "Three of four authors are AMD employees, experiments run on AMD MI250 GPUs, and the work was performed during an AMD internship. AMD has commercial interest in demonstrating efficient inference on its hardware. No competing interests statement is provided."
    327     },
    328     {
    329       "flag": "MBPP benchmark contamination risk unaddressed",
    330       "detail": "MBPP was published in 2021 and is widely used in LLM training. The paper uses Qwen3 models without stating training data cutoffs or addressing whether MBPP examples appear in Qwen3 training data."
    331     },
    332     {
    333       "flag": "Overgeneralized conclusions",
    334       "detail": "The conclusion claims 'intelligent resource allocation is a viable pathway for scalable agentic reasoning' based solely on two Qwen3 models on two Python coding benchmarks within one framework (OpenEvolve). This generalization substantially outstrips the evidence."
    335     },
    336     {
    337       "flag": "Tiny pilot study used for architectural decisions",
    338       "detail": "The decision to use a Decision Tree over linear thresholding (Appendix A.3) is based on N=50 samples — the same 50 samples used to train the router. This circular validation does not provide independent evidence for the architectural choice."
    339     },
    340     {
    341       "flag": "No prompts provided",
    342       "detail": "The paper describes using few-shot exemplars from prior high-scoring generations but does not provide any actual prompt text sent to the models, making full reproduction dependent entirely on the code repository."
    343     }
    344   ],
    345   "cited_papers": [
    346     {
    347       "title": "AlphaEvolve: A coding agent for scientific and algorithmic discovery",
    348       "authors": [
    349         "Alexander Novikov",
    350         "Ngân Vũ",
    351         "Marvin Eisenberger"
    352       ],
    353       "year": 2025,
    354       "arxiv_id": "2506.13131",
    355       "relevance": "Foundational evolutionary coding agent that AdaptEvolve builds upon; directly relevant to agentic AI coding system evaluation."
    356     },
    357     {
    358       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    359       "authors": [
    360         "Naman Jain",
    361         "King Han",
    362         "Alex Gu"
    363       ],
    364       "year": 2024,
    365       "arxiv_id": "2403.07974",
    366       "relevance": "Contamination-aware code generation benchmark used to evaluate AdaptEvolve; relevant to LLM coding capability evaluation methodology."
    367     },
    368     {
    369       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    370       "authors": [
    371         "Lingjiao Chen",
    372         "Matei Zaharia",
    373         "James Zou"
    374       ],
    375       "year": 2023,
    376       "arxiv_id": "2305.05176",
    377       "relevance": "Model cascading baseline that AdaptEvolve compares against; represents a key approach in cost-efficient LLM inference."
    378     },
    379     {
    380       "title": "RouteLLM: Learning to route LLMs with preference data",
    381       "authors": [
    382         "Isaac Ong",
    383         "Amjad Almahairi",
    384         "Vincent Wu"
    385       ],
    386       "year": 2024,
    387       "arxiv_id": "2406.18665",
    388       "relevance": "LLM routing system for cost-quality trade-offs; directly related to the routing methodology studied in AdaptEvolve."
    389     },
    390     {
    391       "title": "Deep think with confidence",
    392       "authors": [
    393         "Yichao Fu",
    394         "Xuewei Wang",
    395         "Yuandong Tian",
    396         "Jiawei Zhao"
    397       ],
    398       "year": 2025,
    399       "arxiv_id": "2508.15260",
    400       "relevance": "Source of the entropy-based confidence metrics (LGC, MC, TC, BWC) adopted by AdaptEvolve for model selection."
    401     },
    402     {
    403       "title": "ShinkaEvolve: Towards open-ended and sample-efficient program evolution",
    404       "authors": [
    405         "Robert Tjarko Lange",
    406         "Yuki Imajuku",
    407         "Edoardo Cetin"
    408       ],
    409       "year": 2025,
    410       "arxiv_id": "2509.19349",
    411       "relevance": "Related evolutionary agentic coding framework; relevant to the survey's scope on agentic AI programming systems."
    412     },
    413     {
    414       "title": "Qwen3 technical report",
    415       "authors": [
    416         "An Yang"
    417       ],
    418       "year": 2025,
    419       "arxiv_id": "2505.09388",
    420       "relevance": "Describes the Qwen3 model family (4B and 32B) used in AdaptEvolve experiments; relevant to LLM capability and efficiency evaluation."
    421     },
    422     {
    423       "title": "Program synthesis with large language models",
    424       "authors": [
    425         "Jacob Austin",
    426         "Augustus Odena",
    427         "Maxwell Nye"
    428       ],
    429       "year": 2021,
    430       "arxiv_id": "2108.07732",
    431       "relevance": "Introduces MBPP benchmark used for evaluation in AdaptEvolve; foundational LLM code generation evaluation paper."
    432     },
    433     {
    434       "title": "Self-consistency improves chain of thought reasoning in language models",
    435       "authors": [
    436         "Xuezhi Wang",
    437         "Jason Wei",
    438         "Dale Schuurmans"
    439       ],
    440       "year": 2022,
    441       "relevance": "Self-consistency mechanism referenced as improving small model capabilities; relevant to LLM reasoning methodology."
    442     },
    443     {
    444       "title": "Speculative decoding with big little decoder",
    445       "authors": [
    446         "Sehoon Kim",
    447         "Karttikeya Mangalam",
    448         "Suhong Moon"
    449       ],
    450       "year": 2023,
    451       "relevance": "Speculative decoding approach for efficient LLM inference; compared conceptually to model cascade approaches in AdaptEvolve."
    452     },
    453     {
    454       "title": "AlphaResearch: Accelerating new algorithm discovery with language models",
    455       "authors": [
    456         "Zhaojian Yu",
    457         "Kaiyue Feng",
    458         "Yilun Zhao"
    459       ],
    460       "year": 2025,
    461       "arxiv_id": "2511.08522",
    462       "relevance": "Related work on accelerating agentic systems with improved planning; cited in context of evolutionary AI agent optimization."
    463     }
    464   ]
    465 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs