scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26862B)
      1 {
      2   "paper": {
      3     "title": "CASTER: Breaking the Cost-Performance Barrier in Multi-Agent Orchestration via Context-Aware Strategy for Task Efficient Routing",
      4     "authors": [
      5       "Shanyv Liu",
      6       "Xuyang Yuan",
      7       "Tao Chen",
      8       "Zijun Zhan",
      9       "Zhu Han",
     10       "Danyang Zheng",
     11       "Weishan Zhang",
     12       "Shaohua Cao"
     13     ],
     14     "year": 2026,
     15     "venue": "arXiv preprint",
     16     "arxiv_id": "2601.19793"
     17   },
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No repository URL or code archive link is provided anywhere in the paper. The paper describes implementation details using PyTorch but does not release the code."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The benchmark tasks and training datasets (Dpre, Dtraj) are described in detail but no download link or public release is provided. The paper constructs its own benchmark across four domains but does not make it available."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "While the paper mentions PyTorch and specific model APIs (GPT-4o, text-embedding-3-small), there is no requirements.txt, Dockerfile, or detailed environment specification with library versions."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions are provided. The paper describes the training protocol (Section A.1) and algorithms in pseudocode, but there are no README-style instructions for replicating the experiments."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Results are reported as point estimates throughout. Tables 1-3, 6-12 report single numbers with no confidence intervals. The box plots in Figure 12 show cost distributions but no error bars on quality scores. Table 7 mentions 'implicit high variance (as seen in error bars)' but the reported tables contain only point estimates."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper makes numerous comparative claims (e.g., CASTER outperforms FrugalGPT, matches Force Strong) but provides no statistical significance tests. Differences of 0.1-1.2 points are treated as meaningful without any hypothesis testing."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Cost reductions are reported as percentages with baselines (e.g., '72.4% cost reduction' from $1.4658 to $0.4052 in Table 1). Quality scores are reported with baselines (e.g., CASTER 95.3 vs Force Strong 95.2 in Science). The reader can assess magnitude from context."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The benchmark consists of 20 tasks per domain (10 easy, 10 hard), and the FrugalGPT comparison uses 10 tasks per domain. No justification is given for these sample sizes, and no power analysis is discussed. Twenty tasks per domain is quite small for the claims being made."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Cost variance is reported in Table 8 (std dev and distribution statistics), but quality scores throughout the paper are single-run numbers with no variance or standard deviation reported. The paper does not mention running experiments multiple times or reporting spread measures for quality evaluations."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The paper includes Force Strong (all GPT-4o), Force Weak (all GPT-4o-mini), and FrugalGPT cascade strategy as baselines (Section 4.1, Tables 1-3)."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "FrugalGPT (2023) is the only named routing baseline. RouteLLM (2024) is discussed in related work but not included in experiments. The paper acknowledges RouteLLM and Agentic Supernet as alternatives but does not compare against them. FrugalGPT is two years old and more recent routing methods exist."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "No ablation study is presented. The system has multiple components (dual-branch feature fusion, cold start training, negative feedback learning, meta-features, semantic features) but there is no experiment removing individual components to measure their contribution."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The paper reports multiple metrics: token cost (cumulative and per-task), cost reduction percentages, quality scores across multiple dimensions (Functional Correctness, Robustness, Engineering Quality, Code Style for Software; similar decompositions for other domains), and multi-model scores."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "All evaluation is automated via LLM-as-a-Judge (GPT-4o). No human evaluation of the generated outputs is performed. The paper acknowledges self-preference bias concern (Wataoka et al., 2024) but relies entirely on automated evaluation."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Section A.3 states: 'these benchmark tasks were explicitly held out from both the router's offline pre-training corpus and its online refinement pipeline, preventing any potential data leakage.' The benchmark of 20 tasks per domain is separated from training data."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Extensive per-category breakdowns are provided in Figure 13 and Table 9 (e.g., Logic, OOP, Concurrency for Software; Physics_Easy, Quantum_Hard for Science). Capability breakdowns are in Figure 15 and Table 11."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper discusses where weak models fail (e.g., Web Security dropping to 48.0, Concurrency to 67.0 in Section 4.4.2) and discusses 'logic collapse' in Science Robustness. The paper also notes DeepSeek's 'cost inversion' and Claude's timeout issues in Section 4.6.1."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper reports that random exploration leads to data pollution (Section 3.3), that DeepSeek shows cost inversion where CASTER's cost reduction is negative (-12.4% in Security, Table 1), and that Force Weak occasionally outperforms strong models 'due to stochasticity' (Section 4.6.2). CASTER also underperforms Force Strong in some specific categories (e.g., Data Struct drops from 90 to 70 in Table 9)."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims CASTER 'reduces inference cost by up to 72.4%' which is supported in Table 1 (OpenAI Software scenario). The claim of 'matching success rates' is supported by quality scores in Tables 2 and 10. The claim of 'outperforming FrugalGPT across all domains' is supported in Table 3."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper makes causal claims such as CASTER 'mitigates the over-thinking or overfitting sometimes exhibited by strong models' (Section 4.4.1) and claims the router 'captures latent complexity features' (Section 4.2). These causal interpretations are not adequately supported—the paper does not control for confounds or provide mechanism evidence beyond correlational results."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The conclusion states CASTER is 'inherently domain-agnostic, offering significant potential for broader applications such as legal analysis and creative writing' (Section 5), but this is tested only on four domains with self-constructed tasks. The title and abstract frame this as a general solution for 'Multi-Agent Orchestration' but results are limited to GPT-4o/mini as the primary model pair with LangGraph workflows."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "No alternative explanations are discussed. The paper does not consider whether the observed improvements could be due to the specific task distribution, the LLM-as-a-Judge's biases, or whether simpler routing heuristics could achieve similar results. There is no threats-to-validity section."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "The paper uses 'gpt-4o' and 'gpt-4o-mini' without snapshot dates or API versions. Table 5 lists model names (e.g., 'claude-sonnet-4-5', 'gemini-2.5-pro', 'deepseek-reasoner (R1)', 'qwen3-max') but these are marketing names without specific version identifiers or snapshot dates. The embedding model 'text-embedding-3-small' also lacks a version date."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The evaluation rubric prompts are provided in full in Figures 6-9 (Software, Data, Science, Security assessment prompts). The dynamic task generation approach is described in Algorithm 3. However, the actual task prompts sent to agents are not fully provided—only the evaluation judge prompts are shown."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Table 4 provides network architecture hyperparameters (Din=1536, Dsem=128, Dstruct=16, Dfuse=64). Section A.1.2 reports training hyperparameters (200 epochs, BCE loss, lr=1e-3 for cold start; lr=1e-4 for fine-tuning, StepLR gamma=0.5 every 50 epochs, Dropout p=0.2). Table 5 provides model pricing. Temperature tau=1.1 for task generation is stated."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "The multi-agent workflows are described in detail in Section B and Figure 3, including the Linear Initialization + Iterative Loop pattern, credit assignment, circuit breaker mechanisms, and state transition logic for each of the four domains."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section A.1.1 describes the synthetic warm-up dataset construction (seed dataset → augmentation by 4-6x with noise injection), and Section C describes the Dynamic Adversarial Task Generation Pipeline with stochastic difficulty stratification (p=0.7 hard mode). The re-labeling logic for trajectory data is specified in Algorithm 2."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no dedicated limitations or threats-to-validity section. The paper has a brief 'Impact Statement' (after Section 5) but it only discusses positive societal impacts and states 'We foresee no specific ethical risks beyond those inherent to the underlying LLMs.'"
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No threats to validity are discussed. The paper does not address the small benchmark size (20 tasks per domain), the reliance on LLM-as-a-Judge with potential self-preference bias (beyond a brief mention in Section 4.1), or the synthetic nature of the evaluation tasks."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No explicit scope boundaries are stated. The paper claims domain-agnostic applicability in the conclusion without stating what settings were NOT tested or what limitations apply. The title suggests general 'Multi-Agent Orchestration' but the system is only tested on LangGraph workflows with specific model pairs."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw data (task prompts, model outputs, judge scores per task) is released. Only aggregated results in tables and figures are presented."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "The data collection procedures are described: synthetic dataset construction via augmentation (Section A.1.1), dynamic trajectory generation via GPT-4o teacher model in sandboxed environments (Section C), and benchmark task curation (Section A.3). The adversarial generation pipeline is described in Algorithm 3."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants are involved; the study is a benchmark evaluation of an automated system."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The data pipeline is documented across multiple sections: seed data → augmentation → cold start training (Section A.1.1), dynamic task generation → sandbox execution → trajectory capture (Section C), and re-labeling logic for fine-tuning (Algorithm 2). The benchmark construction with held-out separation is described in Section A.3."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding sources are disclosed anywhere in the paper. There is no acknowledgments section listing grants or sponsors."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed: China University of Petroleum (East China), University of Houston, Southwest Jiaotong University, and Shandong Key Laboratory of Intelligent Oil & Gas Industrial Software."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding information is disclosed, so independence cannot be assessed. The absence of any funding disclosure is itself a concern."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement or financial interests declaration is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The paper uses GPT-4o, GPT-4o-mini, Claude, Gemini, DeepSeek, and Qwen models but does not state training data cutoff dates for any of them. Since the benchmark tasks are evaluated by these models, knowledge of similar tasks could affect performance."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of whether the evaluation tasks could overlap with the models' training data. Since tasks are generated by GPT-4o itself and then evaluated by GPT-4o-as-judge, there is a concern about circular evaluation that is not addressed."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The benchmark is self-constructed so it would not be in model training data, but the paper does not discuss this advantage or address whether similar task patterns exist in training corpora. The held-out separation described in Section A.3 only pertains to the router's training data, not the LLMs' pre-training."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Detailed inference costs are reported throughout: Tables 1, 2, 3, 5, 6, 7, 8 provide cumulative costs, per-task costs, cost distributions, and cost reductions. Table 5 provides the pricing structure for all models used. For example, CASTER costs $0.018 per task in Software vs $0.039 for Force Strong."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "The total compute budget for the experiments (total API spend across all experiments, training time for the router, GPU hours if applicable) is not stated. Individual task costs are reported but the total experimental budget is not quantified."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "CASTER reduces inference cost by up to 72.4% compared to strong-model baselines while matching their success rates.",
    295       "evidence": "Table 1 shows 72.4% cost reduction in the OpenAI Software scenario ($1.4658 to $0.4052) with quality score of 97.0 vs 95.3 for Force Strong. Table 2 shows 23.4%-54.3% cost reductions across domains.",
    296       "supported": "moderate"
    297     },
    298     {
    299       "claim": "CASTER consistently outperforms both heuristic routing and FrugalGPT across all domains.",
    300       "evidence": "Table 3 shows CASTER beats FrugalGPT in all four domains: cost reductions of 20.7%-48.0% and quality gains of +0.7 to +1.2 points. However, the FrugalGPT comparison uses only 10 tasks per domain.",
    301       "supported": "moderate"
    302     },
    303     {
    304       "claim": "CASTER matches or exceeds Force Strong quality in Science (95.3 vs 95.2) and Security (86.2 vs 85.5) domains.",
    305       "evidence": "Table 10 provides these exact numbers. However, these differences (0.1 and 0.7 points) are within noise for single-run experiments with 20 tasks and no significance tests are provided.",
    306       "supported": "weak"
    307     },
    308     {
    309       "claim": "Dynamic routing can mitigate 'over-thinking' or overfitting sometimes exhibited by strong models on simpler sub-tasks.",
    310       "evidence": "Section 4.4.1 makes this causal interpretation based on CASTER slightly outperforming Force Strong in Science and Security. This is a single observation without controlled experiments to test the 'over-thinking' mechanism.",
    311       "supported": "weak"
    312     },
    313     {
    314       "claim": "Random exploration strategies introduce significant noise that misleads the router into becoming overly conservative.",
    315       "evidence": "Section 3.3 states this as motivation for on-policy training but provides no experimental comparison between random exploration and on-policy training to quantify the claimed effect.",
    316       "supported": "unsupported"
    317     },
    318     {
    319       "claim": "CASTER generalizes across diverse model providers (Claude, Gemini, DeepSeek, Qwen) beyond the primary GPT-4o pair.",
    320       "evidence": "Table 1 shows results across five providers. CASTER achieves cost reductions for most, though DeepSeek shows cost inversion in Security (-12.4%). The router was trained on GPT-4o/Qwen data, so cross-provider results demonstrate some transfer.",
    321       "supported": "moderate"
    322     }
    323   ],
    324   "methodology_tags": [
    325     "benchmark-eval"
    326   ],
    327   "key_findings": "CASTER is a lightweight neural router for dynamic model selection in graph-based multi-agent systems that combines semantic embeddings with structural meta-features. On self-constructed benchmarks across Software Engineering, Data Analysis, Scientific Discovery, and Cybersecurity, CASTER reduces token costs by 23-72% compared to using only strong models while maintaining comparable quality scores. The system outperforms FrugalGPT's cascading strategy by 20-48% in cost and 0.7-1.2 points in quality across all domains. Cross-provider evaluation on Claude, Gemini, DeepSeek, and Qwen shows the approach generalizes beyond the primary GPT-4o training pair, though results are less consistent.",
    328   "red_flags": [
    329     {
    330       "flag": "Self-constructed benchmark evaluated by same model family",
    331       "detail": "GPT-4o generates the benchmark tasks AND serves as the LLM-as-a-Judge evaluator. The paper acknowledges self-preference bias (citing Wataoka et al., 2024) but does not adequately mitigate it. This circular setup could inflate quality scores for GPT-4o-routed outputs."
    332     },
    333     {
    334       "flag": "Very small evaluation set without statistical tests",
    335       "detail": "20 tasks per domain (10 easy, 10 hard) with single-run evaluations. Differences of 0.1-1.2 quality points are presented as meaningful without significance tests. The FrugalGPT comparison uses only 10 tasks per domain. These sample sizes are too small for robust conclusions."
    336     },
    337     {
    338       "flag": "No limitations section",
    339       "detail": "The paper contains no limitations, threats to validity, or discussion of what the results do not show. The Impact Statement only discusses positive societal impacts."
    340     },
    341     {
    342       "flag": "No ablation study",
    343       "detail": "The system has multiple components (dual-branch fusion, cold start, negative feedback, meta-features, semantic features) but no ablation study shows which components are actually necessary or how much each contributes."
    344     },
    345     {
    346       "flag": "Overclaimed generalization",
    347       "detail": "The conclusion claims CASTER is 'inherently domain-agnostic' with potential for 'legal analysis and creative writing' based on four tested domains with synthetic tasks. The title frames this as breaking a general 'Cost-Performance Barrier' but the evaluation is quite narrow."
    348     },
    349     {
    350       "flag": "No code or data release",
    351       "detail": "Neither the CASTER implementation nor the benchmark tasks are released, making independent verification impossible despite detailed methodological descriptions."
    352     }
    353   ],
    354   "cited_papers": [
    355     {
    356       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    357       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    358       "year": 2023,
    359       "arxiv_id": "2305.05176",
    360       "relevance": "Key baseline for cost-efficient LLM routing, directly compared against CASTER in experiments."
    361     },
    362     {
    363       "title": "RouteLLM: Learning to Route LLMs with Preference Data",
    364       "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu"],
    365       "year": 2024,
    366       "arxiv_id": "2406.18665",
    367       "relevance": "Alternative LLM routing approach using RLHF preference data; discussed as a limitation in related work but not experimentally compared."
    368     },
    369     {
    370       "title": "MetaGPT: Meta Programming for a Multi-Agent Collaborative Framework",
    371       "authors": ["Sirui Hong"],
    372       "year": 2023,
    373       "relevance": "Pioneering multi-agent framework that assigns specialized roles to LLMs; foundational to the MAS paradigm CASTER operates within."
    374     },
    375     {
    376       "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    377       "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig"],
    378       "year": 2024,
    379       "relevance": "State-of-the-art agentic software engineering tool; relevant to the software engineering domain CASTER evaluates."
    380     },
    381     {
    382       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversations",
    383       "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"],
    384       "year": 2024,
    385       "relevance": "Major multi-agent framework enabling complex task solving through communicative agents."
    386     },
    387     {
    388       "title": "MultiAgentBench: Evaluating the Collaboration and Competition of LLM Agents",
    389       "authors": ["Kunlun Zhu"],
    390       "year": 2025,
    391       "relevance": "Recent benchmark for evaluating multi-agent systems, used as reference for milestone scoring methodology adopted by CASTER."
    392     },
    393     {
    394       "title": "Self-Preference Bias in LLM-as-a-Judge",
    395       "authors": ["Koki Wataoka", "Tsubasa Takahashi", "Ryokan Ri"],
    396       "year": 2024,
    397       "arxiv_id": "2410.21819",
    398       "relevance": "Identifies self-preference bias in LLM-as-a-Judge evaluation, directly relevant to CASTER's evaluation methodology concerns."
    399     },
    400     {
    401       "title": "ChatDev: Communicative Agents for Software Development",
    402       "authors": ["Chen Qian"],
    403       "year": 2024,
    404       "relevance": "Multi-agent framework simulating software development companies with role-playing LLMs."
    405     },
    406     {
    407       "title": "LLM-based Multi-Agent Systems for Software Engineering: Literature Review, Vision, and the Road Ahead",
    408       "authors": ["Junda He", "Christoph Treude", "David Lo"],
    409       "year": 2025,
    410       "relevance": "Comprehensive review of LLM-based multi-agent systems in software engineering, providing context for CASTER's application domain."
    411     },
    412     {
    413       "title": "Multi-Agent Architecture Search via Agentic Supernet",
    414       "authors": ["Guibin Zhang"],
    415       "year": 2025,
    416       "arxiv_id": "2502.04180",
    417       "relevance": "Architecture search framework for optimal agent topologies; complementary optimization approach to CASTER's model routing."
    418     },
    419     {
    420       "title": "AgentBoard: An Analytical Evaluation Board of Multi-Turn LLM Agents",
    421       "authors": ["Mingyuan Chang"],
    422       "year": 2024,
    423       "relevance": "Evaluation framework for multi-turn LLM agents with fine-grained progress metrics."
    424     }
    425   ]
    426 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs