scan.json (29249B)
1 { 2 "paper": { 3 "title": "Learning to Configure Agentic AI Systems", 4 "authors": [ 5 "Aditya Taparia", 6 "Som Sagar", 7 "Ransalu Senanayake" 8 ], 9 "year": 2026, 10 "venue": "arXiv", 11 "arxiv_id": "2602.11574" 12 }, 13 "scan_version": 2, 14 "active_modules": ["experimental_rigor", "data_leakage"], 15 "methodology_tags": ["benchmark-eval"], 16 "key_findings": "ARC, a hierarchical RL framework that dynamically selects workflows, tools, budgets, and prompts per query, outperforms fixed architectures and flat RL baselines across five benchmarks. On GSM8k, ARC achieves 88.6% accuracy vs 83.6% for GEPA and 85.2% for flat RL. ARC occupies Pareto-optimal accuracy-cost regions, demonstrating that per-query adaptive configuration reduces computational waste. Policies trained on 7B models transfer zero-shot to 32B and 72B variants, though cross-task transfer is limited by tool-structure overlap.", 17 "checklist": { 18 "artifacts": { 19 "code_released": { 20 "applies": true, 21 "answer": true, 22 "justification": "The abstract states 'Codebase: Github' which appears to be a hyperlinked repository reference in the PDF." 23 }, 24 "data_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "All five benchmarks (GSM8k, DROP, MedQA, HotpotQA, GAIA) are publicly available standard datasets. The paper did not collect proprietary data." 28 }, 29 "environment_specified": { 30 "applies": true, 31 "answer": false, 32 "justification": "Appendix B describes high-level requirements ('Python 3.10+ installation and a CUDA-capable machine', GPU class) but provides no requirements.txt, Dockerfile, or specific library versions needed to recreate the environment." 33 }, 34 "reproduction_instructions": { 35 "applies": true, 36 "answer": false, 37 "justification": "Algorithm 1 and Appendix F provide detailed hyperparameters and training procedure, but there are no step-by-step reproduction instructions (commands to run, scripts to execute) in the paper itself." 38 } 39 }, 40 "statistical_methodology": { 41 "confidence_intervals_or_error_bars": { 42 "applies": true, 43 "answer": false, 44 "justification": "Table 1 (main results) reports only point estimates for all methods across all benchmarks. No confidence intervals, error bars, or ± notation on evaluation results. The embedding ablation (Table 5) has ± but that is a separate component evaluation." 45 }, 46 "significance_tests": { 47 "applies": true, 48 "answer": false, 49 "justification": "No statistical significance tests are reported. All claims of 'outperforms' are based on comparing raw accuracy numbers without any p-values, t-tests, or other statistical tests." 50 }, 51 "effect_sizes_reported": { 52 "applies": true, 53 "answer": true, 54 "justification": "Table 1 reports both absolute accuracy values and delta improvements (e.g., '+31.3', '+33.5') against the base model, providing sufficient context to understand the magnitude of improvements." 55 }, 56 "sample_size_justified": { 57 "applies": true, 58 "answer": false, 59 "justification": "No justification for benchmark sizes or evaluation set sizes. For GAIA, only the first 65 samples are used for training with the rest for evaluation, but no justification for this split. No power analysis." 60 }, 61 "variance_reported": { 62 "applies": true, 63 "answer": false, 64 "justification": "Main evaluation results (Table 1) report single numbers with no variance, standard deviation, or spread measures across runs. Figure 9 shows training dynamics variance but not evaluation variance." 65 } 66 }, 67 "evaluation_design": { 68 "baselines_included": { 69 "applies": true, 70 "answer": true, 71 "justification": "Extensive baselines across four categories: base models with tools, search-based methods (Grid/Greedy Search), optimization frameworks (AutoGen, DSPy, GEPA, LAP), and flat RL baselines (RL Bandits, RL Episodes)." 72 }, 73 "baselines_contemporary": { 74 "applies": true, 75 "answer": true, 76 "justification": "Baselines include recent frameworks: AutoGen (2024), DSPy (2023), GEPA (2025). The models evaluated (Qwen 2.5, Gemini 2.5) are current." 77 }, 78 "ablation_study": { 79 "applies": true, 80 "answer": true, 81 "justification": "Multiple ablation studies: SFT refinement (Table 1, ARC vs ARC w/o SFT), alternative training objectives (Table 4, PPO vs GRPO, SFT vs DPO), embedding models (Appendix D, 19 models evaluated), prompt generators (Appendix E, 12 models), and action masking (Figure 3)." 82 }, 83 "multiple_metrics": { 84 "applies": true, 85 "answer": true, 86 "justification": "Beyond task accuracy, the paper evaluates cost efficiency (Figure 4, token-based API cost per episode), workflow diversity (Table 2, entropy and Gini coefficient), and transfer performance (Table 3)." 87 }, 88 "human_evaluation": { 89 "applies": true, 90 "answer": false, 91 "justification": "No human evaluation of system outputs. All evaluation is automated using ground-truth answer matching on benchmarks." 92 }, 93 "held_out_test_set": { 94 "applies": true, 95 "answer": true, 96 "justification": "Section 4.1: 'We employ standard training split for policy learning and evaluate on the test (or validation) set.' GAIA: 'the first 65 samples for training and the rest for evaluation.'" 97 }, 98 "per_category_breakdown": { 99 "applies": true, 100 "answer": true, 101 "justification": "Results are broken down by individual benchmark (Table 1), by reasoning vs tool-use capability axes, by workflow type (Figure 12), and errors by category (Figure 6)." 102 }, 103 "failure_cases_discussed": { 104 "applies": true, 105 "answer": true, 106 "justification": "Section 4.6 provides error analysis with four error types and distribution across benchmarks (Figure 6). Appendix I includes specific failure case examples for each error category." 107 }, 108 "negative_results_reported": { 109 "applies": true, 110 "answer": true, 111 "justification": "GEPA outperforms ARC on MedQA (87.1% vs 64.6%), which the paper discusses. GRPO and DPO underperform PPO/SFT (Table 4). Transfer to GAIA from HotpotQA fails (2.0% vs 6.0% in-domain). LAP baseline shows no meaningful improvement." 112 } 113 }, 114 "claims_and_evidence": { 115 "abstract_claims_supported": { 116 "applies": true, 117 "answer": false, 118 "justification": "The abstract claims 'up to 25% higher task accuracy' but the conclusion states 'up to 16% higher accuracy.' This internal inconsistency on the key magnitude claim undermines trust. The largest gain vs a strong baseline in Table 1 is ARC (63.9%) vs GEPA (39.3%) on DROP = 24.6pp, but most gains are 5-14pp." 119 }, 120 "causal_claims_justified": { 121 "applies": true, 122 "answer": true, 123 "justification": "Ablation studies provide controlled single-variable manipulations: removing SFT (Table 1), replacing PPO with GRPO/DPO (Table 4), comparing hierarchical vs flat RL. The causal claims about component contributions are supported by adequate ablation design." 124 }, 125 "generalization_bounded": { 126 "applies": true, 127 "answer": false, 128 "justification": "The title 'Learning to Configure Agentic AI Systems' and the 'Model Agnostic' claim (Section 4.2) suggest broad generalizability, but experiments cover only 5 benchmarks spanning reasoning and simple QA, with 2 model families. The paper does not test on agentic coding, web navigation, or other agent tasks. Transfer analysis (Section 4.4) partially bounds claims but the title overreaches." 129 }, 130 "alternative_explanations_discussed": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper does not discuss alternative explanations for the improvements. Could the gains stem from increased total compute exploration during RL training? Could the prompt library quality (GPT-5.2 generated) be the dominant factor rather than the hierarchical structure? The MedQA discussion (Section 4.2) touches on prompt content but no systematic consideration of confounds." 134 }, 135 "proxy_outcome_distinction": { 136 "applies": true, 137 "answer": true, 138 "justification": "The paper measures benchmark accuracy and token/dollar cost — claims match the granularity of measurements. It does not overclaim that benchmark accuracy represents broader 'agent capability' or 'intelligence.'" 139 } 140 }, 141 "setup_transparency": { 142 "model_versions_specified": { 143 "applies": true, 144 "answer": true, 145 "justification": "Main evaluation models specified as 'Qwen 2.5 7B Instruct' (Yang et al., 2024a) and 'Gemini 2.5 Flash Lite' (Comanici et al., 2025), with paper references. These identify specific model releases (family, size, variant). Scaling experiments use Qwen 2.5 32B and 72B." 146 }, 147 "prompts_provided": { 148 "applies": true, 149 "answer": false, 150 "justification": "Prompts are described as 'semantic instruction fragments' like 'Decompose the problem' and 'Verify intermediate steps' (Section 3.1.2), but the actual prompt text used in experiments is not provided. The prompt library was generated by GPT-5.2 via meta-prompting but the full prompts are not in the paper or appendix." 151 }, 152 "hyperparameters_reported": { 153 "applies": true, 154 "answer": true, 155 "justification": "Extensive hyperparameters in Appendix F: learning rates (3e-4 structure, 5e-5 prompt), batch size (32), PPO clip epsilon (0.2), discount factor (0.95), entropy coefficient (0.05), all reward shaping coefficients (α=5.0, βs=0.02, βt=0.03, δ1=0.1, δ2=0.2, δ3=0.3), SFT parameters, and GRPO/DPO hyperparameters." 156 }, 157 "scaffolding_described": { 158 "applies": true, 159 "answer": true, 160 "justification": "All 9 workflow patterns are described in detail in Appendix A with diagrams (Figure 7), including LLM call counts, agent roles, and tool configurations per workflow. The hierarchical policy architecture (Section 3.1), action masking (Section 3.2), and tool allocation are thoroughly described." 161 }, 162 "data_preprocessing_documented": { 163 "applies": true, 164 "answer": true, 165 "justification": "State representation construction is documented: MetaCLIP-H/14 embeddings concatenated with hand-crafted features (query length, word count, numerical density, binary indicators). Standard benchmark splits used, with GAIA partitioning explicitly described (first 65 for training, rest for evaluation)." 166 } 167 }, 168 "limitations_and_scope": { 169 "limitations_section_present": { 170 "applies": true, 171 "answer": false, 172 "justification": "No dedicated limitations section. Section 6 'Impact Statement' is brief and only addresses societal harm. Limitations are scattered throughout results discussion but not consolidated." 173 }, 174 "threats_to_validity_specific": { 175 "applies": true, 176 "answer": false, 177 "justification": "No threats-to-validity discussion. The paper discusses MedQA underperformance and transfer limitations as results observations, not as systematic threats to the study's validity." 178 }, 179 "scope_boundaries_stated": { 180 "applies": true, 181 "answer": false, 182 "justification": "No explicit statements about what the results do NOT show. The paper does not bound its claims to the tested benchmarks, models, or task types. No mention of settings where ARC would not apply." 183 } 184 }, 185 "data_integrity": { 186 "raw_data_available": { 187 "applies": true, 188 "answer": false, 189 "justification": "No raw experimental data (per-episode rewards, individual predictions, RL training logs) is released. Only aggregated results in tables and figures." 190 }, 191 "data_collection_described": { 192 "applies": true, 193 "answer": true, 194 "justification": "The RL training data collection is described: episodes are generated by executing configured agent systems on benchmark queries, producing correctness signals and cost statistics. Benchmarks are standard public datasets with sources cited." 195 }, 196 "recruitment_methods_described": { 197 "applies": false, 198 "answer": false, 199 "justification": "No human participants. Data sources are standard public benchmarks (GSM8k, DROP, MedQA, HotpotQA, GAIA)." 200 }, 201 "data_pipeline_documented": { 202 "applies": true, 203 "answer": true, 204 "justification": "Algorithm 1 documents the full pipeline: query encoding → structure policy sampling → action masking → prompt composition → workflow execution → reward computation → buffer storage → PPO update → elite filtering → SFT refinement." 205 } 206 }, 207 "conflicts_of_interest": { 208 "funding_disclosed": { 209 "applies": true, 210 "answer": false, 211 "justification": "No funding sources are disclosed anywhere in the paper. No acknowledgments section listing grants or sponsors." 212 }, 213 "affiliations_disclosed": { 214 "applies": true, 215 "answer": true, 216 "justification": "Author affiliations clearly stated: all three authors are from School of Computing and Augmented Intelligence, Arizona State University. No industry affiliations to disclose." 217 }, 218 "funder_independent_of_outcome": { 219 "applies": true, 220 "answer": false, 221 "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding disclosure is itself a reporting gap." 222 }, 223 "financial_interests_declared": { 224 "applies": true, 225 "answer": false, 226 "justification": "No competing interests statement or financial disclosures are present in the paper." 227 } 228 }, 229 "contamination": { 230 "training_cutoff_stated": { 231 "applies": true, 232 "answer": false, 233 "justification": "No mention of training data cutoff dates for Qwen 2.5 7B Instruct or Gemini 2.5 Flash Lite, despite evaluating these models on public benchmarks." 234 }, 235 "train_test_overlap_discussed": { 236 "applies": true, 237 "answer": false, 238 "justification": "No discussion of whether GSM8k (2021), DROP (2019), MedQA (2021), or HotpotQA (2018) questions appeared in the training data of Qwen 2.5 or Gemini 2.5." 239 }, 240 "benchmark_contamination_addressed": { 241 "applies": true, 242 "answer": false, 243 "justification": "All five benchmarks were published years before the models' likely training cutoffs. GSM8k, DROP, MedQA, and HotpotQA are widely known and likely in training data. No contamination analysis is provided." 244 } 245 }, 246 "human_studies": { 247 "pre_registered": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants in this study. All evaluation is automated benchmark testing." 251 }, 252 "irb_or_ethics_approval": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study." 256 }, 257 "demographics_reported": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "inclusion_exclusion_criteria": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "randomization_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "blinding_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 }, 277 "attrition_reported": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in this study." 281 } 282 }, 283 "cost_and_practicality": { 284 "inference_cost_reported": { 285 "applies": true, 286 "answer": true, 287 "justification": "Figure 4 shows accuracy vs cost trade-off with 'token-based API cost per episode using OpenRouter rates for Qwen2.5 7B Instruct.' Cost efficiency is a central contribution of the paper." 288 }, 289 "compute_budget_stated": { 290 "applies": true, 291 "answer": false, 292 "justification": "Appendix B describes hardware requirements and Appendix F lists '4,000 per dataset minimum' training episodes, but total GPU hours, training wall-clock time, or total API spend are not reported." 293 } 294 }, 295 "experimental_rigor": { 296 "seed_sensitivity_reported": { 297 "applies": true, 298 "answer": false, 299 "justification": "Main results in Table 1 are point estimates with no indication of multiple random seed evaluation. No seed sensitivity analysis is reported." 300 }, 301 "number_of_runs_stated": { 302 "applies": true, 303 "answer": false, 304 "justification": "The number of evaluation runs producing the Table 1 results is not stated. It is unclear whether results are from a single run or averaged." 305 }, 306 "hyperparameter_search_budget": { 307 "applies": true, 308 "answer": false, 309 "justification": "Appendix F lists final hyperparameter values but does not report how they were selected or how many configurations were tried. No search method or budget is described." 310 }, 311 "best_config_selection_justified": { 312 "applies": true, 313 "answer": false, 314 "justification": "No explanation of how final hyperparameters were selected. No mention of validation-based selection or systematic search procedure." 315 }, 316 "multiple_comparison_correction": { 317 "applies": true, 318 "answer": false, 319 "justification": "Multiple comparisons across 5 benchmarks and 10+ baselines with no statistical tests at all, let alone correction for multiple comparisons." 320 }, 321 "self_comparison_bias_addressed": { 322 "applies": true, 323 "answer": false, 324 "justification": "The authors implement RL Bandits, RL Episodes, and LAP baselines. No acknowledgment that author-implemented baselines may systematically underperform vs the authors' own method." 325 }, 326 "compute_budget_vs_performance": { 327 "applies": true, 328 "answer": true, 329 "justification": "Figure 4 presents a Pareto frontier of accuracy vs cost ($ per episode) on GSM8K, showing ARC achieves better accuracy-cost trade-offs. This explicitly plots performance as a function of compute cost." 330 }, 331 "benchmark_construct_validity": { 332 "applies": true, 333 "answer": false, 334 "justification": "No discussion of whether GSM8k accuracy, DROP accuracy, etc. actually measure the 'agentic configuration' capability the paper claims. The paper uses these benchmarks without questioning their validity for evaluating dynamic agent configuration." 335 }, 336 "scaffold_confound_addressed": { 337 "applies": true, 338 "answer": true, 339 "justification": "Scaffold differences are the explicit research variable. The paper compares different scaffold configurations (9 workflows) and demonstrates that scaffold choice significantly affects results (Figure 12). The comparison against baselines with different scaffolds is acknowledged as comparing configuration strategies, not attributing differences to models alone." 340 } 341 }, 342 "data_leakage": { 343 "temporal_leakage_addressed": { 344 "applies": true, 345 "answer": false, 346 "justification": "Not addressed. GSM8k (2021), DROP (2019), MedQA (2021), and HotpotQA (2018) were all published years before Qwen 2.5 and Gemini 2.5 training cutoffs. Solutions may be in training data." 347 }, 348 "feature_leakage_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "No discussion of whether the evaluation setup leaks answer information through context or other channels." 352 }, 353 "non_independence_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "No discussion of whether training and test examples share structural similarities or come from overlapping distributions." 357 }, 358 "leakage_detection_method": { 359 "applies": true, 360 "answer": false, 361 "justification": "No leakage detection or prevention method is used. No canary strings, membership inference tests, decontamination pipelines, or temporal splits." 362 } 363 } 364 }, 365 "claims": [ 366 { 367 "claim": "ARC achieves up to 25% higher task accuracy over strong baselines while reducing token and runtime costs.", 368 "evidence": "Table 1 shows ARC (88.6% GSM8k, 63.9% DROP, 64.6% MedQA) vs various baselines. Largest improvement over a strong baseline: ARC (63.9%) vs GEPA (39.3%) on DROP = 24.6pp. Figure 4 shows Pareto-optimal cost-accuracy trade-off.", 369 "supported": "moderate" 370 }, 371 { 372 "claim": "Hierarchical RL (ARC) consistently outperforms flat RL baselines (RL Bandits, RL Episodes) across all benchmarks.", 373 "evidence": "Table 1: ARC vs RL Episodes on GSM8k (88.6% vs 85.2%), DROP (63.9% vs 54.3%), MedQA (64.6% vs 57.3%), HotpotQA (34.1% vs 27.8%), GAIA (6.0% vs 2.0%).", 374 "supported": "strong" 375 }, 376 { 377 "claim": "SFT refinement consistently improves performance by 5-35% reward improvement across all datasets and models.", 378 "evidence": "Table 1 shows ARC vs ARC w/o SFT: GSM8k 88.6% vs 87.6% (+1.0%), DROP 63.9% vs 62.3% (+1.6%), MedQA 64.6% vs 58.4% (+6.2%), HotpotQA 34.1% vs 33.7% (+0.4%), GAIA 6.0% vs 5.0% (+1.0%). Accuracy gains are 0.4-6.2%, not 5-35%.", 379 "supported": "weak" 380 }, 381 { 382 "claim": "Policies trained on 7B models transfer zero-shot to 32B and 72B model variants with monotonic performance improvement.", 383 "evidence": "Figure 5 shows performance improving with scale across 4 benchmarks. 'Avg. Kendall's τ = 1.0 and Pearson's r = 0.94' (Section 4.4).", 384 "supported": "moderate" 385 }, 386 { 387 "claim": "Policy configuration errors remain below 10% across all benchmarks.", 388 "evidence": "Figure 6 shows policy configuration errors at 8% (GAIA), 9% (HotpotQA, MedQA) across all benchmarks. Error categorization methodology in Appendix I.", 389 "supported": "moderate" 390 }, 391 { 392 "claim": "ARC is model-agnostic, showing consistent gains across both open-weight (Qwen 2.5) and proprietary (Gemini 2.5) models.", 393 "evidence": "Table 1 shows ARC improvements on both Qwen 2.5 7B (+31.3 avg reasoning, +13.95 avg tool-use) and Gemini 2.5 FL (+33.5 avg reasoning, +8.95 avg tool-use).", 394 "supported": "moderate" 395 } 396 ], 397 "red_flags": [ 398 { 399 "flag": "Abstract-conclusion inconsistency on key claim", 400 "detail": "The abstract claims 'up to 25% higher task accuracy' while the conclusion states 'up to 16% higher accuracy.' This inconsistency on the paper's headline number suggests imprecise or cherry-picked reporting." 401 }, 402 { 403 "flag": "No error bars or statistical tests on main results", 404 "detail": "Table 1 reports single-run accuracy numbers for all methods across all benchmarks with no variance, confidence intervals, or significance tests. Given that RL training is stochastic, the absence of multi-seed evaluation is a significant omission." 405 }, 406 { 407 "flag": "No contamination analysis despite using old benchmarks with new models", 408 "detail": "GSM8k (2021), DROP (2019), MedQA (2021), and HotpotQA (2018) are all old enough to be in the training data of Qwen 2.5 and Gemini 2.5. Base model accuracy improvements from configuration could partially reflect better retrieval of memorized solutions rather than genuine reasoning improvement." 409 }, 410 { 411 "flag": "SFT improvement claim overstated", 412 "detail": "The paper claims SFT 'improves average episode reward by ≈5–35% across all datasets' but Table 1 shows accuracy improvements of only 0.4-6.2pp. The 5-35% figure refers to reward (which includes cost penalties) not accuracy, but this distinction is not made clear." 413 }, 414 { 415 "flag": "Very low GAIA scores make improvements hard to interpret", 416 "detail": "GAIA scores range from 0-6% across all methods. ARC's 6.0% vs 2.0% base model (4pp improvement) is reported as meaningful, but with such low absolute scores on what appears to be a small evaluation set, these differences may not be statistically significant." 417 }, 418 { 419 "flag": "No limitations section", 420 "detail": "The paper lacks any dedicated discussion of limitations, threats to validity, or scope boundaries. The Impact Statement (Section 6) addresses only societal harm ('We do not foresee any direct societal harm') without methodological self-critique." 421 } 422 ], 423 "cited_papers": [ 424 { 425 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 426 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik R. Narasimhan", "Yuan Cao"], 427 "year": 2022, 428 "arxiv_id": "2210.03629", 429 "relevance": "Foundational work on LLM agents combining reasoning and tool use, directly relevant to the agentic workflow patterns ARC configures." 430 }, 431 { 432 "title": "Toolformer: Language Models Can Teach Themselves to Use Tools", 433 "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessì"], 434 "year": 2023, 435 "relevance": "Key work on LLM tool use that ARC builds upon, demonstrating autonomous tool selection by language models." 436 }, 437 { 438 "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversations", 439 "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"], 440 "year": 2024, 441 "relevance": "Multi-agent framework used as a baseline; represents fixed workflow/prompt optimization approaches ARC claims to improve upon." 442 }, 443 { 444 "title": "DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines", 445 "authors": ["Omar Khattab", "Arnav Singhvi", "Paridhi Maheshwari"], 446 "year": 2023, 447 "arxiv_id": "2310.03714", 448 "relevance": "Prompt optimization framework used as a baseline, representing the prompt-as-optimization-variable approach to LLM configuration." 449 }, 450 { 451 "title": "GEPA: Reflective Prompt Evolution Can Outperform Reinforcement Learning", 452 "authors": ["Lakshya A. Agrawal", "Sherry Tan", "Deniz Soylu"], 453 "year": 2025, 454 "arxiv_id": "2507.19457", 455 "relevance": "Prompt evolution framework that outperforms ARC on MedQA (87.1% vs 64.6%), demonstrating domain-specific prompt content can dominate structural optimization." 456 }, 457 { 458 "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering", 459 "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig"], 460 "year": 2024, 461 "relevance": "Agentic software engineering system representing the class of agent architectures ARC aims to dynamically configure." 462 }, 463 { 464 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 465 "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"], 466 "year": 2023, 467 "arxiv_id": "2305.05176", 468 "relevance": "Cost-aware LLM routing framework addressing the efficiency dimension ARC optimizes, evolving from model selection to full configuration." 469 }, 470 { 471 "title": "GAIA: A Benchmark for General AI Assistants", 472 "authors": ["Grégoire Mialon", "Clémentine Fourrier", "Thomas Wolf", "Yann LeCun", "Thomas Scialom"], 473 "year": 2023, 474 "relevance": "Multi-modal tool-use benchmark used for evaluation; challenging benchmark where all methods score below 6%." 475 }, 476 { 477 "title": "Lost in the Middle: How Language Models Use Long Contexts", 478 "authors": ["Nelson F. Liu", "Kevin Lin", "John Hewitt"], 479 "year": 2024, 480 "relevance": "Demonstrates the lost-in-the-middle phenomenon that motivates ARC's context budget optimization." 481 }, 482 { 483 "title": "AgentBench: Evaluating LLMs as Agents", 484 "authors": ["Xiao Liu", "Hao Yu", "Hanchen Zhang"], 485 "year": 2023, 486 "arxiv_id": "2308.03688", 487 "relevance": "Agent evaluation suite relevant to understanding how agentic systems perform across diverse tasks." 488 }, 489 { 490 "title": "Token-Budget-Aware LLM Reasoning", 491 "authors": ["Tingxu Han", "Zhenting Wang", "Chunrong Fang"], 492 "year": 2025, 493 "relevance": "Token budget management for LLM reasoning, addressing the same compute-efficiency dimension ARC optimizes with learned policies." 494 }, 495 { 496 "title": "ToolLLM: Facilitating Large Language Models to Master 16000+ Real-World APIs", 497 "authors": ["Yujia Qin", "Shihao Liang", "Yining Ye"], 498 "year": 2023, 499 "arxiv_id": "2307.16789", 500 "relevance": "Large-scale tool-use framework expanding the action space available to LLM agents." 501 } 502 ] 503 }