scan-v5.json (25754B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Learning to Configure Agentic AI Systems", 6 "authors": [ 7 "Aditya Taparia", 8 "Som Sagar", 9 "Ransalu Senanayake" 10 ], 11 "year": 2026, 12 "venue": "arXiv", 13 "arxiv_id": "2602.11574", 14 "doi": null 15 }, 16 "checklist": { 17 "claims_and_evidence": { 18 "abstract_claims_supported": { 19 "applies": true, 20 "answer": false, 21 "justification": "The abstract claims 'up to 25% higher task accuracy' but Table 1 shows improvements of up to 31.3 percentage points over the base model on reasoning tasks; the '25%' figure appears to understate or misrepresent the actual reported numbers without a clear mapping to a specific comparison.", 22 "source": "haiku" 23 }, 24 "causal_claims_justified": { 25 "applies": true, 26 "answer": true, 27 "justification": "Ablation studies compare ARC with and without SFT, against flat RL (RL Bandits, RL Episodes), and alternative training objectives (GRPO, DPO), providing reasonable evidence for causal claims about the hierarchical structure's benefit.", 28 "source": "haiku" 29 }, 30 "generalization_bounded": { 31 "applies": true, 32 "answer": false, 33 "justification": "The conclusion claims ARC is 'a powerful alternative to one-size-fits-all designs' broadly, but results are from 5 benchmarks on 2 models; GAIA performance (6%) is very low and tool-use transfer is explicitly described as 'more weaker,' undermining broad generalization claims.", 34 "source": "haiku" 35 }, 36 "alternative_explanations_discussed": { 37 "applies": true, 38 "answer": false, 39 "justification": "The paper notes GEPA outperforms on MedQA due to domain-specific prompts, but does not systematically consider alternative explanations for ARC's gains (e.g., whether benchmark contamination in backbone LLMs drives results).", 40 "source": "haiku" 41 }, 42 "proxy_outcome_distinction": { 43 "applies": true, 44 "answer": true, 45 "justification": "Task accuracy on established benchmarks (GSM8k, DROP, MedQA, HotpotQA, GAIA) is used as the metric, and claims are framed in terms of benchmark accuracy and token cost — measurements and claims are aligned at the same granularity.", 46 "source": "haiku" 47 } 48 }, 49 "limitations_and_scope": { 50 "limitations_section_present": { 51 "applies": true, 52 "answer": false, 53 "justification": "The paper has only a brief 'Impact Statement' stating 'We do not foresee any direct societal harm'; there is no dedicated limitations or threats-to-validity section.", 54 "source": "haiku" 55 }, 56 "threats_to_validity_specific": { 57 "applies": true, 58 "answer": false, 59 "justification": "No specific threats to validity are discussed — small GAIA test set (unknown N after 65 training samples), potential benchmark contamination in backbone LLMs, and single-run results without variance reporting are not addressed.", 60 "source": "haiku" 61 }, 62 "scope_boundaries_stated": { 63 "applies": true, 64 "answer": false, 65 "justification": "The paper does not explicitly state what the results do NOT show; the conclusion broadly promotes ARC as 'a practical and effective direction for scaling LLM-based systems' without bounding the claim to the tested benchmarks and model families.", 66 "source": "haiku" 67 } 68 }, 69 "conflicts_of_interest": { 70 "funding_disclosed": { 71 "applies": true, 72 "answer": false, 73 "justification": "No funding acknowledgment or grant support is mentioned anywhere in the paper.", 74 "source": "haiku" 75 }, 76 "affiliations_disclosed": { 77 "applies": true, 78 "answer": true, 79 "justification": "All three authors are affiliated with the School of Computing and Augmented Intelligence, Arizona State University, which is clearly disclosed.", 80 "source": "haiku" 81 }, 82 "funder_independent_of_outcome": { 83 "applies": false, 84 "answer": false, 85 "justification": "No funding is disclosed, so independence cannot be assessed.", 86 "source": "haiku" 87 }, 88 "financial_interests_declared": { 89 "applies": true, 90 "answer": false, 91 "justification": "No competing interests or financial interests statement is included in the paper.", 92 "source": "haiku" 93 } 94 }, 95 "scope_and_framing": { 96 "key_terms_defined": { 97 "applies": true, 98 "answer": true, 99 "justification": "Key terms are operationally defined: 'workflow' maps to 9 specific patterns (Appendix A), 'configuration' is formally defined as c=(ω,t,b,p) in Section 3, and 'token budget' is given as budget tiers with specific values.", 100 "source": "haiku" 101 }, 102 "intended_contribution_clear": { 103 "applies": true, 104 "answer": true, 105 "justification": "The paper explicitly lists three contributions: the ARC hierarchical RL framework, a hybrid RL+SFT training pipeline, and empirical demonstrations across benchmarks.", 106 "source": "haiku" 107 }, 108 "engagement_with_prior_work": { 109 "applies": true, 110 "answer": true, 111 "justification": "Section 2 engages substantively with LLM agent frameworks, prompt/workflow optimization (OPRO, DSPy, GEPA, LLMLingua), and hierarchical RL (Options framework, Feudal RL), situating ARC's contributions relative to each thread.", 112 "source": "haiku" 113 } 114 } 115 }, 116 "type_checklist": { 117 "empirical": { 118 "artifacts": { 119 "code_released": { 120 "applies": true, 121 "answer": false, 122 "justification": "The abstract says 'Codebase: Github' but no URL is provided; without a verifiable link, this cannot be confirmed as actually released.", 123 "source": "haiku" 124 }, 125 "data_released": { 126 "applies": true, 127 "answer": true, 128 "justification": "All five benchmarks used (GSM8k, DROP, MedQA, HotpotQA, GAIA) are standard publicly available datasets used unmodified (aside from GAIA partitioning).", 129 "source": "haiku" 130 }, 131 "environment_specified": { 132 "applies": true, 133 "answer": false, 134 "justification": "Appendix B mentions 'Python 3.10+' and GPU requirements (40GB VRAM) but provides no requirements.txt, Dockerfile, or exact package versions.", 135 "source": "haiku" 136 }, 137 "reproduction_instructions": { 138 "applies": true, 139 "answer": false, 140 "justification": "Appendix B describes hardware requirements and Appendix F gives hyperparameters, but without accessible code and without step-by-step instructions, reproduction is not feasible.", 141 "source": "haiku" 142 } 143 }, 144 "statistical_methodology": { 145 "confidence_intervals_or_error_bars": { 146 "applies": true, 147 "answer": false, 148 "justification": "Table 1 (main results) reports only point estimates with no error bars or confidence intervals; Table 5 (embedding ablation) includes standard deviations but this is not the primary result table.", 149 "source": "haiku" 150 }, 151 "significance_tests": { 152 "applies": true, 153 "answer": false, 154 "justification": "No statistical significance tests are conducted for any comparative claims made in Table 1 or elsewhere.", 155 "source": "haiku" 156 }, 157 "effect_sizes_reported": { 158 "applies": true, 159 "answer": true, 160 "justification": "Table 1 reports absolute percentage-point improvements (e.g., '+31.3' for reasoning avg) relative to a base model, providing context for the magnitude of effects.", 161 "source": "haiku" 162 }, 163 "sample_size_justified": { 164 "applies": true, 165 "answer": false, 166 "justification": "The number of training episodes (4,000 per dataset minimum) is stated but not justified; GAIA uses only 65 samples for training and an unspecified remainder for evaluation, with no power analysis.", 167 "source": "haiku" 168 }, 169 "variance_reported": { 170 "applies": true, 171 "answer": false, 172 "justification": "Main results in Table 1 are single-run point estimates with no variance or standard deviation; the paper does not report variance across training runs or seeds.", 173 "source": "haiku" 174 } 175 }, 176 "evaluation_design": { 177 "baselines_included": { 178 "applies": true, 179 "answer": true, 180 "justification": "Table 1 includes diverse baselines: base model with tools, grid/greedy search, AutoGen, DSPy, GEPA, LAP, RL Bandits, and RL Episodes.", 181 "source": "haiku" 182 }, 183 "baselines_contemporary": { 184 "applies": true, 185 "answer": true, 186 "justification": "Baselines include recent systems (AutoGen 2024, DSPy 2023, GEPA 2025) and RL variants; no obviously stale or weak baselines are used.", 187 "source": "haiku" 188 }, 189 "ablation_study": { 190 "applies": true, 191 "answer": true, 192 "justification": "Section 4.5 and Table 4 ablate the SFT refinement stage, compare PPO vs. GRPO, and compare SFT vs. DPO post-training, isolating contributions of each component.", 193 "source": "haiku" 194 }, 195 "multiple_metrics": { 196 "applies": true, 197 "answer": true, 198 "justification": "Task accuracy, token cost (Figure 4), workflow diversity entropy and Gini coefficient (Table 2), and reward distribution (Figure 13) are all reported.", 199 "source": "haiku" 200 }, 201 "human_evaluation": { 202 "applies": false, 203 "answer": false, 204 "justification": "The paper evaluates automated benchmark accuracy; human evaluation is not applicable to this configuration-learning framework.", 205 "source": "haiku" 206 }, 207 "held_out_test_set": { 208 "applies": true, 209 "answer": true, 210 "justification": "Standard training/test splits are used for all benchmarks; for GAIA, the first 65 samples are used for training and the remainder for evaluation.", 211 "source": "haiku" 212 }, 213 "per_category_breakdown": { 214 "applies": true, 215 "answer": true, 216 "justification": "Results are reported per benchmark and organized by capability axis (reasoning vs. tool-use); Appendix G further breaks down accuracy by workflow type per dataset.", 217 "source": "haiku" 218 }, 219 "failure_cases_discussed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Section 4.6 and Appendix I provide detailed error categorization with four failure types and concrete examples for each category.", 223 "source": "haiku" 224 }, 225 "negative_results_reported": { 226 "applies": true, 227 "answer": true, 228 "justification": "The paper explicitly reports that GEPA outperforms ARC on MedQA (87.1% vs 64.6%), that GAIA performance is only 6%, and that tool-use cross-task transfer is 'more weaker.'", 229 "source": "haiku" 230 } 231 }, 232 "setup_transparency": { 233 "model_versions_specified": { 234 "applies": true, 235 "answer": true, 236 "justification": "Models are named specifically: 'Qwen 2.5 7B Instruct' and 'Gemini 2.5 Flash Lite,' with citations to their technical reports.", 237 "source": "haiku" 238 }, 239 "prompts_provided": { 240 "applies": true, 241 "answer": false, 242 "justification": "The paper mentions semantic instruction fragments (e.g., 'Decompose the problem', 'Verify intermediate steps') but the full prompt library is not provided; specific system prompts for agents are not shown.", 243 "source": "haiku" 244 }, 245 "hyperparameters_reported": { 246 "applies": true, 247 "answer": true, 248 "justification": "Appendix F provides a comprehensive list of hyperparameters including learning rates, batch size, PPO clip epsilon, discount factor, entropy coefficient, and reward shaping coefficients.", 249 "source": "haiku" 250 }, 251 "scaffolding_described": { 252 "applies": true, 253 "answer": true, 254 "justification": "All 9 agentic workflows are described and illustrated in Appendix A with figures showing LLM call graphs, tool configurations, and inter-agent routing.", 255 "source": "haiku" 256 }, 257 "data_preprocessing_documented": { 258 "applies": true, 259 "answer": true, 260 "justification": "The query feature vector fq is explicitly defined (query length, word count, numerical density, binary indicators for multi-step reasoning and tool use) and the embedding approach using MetaCLIP-H/14 is described.", 261 "source": "haiku" 262 } 263 }, 264 "data_integrity": { 265 "raw_data_available": { 266 "applies": true, 267 "answer": true, 268 "justification": "All five benchmarks (GSM8k, DROP, MedQA, HotpotQA, GAIA) are publicly available standard datasets; RL episode buffer data is not released but the evaluation data is accessible.", 269 "source": "haiku" 270 }, 271 "data_collection_described": { 272 "applies": true, 273 "answer": true, 274 "justification": "Standard benchmarks are used with their established collection procedures; the GAIA partitioning (first 65 for training, rest for evaluation) is explicitly described.", 275 "source": "haiku" 276 }, 277 "recruitment_methods_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants; all data is from existing public benchmarks.", 281 "source": "haiku" 282 }, 283 "data_pipeline_documented": { 284 "applies": true, 285 "answer": true, 286 "justification": "The data pipeline from query encoding (Section 3.1), through policy decision (Sections 3.1.1–3.1.2), to reward computation (Section 3.2) is fully described with formal notation.", 287 "source": "haiku" 288 } 289 }, 290 "contamination": { 291 "training_cutoff_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "The paper does not state the training data cutoffs for Qwen 2.5 7B Instruct or Gemini 2.5 Flash Lite, which is relevant since benchmarks like GSM8k are widely known.", 295 "source": "haiku" 296 }, 297 "train_test_overlap_discussed": { 298 "applies": true, 299 "answer": false, 300 "justification": "No discussion of whether backbone LLM training data overlaps with benchmark test sets, which is particularly relevant for well-known benchmarks like GSM8k and DROP.", 301 "source": "haiku" 302 }, 303 "benchmark_contamination_addressed": { 304 "applies": true, 305 "answer": false, 306 "justification": "Benchmark contamination is not addressed; GSM8k in particular is a high-risk benchmark for contamination in recent LLMs, and the paper's high accuracy (88.6%) on it is not contextualized relative to this risk.", 307 "source": "haiku" 308 } 309 }, 310 "human_studies": { 311 "pre_registered": { 312 "applies": false, 313 "answer": false, 314 "justification": "No human participants.", 315 "source": "haiku" 316 }, 317 "irb_or_ethics_approval": { 318 "applies": false, 319 "answer": false, 320 "justification": "No human participants.", 321 "source": "haiku" 322 }, 323 "demographics_reported": { 324 "applies": false, 325 "answer": false, 326 "justification": "No human participants.", 327 "source": "haiku" 328 }, 329 "inclusion_exclusion_criteria": { 330 "applies": false, 331 "answer": false, 332 "justification": "No human participants.", 333 "source": "haiku" 334 }, 335 "randomization_described": { 336 "applies": false, 337 "answer": false, 338 "justification": "No human participants.", 339 "source": "haiku" 340 }, 341 "blinding_described": { 342 "applies": false, 343 "answer": false, 344 "justification": "No human participants.", 345 "source": "haiku" 346 }, 347 "attrition_reported": { 348 "applies": false, 349 "answer": false, 350 "justification": "No human participants.", 351 "source": "haiku" 352 } 353 }, 354 "cost_and_practicality": { 355 "inference_cost_reported": { 356 "applies": true, 357 "answer": true, 358 "justification": "Figure 4 shows cost per episode in USD using OpenRouter rates for Qwen 2.5 7B Instruct, and the Pareto frontier explicitly compares accuracy vs. cost across all methods.", 359 "source": "haiku" 360 }, 361 "compute_budget_stated": { 362 "applies": true, 363 "answer": true, 364 "justification": "Appendix B specifies GPU requirements (A100/V100/4090 with 40GB VRAM, or 24GB with 4-bit quantization) and training episodes (4,000 minimum per dataset).", 365 "source": "haiku" 366 } 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "ARC achieves up to 25% higher task accuracy compared to fixed architectures while reducing token and runtime costs.", 373 "evidence": "Table 1 shows ARC at 72.4%/72.9% avg on reasoning vs. base model at 41.1%/39.4%; Figure 4 shows ARC on Pareto frontier for cost-accuracy tradeoff on GSM8k.", 374 "supported": "moderate" 375 }, 376 { 377 "claim": "Hierarchical RL (ARC) outperforms flat RL baselines (RL Bandits, RL Episodes).", 378 "evidence": "Table 1: ARC 72.4% reasoning avg vs. RL Episodes 65.6% and RL Bandits 63.7% for Qwen 2.5 7B.", 379 "supported": "strong" 380 }, 381 { 382 "claim": "SFT post-training refinement improves average episode reward by 5–35% across datasets.", 383 "evidence": "Section 4.5 and Table 1 compare ARC vs. ARC w/o SFT; accuracy gains of 1–3% are shown, and the 5–35% reward claim is stated in text but not directly tabulated.", 384 "supported": "weak" 385 }, 386 { 387 "claim": "Policy configuration errors remain below 10% across all benchmarks, meaning most errors stem from inherent LLM limitations.", 388 "evidence": "Figure 6 shows policy configuration errors at 8–9% for tool-use tasks and 9% for reasoning tasks.", 389 "supported": "strong" 390 }, 391 { 392 "claim": "Cross-task policy transfer is governed by shared workflow/tool structure rather than semantic similarity.", 393 "evidence": "Table 3 shows HotpotQA→GAIA transfer drops to 2% despite high embedding similarity (Dsim=0.93), while HotpotQA→MedQA transfers well (57%) with lower similarity (0.76).", 394 "supported": "moderate" 395 }, 396 { 397 "claim": "ARC policies trained on 7B models generalize zero-shot to 32B and 72B variants with monotonically improving performance.", 398 "evidence": "Figure 5 shows consistent accuracy gains across Qwen 2.5 7B→32B→72B with Kendall's τ=1.0 and Pearson's r=0.94.", 399 "supported": "moderate" 400 } 401 ], 402 "methodology_tags": [ 403 "benchmark-eval" 404 ], 405 "key_findings": "ARC, a hierarchical RL framework, learns per-query agent configurations (workflow, tools, token budget, prompts) and consistently outperforms fixed architectures, search-based methods, and flat RL baselines on five reasoning and tool-use benchmarks. The two-level policy decomposes the combinatorial configuration space (~62K valid options) into tractable structure and prompt decisions, with a post-RL SFT phase that concentrates the policy on high-reward trajectories. Tool-use benchmark performance remains low (GAIA: 6%), cross-domain transfer depends on structural rather than semantic similarity, and policy configuration errors account for fewer than 10% of failures — the dominant failure modes are LLM reasoning and knowledge gaps.", 406 "red_flags": [ 407 { 408 "flag": "No variance on main results", 409 "detail": "Table 1 reports single-run point estimates with no standard deviations, confidence intervals, or multiple seeds, making it impossible to assess result reliability." 410 }, 411 { 412 "flag": "No statistical significance tests", 413 "detail": "All comparative claims (ARC vs. baselines) are made without statistical tests; observed differences could be within noise given single-run reporting." 414 }, 415 { 416 "flag": "Code inaccessible", 417 "detail": "Abstract says 'Codebase: Github' but no URL is provided; reproducibility cannot be verified." 418 }, 419 { 420 "flag": "Very small GAIA evaluation set", 421 "detail": "GAIA uses 65 samples for training and an unspecified number for test; all GAIA results (e.g., 6.0%) are based on a handful of examples (≤36 given original GAIA test split size), making them statistically unreliable." 422 }, 423 { 424 "flag": "No limitations section", 425 "detail": "The paper lacks a dedicated limitations or threats-to-validity section; only a two-sentence impact statement is provided." 426 }, 427 { 428 "flag": "Benchmark contamination unaddressed", 429 "detail": "GSM8k and DROP are high-risk for contamination in recent LLMs; the paper's 88.6% on GSM8k is not contextualized against this risk, and backbone LLM training cutoffs are not stated." 430 }, 431 { 432 "flag": "Abstract accuracy claim inconsistent with results", 433 "detail": "Abstract claims 'up to 25% higher task accuracy' but Table 1 shows improvements up to 31.3 percentage points over the base model; no clear mapping between the stated '25%' and the reported numbers is provided." 434 } 435 ], 436 "cited_papers": [ 437 { 438 "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversations", 439 "relevance": "Key multi-agent framework baseline that ARC is compared against and builds upon." 440 }, 441 { 442 "title": "DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines", 443 "relevance": "Prompt/workflow optimization framework used as a baseline; represents the prior state of the art in structured LLM pipeline optimization." 444 }, 445 { 446 "title": "GAIA: A Benchmark for General AI Assistants", 447 "relevance": "Tool-use benchmark used for evaluation; tests complex multi-modal agentic capabilities." 448 }, 449 { 450 "title": "GEPA: Reflective Prompt Evolution Can Outperform Reinforcement Learning", 451 "relevance": "Contemporary competing approach to prompt/workflow optimization; outperforms ARC on MedQA despite ARC's hierarchical structure." 452 }, 453 { 454 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 455 "relevance": "Foundational work on combining reasoning and tool use in LLM agents; directly motivates the agentic workflow design space ARC navigates." 456 }, 457 { 458 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 459 "relevance": "Early work on cost-aware LLM routing; ARC extends this to full agentic workflow configuration." 460 }, 461 { 462 "title": "AgentBench: Evaluating LLMs as Agents", 463 "relevance": "General agent evaluation suite; related to the benchmark evaluation methodology used in this paper." 464 }, 465 { 466 "title": "Training Verifiers to Solve Math Word Problems (GSM8k)", 467 "relevance": "Primary reasoning benchmark used for evaluation and training of the ARC policy." 468 }, 469 { 470 "title": "SWE-Agent: Agent-Computer Interfaces Enable Automated Software Engineering", 471 "relevance": "Demonstrates agentic systems in software engineering, part of the broader agent landscape ARC is designed to configure." 472 }, 473 { 474 "title": "Large Language Models as Optimizers (OPRO)", 475 "relevance": "Prompt optimization baseline; represents the class of prompt-as-optimization-variable approaches that ARC unifies into a hierarchical policy." 476 } 477 ], 478 "engagement_factors": { 479 "practical_relevance": { 480 "score": 3, 481 "justification": "Directly addresses the real practitioner problem of choosing agent configurations from a large combinatorial space, with a working framework applicable to existing LLM backends." 482 }, 483 "surprise_contrarian": { 484 "score": 2, 485 "justification": "The finding that a lightweight RL policy trained without updating the backbone LLM can navigate a 62K-option configuration space and outperform hand-tuned templates is non-obvious." 486 }, 487 "fear_safety": { 488 "score": 0, 489 "justification": "No AI safety or risk concerns are raised; the paper focuses on efficiency and accuracy of agent configurations." 490 }, 491 "drama_conflict": { 492 "score": 0, 493 "justification": "No controversy or conflict angle; straightforward systems paper." 494 }, 495 "demo_ability": { 496 "score": 2, 497 "justification": "Framework is described as modular and the abstract references a GitHub codebase, suggesting it could be tried with Qwen 2.5 7B on public benchmarks, though no working URL is provided." 498 }, 499 "brand_recognition": { 500 "score": 0, 501 "justification": "Arizona State University, no famous lab or product affiliation." 502 } 503 }, 504 "hn_data": { 505 "threads": [ 506 { 507 "hn_id": "47023905", 508 "title": "Retrieval-Aware Distillation for Transformer-SSM Hybrids", 509 "points": 2, 510 "comments": 0, 511 "url": "https://news.ycombinator.com/item?id=47023905", 512 "created_at": "2026-02-15T14:24:02Z" 513 } 514 ], 515 "top_points": 2, 516 "total_points": 2, 517 "total_comments": 0 518 } 519 }