scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24280B)
      1 {
      2   "scan_version": 2,
      3   "active_modules": ["experimental_rigor", "data_leakage"],
      4   "paper": {
      5     "title": "SABER: Small Actions, Big Errors — Safeguarding Mutating Steps in LLM Agents",
      6     "authors": ["Alejandro Cuadron", "Pengfei Yu", "Yang Liu", "Arpit Gupta"],
      7     "year": 2025,
      8     "venue": "arXiv",
      9     "arxiv_id": "2512.07850",
     10     "doi": "10.48550/arXiv.2512.07850"
     11   },
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "An anonymous repository is provided: https://anonymous.4open.science/r/SABER-1E54/ (Appendix B, Reproducibility Statement)."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The τ-Bench Verified corrected datasets are included in the repository per Appendix B. They also use the public SWE-Bench Verified and τ-Bench benchmarks."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No mention of requirements.txt, Dockerfile, or detailed environment specifications in the paper."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Appendix B states the repository contains 'experiment drivers, prompts, and evaluation pipelines for all models' and 'scripts to regenerate figures and tables.'"
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Table 2 reports average scores over three runs but no confidence intervals or error bars are provided."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "Table 1 reports p-values from logistic regression for mutating vs non-mutating distance effects (all p < 0.001 for mutating, varying for non-mutating)."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Table 1 reports odds ratios (OR) with coefficients, e.g., OR=0.04 for Claude Sonnet 4 on Airline mutating distance. Percentage point improvements are also reported throughout Section 6.1."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "Sample sizes are reported (n=297, n=345, n=690 in Table 1) but no justification for why these sizes are adequate or any power analysis."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "Table 2 states results are 'average score over three runs' to 'reduce variance' but no standard deviation or spread measure is reported."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Each model is compared with and without SABER (Table 2), providing clear baselines."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Baselines include GPT-5, Claude Sonnet 4, and Qwen3-Thinking-235B — all contemporary state-of-the-art models."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Table 3 provides ablation removing reflection, verification, and their combination on τ-Bench Verified."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "Only task success rate is reported as a metric across all benchmarks. No secondary metrics (e.g., efficiency, number of turns, latency)."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "All evaluation uses automated metrics (task success on benchmarks with simulated users). No human evaluation of system outputs."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "τ-Bench and SWE-Bench Verified are established held-out test sets. The paper also creates τ-Bench Verified as a corrected version."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Results are broken down by benchmark domain (Airline vs Retail vs SWE-Bench) and by original vs Verified variants (Table 2)."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 5 provides detailed examples of failure cases in τ-Bench (Figures 1 and 3). The mutating vs non-mutating analysis itself is a form of failure analysis."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Several negative findings: SABER slightly hurts Claude Sonnet 4 on τ-Bench-V Retail (82.3%→81.0%), and SWE-Bench gains are modest since only reflection can be applied. The Qwen3-Thinking self-pairing underperforms the Instruct auxiliary."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims of +28% relative on Airline, +11% on Retail, +7% on SWE-Bench for Qwen3-Thinking are supported by Table 2 data (49.3→63.3 is ~28% relative on Airline)."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Causal claims about mutating actions driving failure are supported by logistic regression with controlled variables (Table 1). Ablation study (Table 3) supports causal claims about SABER components through controlled single-variable manipulation."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The title and abstract claim broadly about 'LLM Agents' but results are on only three benchmarks (τ-Bench Airline/Retail and SWE-Bench Verified). The paper argues for 'action-level analysis' and 'targeted safeguards' as general 'prerequisites for robust multi-turn agents' without bounding to tested domains."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "No substantive discussion of alternative explanations. For example, the mutating/non-mutating distinction might correlate with task difficulty or action frequency rather than being causal. The ceiling effect on Retail (Table 3) is noted but not explored."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper measures task success rate on benchmarks with simulated users and frames this as evidence about 'robust multi-turn agents' in 'real-world' settings without discussing the gap between simulated and actual user interactions."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Exact model versions specified: 'claude-sonnet-4-20250514', 'gpt-5-2025-08-07', 'Qwen3-235B-A22B-Thinking-2507', 'Qwen3-235B-A22B-Instruct-2507' (Section 6.1)."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "The paper describes the reflection and verification prompts conceptually (Section 4) but does not provide actual prompt text. The repository may contain them, but the paper itself does not."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "No temperature, top-p, or other sampling hyperparameters reported. Only the block cap N=16 for context cleaning is stated (Table 3 caption)."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "SABER's scaffolding is described in detail in Section 4: mutation-gated verification, targeted reflection injection, block-based context filtering with embedding similarity retrieval. Figure 2 provides a workflow diagram."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 5 and Appendices C-D document the τ-Bench Verified corrections in detail, with specific task-by-task fixes. The data pipeline from original τ-Bench to Verified is fully documented."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 8 'Limitations' provides substantive discussion of two key limitations."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 8 discusses specific threats: SABER is external rather than internalized, and dependence on user/user-simulator for verification is a specific limitation of the approach."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "The limitations section discusses design limitations but does not explicitly state what the results do NOT show (e.g., does not bound claims to the specific benchmark domains tested or acknowledge that results may not transfer to other agent architectures)."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Appendix B states the repository contains data and scripts to reproduce all results, including execution traces and corrected datasets."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Execution traces are collected from running models on τ-Bench and SWE-Bench Verified. The benchmarks, models, and evaluation protocols are described in Section 6.1."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants; data comes from benchmark evaluations with simulated users."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The pipeline from trajectory collection → mutating/non-mutating classification → logistic regression is described in Section 3. The τ-Bench corrections are documented task-by-task in Appendices C-D."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding statement. Authors are from Amazon AGI Foundations but no explicit funding disclosure."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "All four authors are listed as affiliated with 'Amazon AGI Foundations' on the first page."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "Amazon has commercial interest in agent reliability (Amazon Q Developer is mentioned in related work). The funder/employer has a stake in demonstrating agent improvements."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial disclosure statement in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No training data cutoff dates stated for any of the models used (GPT-5, Claude Sonnet 4, Qwen3)."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No discussion of whether τ-Bench or SWE-Bench tasks may have appeared in training data of the evaluated models."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "τ-Bench was published in 2024 and SWE-Bench in 2024 — models trained after these dates may have seen them. No contamination analysis is provided."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants; all evaluation uses simulated users and automated benchmarks."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No inference costs, API costs, or latency reported despite SABER adding an auxiliary model for every mutating step. The overhead is described qualitatively ('marginal') but not quantified."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No total computational budget stated. Multiple runs across multiple models and benchmarks with no cost reporting."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Results are averaged over 3 runs (Table 2 caption) but no per-seed results or sensitivity analysis is provided."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": true,
    299         "justification": "Table 2 caption: 'we report the average score over three runs.'"
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No discussion of how the block cap N=16 or other SABER parameters were selected, or what search budget was used."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The N=16 block cap and auxiliary model pairing choices appear tuned but selection rationale is not provided."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "Multiple comparisons across models, benchmarks, and configurations in Table 1 with no correction for family-wise error rate."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "Authors evaluate their own SABER system against baselines without acknowledging self-evaluation bias."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "SABER adds an auxiliary model call for every turn (classification + potential reflection/verification). This compute overhead is not quantified or compared to baseline compute."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "Section 5 extensively discusses construct validity issues in τ-Bench, identifying annotation errors and underspecification that cap performance. This is a central contribution of the paper."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": true,
    333         "answer": true,
    334         "justification": "The paper explicitly evaluates the scaffold (SABER) as the intervention being tested, comparing same models with and without SABER. The auxiliary model pairing is also varied (Table 3, Section 6.1)."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "No discussion of whether models were trained on τ-Bench or SWE-Bench data."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No discussion of whether the evaluation setup leaks information not available in real deployment."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of potential overlap between training data and benchmark problems."
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No leakage detection or prevention method applied."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "Deviations in mutating actions reduce odds of success by 55-96% per deviation, while non-mutating deviations have little effect (<10% per deviation).",
    363       "evidence": "Table 1: logistic regression across 3 models on τ-Bench Airline/Retail with p-values. OR ranges from 0.04-0.45 for mutating, 0.81-0.99 for non-mutating.",
    364       "supported": "strong"
    365     },
    366     {
    367       "claim": "SABER improves Qwen3-Thinking by +19.7pp on τ-Bench-V Airline (58.5%→78.2%) and +10.8pp on Retail (66.9%→77.7%).",
    368       "evidence": "Table 2, averaged over 3 runs.",
    369       "supported": "moderate"
    370     },
    371     {
    372       "claim": "SABER improves Qwen3-Thinking by ~4pp on SWE-Bench Verified (42.6%→45.1%).",
    373       "evidence": "Table 2, SWE-Bench Verified row. Only reflection can be applied on SWE-Bench.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "τ-Bench contains annotation errors and underspecified instructions that cap model performance (Airline ~70%, Retail ~92%).",
    378       "evidence": "Section 5 with detailed corrections in Appendices C-D documenting 24 Airline and 18 Retail task corrections.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Mutating actions account for only 14-18% of total steps but dominate failure risk.",
    383       "evidence": "Section 6.1 reports percentages (e.g., Qwen3-Airline: 15.5%, Claude-4-Retail: 18.1%).",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Reflection and verification are synergistic: each adds ~10pp on Airline, together yield 78.7%.",
    388       "evidence": "Table 3 ablation study on Qwen3-Thinking for τ-Bench Verified.",
    389       "supported": "moderate"
    390     }
    391   ],
    392   "methodology_tags": ["benchmark-eval"],
    393   "key_findings": "Mutating (state-changing) actions in LLM agent trajectories account for only 14-18% of steps but are the dominant driver of task failure, with each mutating deviation reducing success odds by 55-96%. SABER, a model-agnostic safeguard combining mutation-gated user verification, targeted reflection, and context cleaning, delivers consistent improvements across models and benchmarks (up to +19.7pp). The paper also identifies systematic annotation errors in the widely-used τ-Bench benchmark and releases a corrected τ-Bench Verified version.",
    394   "red_flags": [
    395     {
    396       "flag": "Company evaluating agent reliability improvements",
    397       "detail": "All authors are from Amazon AGI Foundations. Amazon sells agent-based developer tools (Amazon Q Developer, referenced in related work). The paper's findings about improving agent reliability serve Amazon's commercial interests, yet no conflict of interest is disclosed."
    398     },
    399     {
    400       "flag": "No variance reporting despite multi-run averaging",
    401       "detail": "Table 2 reports averages over 3 runs but provides no standard deviation, confidence intervals, or range. With only 3 runs, some observed improvements (e.g., Claude Sonnet 4 on Retail: 73.3%→78.3%) may not be statistically significant."
    402     },
    403     {
    404       "flag": "No cost analysis for added auxiliary model",
    405       "detail": "SABER requires an auxiliary model call on every turn for mutation classification, plus reflection and verification for mutating steps. This doubles or triples inference cost but no cost analysis is provided, making practical applicability unclear."
    406     },
    407     {
    408       "flag": "Simulated users as proxy for real human verification",
    409       "detail": "SABER's mutation-gated verification relies on user confirmation, but evaluation uses Claude Sonnet 4 as simulated user. The gap between simulated and real user behavior is acknowledged in limitations but not quantified."
    410     }
    411   ],
    412   "cited_papers": [
    413     {
    414       "title": "τ-bench: A benchmark for tool-agent-user interaction in real-world domains",
    415       "authors": ["Shunyu Yao", "Noah Shinn", "Pedram Razavi", "Karthik Narasimhan"],
    416       "year": 2024,
    417       "arxiv_id": "2406.12045",
    418       "relevance": "Primary benchmark used for evaluation; paper identifies and corrects annotation errors in it."
    419     },
    420     {
    421       "title": "SWE-bench: Can language models resolve real-world github issues?",
    422       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig"],
    423       "year": 2024,
    424       "relevance": "Key benchmark for evaluating coding agent capabilities on real-world software engineering tasks."
    425     },
    426     {
    427       "title": "AI agents that matter",
    428       "authors": ["Sayash Kapoor", "Benedikt Stroebl", "Zachary S. Siegel"],
    429       "year": 2024,
    430       "arxiv_id": "2407.01502",
    431       "relevance": "Discusses evaluation pitfalls for AI agents including construct validity and meaningful measurement."
    432     },
    433     {
    434       "title": "Why do multi-agent LLM systems fail?",
    435       "authors": ["Mert Cemri", "Melissa Z. Pan", "Shuyi Yang"],
    436       "year": 2025,
    437       "arxiv_id": "2503.13657",
    438       "relevance": "Analyzes failure modes in multi-agent LLM systems, complementary to SABER's single-agent failure analysis."
    439     },
    440     {
    441       "title": "Which agent causes task failures and when? On automated failure attribution of LLM multi-agent systems",
    442       "authors": ["Shaokun Zhang", "Ming Yin", "Jieyu Zhang"],
    443       "year": 2025,
    444       "arxiv_id": "2505.00212",
    445       "relevance": "Failure attribution in multi-agent systems — directly related to understanding where agent errors occur."
    446     },
    447     {
    448       "title": "OpenHands: An open platform for AI software developers as generalist agents",
    449       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"],
    450       "year": 2025,
    451       "arxiv_id": "2407.16741",
    452       "relevance": "Agent framework used for SWE-Bench evaluation in this paper."
    453     },
    454     {
    455       "title": "Lost in the middle: How language models use long contexts",
    456       "authors": ["Nelson F. Liu", "Kevin Lin", "John Hewitt"],
    457       "year": 2023,
    458       "arxiv_id": "2307.03172",
    459       "relevance": "Key finding on context utilization that motivates SABER's targeted reflection and context cleaning."
    460     },
    461     {
    462       "title": "SWE-agent: Agent-computer interfaces enable automated software engineering",
    463       "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig"],
    464       "year": 2024,
    465       "arxiv_id": "2405.15793",
    466       "relevance": "Influential agent framework for software engineering tasks."
    467     },
    468     {
    469       "title": "ReAct: Synergizing reasoning and acting in language models",
    470       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"],
    471       "year": 2023,
    472       "arxiv_id": "2210.03629",
    473       "relevance": "Foundational reasoning+acting framework that SABER builds upon for reflection injection."
    474     },
    475     {
    476       "title": "MetaGPT: Meta programming for a multi-agent collaborative framework",
    477       "authors": ["Sirui Hong", "Mingchen Zhuge", "Jiaqi Chen"],
    478       "year": 2024,
    479       "arxiv_id": "2308.00352",
    480       "relevance": "Multi-agent framework; provides context for SABER's single-agent approach comparison."
    481     },
    482     {
    483       "title": "Qwen3 technical report",
    484       "authors": ["An Yang", "Anfeng Li", "Baosong Yang"],
    485       "year": 2025,
    486       "arxiv_id": "2505.09388",
    487       "relevance": "Technical report for one of the primary models evaluated in the paper."
    488     }
    489   ]
    490 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs