scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26917B)
      1 {
      2   "paper": {
      3     "title": "SAGE: Steerable Agentic Data Generation for Deep Search with Execution Feedback",
      4     "authors": ["Fangyuan Xu", "Rujun Han", "Yanfei Chen", "Zifeng Wang", "I-Hung Hsu", "Jun Yan", "Vishy Tirumalashetty", "Eunsol Choi", "Tomas Pfister", "Chen-Yu Lee"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2601.18202",
      8     "doi": "10.48550/arXiv.2601.18202"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "SAGE proposes a dual-agent pipeline for generating difficulty-controlled deep search QA pairs, using a data generator and a search agent with iterative execution feedback. The framework achieves 87% correctness and 50% pass rate (correct + target difficulty) after 3 feedback rounds, significantly outperforming resampling baselines. Training Qwen-7B on SAGE-generated data yields up to 29% relative improvement on in-domain evaluation and 23% on out-of-domain FRAMES, and enables transfer from Wikipedia-based retrieval to Google Search at inference time.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The paper states 'The code and data will be released at https://github.com/carriex/sage' — this is a promise of future release, not an actual release."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "Same future-release promise. While evaluation uses public benchmarks (Musique, FRAMES, NQ, HotpotQA), the SAGE-generated training data (20K examples) is not yet released."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "Section A.3 mentions '8 H100 GPUs' and some training parameters, but no requirements.txt, Dockerfile, or library version specifications are provided."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions. The paper provides algorithmic descriptions (Algorithms 1-2) and prompts (Figures 4-8), but no runnable scripts or reproduction guide."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results in Tables 2-5 are point estimates with no confidence intervals, error bars, or ± notation."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper makes numerous comparative claims (e.g., '27% relative improvement') based solely on comparing numbers without any statistical significance tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Relative improvements are reported with baseline context: '27% relative improvement' (28.5% vs 22.4%), '23% relative improvement' on FRAMES (32.3% vs 26.2%). Tables show absolute values enabling magnitude assessment."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Training set size of 20K is matched to baselines for fair comparison, but no justification for why 300 test samples per split is sufficient. No power analysis."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or spread measures reported across runs. All results appear to be single-run numbers."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Multiple baselines: NQ+HotpotQA (Search-R1 checkpoints), Musique, prompting-based Gemini agents, and data generator without difficulty control. Both intrinsic and extrinsic evaluations include baselines."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Search-R1 (Jin et al., 2025) is the most recent RL-based search agent framework. Gemini-2.5-flash is contemporary. Baselines are current and competitive."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table 2 ablates resampling vs. feedback across rounds. Table 5 ablates number of feedback rounds (0-3) for downstream performance. Figure 2 breaks down by target step."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Intrinsic evaluation uses four metrics: % correct, % pass, Avg@4, and # search steps (Table 2). Downstream evaluation uses accuracy via LLM-as-judge on multiple benchmarks."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "All evaluation is automated using LLM-as-judge (gemini-2.0-flash). No human evaluation of generated data quality or agent outputs is conducted."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "In-domain test data (300 per step) is separately generated from training data. Out-of-domain evaluation uses separate benchmarks (Musique test split, FRAMES, GAIA, Browsecomp, HLE-Search)."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 3 provides per-hop breakdowns (3-hop through 7-hop). Figure 2 breaks down by target step. Table 4 reports per-benchmark. Figure 3 analyzes reasoning type distribution."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 5.3 and Table 6 provide detailed error categorization: superficial complexity (13%), multi-query collapse (21%), overly specific questions (31%), information co-location (35%), and four incorrect-data categories."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Table 5 shows that 3 rounds of feedback does not improve over 2 rounds despite generating harder data, leading to the observation that 'increasing data difficulty alone is insufficient.' QWEN-3B achieves 1.0% on Browsecomp across all training data."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims 'up to 23% relative performance gain' on out-of-domain benchmarks — Table 3 shows QWEN-7B FRAMES: 32.3% vs 26.2% (23.3% relative). '27% relative improvements on in-domain' — 28.5% vs 22.4% for 3B. Claims match reported numbers."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims (e.g., 'execution feedback improves data quality') are supported by controlled ablations in Table 2 comparing resampling vs. feedback holding other variables constant, and Table 5 varying only the number of feedback rounds."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Limitations section explicitly bounds claims: 'only experiment with generating data from a single general domain corpus, Wikipedia,' 'do not explore alternative reinforcement learning algorithms such as GRPO, nor model scales beyond 7B parameters.'"
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative explanations for why SAGE-trained agents outperform baselines. Could the improvement be due to data diversity rather than difficulty? Or the filtering criteria? These confounds are not addressed."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper's intrinsic metrics (correctness, search steps) directly measure what they claim. For downstream evaluation, they measure accuracy on QA benchmarks and frame results as QA performance, matching the granularity of measurement."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Uses 'gemini-2.5-flash' and 'gemini-2.0-flash' without snapshot dates or API versions. 'Qwen-2.5-3B-Instruct' and 'Qwen-2.5-7B-Instruct' are specific model names but Gemini versions are marketing names only."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Full prompt text is provided in Figures 4, 5, 6, 7, 8, and 10 in the appendix, covering the data generator, search agent, LLM judge, and analysis prompts."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section A.2 reports temperature=1, thinking disabled for Gemini. Section A.3 reports learning rate 1e-6/1e-5, 500 steps, batch size 512, mini-batch 256, micro-batch 64, GAE λ=1, γ=1, max sequence 8192 tokens."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The ReACT-based search agent framework is described (Section 2.1). The dual-agent pipeline is detailed with Algorithms 1-2 showing data generator and search agent interaction, feedback loops, and stopping criteria."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Data filtering is documented: 'filter out questions that require less than two search steps,' pass@K=0 questions are removed (Section 3.3). Training data size is 20K. E5 retriever returns top 3 passages per query."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Dedicated 'Limitations' section with two subsections: 'Proposed method' and 'Experimental setting,' providing substantive discussion of the framework's constraints."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Specific threats identified: fixed search agent may limit data quality, pass@K=1 may admit incorrect content, single-corpus limitation, no exploration of GRPO, and model scale limited to 7B."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Explicit scope boundaries: 'only focuses on generating high-quality (q,a) pairs for RL training,' 'only experiment with generating data from a single general domain corpus, Wikipedia,' 'do not explore... nor model scales beyond 7B.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "Generated data is not yet released. Only aggregated results are reported. No way to independently verify the generated QA pairs."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Data generation procedure is described in detail: Algorithm 1 outlines the full pipeline from document sampling to QA pair generation and verification. Corpus is the 2018 Wikipedia dump with E5 retriever."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data is synthetically generated from a standard public corpus (Wikipedia). Standard benchmarks used for evaluation."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Full pipeline documented: random document sampling → initial QA generation with difficulty prompt → search agent verification (K=4 traces) → execution feedback → filtering (pass@K, min steps). Algorithms 1-2 formalize this."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No explicit funding acknowledgment. The paper notes 'Work done at Google Cloud AI Research' and 'This work has no implications of any Google products' but does not disclose funding sources."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations clearly stated: Google Cloud AI Research (8 authors) and New York University (2 authors, one shared). First author's internship at Google is noted."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Most authors are Google employees. The pipeline uses Google's Gemini models as the data generator and search agent. Google has a commercial interest in demonstrating the effectiveness of Gemini for data generation."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial disclosure statement in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff dates stated for Qwen-2.5 or Gemini models. The 2018 Wikipedia dump is specified for retrieval but pre-training cutoffs are not discussed."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether Qwen or Gemini pre-training data overlaps with Musique, FRAMES, GAIA, or other evaluation benchmarks."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "HotpotQA (2018), Musique (2022), and other benchmarks were published well before likely Qwen/Gemini training cutoffs. No contamination analysis performed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No API costs, latency, or cost-per-example reported for the data generation pipeline or inference. The pipeline makes extensive API calls to Gemini (generator + K search traces × R rounds per example) without quantifying cost."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Training hardware mentioned ('single node with 8 H100 GPUs') but total GPU hours, wall-clock time, and API spend for data generation are not reported."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No mention of random seeds or seed sensitivity for either data generation or model training."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "K=4 traces for search agent verification and K=8 for Table 1 difficulty estimation are stated. However, the number of training runs for downstream evaluation is not stated — results appear to be single-run."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget reported. Training uses specific hyperparameters (Section A.3) without stating how they were selected."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "Table 5 reports downstream performance for 0-3 feedback rounds and selects 2 rounds based on best downstream performance. All configurations are shown transparently."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Many comparisons across models, datasets, and settings but no statistical tests are performed, let alone multiple comparison corrections."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "Authors evaluate their own data generation pipeline against baselines without acknowledging evaluation bias. Search-R1 baselines use public checkpoints, but Musique-trained models are their own reimplementation."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "SAGE data generation requires multiple Gemini API calls per example (generator + K×R search traces), making it far more expensive than static datasets. This compute cost difference is not discussed relative to performance gains."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether the benchmarks (Musique, FRAMES, GAIA, Browsecomp, HLE-Search) actually measure 'deep search' capability or what construct validity gaps might exist."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "All model comparisons use the same training framework (Search-R1/PPO) and same retrieval setup (E5 + Wikipedia 2018). The scaffold is held constant, isolating the training data as the variable."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of temporal leakage. Qwen models may have been trained on data including Musique/HotpotQA solutions. The 2018 Wikipedia dump predates the models but benchmark answers could be in pre-training data."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup leaks information. LLM-as-judge uses gemini-2.0-flash which may have internalized benchmark answers."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No analysis of whether SAGE-generated training data and in-domain test data are sufficiently independent, despite both being generated from the same corpus with the same pipeline."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No concrete leakage detection or prevention methods applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "SAGE generates data that is both more correct (87%) and more difficult than baselines after 3 feedback rounds, with 50% pass rate vs 47% for resampling.",
    365       "evidence": "Table 2 shows progressive improvement: initial generator 71% correct/18% pass → +3 feedback: 87% correct/50% pass vs +3 resample: 84% correct/47% pass.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "Training on SAGE data yields 27% relative improvement on in-domain evaluation for QWEN-3B (28.5% vs 22.4% Musique) and 29% for QWEN-7B (38.1% vs 29.6%).",
    370       "evidence": "Table 3 reports per-step and average accuracy for models trained on different data. Improvements consistent across both model sizes.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Up to 23% relative improvement on out-of-domain FRAMES benchmark for QWEN-7B (32.3% vs 26.2% when trained on NQ+HotpotQA).",
    375       "evidence": "Table 3, FRAMES column. However, these are single-run results with no error bars or significance tests, so true improvement magnitude is uncertain.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Agents trained with fixed-corpus retrieval can transfer to Google Search at inference time, showing 50% relative improvement on GAIA for QWEN-7B (24.0% vs 15.6%).",
    380       "evidence": "Table 4 shows improvements on GAIA but mixed results on other benchmarks: marginal on Browsecomp (2.6% vs 2.1%) and no gain on HLE-Search (7.0% vs 8.0%).",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "SAGE generates questions requiring more diverse reasoning strategies than Musique, including calculation (35% vs 5%) and temporal reasoning (32% vs 8%).",
    385       "evidence": "Figure 3 shows reasoning type distribution based on LLM analysis of 100 trajectories each. However, the analysis uses gemini-2.5-flash to label reasoning types, not human annotation.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Increasing feedback beyond 2 rounds does not improve downstream performance despite generating harder data, suggesting difficulty alone is insufficient.",
    390       "evidence": "Table 5: round 2 achieves 38.1% in-domain, 32.3% FRAMES; round 3 drops to 34.1% and 28.1% respectively, despite Avg@4 decreasing from 80.4 to 79.5.",
    391       "supported": "strong"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "Company evaluating own product",
    397       "detail": "8 of 10 authors are from Google Cloud AI Research. The pipeline relies on Google's Gemini models for data generation, search agent, and LLM-as-judge evaluation. While the paper states 'This work has no implications of any Google products,' the demonstrations showcase Gemini capabilities."
    398     },
    399     {
    400       "flag": "No error bars or significance tests",
    401       "detail": "All reported results are point estimates from apparently single runs. No statistical significance tests despite claims of improvement across multiple benchmarks and model sizes. With small test sets (300 samples), observed differences could be within noise."
    402     },
    403     {
    404       "flag": "LLM-as-judge circularity",
    405       "detail": "Gemini-2.0-flash is used as the LLM judge for both correctness verification during data generation AND final evaluation. The same model family (Gemini) generates the data, verifies it, and evaluates downstream results. No human validation of judge accuracy."
    406     },
    407     {
    408       "flag": "Unreleased code and data",
    409       "detail": "Code and data are promised for future release but not available. Results cannot be independently verified or reproduced."
    410     },
    411     {
    412       "flag": "Missing cost analysis",
    413       "detail": "The SAGE pipeline requires multiple rounds of Gemini API calls per training example (data generator + K search traces × R rounds), making it substantially more expensive than using existing datasets. This cost is never quantified or discussed relative to performance gains."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Search-R1: Training LLMs to Reason and Leverage Search Engines with Reinforcement Learning",
    419       "authors": ["B. Jin", "H. Zeng", "Z. Yue"],
    420       "year": 2025,
    421       "relevance": "RL framework for training search agents, used as the training framework in downstream evaluation."
    422     },
    423     {
    424       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    425       "authors": ["C. E. Jimenez", "J. Yang", "A. Wettig"],
    426       "year": 2024,
    427       "relevance": "Major code generation benchmark referenced as an example of agentic LLM tasks."
    428     },
    429     {
    430       "title": "Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection",
    431       "authors": ["A. Asai", "Z. Wu", "Y. Wang"],
    432       "year": 2024,
    433       "relevance": "Retrieval-augmented generation with self-reflection, relevant to agentic RAG approaches."
    434     },
    435     {
    436       "title": "ReACT: Synergizing Reasoning and Acting in Language Models",
    437       "authors": ["S. Yao", "J. Zhao", "D. Yu"],
    438       "year": 2023,
    439       "relevance": "Foundational framework for interleaving reasoning and tool use that SAGE builds upon."
    440     },
    441     {
    442       "title": "GAIA: A Benchmark for General AI Assistants",
    443       "authors": ["G. Mialon", "C. Fourrier", "C. Swift"],
    444       "year": 2023,
    445       "arxiv_id": "2311.12983",
    446       "relevance": "General AI assistant benchmark used for Google Search transfer evaluation."
    447     },
    448     {
    449       "title": "Browsecomp: A Simple Yet Challenging Benchmark for Browsing Agents",
    450       "authors": ["J. Wei", "Z. Sun", "S. Papay"],
    451       "year": 2025,
    452       "relevance": "Deep search benchmark for browsing agents used for evaluation."
    453     },
    454     {
    455       "title": "Humanity's Last Exam",
    456       "authors": ["L. Phan"],
    457       "year": 2025,
    458       "arxiv_id": "2501.14249",
    459       "relevance": "Challenging evaluation benchmark used for search-subset evaluation."
    460     },
    461     {
    462       "title": "WebGPT: Browser-assisted Question-Answering with Human Feedback",
    463       "authors": ["R. Nakano", "J. Hilton", "S. Balaji"],
    464       "year": 2021,
    465       "relevance": "Foundational work on training search agents with human feedback."
    466     },
    467     {
    468       "title": "Fact, Fetch, and Reason: A Unified Evaluation of Retrieval-Augmented Generation",
    469       "authors": ["S. Krishna", "K. Krishna", "A. Mohananey"],
    470       "year": 2025,
    471       "doi": "10.18653/v1/2025.naacl-long.243",
    472       "relevance": "FRAMES benchmark used for out-of-domain evaluation of deep search agents."
    473     },
    474     {
    475       "title": "WebArena: A Realistic Web Environment for Building Autonomous Agents",
    476       "authors": ["S. Zhou", "F. F. Xu", "H. Zhu"],
    477       "year": 2024,
    478       "relevance": "Web environment benchmark for autonomous agents."
    479     },
    480     {
    481       "title": "Qwen2.5 Technical Report",
    482       "authors": ["Q. A. Yang"],
    483       "year": 2024,
    484       "relevance": "Base model (Qwen-2.5-3B/7B-Instruct) used for training search agents in downstream evaluation."
    485     },
    486     {
    487       "title": "Proximal Policy Optimization Algorithms",
    488       "authors": ["J. Schulman", "F. Wolski", "P. Dhariwal"],
    489       "year": 2017,
    490       "relevance": "PPO algorithm used for reinforcement learning training of search agents."
    491     }
    492   ]
    493 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs