scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28188B)
      1 {
      2   "paper": {
      3     "title": "From Task Solving to Robust Real-World Adaptation in LLM Agents",
      4     "authors": [
      5       "Pouya Pezeshkpour",
      6       "Estevam Hruschka"
      7     ],
      8     "year": 2026,
      9     "venue": "arXiv",
     10     "arxiv_id": "2602.02760"
     11   },
     12   "scan_version": 2,
     13   "active_modules": ["experimental_rigor", "data_leakage"],
     14   "methodology_tags": ["benchmark-eval"],
     15   "key_findings": "LLM agents show a consistent gap between nominal task-solving and deployment-like robustness when tested under partial observability, noisy sensing, non-stationarity, and agent-state drift in a grid-based game. Performance degrades with grid size, but model rankings are unstable across regimes—weaker models can outperform stronger ones when strategy matches the uncertainty. Despite no explicit instruction, agents exhibit distinct trade-offs between completion, efficiency, and penalty avoidance, and action-frequency profiles reveal systematic strategy differences (e.g., GPT-5 MINI front-loads sensing, Qwen3 uses myopic trial-and-error).",
     16   "checklist": {
     17     "artifacts": {
     18       "code_released": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "A GitHub repository is provided: https://github.com/megagonlabs/wildgrid (footnote 1 in the abstract)."
     22       },
     23       "data_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The benchmark is procedurally generated from random seeds with documented parameters. The released code repository enables regeneration of all game instances."
     27       },
     28       "environment_specified": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No requirements.txt, Dockerfile, or detailed environment setup is mentioned in the paper. No library versions are specified."
     32       },
     33       "reproduction_instructions": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No step-by-step reproduction instructions are provided in the paper. The experimental setup is described (Section 3) but no commands or scripts to reproduce results."
     37       }
     38     },
     39     "statistical_methodology": {
     40       "confidence_intervals_or_error_bars": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "Table 1 reports only point estimates for Accuracy, Score, and Steps. No confidence intervals or error bars are reported anywhere. Ablation figures (Figure 4) also show curves without error bars."
     44       },
     45       "significance_tests": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "Comparative claims (e.g., 'Gemini-3 Pro attains the highest accuracy on 6×6 and 10×10') are made by comparing raw numbers without any statistical significance tests."
     49       },
     50       "effect_sizes_reported": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "Only raw accuracy, score, and step values are reported in Table 1. No formal effect sizes (Cohen's d, odds ratios, or contextualized percentage improvements) are provided."
     54       },
     55       "sample_size_justified": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "50 instances per grid size for main evaluation and only 5 instances per data point for ablations are used, with no justification for these choices and no power analysis."
     59       },
     60       "variance_reported": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No standard deviations, interquartile ranges, or any spread measures are reported. Table 1 shows averages only. The 50-episode results have no variance quantification."
     64       }
     65     },
     66     "evaluation_design": {
     67       "baselines_included": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "Five state-of-the-art LLMs are compared against each other (GPT-5.2, GPT-5 mini, Gemini-3 Pro, Gemini-3 Flash, Qwen3-235B), providing mutual baselines."
     71       },
     72       "baselines_contemporary": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "All five models are current state-of-the-art: GPT-5.2, GPT-5 mini (Singh et al., 2025), Gemini-3 Pro/Flash (Comanici et al., 2025), and Qwen3-235B (Yang et al., 2025)."
     76       },
     77       "ablation_study": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Section 4.3 presents single-stressor ablations: 'we deactivate all perturbations and vary only the single factor under study, to isolate its causal impact on performance.' Four factors are swept: Noise, Latent, Hazard-Spread, Teleport-Step."
     81       },
     82       "multiple_metrics": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Three metrics are reported: success rate (Acc), average Score, and Steps per grid size (Table 1). The paper explicitly notes these capture different aspects of performance."
     86       },
     87       "human_evaluation": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No human evaluation is included. All evaluation is automated through the grid game's built-in metrics (success/failure, score, steps)."
     91       },
     92       "held_out_test_set": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Game instances are randomly generated for each evaluation. No training or tuning is performed on these instances—all models are evaluated zero-shot, so every instance is effectively held-out."
     96       },
     97       "per_category_breakdown": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Results are broken down by grid size (6×6, 8×8, 10×10) in Table 1 and by individual stressor in the ablation studies (Figure 4). Per-model action profiles are also shown (Figure 3)."
    101       },
    102       "failure_cases_discussed": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 4.2 discusses Qwen3's myopic trial-and-error behavior, GPT-5.2's score degradation from miscalibrated interaction, and specific failure drivers identified through behavioral traces and feature attribution (Section 4.4)."
    106       },
    107       "negative_results_reported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Qwen3 'largely fails across grid sizes' (Section 4.1). The ablations show counterintuitive results: moderate noise can improve performance (Section 4.3), and teleports can help or hurt depending on frequency. GPT-5.2's score degrades sharply with grid size."
    111       }
    112     },
    113     "claims_and_evidence": {
    114       "abstract_claims_supported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Abstract claims are supported: 'large gaps between nominal task-solving and deployment-like robustness' (Table 1 shows degradation), 'rankings are unstable' (different leaders per grid size), 'agents trade off completion, efficiency, and penalty avoidance' (Score/Step variation in Table 1), 'model-specific sensitivities' (Figures 4-5)."
    118       },
    119       "causal_claims_justified": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The paper explicitly makes causal claims ('to isolate the causal effect of each deployment stressor') and uses controlled single-factor ablations (Section 4.3) where all modifiers are disabled except one. This controlled manipulation design is adequate for causal inference about individual stressors."
    123       },
    124       "generalization_bounded": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "The title claims 'Robust Real-World Adaptation' and the paper repeatedly frames findings as indicating 'deployment-like robustness' and 'real-world readiness,' but all evidence comes from a synthetic grid game. No explicit bounding of claims to the grid-game setting is provided."
    128       },
    129       "alternative_explanations_discussed": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper does not substantively discuss alternative explanations for observed differences between models. Differences could stem from prompt sensitivity, model size, training data composition, or thinking-mode budget differences rather than 'strategy' differences. None of these alternatives are discussed."
    133       },
    134       "proxy_outcome_distinction": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper measures grid game performance (accuracy, score, steps) but frames results as evidence of 'real-world readiness' and 'deployment-like robustness.' The gap between grid game metrics and actual deployment robustness is never acknowledged."
    138       }
    139     },
    140     "setup_transparency": {
    141       "model_versions_specified": {
    142         "applies": true,
    143         "answer": false,
    144         "justification": "Models are listed as 'GPT-5.2, GPT-5 MINI, GEMINI 3 PRO, GEMINI 3 FLASH, and QWEN3-235B-A22B.' These are marketing names without API versions or snapshot dates. No version identifiers like API dates are provided."
    145       },
    146       "prompts_provided": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Full system and user prompts are provided in Appendix A (Prompts A.1 and A.2). The system prompt describes game mechanics and output format, and the user prompt provides the observation template."
    150       },
    151       "hyperparameters_reported": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "Only 'default thinking budget (medium or high)' is mentioned for models supporting thinking mode. Temperature, top-p, max tokens, and other sampling parameters are not reported."
    155       },
    156       "scaffolding_described": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 2.3 describes the player interface in detail: text-only observation, local view with facing direction, state vector, action space, short action history, and event-based execution log. Full prompts are in the appendix."
    160       },
    161       "data_preprocessing_documented": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 3 documents the game instance generation: 50 random instances per grid size, with parameter ranges specified (noise ∼U(0,0.2), move fail ∼U(0,0.1), latent fraction ∼U(0,0.2)), fixed dynamics (5×5 window, shifts every 25 steps, teleports every 50 steps, drift every 100 steps)."
    165       }
    166     },
    167     "limitations_and_scope": {
    168       "limitations_section_present": {
    169         "applies": true,
    170         "answer": false,
    171         "justification": "No dedicated Limitations or Threats to Validity section exists. The conclusion mentions future work directions but does not discuss limitations of the current study."
    172       },
    173       "threats_to_validity_specific": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No specific threats to validity are discussed anywhere in the paper."
    177       },
    178       "scope_boundaries_stated": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "No explicit scope boundaries are stated. The paper does not state what the results do NOT show or which settings/populations are excluded from claims."
    182       }
    183     },
    184     "data_integrity": {
    185       "raw_data_available": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "No raw trajectory data, episode logs, or per-instance results are made available. Only aggregated results are reported in tables and figures."
    189       },
    190       "data_collection_described": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "The game instance generation procedure is fully described in Sections 2.1 and 3, including tile placement, parameter sampling ranges, and fixed dynamics schedules."
    194       },
    195       "recruitment_methods_described": {
    196         "applies": false,
    197         "answer": false,
    198         "justification": "No human participants. The paper evaluates LLM agents on a procedural benchmark."
    199       },
    200       "data_pipeline_documented": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The pipeline from game generation (random seed → map sampling → parameter assignment) through evaluation (LLM agent plays → metrics computed) is documented in Sections 2 and 3."
    204       }
    205     },
    206     "conflicts_of_interest": {
    207       "funding_disclosed": {
    208         "applies": true,
    209         "answer": false,
    210         "justification": "No funding sources are disclosed anywhere in the paper. Both authors are from Megagon Labs (a corporate research lab) but no funding acknowledgment is present."
    211       },
    212       "affiliations_disclosed": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "Both authors list Megagon Labs as their affiliation. They are not evaluating a Megagon product—they test third-party models (GPT, Gemini, Qwen)."
    216       },
    217       "funder_independent_of_outcome": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No funding is disclosed, so independence cannot be assessed. Megagon Labs (a Recruit Holdings subsidiary) does not appear to have a direct stake in the models evaluated, but without disclosure this cannot be verified."
    221       },
    222       "financial_interests_declared": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No competing interests or financial interests statement is present in the paper."
    226       }
    227     },
    228     "contamination": {
    229       "training_cutoff_stated": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No training data cutoff dates are stated for any of the five evaluated models."
    233       },
    234       "train_test_overlap_discussed": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No discussion of whether models could have seen similar grid games or the WildGrid code/description during training. While the benchmark is new, the game mechanics could resemble training data."
    238       },
    239       "benchmark_contamination_addressed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "Although the benchmark is procedurally generated (making direct contamination unlikely), the paper never explicitly discusses this advantage or the contamination question."
    243       }
    244     },
    245     "human_studies": {
    246       "pre_registered": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human participants in this study. It evaluates LLM agents on a procedural benchmark."
    250       },
    251       "irb_or_ethics_approval": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this study."
    255       },
    256       "demographics_reported": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study."
    260       },
    261       "inclusion_exclusion_criteria": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study."
    265       },
    266       "randomization_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "blinding_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study."
    275       },
    276       "attrition_reported": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this study."
    280       }
    281     },
    282     "cost_and_practicality": {
    283       "inference_cost_reported": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "No API costs, token counts, or latency measurements are reported. The study runs 750+ episodes (50 instances × 3 grid sizes × 5 models, each up to 200 steps) but total cost is not quantified."
    287       },
    288       "compute_budget_stated": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "No total computational budget, API spend, or hardware specifications are stated."
    292       }
    293     },
    294     "experimental_rigor": {
    295       "seed_sensitivity_reported": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "Results are averaged over 50 random game instances but no seed sensitivity analysis is reported. No variance across instances is shown."
    299       },
    300       "number_of_runs_stated": {
    301         "applies": true,
    302         "answer": true,
    303         "justification": "Section 3: 'we generate 50 random game instances for each grid size' for main evaluation and '5 instances for each data point per condition' for ablations."
    304       },
    305       "hyperparameter_search_budget": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "No hyperparameter search budget is reported. The choice of parameter ranges (noise, latent fraction, etc.) and thinking budget settings are not justified."
    309       },
    310       "best_config_selection_justified": {
    311         "applies": true,
    312         "answer": true,
    313         "justification": "A single fixed configuration is used for all models with the same parameters. All results for all models are reported—no selection of best configuration occurs."
    314       },
    315       "multiple_comparison_correction": {
    316         "applies": false,
    317         "answer": false,
    318         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    319       },
    320       "self_comparison_bias_addressed": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "The authors designed the benchmark and evaluate third-party models on it. They do not acknowledge the potential bias of benchmark designers selecting game mechanics that may favor certain model capabilities."
    324       },
    325       "compute_budget_vs_performance": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "Models of vastly different sizes and compute costs are compared (e.g., Qwen3-235B vs GPT-5 mini) without any discussion of compute budget differences or performance normalized by cost."
    329       },
    330       "benchmark_construct_validity": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "The paper assumes grid game performance reflects 'real-world deployment robustness' but provides no validation of this construct mapping. Whether performance on a grid puzzle predicts robustness in actual deployment scenarios is not examined."
    334       },
    335       "scaffold_confound_addressed": {
    336         "applies": true,
    337         "answer": true,
    338         "justification": "All models use the identical prompt and interaction interface (Section 2.3, Appendix A). The scaffold is controlled across all comparisons."
    339       }
    340     },
    341     "data_leakage": {
    342       "temporal_leakage_addressed": {
    343         "applies": true,
    344         "answer": false,
    345         "justification": "No discussion of whether models' training data could include similar grid game descriptions or solutions, despite the benchmark being novel."
    346       },
    347       "feature_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "No discussion of whether the observation format or prompt structure leaks information that aids performance beyond what a real deployment would provide."
    351       },
    352       "non_independence_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "No discussion of independence between game instances or whether shared parameter ranges across instances create dependencies."
    356       },
    357       "leakage_detection_method": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No concrete leakage detection or prevention method is used or described."
    361       }
    362     }
    363   },
    364   "claims": [
    365     {
    366       "claim": "Performance generally degrades as grid size and horizon increase, with all frontier models exhibiting decreasing accuracy from 6×6 to 10×10.",
    367       "evidence": "Table 1 shows accuracy declining for GPT-5.2 (48→40→26%), Gemini-3 Pro (50→28→38%), and Gemini-3 Flash (48→42→32%) across grid sizes. Qwen3 fails almost entirely (2→0→0%).",
    368       "supported": "moderate"
    369     },
    370     {
    371       "claim": "Model rankings are unstable across grid sizes and metrics: weaker models can outperform stronger ones when strategy matches the uncertainty regime.",
    372       "evidence": "Table 1: Gemini-3 Pro leads on 6×6 and 10×10 accuracy, but Gemini-3 Flash leads on 8×8. GPT-5 mini has lowest steps and best scores despite lower accuracy. Section 4.1 discusses this instability.",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "Agents trade off completion, efficiency, and penalty avoidance despite no explicit instruction to do so, suggesting partial objective inference.",
    377       "evidence": "Table 1 shows GPT-5 mini consistently has lowest step counts and best scores despite not leading in accuracy. Section 4.2 shows distinct action-frequency profiles. Section 4.1: 'several models appear to partially infer these objectives.'",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "LLM agents exhibit a clear sense-then-act signature: information gathering is front-loaded early in episodes.",
    382       "evidence": "Figure 3 shows SCAN and MEASURE concentrated in earliest steps, decaying toward zero, across all frontier models. GPT-5 MINI shows the strongest early sensing investment.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Single-stressor sensitivities are strongly non-monotonic and model-specific: moderate noise can improve accuracy, and teleportation can aid exploration.",
    387       "evidence": "Figure 4: GPT-5.2 and Gemini-3 Pro accuracy peaks at mid-range noise. GPT-5.2 achieves near-perfect accuracy with teleports every 10 steps. Section 4.3 discusses these patterns.",
    388       "supported": "weak"
    389     },
    390     {
    391       "claim": "Robustness in realistic environments depends on adaptive strategy selection as much as raw task-solving capability.",
    392       "evidence": "Combination of ranking instability (Table 1), strategy-specific strengths (Figures 3-4), and feature attribution showing model-specific sensitivities (Figure 5). This is an interpretive synthesis rather than a directly tested claim.",
    393       "supported": "weak"
    394     }
    395   ],
    396   "red_flags": [
    397     {
    398       "flag": "No variance or uncertainty quantification",
    399       "detail": "50 episodes per grid size for main results and only 5 episodes per data point for ablations, yet no standard deviations, confidence intervals, or error bars are reported anywhere. The observed differences between models could be within noise."
    400     },
    401     {
    402       "flag": "No statistical tests despite comparative claims",
    403       "detail": "The paper makes many comparative claims (e.g., 'Gemini-3 Pro attains the highest accuracy') by comparing raw numbers without any significance testing. With 50 episodes and high variance in game outcomes, apparent differences may not be statistically significant."
    404     },
    405     {
    406       "flag": "Construct validity gap: grid game vs. real-world robustness",
    407       "detail": "The paper claims findings about 'real-world readiness' and 'deployment-like robustness' based entirely on a synthetic grid game. No validation that grid game performance predicts actual deployment robustness is provided."
    408     },
    409     {
    410       "flag": "Very small ablation sample size",
    411       "detail": "Single-stressor ablations use only 5 random episodes per data point (Section 3). The non-monotonic patterns in Figure 4 could easily be noise artifacts at this sample size."
    412     },
    413     {
    414       "flag": "No limitations section",
    415       "detail": "The paper lacks any dedicated limitations section or threats to validity discussion, despite significant methodological choices (synthetic setting, small samples, no statistical tests) that warrant explicit acknowledgment."
    416     },
    417     {
    418       "flag": "Overclaiming from synthetic to real-world",
    419       "detail": "The title promises 'Robust Real-World Adaptation' but all evidence comes from a grid game. The four 'deployment stressors' are parameterized game mechanics, not validated proxies for real deployment conditions."
    420     }
    421   ],
    422   "cited_papers": [
    423     {
    424       "title": "Identifying the risks of LM agents with an LM-emulated sandbox",
    425       "authors": ["Yangjun Ruan", "Honghua Dong", "Andrew Wang", "Silviu Pitis", "Yongchao Zhou", "Jimmy Ba", "Yann Dubois", "Chris J Maddison", "Tatsunori Hashimoto"],
    426       "year": 2023,
    427       "arxiv_id": "2309.15817",
    428       "relevance": "ToolEmu framework for scaling LLM agent safety testing via emulated tools and risk scoring."
    429     },
    430     {
    431       "title": "τ-bench: A benchmark for tool-agent-user interaction in real-world domains",
    432       "authors": ["Shunyu Yao", "Noah Shinn", "Pedram Razavi", "Karthik Narasimhan"],
    433       "year": 2024,
    434       "arxiv_id": "2406.12045",
    435       "relevance": "Benchmark with persistent state and trajectory-level metrics exposing LLM agent brittleness in multi-step tool use."
    436     },
    437     {
    438       "title": "ToolSandbox: A stateful, conversational, interactive evaluation benchmark for LLM tool use capabilities",
    439       "authors": ["Jiarui Lu", "Thomas Holleis", "Yizhe Zhang"],
    440       "year": 2025,
    441       "relevance": "Evaluates LLM tool use with stateful simulation and conversational interaction, revealing failure modes."
    442     },
    443     {
    444       "title": "Tools Fail: Detecting silent errors in faulty tools",
    445       "authors": ["Jimin Sun", "So Yeon Min", "Yingshan Chang", "Yonatan Bisk"],
    446       "year": 2024,
    447       "arxiv_id": "2406.19228",
    448       "relevance": "Studies LLM agent robustness to silent tool failures and recovery strategies."
    449     },
    450     {
    451       "title": "Hell or High Water: Evaluating agentic recovery from external failures",
    452       "authors": ["Andrew Wang", "Sophia Hager", "Adi Asija", "Daniel Khashabi", "Nicholas Andrews"],
    453       "year": 2025,
    454       "arxiv_id": "2508.11027",
    455       "relevance": "Evaluates LLM agent replanning under external failures and ambiguous instructions."
    456     },
    457     {
    458       "title": "OpenAI GPT-5 system card",
    459       "authors": ["Aaditya Singh"],
    460       "year": 2025,
    461       "arxiv_id": "2601.03267",
    462       "relevance": "System card for GPT-5, one of the models evaluated in this benchmark."
    463     },
    464     {
    465       "title": "CodeAgent: Enhancing code generation with tool-integrated agent systems for real-world repo-level coding challenges",
    466       "authors": ["Kechi Zhang", "Jia Li", "Ge Li", "Xianjie Shi", "Zhi Jin"],
    467       "year": 2024,
    468       "arxiv_id": "2401.07339",
    469       "relevance": "Tool-integrated LLM agent for code generation, representing the specialized agent paradigm this paper critiques."
    470     },
    471     {
    472       "title": "AlphaEvolve: A coding agent for scientific and algorithmic discovery",
    473       "authors": ["Alexander Novikov"],
    474       "year": 2025,
    475       "arxiv_id": "2506.13131",
    476       "relevance": "Coding agent for scientific discovery, exemplifying the trend toward specialized LLM agents."
    477     },
    478     {
    479       "title": "BrowserArena: Evaluating LLM agents on real-world web navigation tasks",
    480       "authors": ["Sagnik Anupam", "Davis Brown", "Shuo Li", "Eric Wong", "Hamed Hassani", "Osbert Bastani"],
    481       "year": 2025,
    482       "arxiv_id": "2510.02418",
    483       "relevance": "Real-world web agent evaluation benchmark where different LLMs adopt distinct behavioral strategies."
    484     },
    485     {
    486       "title": "OdysseyBench: Evaluating LLM agents on long-horizon complex office application workflows",
    487       "authors": ["Weixuan Wang", "Dongge Han"],
    488       "year": 2025,
    489       "arxiv_id": "2508.09124",
    490       "relevance": "Long-horizon agent evaluation showing performance decline with increasing step count, consistent with this paper's findings."
    491     },
    492     {
    493       "title": "Large language model agent: A survey on methodology, applications and challenges",
    494       "authors": ["Junyu Luo", "Weizhi Zhang"],
    495       "year": 2025,
    496       "arxiv_id": "2503.21460",
    497       "relevance": "Survey of LLM agent approaches covering planning, tool use, and multi-step reasoning."
    498     },
    499     {
    500       "title": "HAZARD challenge: Embodied decision making in dynamically changing environments",
    501       "authors": ["Qinhong Zhou", "Sunli Chen", "Yisong Wang"],
    502       "year": 2024,
    503       "arxiv_id": "2401.12975",
    504       "relevance": "Embodied agent benchmark targeting dynamically changing environments, a related robustness evaluation approach."
    505     }
    506   ]
    507 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs