scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25213B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "From Task Solving to Robust Real-World Adaptation in LLM Agents",
      6     "authors": [
      7       "Pouya Pezeshkpour",
      8       "Estevam Hruschka"
      9     ],
     10     "year": 2026,
     11     "venue": "arXiv",
     12     "arxiv_id": "2602.02760",
     13     "doi": null
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "All core abstract claims (performance degradation with grid size, ranking instability, partial objective inference, ablation findings) are directly supported by Table 1, Figures 3-5, and the experimental sections.",
     21         "source": "haiku"
     22       },
     23       "causal_claims_justified": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "Section 4.3 explicitly runs single-stressor ablations that deactivate all other modifiers to isolate 'the causal effect of each deployment stressor', which is appropriate for causal inference in this controlled simulation.",
     27         "source": "haiku"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The paper repeatedly invokes 'real-world readiness' and 'deployment-like robustness' based solely on a synthetic grid game with four stressors, without bounding these conclusions to the specific game design or discussing the gap to actual deployment contexts.",
     33         "source": "haiku"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Ranking instability and non-monotonic noise effects could reflect specific game design choices rather than generalizable deployment properties, but no alternative interpretations are considered.",
     39         "source": "haiku"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "Grid game accuracy, score, and step counts are used as proxies for 'real-world deployment readiness' without discussing whether these synthetic metrics predict actual deployment performance.",
     45         "source": "haiku"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "There is no dedicated limitations or threats-to-validity section; the conclusion briefly mentions future work directions but does not enumerate limitations of the current study.",
     53         "source": "haiku"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "No specific threats to validity are discussed, such as whether the synthetic grid game generalizes to real LLM agent tasks, or whether 50 and 5 episodes per condition are sufficient sample sizes.",
     59         "source": "haiku"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The paper does not explicitly state what its results do not show—for example, that grid game robustness findings do not necessarily transfer to code agents, web agents, or other real-world agent settings.",
     65         "source": "haiku"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No funding acknowledgment or grant numbers appear anywhere in the paper.",
     73         "source": "haiku"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Both authors list 'Megagon Labs' as their affiliation with contact emails, clearly disclosed in the header.",
     79         "source": "haiku"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "No external funder is disclosed; Megagon Labs does not produce any of the five evaluated models, so institutional conflict is not a concern here.",
     85         "source": "haiku"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) appears in the paper.",
     91         "source": "haiku"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Key terms are operationally defined: 'robustness' is defined via four stressors (partial observability, noisy signals, dynamic environments, dynamic agent state), and 'agent' is defined as an LLM player with a specified text interface.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper clearly states it introduces WildGrid, a controllable grid benchmark for stress-testing LLM agents under deployment-like conditions, explicitly differentiating it from prior clean-interface evaluations.",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 5 systematically positions WildGrid against τ-bench, ToolSandbox, MiniGrid, TextWorld, ToolEmu, and POMDP literature, explaining how this work jointly combines stressors that prior work addresses separately.",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "empirical": {
    117       "artifacts": {
    118         "code_released": {
    119           "applies": true,
    120           "answer": true,
    121           "justification": "Footnote 1 links to https://github.com/megagonlabs/wildgrid, providing a concrete GitHub repository for the benchmark.",
    122           "source": "haiku"
    123         },
    124         "data_released": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "Episodes are procedurally generated with specified random seeds and documented parameters; with the released code, evaluation instances can be regenerated.",
    128           "source": "haiku"
    129         },
    130         "environment_specified": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "No requirements.txt, Dockerfile, or explicit dependency specification is mentioned in the paper; only a GitHub link is provided without setup documentation.",
    134           "source": "haiku"
    135         },
    136         "reproduction_instructions": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "No step-by-step instructions for reproducing the experiments are provided in the paper beyond the GitHub link.",
    140           "source": "haiku"
    141         }
    142       },
    143       "statistical_methodology": {
    144         "confidence_intervals_or_error_bars": {
    145           "applies": true,
    146           "answer": false,
    147           "justification": "Table 1 reports accuracy, score, and steps as point estimates with no confidence intervals or error bars; ablation figures also lack spread measures despite stochastic environments.",
    148           "source": "haiku"
    149         },
    150         "significance_tests": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "No statistical significance tests are applied to any of the comparative performance results across models or conditions.",
    154           "source": "haiku"
    155         },
    156         "effect_sizes_reported": {
    157           "applies": true,
    158           "answer": true,
    159           "justification": "Raw percentage accuracy differences and score differences between models are reported in Table 1 (e.g., Gemini-3 Pro 50% vs. GPT-5 mini 34% on 6x6), providing magnitude of effects without significance testing.",
    160           "source": "haiku"
    161         },
    162         "sample_size_justified": {
    163           "applies": true,
    164           "answer": false,
    165           "justification": "50 episodes per grid size for main results and only 5 instances per data point for ablations are used with no power analysis or justification for these choices.",
    166           "source": "haiku"
    167         },
    168         "variance_reported": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "No standard deviation or variance is reported for any metric in Table 1 or the ablation figures; only point estimates are shown.",
    172           "source": "haiku"
    173         }
    174       },
    175       "evaluation_design": {
    176         "baselines_included": {
    177           "applies": true,
    178           "answer": true,
    179           "justification": "Five contemporary LLMs serve as mutual baselines for each other; ablations additionally compare single-stressor conditions against the full-modifier setting.",
    180           "source": "haiku"
    181         },
    182         "baselines_contemporary": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "All five models (GPT-5.2, GPT-5 Mini, Gemini-3 Pro/Flash, Qwen3-235B-A22B) are 2025-2026 frontier models cited from recent system card papers.",
    186           "source": "haiku"
    187         },
    188         "ablation_study": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "Section 4.3 runs controlled single-stressor ablations, deactivating all other modifiers and sweeping one factor at a time across noise, latent fraction, hazard spread, and teleport schedule.",
    192           "source": "haiku"
    193         },
    194         "multiple_metrics": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "Results are reported across accuracy (completion rate), score (cumulative reward), step count, action-frequency profiles over time, and logistic regression feature attributions.",
    198           "source": "haiku"
    199         },
    200         "human_evaluation": {
    201           "applies": false,
    202           "answer": false,
    203           "justification": "The evaluation is fully automated against a programmatic game environment; no human evaluation of agent outputs is needed or performed.",
    204           "source": "haiku"
    205         },
    206         "held_out_test_set": {
    207           "applies": false,
    208           "answer": false,
    209           "justification": "Not applicable: evaluation uses procedurally generated game instances rather than a fixed prediction task requiring train/test splits.",
    210           "source": "haiku"
    211         },
    212         "per_category_breakdown": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "Results are broken down by grid size (6x6, 8x8, 10x10), by individual stressor type in ablation figures, and by model across all analyses.",
    216           "source": "haiku"
    217         },
    218         "failure_cases_discussed": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "QWEN3's failure mode (myopic INTERACT-heavy behavior leading to energy depletion) is analyzed in detail, and model-specific failure drivers are identified through action profiles and logistic regression coefficients.",
    222           "source": "haiku"
    223         },
    224         "negative_results_reported": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "QWEN3's near-zero accuracy (2% on 6x6, 0% on larger grids) is fully reported, and performance degradation including negative scores across conditions is shown throughout.",
    228           "source": "haiku"
    229         }
    230       },
    231       "setup_transparency": {
    232         "model_versions_specified": {
    233           "applies": true,
    234           "answer": true,
    235           "justification": "Each model is cited with a specific arXiv system card (Singh et al. 2025 for GPT-5, Comanici et al. 2025 for Gemini-3, Yang et al. 2025 for Qwen3-235B-A22B), providing traceable version references.",
    236           "source": "haiku"
    237         },
    238         "prompts_provided": {
    239           "applies": true,
    240           "answer": true,
    241           "justification": "Appendix A provides both the system prompt and user prompt verbatim with all template variables shown, allowing exact prompt reproduction.",
    242           "source": "haiku"
    243         },
    244         "hyperparameters_reported": {
    245           "applies": true,
    246           "answer": false,
    247           "justification": "Temperature, top-p, and other API parameters are not reported; only 'default thinking budget (medium or high)' is mentioned for thinking-enabled models.",
    248           "source": "haiku"
    249         },
    250         "scaffolding_described": {
    251           "applies": true,
    252           "answer": true,
    253           "justification": "Section 2.3 describes the text-only interface, structured state summary format, action history passing, and event logging mechanism in sufficient detail to understand the scaffolding.",
    254           "source": "haiku"
    255         },
    256         "data_preprocessing_documented": {
    257           "applies": true,
    258           "answer": true,
    259           "justification": "Game generation parameters are documented: density hyperparameters, uniform sampling of noise/move-fail/latent fraction from specified ranges (e.g., noise~U(0,0.2)), and fixed counts for unique objects.",
    260           "source": "haiku"
    261         }
    262       },
    263       "data_integrity": {
    264         "raw_data_available": {
    265           "applies": true,
    266           "answer": false,
    267           "justification": "Raw episode trajectories and outcomes are not explicitly released or archived; the code is available but the specific random seeds used in the paper are not documented.",
    268           "source": "haiku"
    269         },
    270         "data_collection_described": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "Section 3 documents the collection procedure: 50 random game instances per grid size for main results, 5 per data point for ablations, with uniform sampling of key parameters from specified distributions.",
    274           "source": "haiku"
    275         },
    276         "recruitment_methods_described": {
    277           "applies": false,
    278           "answer": false,
    279           "justification": "No human participants; programmatic game instance generation is used.",
    280           "source": "haiku"
    281         },
    282         "data_pipeline_documented": {
    283           "applies": true,
    284           "answer": true,
    285           "justification": "The pipeline from game generation to LLM evaluation to metric computation (accuracy, score, steps) is described in sufficient detail across Sections 2-4.",
    286           "source": "haiku"
    287         }
    288       },
    289       "contamination": {
    290         "training_cutoff_stated": {
    291           "applies": true,
    292           "answer": false,
    293           "justification": "No training data cutoffs are stated for any of the five evaluated models.",
    294           "source": "haiku"
    295         },
    296         "train_test_overlap_discussed": {
    297           "applies": true,
    298           "answer": false,
    299           "justification": "The paper does not discuss whether LLMs may have been trained on similar grid-world tasks (MiniGrid, TextWorld) that could inflate performance estimates on the WildGrid benchmark.",
    300           "source": "haiku"
    301         },
    302         "benchmark_contamination_addressed": {
    303           "applies": true,
    304           "answer": false,
    305           "justification": "WildGrid appears to be novel but similar grid environments exist in pretraining corpora; the paper does not address whether exposure to analogous environments during training affects results.",
    306           "source": "haiku"
    307         }
    308       },
    309       "human_studies": {
    310         "pre_registered": {
    311           "applies": false,
    312           "answer": false,
    313           "justification": "No human participants.",
    314           "source": "haiku"
    315         },
    316         "irb_or_ethics_approval": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "No human participants.",
    320           "source": "haiku"
    321         },
    322         "demographics_reported": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "No human participants.",
    326           "source": "haiku"
    327         },
    328         "inclusion_exclusion_criteria": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "No human participants.",
    332           "source": "haiku"
    333         },
    334         "randomization_described": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "No human participants.",
    338           "source": "haiku"
    339         },
    340         "blinding_described": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "No human participants.",
    344           "source": "haiku"
    345         },
    346         "attrition_reported": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "No human participants.",
    350           "source": "haiku"
    351         }
    352       },
    353       "cost_and_practicality": {
    354         "inference_cost_reported": {
    355           "applies": true,
    356           "answer": false,
    357           "justification": "No inference cost, latency, or API cost for running 50+ episodes per model per grid size across five models is reported.",
    358           "source": "haiku"
    359         },
    360         "compute_budget_stated": {
    361           "applies": true,
    362           "answer": false,
    363           "justification": "Total computational budget (API calls, wall time, dollar cost) for the full evaluation is not stated anywhere in the paper.",
    364           "source": "haiku"
    365         }
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "Performance degrades as grid size and horizon increase for all frontier models",
    372       "evidence": "Table 1 shows accuracy declining from 6x6 to 10x10 for GPT-5.2 (48%→26%), Gemini-3 Pro (50%→38%), Gemini-3 Flash (48%→32%), GPT-5 mini (34%→30%)",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "Model rankings are unstable across grid sizes and stressor regimes",
    377       "evidence": "Gemini-3 Pro leads 6x6 and 10x10 accuracy but Gemini-3 Flash leads 8x8; GPT-5 mini has low raw accuracy but best step efficiency and scores on 6x6 and 10x10",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "Agents exhibit partial implicit objective inference (efficiency/score trade-offs) without explicit instruction",
    382       "evidence": "GPT-5 mini shows lowest step counts and competitive scores without being instructed to optimize them; action profiles in Figure 3 show model-specific sensing-then-act strategies",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Moderate noise can improve accuracy for some models (non-monotonic noise effect)",
    387       "evidence": "Figure 4 shows GPT-5.2 and Gemini-3 Pro peak at mid-range noise levels in the single-stressor ablation, consistent with Findling & Wyart (2024) cited by the paper",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "QWEN3 largely fails the task due to a myopic trial-and-error INTERACT strategy",
    392       "evidence": "QWEN3 achieves 2% accuracy on 6x6 and 0% on larger grids; Figure 3e shows unusually high INTERACT probability with minimal SCAN/MEASURE, leading to early energy depletion",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "Logistic regression on 9 environment features identifies interpretable model-specific failure drivers",
    397       "evidence": "Figure 5 heatmap shows per-model coefficients; however, models achieve only ~60% win/loss prediction accuracy on average with the linear predictor",
    398       "supported": "weak"
    399     }
    400   ],
    401   "methodology_tags": [
    402     "benchmark-eval",
    403     "ablation"
    404   ],
    405   "key_findings": "Five frontier LLMs show substantial gaps between nominal task-solving and deployment-like robustness on WildGrid, a controllable grid benchmark with four stressors (partial observability, noisy sensing, non-stationarity, agent-state drift), with accuracy declining across grid sizes. Model rankings are unstable across regimes, with sensing strategy diversity (information-gathering vs. reactive) driving outcome differences independent of raw capability. Single-stressor ablations reveal non-monotonic sensitivities where moderate noise or teleportation can improve performance for some models. QWEN3 fails almost completely due to a myopic INTERACT-heavy strategy that depletes energy early, while GPT-5 mini's front-loaded sensing strategy yields better efficiency despite lower raw accuracy.",
    406   "red_flags": [
    407     {
    408       "flag": "Tiny ablation sample",
    409       "detail": "Only 5 episodes per data point in ablations — far too small to draw reliable conclusions about non-monotonic stressor effects without any confidence intervals or significance testing."
    410     },
    411     {
    412       "flag": "No variance reporting",
    413       "detail": "All metrics in Table 1 and ablation figures are point estimates with no standard deviation, error bars, or confidence intervals despite stochastic game environments and limited replications."
    414     },
    415     {
    416       "flag": "Weak logistic predictor as evidence",
    417       "detail": "The feature attribution analysis (Section 4.4) achieves only ~60% win/loss prediction accuracy on average with a 9-feature logistic model, yet is used to draw specific conclusions about failure drivers."
    418     },
    419     {
    420       "flag": "Synthetic-to-real gap unaddressed",
    421       "detail": "Claims about 'real-world readiness' and 'deployment-like robustness' are based entirely on a synthetic 10x10 grid game; no validation against or comparison to actual deployment contexts is provided."
    422     },
    423     {
    424       "flag": "Inference hyperparameters omitted",
    425       "detail": "Temperature, top-p, and other API parameters are not reported for any of the five models, and thinking-enabled models are run with vague 'default' settings, making exact reproduction impossible."
    426     },
    427     {
    428       "flag": "Contamination from similar environments unaddressed",
    429       "detail": "Training cutoffs are not stated and potential contamination from similar grid-world environments (MiniGrid, TextWorld) in model pretraining data is not discussed."
    430     }
    431   ],
    432   "cited_papers": [
    433     {
    434       "title": "τ-bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains",
    435       "relevance": "Key prior benchmark for tool-using LLM agents with persistent state and trajectory-level metrics, directly compared in related work"
    436     },
    437     {
    438       "title": "ToolSandbox: A Stateful, Conversational, Interactive Evaluation Benchmark for LLM Tool Use Capabilities",
    439       "relevance": "Prior stateful evaluation benchmark exposing agent brittleness, part of the benchmark landscape WildGrid positions against"
    440     },
    441     {
    442       "title": "Minigrid & Miniworld: Modular & Customizable Reinforcement Learning Environments for Goal-Oriented Tasks",
    443       "relevance": "Foundational grid-based testbed for agent evaluation that WildGrid builds on conceptually and extends with LLM-specific deployment stressors"
    444     },
    445     {
    446       "title": "TextWorld: A Learning Environment for Text-Based Games",
    447       "relevance": "Procedural text-game environment with modular generation closely related to WildGrid's design philosophy"
    448     },
    449     {
    450       "title": "ALFWorld: Aligning Text and Embodied Environments for Interactive Learning",
    451       "relevance": "Long-horizon embodied agent benchmark in the same evaluation tradition as this paper"
    452     },
    453     {
    454       "title": "Identifying the Risks of LM Agents with an LM-Emulated Sandbox (ToolEmu)",
    455       "relevance": "Prior approach to scaling LLM agent safety testing via emulation, positioned as a complement to WildGrid's controlled perturbation approach"
    456     },
    457     {
    458       "title": "Hell or High Water: Evaluating Agentic Recovery from External Failures",
    459       "relevance": "Related benchmark for agent robustness to external failures addressing similar deployment resilience questions"
    460     },
    461     {
    462       "title": "Cooperative Inverse Reinforcement Learning",
    463       "relevance": "Foundational work on implicit objective inference that the paper's 'partial objective inference' finding relates to theoretically"
    464     }
    465   ],
    466   "engagement_factors": {
    467     "practical_relevance": {
    468       "score": 2,
    469       "justification": "Findings about deployment robustness gaps and strategy-vs-capability tradeoffs are actionable for practitioners choosing and tuning LLM agents for real deployments."
    470     },
    471     "surprise_contrarian": {
    472       "score": 2,
    473       "justification": "The finding that weaker models can outperform stronger ones under specific regimes, and that moderate noise can improve performance, challenges conventional capability-centric evaluation thinking."
    474     },
    475     "fear_safety": {
    476       "score": 1,
    477       "justification": "Touches on deployment readiness concerns and agent failure modes but stops short of catastrophic risk framing; no safety-critical real-world scenarios examined."
    478     },
    479     "drama_conflict": {
    480       "score": 1,
    481       "justification": "No significant controversy or adversarial narrative; the paper's tone is constructive and diagnostic rather than confrontational."
    482     },
    483     "demo_ability": {
    484       "score": 2,
    485       "justification": "Code is released at github.com/megagonlabs/wildgrid and the grid game interface is simple enough to run locally with standard LLM API access."
    486     },
    487     "brand_recognition": {
    488       "score": 1,
    489       "justification": "Megagon Labs is not a widely recognized AI lab; the evaluated models (GPT-5, Gemini-3) carry brand recognition but this is not a paper from those labs."
    490     }
    491   },
    492   "hn_data": {
    493     "threads": [],
    494     "top_points": 0,
    495     "total_points": 0,
    496     "total_comments": 0
    497   }
    498 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs