scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (19440B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "EcoGym: Evaluating LLMs for Long-Horizon Plan-and-Execute in Interactive Economies",
      6     "authors": [
      7       "Xavier Hu",
      8       "Jinxiang Xia",
      9       "Shengze Xu",
     10       "Kangqi Song",
     11       "Yishuo Yuan"
     12     ],
     13     "year": 2026,
     14     "venue": "arXiv",
     15     "arxiv_id": "2602.09514",
     16     "doi": null
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The main abstract claims—no single model dominates across three scenarios, and models show suboptimality in strategy or execution—are directly supported by Table 2 results and the failure mode analysis in Section 4.2.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper makes causal claims (e.g., 'thinking mode catalyzes universal performance elevation', memory modules improve performance) tested via ablations on only 2 models with single runs, insufficient for causal inference given the high variance demonstrated in the stochastic stability analysis.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The conclusion states 'current SOTA LLMs have achieved super-human performance in specific long-horizon economic planning scenarios' based on a single human experiment in one environment (Operation), then broadly frames this as showing 'immense potential for complex economic decision-making' without adequate bounding.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper observes phenomena like inverse scaling (GPT-5-Mini outperforming GPT-5.2 in Freelance) and non-monotonic context window effects without discussing alternative explanations; the failure mode analysis attributes differences to 'strategic prioritization' vs. 'execution efficiency' without considering confounders.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Net Worth, Income, and DAU are used as proxies for 'long-horizon planning capability' without interrogating whether these economic outcomes measure planning specifically versus instruction-following, domain knowledge heuristics, or reactive behavior.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "There is no dedicated limitations or threats-to-validity section; the conclusion briefly notes models 'struggle to maintain strategic coherence' but does not systematically address benchmark limitations.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No specific threats to validity are discussed; stochastic variance is noted but not framed as a validity threat, and issues like single-run evaluation for Freelance/Operation or limited human sample size are not addressed as threats.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not state explicit scope boundaries about what conclusions cannot be drawn from EcoGym results; no discussion of what EcoGym does NOT measure or settings where results would not transfer.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding disclosure is present; there is no acknowledgments section and no mention of funding sources, despite being a corporate research paper from OPPO.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The paper is bylined as 'OPPO AI Agent Team' and the corresponding author emails end in @oppo.com, making the corporate affiliation clear.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "OPPO does not appear to evaluate its own proprietary model; all evaluated models are from OpenAI, Google, Anthropic, xAI, Moonshot, MiniMax, and open-weight providers.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement is present anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "'Long-horizon planning' and 'plan-and-execute' are used throughout without precise operational definitions; the formal POMDP task formulation describes mechanics but does not define what constitutes 'planning' as distinct from reactive behavior.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper explicitly lists three contributions: an infinite-horizon planning evaluation framework, a utility-guided economic assessment paradigm, and a multi-dimensional empirical analysis of 11 LLMs.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The related work section explicitly positions EcoGym against Vending Bench v1/v2, HeroBench, GDPval, and broader planning benchmarks, explaining how EcoGym differs (multi-scenario, unified framework, fully open-source).",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "benchmark-creation": {
    120       "construct_design": {
    121         "construct_validity_argued": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "The paper argues economic environments require sustained decisions (Principle 2) but does not formally argue why Net Worth/Income/DAU specifically measure 'long-horizon planning capability' versus domain knowledge, memorized heuristics, or short-horizon greedy behavior.",
    125           "source": "haiku"
    126         },
    127         "difficulty_distribution_characterized": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "Table 4 tests three inventory-size tiers for Vending (Small/Medium/Large), but no systematic difficulty distribution across benchmark items is characterized and no empirical difficulty calibration is performed.",
    131           "source": "haiku"
    132         },
    133         "ceiling_floor_effects_checked": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Three of eleven models (DeepSeek-v3.2, Grok-4.1-Fast, Kimi-k2) scored exactly 0.00 income in Freelance—a severe floor effect affecting 27% of evaluated models—which is noted via truncated lines in Figure 1 but not analyzed as a discriminability problem.",
    137           "source": "haiku"
    138         },
    139         "human_baseline_included": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Human baselines are provided only for the Operation environment (average DAU 1,404) due to time constraints; Vending and Freelance have no human baselines, leaving the benchmark incompletely grounded, and the human sample size is not reported.",
    143           "source": "haiku"
    144         },
    145         "scoring_rubric_justified": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "Net Worth, Income, and DAU are defined with formulas in Appendix B but their choice over alternative metrics (e.g., strategic consistency, recovery rate, action efficiency) is not justified, and edge cases in scoring are not addressed.",
    149           "source": "haiku"
    150         }
    151       },
    152       "robustness": {
    153         "contamination_resistance_designed": {
    154           "applies": true,
    155           "answer": true,
    156           "justification": "The Freelance environment implements 'Logic Mutation' (refactoring numerical values and variables) and 'Scenario Injection' to prevent memorization, documented in Section 3.2.2; this is a concrete, described anti-contamination measure.",
    157           "source": "haiku"
    158         },
    159         "temporal_robustness_discussed": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "The paper describes the benchmark as 'open and extensible' but provides no discussion of whether it will be gamed or obsoleted as models improve, nor any versioning or update plan.",
    163           "source": "haiku"
    164         },
    165         "failure_modes_discussed": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "Section 4.2 discusses failure modes of agents within the benchmark but does not discuss failure modes of the benchmark itself—e.g., how agents could game economic metrics without genuine planning, or what capabilities EcoGym fails to measure.",
    169           "source": "haiku"
    170         },
    171         "baseline_implementations_provided": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "Code is released at https://github.com/OPPO-PersonalAI/EcoGym and Table 2 provides numerical results for 11 baseline models with specific version identifiers in Appendix A, enabling reproduction.",
    175           "source": "haiku"
    176         }
    177       },
    178       "documentation": {
    179         "dataset_documentation_complete": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "Data collection is described at a high level (Perplexity for Vending product data, 8 aggregated datasets for Freelance) but no formal data cards, complete preprocessing documentation, or collection methodology for all environments are provided.",
    183           "source": "haiku"
    184         },
    185         "licensing_and_access_clear": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "The code is on GitHub and described as 'open,' but no specific license is stated in the paper, and terms of use for the benchmark are not specified.",
    189           "source": "haiku"
    190         },
    191         "intended_use_specified": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "The benchmark is framed for evaluating 'long-horizon plan-and-execute in interactive economies' but the paper does not specify what conclusions should NOT be drawn from EcoGym scores or what use cases are out of scope.",
    195           "source": "haiku"
    196         }
    197       }
    198     }
    199   },
    200   "claims": [
    201     {
    202       "claim": "No single model consistently achieves superior performance across all three EcoGym scenarios",
    203       "evidence": "Table 2 shows Gemini-3-Pro leads in Vending (11274.73), GPT-5-Mini leads in Freelance (2990.72), and Claude-Sonnet-4.5 leads in Operation (1572.49)",
    204       "supported": "strong"
    205     },
    206     {
    207       "claim": "Models exhibit significant suboptimality in either high-level strategy or efficient action execution",
    208       "evidence": "Failure mode analysis in Section 4.2 identifies strategic prioritization gaps in Operation and execution inefficiency in Vending/Freelance via differential trajectory analysis",
    209       "supported": "moderate"
    210     },
    211     {
    212       "claim": "Top-tier LLMs have achieved super-human performance in specific long-horizon economic planning scenarios",
    213       "evidence": "Human experts averaged DAU 1,404 in Operation while Claude-Sonnet-4.5 (1572.49), Gemini-3-Pro (1280.75), and others are compared; human sample size not reported and testing limited to ~45 minutes",
    214       "supported": "weak"
    215     },
    216     {
    217       "claim": "Extending context window beyond 128k does not yield consistent performance gains",
    218       "evidence": "Figure 4 shows Gemini-3-Pro performance degrading monotonically from k=128 to k=1024, while Gemini-3-Flash shows volatile non-monotonic behavior across the same range",
    219       "supported": "moderate"
    220     },
    221     {
    222       "claim": "Thinking mode universally improves performance across models in long-horizon tasks",
    223       "evidence": "Figure 6 shows DAU improvements for both Gemini-3-Flash (1196→1398) and Gemini-3-Pro (1398→1511) with thinking enabled in Operation, but tested on only 2 models in 1 environment",
    224       "supported": "weak"
    225     },
    226     {
    227       "claim": "No single memory architecture dominates across models and environments",
    228       "evidence": "Table 3 shows episodic memory best for Gemini-3-Pro (18939 vs. 11274 baseline) but working memory best for Gemini-3-Flash (10099) in Vending",
    229       "supported": "moderate"
    230     }
    231   ],
    232   "methodology_tags": [
    233     "benchmark-eval"
    234   ],
    235   "key_findings": "EcoGym introduces three long-horizon economic simulation environments (Vending, Freelance, Operation) for benchmarking LLM agent planning across 11 models, finding that no single model dominates across all scenarios. Floor effects are severe in Freelance (3/11 models score zero income), raising serious discriminability concerns that go unaddressed. Selected top-tier models marginally exceed a limited human baseline in the Operation environment only. Diagnostic studies on context length, thinking mode, and memory modules yield inconsistent and model-dependent results, suggesting no universal architectural solution for long-horizon planning emerges from the study.",
    236   "red_flags": [
    237     {
    238       "flag": "Severe floor effects in Freelance",
    239       "detail": "3 of 11 models (DeepSeek-v3.2, Grok-4.1-Fast, Kimi-k2) scored exactly 0.00 income in Freelance—27% complete failure rate—which is a serious discriminability problem noted via truncated lines in Figure 1 but never analyzed as such."
    240     },
    241     {
    242       "flag": "Single-run evaluation for two of three environments",
    243       "detail": "Freelance and Operation main results are reported from a single run each, despite the stochastic stability analysis showing high variance; single-run results in stochastic environments are unreliable as a benchmark basis."
    244     },
    245     {
    246       "flag": "Incomplete and under-reported human baseline",
    247       "detail": "Human baselines are collected only for Operation (excluded from Vending/Freelance due to time constraints); the Operation human sample size is never reported, and the 'super-human performance' claim rests on this thin evidence."
    248     },
    249     {
    250       "flag": "No limitations section",
    251       "detail": "The paper contains no dedicated limitations or threats-to-validity section despite multiple methodological choices (single runs, limited human sample, uncharacterized floor effects) that warrant systematic discussion."
    252     },
    253     {
    254       "flag": "Overbroad super-human performance claim",
    255       "detail": "The conclusion claims LLMs demonstrate 'super-human performance in specific long-horizon economic planning scenarios' based on one environment, undisclosed number of human participants tested for ~45 minutes, with no variance reporting."
    256     },
    257     {
    258       "flag": "No funding disclosure from corporate authors",
    259       "detail": "This is a corporate paper from OPPO AI Agent Team with no funding disclosure, no acknowledgments section, and no competing interests statement."
    260     }
    261   ],
    262   "cited_papers": [
    263     {
    264       "title": "Vending-bench: A benchmark for long-term coherence of autonomous agents",
    265       "relevance": "Direct methodological predecessor; EcoGym explicitly builds on and extends Vending Bench's evaluation approach and failure mode analysis methodology"
    266     },
    267     {
    268       "title": "GDPval: Evaluating AI model performance on real-world economically valuable tasks",
    269       "relevance": "Related economic evaluation benchmark; EcoGym positions itself as complementary with interactive simulated environments vs. GDPval's real-world task approach"
    270     },
    271     {
    272       "title": "HeroBench: A benchmark for long-horizon planning and structured reasoning in virtual worlds",
    273       "relevance": "Related long-horizon planning benchmark with competitive economic dynamics used for comparison"
    274     },
    275     {
    276       "title": "RE-Bench: Evaluating frontier AI R&D capabilities of language model agents against human experts",
    277       "relevance": "Long-horizon expert task benchmark with human baselines; cited as representative effort to quantify macroeconomic potential of agents"
    278     },
    279     {
    280       "title": "Generative Agents: Interactive simulacra of human behavior",
    281       "relevance": "Early work on persistent memory and planning in long-horizon social/economic interactions; foundational to micro-economic execution agent taxonomy"
    282     },
    283     {
    284       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    285       "relevance": "Source dataset used to construct Freelance software development tasks"
    286     },
    287     {
    288       "title": "LiveCodeBench: Holistic and contamination-free evaluation of large language models for code",
    289       "relevance": "Source dataset for Freelance coding tasks; contamination-free methodology directly relevant to benchmark design choices"
    290     },
    291     {
    292       "title": "Remote Labor Index: Measuring AI automation of remote work",
    293       "relevance": "Related economic capability benchmark cited to motivate economic grounding of agent evaluation"
    294     }
    295   ],
    296   "engagement_factors": {
    297     "practical_relevance": {
    298       "score": 2,
    299       "justification": "Open-source benchmark with runnable code and economic planning scenarios (vending, freelancing, platform ops) that map to real business use cases for LLM agents."
    300     },
    301     "surprise_contrarian": {
    302       "score": 1,
    303       "justification": "The inverse scaling finding (smaller GPT-5-Mini beating larger GPT-5.2 in Freelance) and non-monotonic context window effects are mildly surprising, but 'no universal winner' is expected given task diversity."
    304     },
    305     "fear_safety": {
    306       "score": 1,
    307       "justification": "The paper invokes economic impact of autonomous agents and the 'super-human' framing could attract concern, but safety implications are not engaged with in any depth."
    308     },
    309     "drama_conflict": {
    310       "score": 1,
    311       "justification": "The claim that LLMs surpass human performance in economic planning has mild drama value, but evidence is limited and the paper's framing is measured."
    312     },
    313     "demo_ability": {
    314       "score": 2,
    315       "justification": "Code released on GitHub with three runnable environments; practitioners could evaluate their own models against the 11 reported baselines."
    316     },
    317     "brand_recognition": {
    318       "score": 1,
    319       "justification": "OPPO is a major consumer electronics brand but not a leading AI research institution; the evaluated models include recognizable names but OPPO itself has low research brand recognition."
    320     }
    321   },
    322   "hn_data": {
    323     "threads": [
    324       {
    325         "hn_id": "47149111",
    326         "title": "Security Risks of AI Agents Hiring Humans: An Empirical Marketplace Study",
    327         "points": 1,
    328         "comments": 1,
    329         "url": "https://news.ycombinator.com/item?id=47149111",
    330         "created_at": "2026-02-25T09:00:39Z"
    331       }
    332     ],
    333     "top_points": 1,
    334     "total_points": 1,
    335     "total_comments": 1
    336   }
    337 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs