scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30821B)
      1 {
      2   "paper": {
      3     "title": "EcoGym: Evaluating LLMs for Long-Horizon Plan-and-Execute in Interactive Economies",
      4     "authors": [
      5       "Xavier Hu",
      6       "Jinxiang Xia",
      7       "Shengze Xu",
      8       "Kangqi Song",
      9       "Yishuo Yuan"
     10     ],
     11     "year": 2026,
     12     "venue": "arXiv",
     13     "arxiv_id": "2602.09514"
     14   },
     15   "scan_version": 2,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "EcoGym introduces three economic simulation environments (Vending, Freelance, Operation) for evaluating LLM long-horizon planning over 365-day horizons. Evaluating 11 LLMs reveals no single model dominates across all scenarios, with performance gaps stemming from trade-offs between strategic prioritization and execution efficiency. Context window expansion does not consistently improve performance, memory module effectiveness is highly model- and task-dependent, and top-tier models surpass a sparsely-described human baseline in the Operation environment.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "GitHub repository URL provided on page 1: https://github.com/OPPO-PersonalAI/EcoGym."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The benchmark environments (600+ SKUs for Vending, 5k+ tasks for Freelance derived from public datasets) are released as part of the open-source repository. Freelance uses publicly available datasets (LiveCodeBench, SWE-bench, GSM8K, etc.) with mutation."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No requirements.txt, Dockerfile, conda environment file, or library version information provided in the paper. Only model API versions are listed in Appendix A."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions in the paper. A GitHub repository is linked but the paper contains no commands, scripts, or README-like instructions to replicate experiments."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The main results table (Table 2) reports point estimates only for all three environments. Freelance and Operation results are single-run. Table 4 reports mean ± std for one secondary experiment, but the primary evaluation has no confidence intervals or error bars."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No statistical significance tests are used anywhere. All comparative claims (e.g., 'Gemini-3 series demonstrates dominant asset appreciation') are based solely on raw numerical comparisons."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Only raw performance metrics are reported (Net Worth, Income, DAU). No standardized effect sizes, percentage improvements with baseline context, or magnitude-of-difference measures are provided."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No justification for why 5 runs for Vending and only 1 run for Freelance/Operation. No power analysis or discussion of whether the number of trials is sufficient for the claims made."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "Main results (Table 2) for Freelance and Operation are explicitly single-run with no variance. Vending is averaged over 5 runs but Table 2 shows no standard deviation. Variance is shown only in appendix figures (Figures 3, 16-18) and one secondary experiment (Table 4 with mean ± std)."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Eleven LLMs are compared against each other across all three environments (Table 2), and a human expert baseline is included for the Operation environment."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Models include GPT-5.2 (2025-12-11), Gemini-3-Pro, Claude-Sonnet-4.5 (2025-09-29), DeepSeek-v3.2, and other recent models with specific version dates from 2025 (Appendix A, Tables 5-6)."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple diagnostic studies: memory module ablation (Table 3, four memory types), context window length sweep (Figure 4, k=32 to 1024), thinking mode on/off (Figure 6), and environment complexity tiers (Table 4)."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Three distinct primary metrics across environments: Net Worth (Vending), Income (Freelance), and DAU (Operation). Additional behavioral pattern analysis and failure mode analysis supplement the quantitative metrics."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Human experts were recruited to perform the Operation task via a dedicated GUI (Appendix H), achieving average DAU of 1,404 over ~45 minutes, serving as a human performance baseline."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The benchmark environments are novel evaluation settings. No models were trained or tuned on these environments, so all results are on unseen test conditions with no dev/test contamination."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results are reported per-environment (Table 2), per-model behavioral patterns over time (Figure 5, Appendix F), per-complexity tier (Table 4), and per-memory-type (Table 3)."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section 4.2 'Failure Modes Analysis' provides detailed comparison of top-2 models per scenario, identifying specific failure modes: passive waiting (Gemini-3-Flash in Vending), redundant loops (Gemini-3-Pro in Freelance), and strategic misalignment (runner-up in Operation)."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Three models score 0 income in Freelance (Table 2: DeepSeek-v3.2, Grok-4.1-Fast, Kimi-k2). Context window expansion degrades Gemini-3-Pro (Figure 4). Working memory hurts Gemini-3-Pro in Freelance. Gemini-3-Pro stagnates at higher complexity (Table 4)."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Abstract claim that 'no single model dominates across all three scenarios' is supported by Table 2 showing different leaders per environment (Gemini-3-Pro in Vending, GPT-5-Mini in Freelance, Claude-Sonnet-4.5 in Operation). The 'significant suboptimality' claim is supported by failure mode analysis."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Causal claims from ablation-style experiments use controlled single-variable manipulation: thinking on/off (Figure 6), memory module addition (Table 3), context window length variation (Figure 4). Each varies one factor while holding others constant."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The title claims 'Interactive Economies' broadly but tests only 3 specific simulated environments. The conclusion generalizes to 'frontier models struggle to maintain strategic coherence over long-time decisions' from 3 simulated games. The abstract claims a 'generalizable benchmark' without bounding the generalization."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "No alternative explanations discussed for observed performance differences. The paper does not consider API configuration effects, model training data composition, whether benchmark design may favor certain architectures, or whether the stochastic environments introduce systematic biases."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper measures Net Worth, Income, and DAU in simulated games and frames this as evaluating 'long-horizon plan-and-execute' capability in 'realistic economic settings.' The gap between simulated economic game metrics and actual economic decision-making capability is not acknowledged."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Appendix A Tables 5-6 provide specific version identifiers with dates: 'GPT-5.2-2025-12-11', 'GPT-5-Mini-2025-08-07', 'Claude-Sonnet-4-5-20250929', 'Kimi-K2-0905-Preview', plus model repository links for open-weights models."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Full system prompts provided in Appendix C: shared core protocol (C.1), Vending agent + market physics synthesis (C.2), Freelance agent + LLM auditor (C.3), and Operation agent (C.4)."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section 4.1: 'Generation parameters are standardized (Temperature=1.0, Top-p=0.95) across all trials.' Context window restricted to 128 steps. Maximum 365 days."
    158       },
    159       "scaffolding_described": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Agent scaffolding described: structured tool-calling mechanism (Appendix C.1), sliding context window of 128 steps, per-environment action schemas (Appendix D, Tables 7-9), and optional memory module architectures with formal definitions (Appendix E)."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Freelance data pipeline described in Section 3.2.2: dataset aggregation → difficulty filtering → strategy router & mutation (scenario injection for code, logic mutation for quantitative) → solvability check via LLM-as-Judge. Vending data synthesis via Perplexity queries + LLM-generated market physics (Appendix C.2)."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No dedicated limitations section. The conclusion mentions model capabilities generally but does not substantively discuss limitations of the benchmark design, evaluation methodology, or scope of the findings."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No threats to validity discussed. No analysis of how simulated environments might not reflect real economic dynamics, no discussion of single-run reliability for Freelance/Operation, and no consideration of benchmark design biases."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No explicit scope boundaries stated. The paper does not specify what the results do NOT show, what settings or populations are excluded, or what claims the authors are not making."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No raw experimental data (agent trajectories, daily metric logs, run outputs) is made available. Only aggregated results in tables and figures are provided. The GitHub repo contains benchmark code but not experimental outputs."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Data collection described per environment: Vending uses Perplexity queries with LLM-synthesized market physics (Section 3.2.1), Freelance aggregates from 8 public datasets with mutation pipeline (Section 3.2.2), Operation uses parametric continuous state space (Section 3.2.3)."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "For human baseline testing, the paper states only 'we recruited human experts' with no details about recruitment channels, number of participants, qualifications, or selection criteria."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The Freelance data pipeline is well-documented with explicit stages and filtering (Section 3.2.2). The Vending market physics synthesis pipeline is provided with the full prompt (Appendix C.2). Environment state transitions are formally specified (Appendix B)."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding information disclosed anywhere in the paper. Authors are from OPPO AI Agent Team, suggesting corporate funding, but no explicit funding statement is provided."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Authors are identified as 'OPPO AI Agent Team' on the title page. Correspondence emails use @oppo.com domain."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "OPPO is a technology company with commercial interests in AI agent capabilities. The benchmark evaluates LLM planning abilities relevant to OPPO's AI agent products. No discussion of funder independence."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests statement, no patent disclosures, and no financial interest declarations are present in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No training data cutoff dates stated for any of the 11 evaluated models. Appendix A lists API version dates (e.g., GPT-5.2-2025-12-11) but these are release/snapshot dates, not training cutoffs."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No systematic discussion of train/test overlap. The Freelance tasks are derived from widely-used public datasets (SWE-bench, GSM8K, LiveCodeBench) that are likely in model training data. While mutation is applied, overlap is not explicitly discussed."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "Contamination prevention is partially implemented for Freelance tasks via 'Logic Mutation, refactoring numerical values and variables to prevent data contamination or memorization' (Section 3.2.2), but no systematic contamination analysis is performed for the benchmark as a whole. Vending and Operation contamination risks are not discussed."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No pre-registration mentioned for the human baseline evaluation."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": true,
    256         "answer": false,
    257         "justification": "No IRB or ethics board approval mentioned for the human expert recruitment and evaluation."
    258       },
    259       "demographics_reported": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "No demographics reported for human experts. Only described as 'human experts' with no characterization of number of participants, expertise domain, experience level, or background."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "No inclusion or exclusion criteria stated for human expert recruitment."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "The human evaluation is a performance measurement task, not a randomized experiment with treatment and control conditions."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "Not applicable — the human evaluation is a performance baseline task, not a comparative experiment requiring blinding."
    278       },
    279       "attrition_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No attrition information provided. The paper notes 'maintaining consistent human attention proved to be a significant challenge' but does not report how many experts started vs. completed the evaluation or any dropout information."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No API costs, token consumption, or per-evaluation costs are reported despite evaluating 11 models across 3 environments with 365-step horizons and multiple runs for some conditions."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No total computational budget stated. No mention of total API spend, GPU hours, or hardware used for the evaluation campaign."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Five independent trials conducted for a representative subset of models across all three scenarios. Stochastic stability analysis shown in Figures 3, 16-18, revealing Vending has high variance while Freelance and Operation are more stable."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": true,
    306         "justification": "Section 4.2: 'we conducted five independent trials for a representative subset of models across all three scenarios.' Main results report 5-run averages for Vending and single runs for Freelance and Operation."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "Fixed generation parameters (Temperature=1.0, Top-p=0.95) used without justification for these specific values. No exploration of alternative settings or discussion of why these were chosen."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": true,
    316         "justification": "Standardized generation parameters (Temperature=1.0, Top-p=0.95) used identically across all models and trials, avoiding cherry-picking. The context window default of 128 is explored via ablation (Figure 4)."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "No statistical hypothesis tests are performed, so multiple comparison correction is not applicable."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "No acknowledgment of benchmark designer bias. OPPO designed EcoGym and evaluates all 11 models on it. The potential for benchmark design choices to systematically favor or disfavor certain model capabilities is not discussed."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No analysis of compute budget vs performance. Models with vastly different inference costs (e.g., Qwen3-235B-A22B vs GPT-5-Mini) are compared on raw performance without controlling for or reporting compute differences."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "No discussion of whether the three simulated environments actually measure 'long-horizon planning capability' as claimed. No construct validity analysis, no comparison with alternative measures of planning ability, and no questioning of whether simulated economic games proxy real economic decision-making."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": true,
    340         "answer": true,
    341         "justification": "All models use the same unified interface with standardized action schemas (Tables 7-9), identical context window settings (128 steps), the same tool-calling protocol (Appendix C.1), and the same generation parameters, controlling for scaffolding confounds."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of temporal leakage. The Freelance tasks derive from datasets (LiveCodeBench, SWE-bench, GSM8K) that predate the evaluated models, and the temporal relationship between source data and model training is not analyzed."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether the evaluation setup leaks information through prompts, action schemas, or feedback mechanisms that could advantage models familiar with similar patterns."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of non-independence. The underlying datasets (SWE-bench, GSM8K, FinQA, etc.) used for Freelance tasks are widely used in LLM training and evaluation, and potential overlap is not addressed beyond the mutation step."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": true,
    363         "justification": "Section 3.2.2 describes concrete prevention: 'Logic Mutation, refactoring numerical values and variables to prevent data contamination or memorization' for quantitative tasks, plus 'Scenario Injection' for coding tasks and a 'solvability check via an LLM-as-a-Judge' before task inclusion."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "No single model consistently achieves superior performance across all three EcoGym scenarios.",
    370       "evidence": "Table 2 shows different leaders: Gemini-3-Pro dominates Vending (11274.73), GPT-5-Mini leads Freelance (2990.72), and Claude-Sonnet-4.5 tops Operation (1572.49).",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Models exhibit significant suboptimality in either high-level strategies or efficient action execution.",
    375       "evidence": "Failure mode analysis (Section 4.2): Claude-Sonnet-4.5 won Operation via strategic prioritization (quantity over quality), while GPT-5-Mini won Freelance via execution efficiency (precise state tracking). Gemini-3-Pro suffered redundant loops in Freelance.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Expanding the context window does not yield consistent performance gains.",
    380       "evidence": "Figure 4 shows Gemini-3-Pro peaks at k=128 and degrades progressively to k=1024, while Gemini-3-Flash shows volatile performance across window sizes. Only tested on Operation environment with 2 models.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Memory integration generally enhances performance but is not universally beneficial, and optimal memory type is model- and task-dependent.",
    385       "evidence": "Table 3 shows all four memory types improve both models in Vending, but the paper notes performance regression for Gemini-3-Pro with working memory in Freelance. Gemini-3-Flash favors working memory while Gemini-3-Pro benefits from episodic memory.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Thinking mode catalyzes universal performance elevation across model variants.",
    390       "evidence": "Figure 6 shows DAU improvement from 1196.71 to 1398.20 for Gemini-3-Flash and from 1280.75 to 1511.08 for Gemini-3-Pro. Tested only on 2 models in 1 environment (Operation).",
    391       "supported": "weak"
    392     },
    393     {
    394       "claim": "Current SOTA LLMs have achieved super-human performance in specific long-horizon economic planning scenarios.",
    395       "evidence": "Human experts averaged DAU 1,404 in Operation; Claude-Sonnet-4.5 (1572.49) and Grok-4.1-Fast (1372.77) surpassed or approached this. Only tested in Operation with vaguely described human baseline (unspecified number of experts, no demographics).",
    396       "supported": "weak"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "Single-run main results for 2/3 environments",
    402       "detail": "Freelance and Operation main results (Table 2) are from single runs despite the paper acknowledging 'inherent instability' in long-horizon environments. Variance analysis in appendix is only for a subset of models."
    403     },
    404     {
    405       "flag": "Vague human baseline with super-human claims",
    406       "detail": "Human expert comparison lacks critical details: number of participants, expertise qualifications, recruitment method, and practice time are all unreported. A single average DAU number (1,404) from uncharacterized 'human experts' supports the claim of 'super-human performance.'"
    407     },
    408     {
    409       "flag": "No statistical tests for comparative claims",
    410       "detail": "All claims of model superiority are based on raw number comparisons without any significance testing, despite the paper demonstrating high variance in at least one environment (Vending)."
    411     },
    412     {
    413       "flag": "Conflict of interest: company-designed benchmark",
    414       "detail": "OPPO AI Agent Team designed EcoGym, selected the evaluation environments, and evaluates all models. No competing interests statement is provided and no benchmark designer bias is acknowledged."
    415     },
    416     {
    417       "flag": "No limitations section",
    418       "detail": "The paper has no limitations, threats to validity, or scope boundaries section. Simulated economic environments are presented as 'realistic economic settings' without discussing the gap between simulation and real-world economic dynamics."
    419     },
    420     {
    421       "flag": "No cost reporting for large-scale evaluation",
    422       "detail": "Evaluating 11 models across 3 environments with 365-day horizons, multiple ablations, and multiple runs requires substantial API costs, but no cost data is reported."
    423     }
    424   ],
    425   "cited_papers": [
    426     {
    427       "title": "Vending-bench: A benchmark for long-term coherence of autonomous agents",
    428       "authors": ["Axel Backlund", "Lukas Petersson"],
    429       "year": 2025,
    430       "arxiv_id": "2502.15840",
    431       "relevance": "Direct predecessor benchmark for long-term agent coherence evaluation in economic settings; EcoGym builds upon its methodology."
    432     },
    433     {
    434       "title": "HeroBench: A benchmark for long-horizon planning and structured reasoning in virtual worlds",
    435       "authors": ["Petr Anokhin", "Roman Khalikov", "Stefan Rebrikov"],
    436       "year": 2025,
    437       "arxiv_id": "2508.12782",
    438       "relevance": "Benchmark for long-horizon planning with competitive resource management dynamics."
    439     },
    440     {
    441       "title": "GDPval: Evaluating AI model performance on real-world economically valuable tasks",
    442       "authors": ["Tejal Patwardhan", "Rachel Dias", "Elizabeth Proehl"],
    443       "year": 2025,
    444       "relevance": "Evaluates AI agents on macroeconomically meaningful real-world tasks spanning major economic sectors."
    445     },
    446     {
    447       "title": "RE-Bench: Evaluating frontier AI R&D capabilities of language model agents against human experts",
    448       "authors": ["Hjalmar Wijk", "Tao Lin", "Joel Becker"],
    449       "year": 2024,
    450       "arxiv_id": "2411.15114",
    451       "relevance": "Evaluates frontier AI capabilities on expert-level tasks with human comparison, relevant to agent capability assessment."
    452     },
    453     {
    454       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    455       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig"],
    456       "year": 2023,
    457       "arxiv_id": "2310.06770",
    458       "relevance": "Major LLM code generation benchmark; used as task source for EcoGym's Freelance environment."
    459     },
    460     {
    461       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    462       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    463       "year": 2024,
    464       "arxiv_id": "2403.07974",
    465       "relevance": "Contamination-free code evaluation benchmark; used as task source for EcoGym's Freelance environment."
    466     },
    467     {
    468       "title": "WebArena: A realistic web environment for building autonomous agents",
    469       "authors": ["Shuyan Zhou", "Frank F. Xu", "Hao Zhu"],
    470       "year": 2024,
    471       "arxiv_id": "2307.13854",
    472       "relevance": "Realistic web agent evaluation environment relevant to agentic AI capability assessment."
    473     },
    474     {
    475       "title": "Generative agents: Interactive simulacra of human behavior",
    476       "authors": ["Joon Sung Park", "Joseph O'Brien", "Carrie Jun Cai"],
    477       "year": 2023,
    478       "relevance": "Foundational work on LLM-based agents with persistent memory and planning in social and economic interactions."
    479     },
    480     {
    481       "title": "AgentBoard: An analytical evaluation board of multi-turn LLM agents",
    482       "authors": ["Chang Ma", "Junlei Zhang", "Zhihao Zhu"],
    483       "year": 2024,
    484       "arxiv_id": "2401.13178",
    485       "relevance": "Multi-turn LLM agent evaluation framework with fine-grained analytical capabilities."
    486     },
    487     {
    488       "title": "The OpenHands software agent SDK: A composable and extensible foundation for production agents",
    489       "authors": ["Xingyao Wang", "Simon Rosenberg", "Juan Michelini"],
    490       "year": 2025,
    491       "arxiv_id": "2511.03690",
    492       "relevance": "Agent scaffold framework for production AI agents, relevant to agentic workflow evaluation."
    493     },
    494     {
    495       "title": "xBench: Tracking agents productivity scaling with profession-aligned real-world evaluations",
    496       "authors": ["Kaiyuan Chen", "Yixin Ren", "Yang Liu"],
    497       "year": 2025,
    498       "arxiv_id": "2506.13651",
    499       "relevance": "Benchmarks AI agent productivity scaling across professional domains."
    500     },
    501     {
    502       "title": "Remote Labor Index: Measuring AI automation of remote work",
    503       "authors": ["Mantas Mazeika", "Alice Gatti", "Cristina Menghini"],
    504       "year": 2025,
    505       "relevance": "Measures AI potential to automate remote labor, relevant to economic impact evaluation of AI agents."
    506     }
    507   ]
    508 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs