scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28880B)
      1 {
      2   "paper": {
      3     "title": "Magentic Marketplace: An Open-Source Environment for Studying Agentic Markets",
      4     "authors": [
      5       "Gagan Bansal",
      6       "Wenyue Hua",
      7       "Zezhou Huang",
      8       "Adam Fourney",
      9       "Amanda Swearngin",
     10       "Will Epperson",
     11       "Tyler Payne",
     12       "Jake M. Hofman",
     13       "Brendan Lucier",
     14       "Chinmay Singh",
     15       "Markus Mobius",
     16       "Akshay Nambi",
     17       "Archana Yadav",
     18       "Kevin Gao",
     19       "David M. Rothschild",
     20       "Aleksandrs Slivkins",
     21       "Daniel G. Goldstein",
     22       "Hussein Mozannar",
     23       "Nicole Immorlica",
     24       "Maya Murad",
     25       "Matthew Vogel",
     26       "Subbarao Kambhampati",
     27       "Eric Horvitz",
     28       "Saleema Amershi"
     29     ],
     30     "year": 2025,
     31     "venue": "arXiv preprint",
     32     "arxiv_id": "2510.25779"
     33   },
     34   "checklist": {
     35     "artifacts": {
     36       "code_released": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The paper states 'Available open-source at https://github.com/microsoft/multi-agent-marketplace' (Section 1, footnote 1) and Contribution 3 explicitly states they 'open-source Magentic Marketplace.'"
     40       },
     41       "data_released": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper uses fully synthetic data generated through a pipeline described in Section 4.1, but there is no explicit statement that the generated datasets (the 33-customer/99-business and 100-customer/300-business scenarios) are included in the repository. The paper says 'See our repository for additional synthetic domain data and experiments' but does not confirm whether the exact experimental datasets are released."
     45       },
     46       "environment_specified": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is mentioned in the paper. The paper mentions vLLM for open-source models but does not provide a comprehensive environment setup with library versions."
     50       },
     51       "reproduction_instructions": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No step-by-step reproduction instructions are described in the paper. While the methodology is explained at a high level, there are no specific commands, scripts, or a 'Reproducing Results' section."
     55       }
     56     },
     57     "statistical_methodology": {
     58       "confidence_intervals_or_error_bars": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The paper reports results with error bars in Figures 4, 5, 7, 8, and 9. Section 4.2 states 'All experiments were conducted with 5 independent runs, with mean and standard deviation reported.'"
     62       },
     63       "significance_tests": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper makes comparative claims (e.g., 'GPT-4.1 and Gemini-2.5-Flash come very close to the optimal outcome') but does not report any statistical significance tests (no p-values, t-tests, or other formal comparisons)."
     67       },
     68       "effect_sizes_reported": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The paper reports concrete effect sizes with context: 'consumer welfare declines by 4.3% when providing one hundred search results versus three' (Section 5.2), 'first proposals achieving selection rates between 60-100% compared to near-zero selection for third proposals. This represents a 10-30 fold advantage' (Section 5.4), and 'Sonnet-4 - 65.4%, GPT-5 - 44%' decline figures."
     72       },
     73       "sample_size_justified": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "The paper uses 5 independent runs per condition and two market scales (33/99 and 100/300) but provides no justification for why these specific sizes were chosen, nor any power analysis."
     77       },
     78       "variance_reported": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Section 4.2 explicitly states 'All experiments were conducted with 5 independent runs, with mean and standard deviation reported.' Standard deviation is visible in the box plots in the figures."
     82       }
     83     },
     84     "evaluation_design": {
     85       "baselines_included": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Table 2 defines four baselines: Random w/ items only, Cheapest w/ items & prices, Random w/ items & amenities, and Optimal. These are compared against the agentic conditions in Figure 4."
     89       },
     90       "baselines_contemporary": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The baselines represent different information conditions in the marketplace (random selection, price-optimized, amenity-aware, and theoretical optimal). These are appropriate structural baselines for a novel environment since there are no prior systems to compare against. The paper also compares multiple contemporary frontier models (GPT-4.1, GPT-5, Sonnet-4, Sonnet-4.5, Gemini-2.5-Flash)."
     94       },
     95       "ablation_study": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The experimental design systematically varies components: search condition (perfect vs. lexical), consideration set size (3 to 100), and manipulation conditions (6 strategies). The welfare experiments with and without perfect search effectively ablate the discovery layer. The consideration set experiments ablate the impact of information overload."
     99       },
    100       "multiple_metrics": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper uses multiple metrics: consumer welfare/utility (Section 5.1), mean payments to manipulated businesses (Section 5.3), position selection rates (Section 5.4), proposal selection rates (Section 5.4), and number of businesses contacted (Figure 6)."
    104       },
    105       "human_evaluation": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "The paper mentions 'Our manual evaluation of Qwen3-14b revealed significant performance limitations' and 'manual analysis of seven trials' (Section 5.1), but this is only an informal diagnostic for one model's failure modes, not a systematic human evaluation of the system's outputs or agent decisions."
    109       },
    110       "held_out_test_set": {
    111         "applies": false,
    112         "answer": false,
    113         "justification": "This is a simulation-based study where agents operate in a synthetic marketplace environment. There is no train/test split paradigm; the evaluation is based on agent performance in the simulated market."
    114       },
    115       "per_category_breakdown": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Results are broken down per model (GPT-4o, GPT-4.1, GPT-5, Gemini-2.5-Flash, etc.), per domain (Mexican restaurants, Contractors), per experimental condition (perfect search vs. lexical search), and per manipulation strategy (control, authority, social proof, loss aversion, prompt injection basic/strong)."
    119       },
    120       "failure_cases_discussed": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Section 5.1 discusses Qwen3-14b failure modes in detail: 'premature termination without completing payment, role confusion where it critiqued its own wrong actions while simultaneously executing them, and excessive purchasing without selection criteria.' Section 5.4 discusses the universal first-proposal bias as a systematic failure."
    124       },
    125       "negative_results_reported": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Several negative results are reported: the paradox of choice effect where more options reduced welfare (Section 5.2), severe first-proposal bias across all models (Section 5.4), manipulation vulnerability in smaller/older models (Section 5.3), and Qwen3-14b's fundamental inability to navigate the marketplace."
    129       }
    130     },
    131     "claims_and_evidence": {
    132       "abstract_claims_supported": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The abstract claims that 'frontier models can approach optimal welfare—but only under ideal search conditions' (supported by Figure 4), 'Performance degrades sharply with scale' (supported by consideration set experiments in Figure 5), and 'all models exhibit severe first-proposal bias, creating 10-30x advantages for response speed over quality' (supported by Figure 9 in Section 5.4). All claims are substantiated in the results."
    136       },
    137       "causal_claims_justified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper makes causal claims about the effect of consideration set size on welfare and of manipulation strategies on agent behavior. These are supported by controlled experiments where individual variables are manipulated while others are held constant. The experimental design (Table 2) systematically isolates factors. The ablation-style comparison between perfect search and lexical search is a controlled single-variable manipulation."
    141       },
    142       "generalization_bounded": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "While the paper notes that it uses fully synthetic data and focuses on restaurants and contractors domains, the title 'An Open-Source Environment for Studying Agentic Markets' and broad claims like 'These findings reveal how behaviors emerge across market conditions, informing the design of fair and efficient agentic marketplaces' extend beyond what was tested. The paper does not systematically bound its claims to the tested setting (synthetic data, two specific domains, static markets)."
    146       },
    147       "alternative_explanations_discussed": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Section 5.2 discusses alternative explanations for the paradox of choice: 'We hypothesize that this effect may not be inherent to agentic markets; rather, it could arise from an interplay between limitations of agents and our specific design decisions.' Section 5.1 discusses prompt-model misalignment as an alternative explanation for Qwen3-14b failures but then provides evidence for more fundamental issues."
    151       }
    152     },
    153     "setup_transparency": {
    154       "model_versions_specified": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "The paper uses marketing names without specific version identifiers: 'GPT-4o', 'GPT-4.1', 'GPT-5', 'Gemini-2.5-Flash', 'Sonnet-4', 'Sonnet-4.5'. No API snapshot dates or specific model version strings (e.g., 'gpt-4o-2024-08-06') are provided. The citations point to model cards but the actual versions used in experiments are not specified."
    158       },
    159       "prompts_provided": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The paper describes agent roles and behaviors at a high level but does not provide the actual system prompts or instructions given to the LLM agents. Section 4.2 mentions agents are 'instructed to maximize utility' but the actual prompt text is not provided in the paper or appendix."
    163       },
    164       "hyperparameters_reported": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "No temperature, top-p, max tokens, or other hyperparameter settings are reported for any of the models used. The paper mentions using vLLM for open-source models and YARN for context extension but does not specify sampling parameters."
    168       },
    169       "scaffolding_described": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "The paper provides detailed descriptions of the agentic scaffolding: the three-endpoint architecture (register, protocol, action), the five core actions (search, send text, send order proposals, send payments, receive), the action-observation loop, and the REST API specifications in Table 1. Figure 2 and Figure 3 show detailed architecture diagrams. The ReACT-style agent loop is mentioned in Section 5.1."
    173       },
    174       "data_preprocessing_documented": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 4.1 describes the three-stage synthetic data generation pipeline in detail: (1) Item/Service Universe Construction with price distributions, (2) Customer Synthesis with item sampling and constraints, (3) Business Synthesis with controlled matching properties. The generation process is fully documented with parameter choices (e.g., 1-3 items, 1-2 amenities, α=2)."
    178       }
    179     },
    180     "limitations_and_scope": {
    181       "limitations_section_present": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "There is no dedicated limitations or threats-to-validity section. The Discussion section (Section 6) mentions some limitations in passing (e.g., 'our experiments focused on static markets') but there is no substantive dedicated section."
    185       },
    186       "threats_to_validity_specific": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "No specific threats to validity are systematically discussed. The paper does not address issues such as the generalizability of synthetic data, the limited number of runs (5), the potential impact of prompt design on results, or the representativeness of the restaurant/contractor domains."
    190       },
    191       "scope_boundaries_stated": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "While the Discussion mentions 'our experiments focused on static markets' and notes future extensions to dynamic markets and human-in-the-loop designs, the paper does not explicitly state what the results do NOT show. There are no systematic scope boundaries like METR's Table 2 ('What the evidence does not show')."
    195       }
    196     },
    197     "data_integrity": {
    198       "raw_data_available": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "No raw experimental data (agent conversation logs, transaction records, per-trial results) is made available. Only aggregated results are shown in figures."
    202       },
    203       "data_collection_described": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Section 4.1 describes the synthetic data generation pipeline in detail, and Section 4.2 describes the experimental protocol including how agents interact with the marketplace, the utility function (Equation 1), and the evaluation conditions (Table 2)."
    207       },
    208       "recruitment_methods_described": {
    209         "applies": false,
    210         "answer": false,
    211         "justification": "No human participants were involved. The study uses synthetic customer and business data with LLM agents. Data source is a fully synthetic generation pipeline, not a standard benchmark."
    212       },
    213       "data_pipeline_documented": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "The data generation pipeline is documented in Section 4.1 with three explicit stages and parameters. The experimental pipeline (how agents interact, make decisions, and complete transactions) is documented in Sections 3 and 4."
    217       }
    218     },
    219     "conflicts_of_interest": {
    220       "funding_disclosed": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No funding disclosure or acknowledgment of financial support is present. The Acknowledgements section (Section 8) thanks individuals for conversations and feedback but does not mention funding sources."
    224       },
    225       "affiliations_disclosed": {
    226         "applies": true,
    227         "answer": true,
    228         "justification": "Author affiliations are clearly stated: Microsoft for most authors, Arizona State University for Subbarao Kambhampati. The roles are also specified (Core Contributors, Contributors, Technical Program Managers, Advisors, Principal Investigator)."
    229       },
    230       "funder_independent_of_outcome": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "The paper is authored primarily by Microsoft employees and evaluates models that include Microsoft's own GPT-family models (GPT-4o, GPT-4.1, GPT-5, GPT-OSS-20b) via their API. Microsoft has a direct financial interest in the perception of these models' capabilities. No acknowledgment of this conflict is made."
    234       },
    235       "financial_interests_declared": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No competing interests or financial interests statement is present in the paper. Given that the authors are Microsoft employees evaluating Microsoft-affiliated models and proposing an environment that could influence how agentic markets are built, a financial interests declaration would be expected."
    239       }
    240     },
    241     "contamination": {
    242       "training_cutoff_stated": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "The paper does not evaluate pre-trained model capabilities on a benchmark. It evaluates agent behavior in a novel simulated marketplace environment with fully synthetic data. The agents' performance depends on reasoning and tool-use capabilities, not on memorized benchmark answers."
    246       },
    247       "train_test_overlap_discussed": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "Not applicable. The marketplace environment is novel and the data is fully synthetic, generated for this study. There is no risk of train/test overlap in the traditional sense."
    251       },
    252       "benchmark_contamination_addressed": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "Not applicable. The evaluation uses a novel simulated environment with synthetic data, not an established benchmark that could have appeared in training data."
    256       }
    257     },
    258     "human_studies": {
    259       "pre_registered": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants were involved in the study. All agents are LLM-based and interact with synthetic data."
    263       },
    264       "irb_or_ethics_approval": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants were involved. The study uses fully synthetic data and LLM agents."
    268       },
    269       "demographics_reported": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants were involved."
    273       },
    274       "inclusion_exclusion_criteria": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants were involved."
    278       },
    279       "randomization_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants were involved."
    283       },
    284       "blinding_described": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants were involved."
    288       },
    289       "attrition_reported": {
    290         "applies": false,
    291         "answer": false,
    292         "justification": "No human participants were involved."
    293       }
    294     },
    295     "cost_and_practicality": {
    296       "inference_cost_reported": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "No inference costs, API costs, or latency figures are reported. The paper runs experiments with multiple frontier models (including GPT-5, Sonnet-4.5) across many conditions with 5 runs each, but the total cost is not disclosed."
    300       },
    301       "compute_budget_stated": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No total computational budget, GPU hours, API spend, or hardware specifications are provided. The paper mentions using vLLM for open-source models but does not specify what hardware was used or the total compute requirements."
    305       }
    306     }
    307   },
    308   "claims": [
    309     {
    310       "claim": "Frontier models can approach optimal welfare outcomes under ideal (perfect) search conditions in two-sided agentic markets.",
    311       "evidence": "Figure 4 shows GPT-4.1 and Gemini-2.5-Flash approaching the optimal dashed line under the Agentic: Perfect search condition in both Mexican restaurant and Contractor domains (Section 5.1).",
    312       "supported": "strong"
    313     },
    314     {
    315       "claim": "Performance degrades sharply with increased consideration set size, exhibiting a 'paradox of choice' effect.",
    316       "evidence": "Figure 5 shows consumer welfare declining as consideration set size increases from 3 to 100. GPT-4o shows 4.3% decline; Sonnet-4 shows 65.4% decline; GPT-5 shows 44% decline on Mexican 100-300 (Section 5.2).",
    317       "supported": "strong"
    318     },
    319     {
    320       "claim": "All models exhibit severe first-proposal bias, creating 10-30x advantages for response speed over quality.",
    321       "evidence": "Figure 9 shows first-proposal selection rates between 60-100% across all models, with near-zero rates for third proposals. GPT-4o and Sonnet-4.5 showed 100% first-proposal selection in some conditions (Section 5.4).",
    322       "supported": "strong"
    323     },
    324     {
    325       "claim": "Frontier models demonstrate robust manipulation resistance while smaller/older models are significantly vulnerable.",
    326       "evidence": "Figure 7 shows GPT-4.1, Sonnet-4.5, and Gemini-2.5-Flash maintaining stable performance across manipulation conditions, while GPT-4o, GPT-OSS-20B, and Qwen3-4B-2507 showed significant vulnerability (Section 5.3).",
    327       "supported": "strong"
    328     },
    329     {
    330       "claim": "Two-sided agentic markets can achieve reasonable welfare outcomes by reducing information asymmetries through agent-mediated communication.",
    331       "evidence": "Section 5.1 shows that under lexical search, proprietary models outperform two of three baselines (random selection and cheapest-by-price), demonstrating agents can navigate noisy discovery and make quality decisions.",
    332       "supported": "moderate"
    333     },
    334     {
    335       "claim": "Open-source models (GPT-OSS-20b, Qwen3-4b) can approach proprietary model performance under perfect search but degrade notably under lexical search.",
    336       "evidence": "Section 5.1 and Figure 4 show GPT-OSS-20b outperforming GPT-4o under both search conditions in the Mexican dataset, but notes 'an overall notable drop under lexical search' for open-source models generally.",
    337       "supported": "moderate"
    338     }
    339   ],
    340   "methodology_tags": [
    341     "benchmark-eval",
    342     "case-study"
    343   ],
    344   "key_findings": "Magentic Marketplace is an open-source simulated environment for studying LLM agent behavior in two-sided economic markets. Experiments across multiple frontier and open-source models reveal that agents can approach optimal welfare under ideal search conditions, but performance degrades with larger consideration sets (a 'paradox of choice'). All tested models exhibit severe first-proposal bias (60-100% selection of first offers), creating 10-30x advantages for response speed over quality. Frontier models show robust manipulation resistance, but smaller models remain vulnerable to both psychological tactics and prompt injection attacks.",
    345   "red_flags": [
    346     {
    347       "flag": "Company evaluating its own models",
    348       "detail": "Microsoft employees evaluate GPT-family models (GPT-4o, GPT-4.1, GPT-5, GPT-OSS-20b) which are Microsoft/OpenAI products, alongside competitors. No conflict of interest statement is provided. The environment design itself could favor certain model architectures."
    349     },
    350     {
    351       "flag": "No significance testing",
    352       "detail": "Despite comparative claims across models and conditions, no statistical significance tests are reported. With only 5 runs per condition, the observed differences may not be statistically significant, particularly for close comparisons."
    353     },
    354     {
    355       "flag": "Missing model versions and hyperparameters",
    356       "detail": "No specific model version strings (API snapshots) or sampling hyperparameters (temperature, top-p) are reported. Different temperature settings could substantially affect agent behavior, making results difficult to reproduce."
    357     },
    358     {
    359       "flag": "No limitations section",
    360       "detail": "The paper lacks a dedicated limitations or threats-to-validity section despite making broad claims about agentic market design. Key limitations such as synthetic-only data, limited domains, static markets, and prompt sensitivity are not systematically acknowledged."
    361     },
    362     {
    363       "flag": "Missing prompts",
    364       "detail": "The actual system prompts given to LLM agents are not provided, despite being critical to agent behavior. The paper describes agent roles at a high level but readers cannot reproduce the exact experimental setup without knowing the prompt text."
    365     }
    366   ],
    367   "cited_papers": [
    368     {
    369       "title": "Cooperation, competition, and maliciousness: LLM-stakeholders interactive negotiation",
    370       "authors": ["Sahar Abdelnabi", "Amr Gomaa", "Sarath Sivaprasad", "Lea Schönherr", "Mario Fritz"],
    371       "year": 2024,
    372       "relevance": "Studies LLM agent behavior in negotiation games, directly relevant to understanding agent capabilities in economic interactions."
    373     },
    374     {
    375       "title": "Multi-agent risks from advanced AI",
    376       "authors": ["Lewis Hammond", "Alan Chan", "Jesse Clifton"],
    377       "year": 2025,
    378       "arxiv_id": "2502.14143",
    379       "relevance": "Surveys risks from multi-agent AI systems, relevant to safety and alignment concerns in agentic markets."
    380     },
    381     {
    382       "title": "Generative agents: Interactive simulacra of human behavior",
    383       "authors": ["Joon Sung Park", "Joseph C. O'Brien", "Carrie J. Cai", "Meredith Ringel Morris", "Percy Liang", "Michael S. Bernstein"],
    384       "year": 2023,
    385       "arxiv_id": "2304.03442",
    386       "relevance": "Foundational work on LLM-based agents simulating human behavior, relevant to understanding agent capabilities in social and economic settings."
    387     },
    388     {
    389       "title": "The agentic economy",
    390       "authors": ["David M. Rothschild", "Markus Mobius", "Jake M. Hofman"],
    391       "year": 2025,
    392       "arxiv_id": "2505.15799",
    393       "relevance": "Conceptual framework for agentic economies and two-sided agentic markets, directly motivates the Magentic Marketplace work."
    394     },
    395     {
    396       "title": "Algorithmic collusion by large language models",
    397       "authors": ["Yannai A. Gonczarowski", "Ran I. Shorrer", "Sara Fish"],
    398       "year": 2024,
    399       "relevance": "Studies LLM collusion in economic settings, relevant to understanding risks in agentic markets."
    400     },
    401     {
    402       "title": "What is your AI agent buying? Evaluation, implications, and emerging questions for agentic e-commerce",
    403       "authors": ["Amine Allouah", "Omar Besbes", "Josue D Figueroa", "Yash Kanoria", "Akshit Kumar"],
    404       "year": 2025,
    405       "arxiv_id": "2508.02630",
    406       "relevance": "Evaluates AI agent behavior in e-commerce, directly relevant to understanding agentic purchasing decisions."
    407     },
    408     {
    409       "title": "From LLM reasoning to autonomous AI agents: A comprehensive review",
    410       "authors": ["Mohamed Amine Ferrag", "Norbert Tihanyi", "Merouane Debbah"],
    411       "year": 2025,
    412       "arxiv_id": "2504.19678",
    413       "relevance": "Comprehensive survey of autonomous AI agents, relevant to understanding the landscape of agentic AI capabilities."
    414     },
    415     {
    416       "title": "A survey of AI agent protocols",
    417       "authors": ["Yingxuan Yang", "Huacan Chai", "Yuanyi Song"],
    418       "year": 2025,
    419       "arxiv_id": "2504.16736",
    420       "relevance": "Surveys communication protocols for AI agents, relevant to the protocol design aspects of agentic markets."
    421     },
    422     {
    423       "title": "CRMArena: Understanding the capacity of LLM agents to perform professional CRM tasks in realistic environments",
    424       "authors": ["Kung-Hsiang Huang", "Akshara Prabhakar", "Sidharth Dhawan"],
    425       "year": 2024,
    426       "arxiv_id": "2411.02305",
    427       "relevance": "Evaluates LLM agents in professional CRM tasks, relevant to understanding agent capability in business interactions."
    428     },
    429     {
    430       "title": "The AI economist: Improving equality and productivity with AI-driven tax policies",
    431       "authors": ["Stephan Zheng", "Alexander Trott", "Sunil Srinivasa", "Nikhil Naik"],
    432       "year": 2020,
    433       "arxiv_id": "2004.13332",
    434       "relevance": "Foundational work on AI agents in economic simulation environments, directly relevant to the economic simulation paradigm."
    435     },
    436     {
    437       "title": "Virtual agent economies",
    438       "authors": ["Nenad Tomasev", "Matija Franklin", "Joel Z. Leibo"],
    439       "year": 2025,
    440       "arxiv_id": "2509.10147",
    441       "relevance": "Studies virtual economies with AI agents, directly relevant to understanding market dynamics in agentic settings."
    442     },
    443     {
    444       "title": "STRIDE: A tool-assisted LLM agent framework for strategic and interactive decision-making",
    445       "authors": ["Chuanhao Li", "Runhan Yang", "Tiankai Li"],
    446       "year": 2024,
    447       "arxiv_id": "2405.16376",
    448       "relevance": "LLM agent framework for strategic decision-making, relevant to agent architecture for economic interactions."
    449     }
    450   ]
    451 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs