scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28529B)
      1 {
      2   "paper": {
      3     "title": "Magentic-One: A Generalist Multi-Agent System for Solving Complex Tasks",
      4     "authors": [
      5       "Adam Fourney",
      6       "Gagan Bansal",
      7       "Hussein Mozannar",
      8       "Cheng Tan",
      9       "Eduardo Salinas",
     10       "Erkang (Eric) Zhu",
     11       "Friederike Niedtner",
     12       "Grace Proebsting",
     13       "Griffin Bassman",
     14       "Jack Gerrits",
     15       "Jacob Alber",
     16       "Peter Chang",
     17       "Ricky Loynd",
     18       "Robert West",
     19       "Victor Dibia",
     20       "Ahmed Awadallah",
     21       "Ece Kamar",
     22       "Rafah Hosn",
     23       "Saleema Amershi"
     24     ],
     25     "year": 2024,
     26     "venue": "arXiv",
     27     "arxiv_id": "2411.04468"
     28   },
     29   "checklist": {
     30     "artifacts": {
     31       "code_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The paper states 'we provide an open-source implementation of Magentic-One' and links to https://aka.ms/magentic-one. AutoGenBench is also released at https://aka.ms/agbench. The AutoGen framework is at https://github.com/microsoft/autogen."
     35       },
     36       "data_released": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The benchmarks used (GAIA, AssistantBench, WebArena) are all publicly available datasets. The paper uses standard public benchmarks without modification (aside from a train/test split for WebArena, which is described reproducibly via MD5 hashing)."
     40       },
     41       "environment_specified": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper mentions Docker containers for isolation and AutoGen v0.4, but does not provide a requirements.txt, Dockerfile, conda environment, or detailed dependency listing with library versions."
     45       },
     46       "reproduction_instructions": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper refers readers to the open-source release for 'detailed empirical performance evaluations of Magentic-One, including ablations and error analysis' but the paper itself does not contain step-by-step reproduction instructions. The reader is directed to the URL https://aka.ms/magentic-one."
     50       }
     51     },
     52     "statistical_methodology": {
     53       "confidence_intervals_or_error_bars": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Table 1 includes '95% error bars as ± using the Wald interval method' for all reported results. Appendix A discusses the statistical methodology including Wald and Wilson intervals."
     57       },
     58       "significance_tests": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The paper uses z-tests for proportions (α=0.05) to compare Magentic-One to baselines. Appendix A describes the methodology: 'We used the z-test to compare the accuracy of Magentic-One to each baseline in Table 1.' Results that are statistically comparable or statistically exceed are marked in Table 1."
     62       },
     63       "effect_sizes_reported": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Raw performance percentages with baselines are reported (e.g., '38% on GAIA', '32.8% on WebArena', '27.7% on AssistantBench'), along with baseline numbers providing full context for the magnitude of differences. For example, Table 1 shows Magentic-One at 38.00% vs. omne at 40.53% on GAIA."
     67       },
     68       "sample_size_justified": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "The paper uses the benchmark sizes as given (GAIA test: 300, AssistantBench test: 181, WebArena: 812) without discussing whether these sizes provide adequate statistical power for the comparisons being made. The z-test discussion in Appendix A acknowledges limitations but does not justify sample sizes."
     72       },
     73       "variance_reported": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "The paper does not report variance or standard deviation across repeated experimental runs. Although AutoGenBench is described as supporting 'repetition' for computing variance, the main results in Table 1 appear to be single-run numbers with Wald intervals (which are derived from the proportion, not from repeated experiments). No multi-run variance is reported."
     77       }
     78     },
     79     "evaluation_design": {
     80       "baselines_included": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Table 1 includes multiple baselines per benchmark: for GAIA (omne, Trase Agent, Multi Agent, das agent, Sibyl, HF Agents, FRIDAY, GPT-4 + plugins), for AssistantBench (SPA→CB, Infogent), and for WebArena (Jace.AI, WebPilot, AWM, SteP, BrowserGym). Human performance is also included where available."
     84       },
     85       "baselines_contemporary": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Baselines are drawn from leaderboards 'as of October 21, 2024' and include contemporary systems like omne v0.1, Trase Agent v0.2, WebPilot, and Jace.AI. The baselines are state-of-the-art entries from the same period."
     89       },
     90       "ablation_study": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Section 5.3 presents ablation experiments on the GAIA validation set: replacing the Orchestrator with a simple GroupChat mechanism, and removing individual agents (WebSurfer, FileSurfer, Coder+Terminal). Results are broken down by difficulty level and required capabilities (Figure 3)."
     94       },
     95       "multiple_metrics": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "For AssistantBench, both exact match (EM) and a softer accuracy metric are reported. For GAIA and WebArena, task completion rate is the standard metric for those benchmarks. Additionally, per-category breakdowns (Table 2) and per-difficulty breakdowns provide multiple perspectives."
     99       },
    100       "human_evaluation": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No human evaluation of the system's outputs was performed. The error analysis in Section 5.4 uses GPT-4o for automated log analysis ('we opted to automate log analysis using LLMs') rather than human judges. Human performance is referenced as a baseline from prior work but no humans evaluated Magentic-One's outputs."
    104       },
    105       "held_out_test_set": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "For GAIA and AssistantBench, results are reported on hidden test sets with answers withheld. For WebArena, the authors created their own validation/test split using MD5 hashing of template IDs and note that 'the test set was evaluated only once.' They also report the validation-to-test gap (35.1% vs 30.5%)."
    109       },
    110       "per_category_breakdown": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Table 2 provides per-category breakdowns: GAIA by difficulty level (1-3), AssistantBench by difficulty (Easy/Medium/Hard), and WebArena by domain (Reddit, Shopping, CMS, Gitlab, Maps, Cross Site). Figure 3 shows ablation results broken down by difficulty level and required capabilities."
    114       },
    115       "failure_cases_discussed": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Section 5.4 presents a detailed error analysis with automatically discovered error codes (persistent-inefficient-actions, insufficient-verification-steps, etc.). Appendix C provides a full codebook with definitions and examples. Section 6.3 discusses specific failure modes including risky agent behaviors."
    119       },
    120       "negative_results_reported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The paper reports negative findings: the validation-to-test performance drop on WebArena suggesting mild overfitting (35.1% to 30.5%), o1-preview refusing 26% of WebArena Gitlab tasks making fair comparison impossible, and the discussion of performance being worse on easy tasks compared to hard tasks. The limitations section (6.2) also enumerates several weaknesses."
    124       }
    125     },
    126     "claims_and_evidence": {
    127       "abstract_claims_supported": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The abstract claims Magentic-One 'achieves statistically competitive performance to the state-of-the-art on three diverse and challenging agentic benchmarks.' Table 1 with z-tests supports this claim — results are statistically comparable to SOTA on GAIA and AssistantBench, though on WebArena WebPilot and Jace.AI statistically exceed Magentic-One's performance. The abstract appropriately uses 'statistically competitive' rather than 'state-of-the-art.'"
    131       },
    132       "causal_claims_justified": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Causal claims are primarily made through ablation studies (Section 5.3), which show the impact of removing specific agents. These are controlled single-variable manipulations. For example, 'removing any single agent reduces performance by between 21% to 39%.' The ablation design is adequate for these claims."
    136       },
    137       "generalization_bounded": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper appropriately bounds its claims. The title says 'generalist' but the paper demonstrates this across three specific benchmarks. Section 6.2 Limitations explicitly lists accuracy-focused evaluation, limited modalities, limited action space, limited coding capabilities, and fixed team membership. The WebArena overfitting observation (Section 5.2) shows awareness of generalization limits."
    141       },
    142       "alternative_explanations_discussed": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper discusses several alternative explanations: that WebArena leaderboard results may benefit from overfitting since there is no hidden test set (Section 5.2), that the performance advantage on hard vs easy tasks may be due to fixed overhead (Section 5.2), and that the Orchestrator's chain-of-thought prompting (rather than the multi-agent design per se) could contribute to performance. Appendix A discusses limitations of the z-test."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 5.1 states 'the default multimodal LLM we use for all agents (except the ComputerTerminal) is gpt-4o-2024-05-13.' The o1-preview model is specified by name with a footnote linking to its announcement. The snapshot date '2024-05-13' provides a specific version."
    153       },
    154       "prompts_provided": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "The paper describes the prompts in natural language (e.g., describing the task ledger questions and progress ledger questions in Section 4.1) and includes a brief WebArena-specific prompt snippet. However, the full prompt text for the Orchestrator, WebSurfer, FileSurfer, and Coder agents is not provided in the paper or appendix. The reader is referred to the open-source code."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "No temperature, top-p, max_tokens, or other LLM API hyperparameters are reported. The stall counter threshold (≤2) is mentioned, and time budgets (e.g., 25 minutes) are referenced, but core LLM generation parameters are not specified."
    163       },
    164       "scaffolding_described": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 4 provides a detailed description of the multi-agent scaffolding: the Orchestrator's outer and inner loops, task and progress ledgers, stall counter mechanism, agent selection, and error recovery. Section 4.2 describes each agent's capabilities and action spaces. Figure 2 provides a workflow diagram."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "The WebArena train/test split procedure is documented (MD5 hash of template ID, digits 0-7 to validation, rest to test). The GAIA and AssistantBench splits are standard. Setup code modifications for each benchmark (answer formatting, login instructions, Postmill/Reddit confusion) are described in Section 5.1."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 6.2 is explicitly titled 'Limitations' and provides a detailed enumeration of seven specific limitations: accuracy-focused evaluation, high cost/latency, limited modalities, limited action space, limited coding capabilities, fixed team membership, and limited learning."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The limitations are specific to this system. For example: 'the Coder is ill-suited to operate over existing complex, or multi-file, code bases' (Section 6.2), WebArena lacking a hidden test set enabling potential overfitting (Section 5.2), o1-preview refusing 26% of GitLab tasks, and the WebArena validation-to-test gap. These are concrete, study-specific threats."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 6.2 explicitly states what the system does not handle: cannot process videos, cannot hover/drag web elements, cannot handle all document types, cannot use API keys/databases, coding is limited to standalone Python scripts, and agents do not learn across tasks. The paper is clear about what it does not claim."
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "The paper does not release raw experimental logs, per-task results, or the automated error analysis data. Test set results for GAIA and AssistantBench are hidden by the benchmark providers. The open-source release includes code but not the raw experimental data."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The benchmarks used are well-described public datasets (GAIA: 465 QA pairs, AssistantBench: 214 QA pairs, WebArena: 812 tasks). The paper describes what each benchmark contains, how tasks are structured, and how evaluation works. Section 5.1 explains the experimental setup including Docker containers and AutoGenBench."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": false,
    205         "answer": false,
    206         "justification": "No human participants were recruited for this study. The benchmarks are standard public datasets; human performance numbers are cited from prior work."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "The evaluation pipeline is documented: AutoGenBench initializes Docker containers for each task, runs the agent system, logs results to a central location, and applies benchmark-specific evaluation functions. The WebArena split procedure is documented. The error analysis pipeline (postmortem generation, code assignment, iterative clustering) is described in Section 5.4."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding or acknowledgments section is present in the paper. All authors are listed as 'Microsoft Research AI Frontiers' but no explicit funding disclosure is provided."
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "All authors are listed as affiliated with 'Microsoft Research AI Frontiers.' This is clearly stated on the first page."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "The paper is authored entirely by Microsoft Research employees. Microsoft has a financial interest in the AutoGen framework and related AI products. The system is evaluated using Microsoft's GPT-4o (via OpenAI, in which Microsoft has a major investment). The funder (Microsoft) is not independent of the outcome."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests or financial interests statement is present in the paper. Given that all authors are Microsoft employees evaluating a Microsoft Research product built on Microsoft-invested OpenAI models, a conflict-of-interest declaration would be expected."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "The paper uses GPT-4o (gpt-4o-2024-05-13) and o1-preview for benchmark evaluation but does not state the training data cutoff dates for these models. This is relevant because GAIA and WebArena tasks could potentially overlap with training data."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No discussion of whether GAIA, AssistantBench, or WebArena tasks could have appeared in GPT-4o's training data. The paper does not address potential memorization of benchmark questions."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "GAIA was published in 2023, before GPT-4o's training cutoff. WebArena was also published in 2023. The paper does not discuss whether these benchmark examples could have been seen during model training."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study. The paper evaluates an automated system on standardized benchmarks."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants in this study."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Section 6.2 acknowledges 'incurring perhaps several US dollars, and tens of minutes per task' but this is a qualitative estimate in the limitations section, not a quantified measurement. No per-task cost data, token counts, or API spend are reported."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "No total computational budget, API spend, or hardware specifications are stated. The experiments were 'conducted between August and October 2024' but no compute costs are quantified."
    300       }
    301     }
    302   },
    303   "claims": [
    304     {
    305       "claim": "Magentic-One achieves statistically competitive performance to the state-of-the-art on GAIA, AssistantBench, and WebArena benchmarks.",
    306       "evidence": "Table 1 shows Magentic-One (GPT-4o, o1) at 38.00% on GAIA (vs. omne at 40.53%), 13.3% EM / 27.7% accuracy on AssistantBench (vs. SPA→CB at 13.8%/26.4%), and 32.8% on WebArena (GPT-4o only, vs. WebPilot at 37.2%). Z-tests with α=0.05 confirm statistical comparability for GAIA and AssistantBench. On WebArena, WebPilot and Jace.AI statistically exceed Magentic-One.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "Magentic-One's modular design allows agents to be added or removed without additional prompt tuning or training.",
    311       "evidence": "The ablation study (Section 5.3, Figure 3) demonstrates the system functions with agents removed, and the paper notes agents compensated creatively for missing capabilities. However, no experiment shows adding new agents without prompt changes.",
    312       "supported": "moderate"
    313     },
    314     {
    315       "claim": "All agents in Magentic-One contribute to performance, with the Orchestrator's ledgers being critical.",
    316       "evidence": "Section 5.3 shows that replacing the Orchestrator with a simple GroupChat drops performance by 31%. Removing individual agents reduces performance by 21-39%. Figure 3 shows ablation results by difficulty level and capabilities.",
    317       "supported": "strong"
    318     },
    319     {
    320       "claim": "Magentic-One demonstrates generalization by achieving strong performance across all three benchmarks without modification to core agent capabilities.",
    321       "evidence": "Section 5.1 states 'An identical configuration of Magentic-One was used for all three benchmarks' though benchmark-specific prompts for answer formatting and site descriptions were added. Performance is competitive across benchmarks per Table 1. No prior system had been evaluated across all three.",
    322       "supported": "moderate"
    323     },
    324     {
    325       "claim": "Magentic-One introduces some fixed overhead that disproportionately helps with hard tasks while introducing more errors on easy tasks.",
    326       "evidence": "Table 2 shows Magentic-One competing better on hard categories (Level 3 GAIA, Hard AssistantBench) compared to easy categories. This is presented as a hypothesis in Section 5.2, not a proven finding.",
    327       "supported": "weak"
    328     },
    329     {
    330       "claim": "The validation-to-test gap on WebArena (35.1% to 30.5%) suggests mild overfitting from extra attention paid to validation tasks.",
    331       "evidence": "Section 5.2 reports the gap and attributes it to validation set debugging. This is an honest disclosure but the 4.6% difference is within the margin one might expect from splitting, so 'mild overfitting' is plausible but not conclusive.",
    332       "supported": "moderate"
    333     }
    334   ],
    335   "methodology_tags": [
    336     "benchmark-eval"
    337   ],
    338   "key_findings": "Magentic-One is a multi-agent system with five specialized agents (Orchestrator, WebSurfer, FileSurfer, Coder, ComputerTerminal) that achieves statistically competitive performance to state-of-the-art on GAIA (38%), AssistantBench (27.7% accuracy), and WebArena (32.8%) without benchmark-specific modifications to core agent capabilities. Ablation studies show all agents contribute to performance, with the Orchestrator's ledger-based planning being most critical (31% drop when simplified). The paper also introduces AutoGenBench for rigorous agentic evaluation with Docker isolation and identifies systematic failure modes through automated log analysis.",
    339   "red_flags": [
    340     {
    341       "flag": "Vendor evaluating own product",
    342       "detail": "All 19 authors are Microsoft Research employees. Magentic-One is built on Microsoft's AutoGen framework, evaluated using OpenAI's GPT-4o (Microsoft is OpenAI's largest investor). No conflict-of-interest statement is provided. No independent evaluation exists."
    343     },
    344     {
    345       "flag": "No multi-run variance reported",
    346       "detail": "Despite emphasizing that AutoGenBench supports repetition to compute variance from stochastic LLM calls, the paper does not report variance across repeated runs. The error bars in Table 1 are Wald intervals derived from the proportion itself, not from repeated experiments. This understates true uncertainty."
    347     },
    348     {
    349       "flag": "Cost not quantified",
    350       "detail": "The paper acknowledges 'several US dollars, and tens of minutes per task' but does not quantify actual costs. For a system requiring dozens of LLM calls per task across hundreds of tasks, the total evaluation cost is likely substantial and unreported."
    351     },
    352     {
    353       "flag": "Benchmark contamination not addressed",
    354       "detail": "GAIA and WebArena were published in 2023, before GPT-4o's training cutoff. The paper does not discuss whether benchmark questions could have been memorized during model training."
    355     },
    356     {
    357       "flag": "WebArena comparison fairness concerns",
    358       "detail": "WebArena has no hidden test set and uses self-reported results. The authors note their own mild overfitting and encourage other teams to take similar precautions, but cannot verify whether competing systems overfit more."
    359     }
    360   ],
    361   "cited_papers": [
    362     {
    363       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework",
    364       "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"],
    365       "year": 2024,
    366       "relevance": "Core multi-agent framework upon which Magentic-One is built; seminal work in agentic AI architecture."
    367     },
    368     {
    369       "title": "GAIA: A Benchmark for General AI Assistants",
    370       "authors": ["Grégoire Mialon", "Clémentine Fourrier"],
    371       "year": 2023,
    372       "arxiv_id": "2311.12983",
    373       "relevance": "Primary benchmark for evaluating general-purpose agentic systems on multi-step reasoning and tool use tasks."
    374     },
    375     {
    376       "title": "WebArena: A Realistic Web Environment for Building Autonomous Agents",
    377       "authors": ["Shuyan Zhou", "Frank F. Xu"],
    378       "year": 2024,
    379       "relevance": "Major benchmark for evaluating agents in synthetic web environments with multi-step planning requirements."
    380     },
    381     {
    382       "title": "AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?",
    383       "authors": ["Ori Yoran", "Samuel Joseph Amouyal"],
    384       "year": 2024,
    385       "relevance": "Benchmark for realistic web-based tasks requiring multi-step reasoning, used to evaluate agentic system generalization."
    386     },
    387     {
    388       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    389       "authors": ["Carlos E. Jimenez", "John Yang"],
    390       "year": 2024,
    391       "relevance": "Key benchmark for AI-assisted software engineering, referenced as a target for future Magentic-One extensions."
    392     },
    393     {
    394       "title": "AI Agents That Matter",
    395       "authors": ["Sayash Kapoor", "Benedikt Stroebl", "Zachary S. Siegel"],
    396       "year": 2024,
    397       "relevance": "Critical work on agentic evaluation methodology — argues for considering cost, latency, and value beyond accuracy."
    398     },
    399     {
    400       "title": "SWE-Agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    401       "authors": ["John Yang", "Carlos E. Jimenez"],
    402       "year": 2024,
    403       "relevance": "Key single-agent approach to software engineering tasks, representative of the agentic coding paradigm."
    404     },
    405     {
    406       "title": "Sibyl: Simple yet Effective Agent Framework for Complex Real-World Reasoning",
    407       "authors": ["Yuqing Wang", "Tao Shen"],
    408       "year": 2024,
    409       "relevance": "Multi-agent baseline using debate-based jury mechanism with tools; directly compared to Magentic-One on GAIA."
    410     },
    411     {
    412       "title": "WebPilot: A Versatile and Autonomous Multi-Agent System for Web Task Execution with Strategic Exploration",
    413       "authors": ["Yao Zhang", "Zijian Ma"],
    414       "year": 2024,
    415       "relevance": "Multi-agent web navigation system that statistically outperforms Magentic-One on WebArena."
    416     },
    417     {
    418       "title": "Agentless: Demystifying LLM-Based Software Engineering Agents",
    419       "authors": ["Chunqiu Steven Xia", "Yinlin Deng"],
    420       "year": 2024,
    421       "relevance": "Challenges the agentic paradigm by showing non-agentic approaches can match agentic ones in software engineering."
    422     },
    423     {
    424       "title": "The AI Scientist: Towards Fully Automated Open-Ended Scientific Discovery",
    425       "authors": ["Chris Lu", "Cong Lu"],
    426       "year": 2024,
    427       "arxiv_id": "2408.06292",
    428       "relevance": "Multi-agent system for scientific research, representing the extension of agentic AI into scientific discovery."
    429     },
    430     {
    431       "title": "OpenDevin: An Open Platform for AI Software Developers as Generalist Agents",
    432       "authors": ["Xingyao Wang", "Boxuan Li"],
    433       "year": 2024,
    434       "relevance": "Open-source agentic software development platform; relevant as a comparison point for generalist agentic systems."
    435     }
    436   ]
    437 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs