ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (26055B)


      1 {
      2   "paper": {
      3     "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
      4     "authors": [
      5       "Xingyao Wang",
      6       "Boxuan Li",
      7       "Yufan Song",
      8       "Frank F. Xu",
      9       "Xiangru Tang",
     10       "Mingchen Zhuge",
     11       "Jiayi Pan",
     12       "Yueqi Song",
     13       "Bowen Li",
     14       "Jaskirat Singh",
     15       "Hoang H. Tran",
     16       "Fuqiang Li",
     17       "Ren Ma",
     18       "Mingzhang Zheng",
     19       "Bill Qian",
     20       "Yanjun Shao",
     21       "Niklas Muennighoff",
     22       "Yizhe Zhang",
     23       "Binyuan Hui",
     24       "Junyang Lin",
     25       "Robert Brennan",
     26       "Hao Peng",
     27       "Heng Ji",
     28       "Graham Neubig"
     29     ],
     30     "year": 2024,
     31     "venue": "ICLR 2025",
     32     "arxiv_id": "2407.16741"
     33   },
     34   "scan_version": 2,
     35   "active_modules": ["experimental_rigor", "data_leakage"],
     36   "checklist": {
     37     "artifacts": {
     38       "code_released": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "GitHub repository provided: https://github.com/All-Hands-AI/OpenHands. Released under MIT license."
     42       },
     43       "data_released": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "All 15 evaluation benchmarks used are publicly available standard benchmarks (SWE-Bench, WebArena, GAIA, etc.). The paper uses no proprietary data."
     47       },
     48       "environment_specified": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper describes a Docker-based sandbox environment in detail (§2.2, §F), including the runtime image build system with Dockerfiles. The open-source repository contains full environment specifications."
     52       },
     53       "reproduction_instructions": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "The evaluation framework is integrated into the open-source platform (§4), and the paper describes benchmark integration in detail. The repository contains evaluation scripts for all 15 benchmarks."
     57       }
     58     },
     59     "statistical_methodology": {
     60       "confidence_intervals_or_error_bars": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No confidence intervals or error bars reported in any of the results tables (Tab. 3-6). All results are point estimates."
     64       },
     65       "significance_tests": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No statistical significance tests used despite numerous comparative claims (e.g., 'competitive performance', 'significantly better'). Comparisons are based solely on numeric differences."
     69       },
     70       "effect_sizes_reported": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Results are reported as success rates with baseline context, e.g., CodeActAgent at 26.0% vs SWE-Agent at 18.0% on SWE-Bench Lite (Tab. 4), with absolute numbers allowing effect size computation."
     74       },
     75       "sample_size_justified": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No justification for benchmark subset sizes. SWE-Bench Lite uses 300 instances (cost-saving noted but not statistically justified), BIRD uses 300 samples from dev set, ML-Bench uses quarter subset — none justified statistically."
     79       },
     80       "variance_reported": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No variance or standard deviation reported. All results appear to be single-run numbers with no indication of run-to-run variability."
     84       }
     85     },
     86     "evaluation_design": {
     87       "baselines_included": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Extensive baseline comparisons included across all 15 benchmarks. Tab. 3-6 compare against SWE-Agent, AutoCodeRover, Aider, Agentless, WebArena Agent, AutoGPT, and many others."
     91       },
     92       "baselines_contemporary": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Baselines are contemporary: SWE-Agent (2024), Agentless (2024), AutoWebGLM (2024), Moatless Tools (2024). Most baselines are from 2023-2024."
     96       },
     97       "ablation_study": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "No ablation study. The paper tests different agents and models but does not systematically remove or modify components to measure their individual contribution to performance."
    101       },
    102       "multiple_metrics": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Multiple benchmarks spanning different capabilities are used (software engineering, web browsing, misc. assistance). Within benchmarks, metrics like pass@k (HumanEvalFix) and success rate are used. Tab. 4 includes cost alongside accuracy."
    106       },
    107       "human_evaluation": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "No human evaluation of the system's outputs. All evaluations are fully automated (test suites, exact match, execution accuracy)."
    111       },
    112       "held_out_test_set": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Standard benchmark test sets are used. SWE-Bench Lite is a canonical subset, GPQA uses the diamond set, GAIA uses L1 validation set. These are established held-out sets."
    116       },
    117       "per_category_breakdown": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Results broken down by benchmark category (software engineering, web browsing, misc. assistance) in separate tables (Tab. 4-6). GPQA results broken down by subset (diamond, main, extended) in Tab. 7."
    121       },
    122       "failure_cases_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section A (Limitations and Future Work) discusses failure modes: agents struggle with complex tasks, long file editing, and web browsing. Specific weaknesses acknowledged."
    126       },
    127       "negative_results_reported": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Several negative results reported: CodeActAgent underperforms Gorilla fine-tuned model on APIBench (36.4% vs 75.0%), underperforms CC-NET on MiniWoB++ (40.8% vs 91.1%), and underperforms SWE-Agent on HumanEvalFix (79.3% vs 87.7%)."
    131       }
    132     },
    133     "claims_and_evidence": {
    134       "abstract_claims_supported": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Abstract claims are appropriately hedged: 'perform an evaluation of agents over 15 challenging tasks' (supported by §4), platform features described match implementation. No overclaimed SOTA."
    138       },
    139       "causal_claims_justified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper claims CodeAct agent is 'significantly better' than non-agentic approaches on HumanEvalFix (§4.2.1) and attributes performance differences to the framework's design choices, but no controlled experiments isolate specific causal factors."
    143       },
    144       "generalization_bounded": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The paper frames results per-benchmark and per-model, acknowledging that the same agent demonstrates 'competitive performance' rather than claiming superiority. §4.1 explicitly notes agents are 'designed with generality in mind' rather than claiming they are the best."
    148       },
    149       "alternative_explanations_discussed": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No discussion of alternative explanations for performance differences. For example, the HumanEvalFix comparison to SWE-Agent (0-shot vs 1-shot) is noted but not systematically analyzed as a confound."
    153       },
    154       "proxy_outcome_distinction": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "The paper uses benchmark pass rates as proxies for agent capability (e.g., SWE-Bench for 'software engineering ability', WebArena for 'web browsing ability') without discussing whether these benchmarks capture the full scope of the claimed capabilities."
    158       }
    159     },
    160     "setup_transparency": {
    161       "model_versions_specified": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Specific model versions provided throughout: 'gpt-4o-2024-05-13', 'claude-3-5-sonnet-20240620', 'gpt-4o-mini-2024-07-18', 'gpt-4-1106-preview' (Tab. 3-6). Version dates included."
    165       },
    166       "prompts_provided": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Full agent prompts provided: Browsing Agent prompt in §K with complete text, CodeAct SWE Agent prompt referenced at GitHub URL (§H). Agent implementation code in Fig. 3."
    170       },
    171       "hyperparameters_reported": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No hyperparameters reported for LLM API calls (temperature, top-p, max tokens). No mention of sampling settings used across experiments."
    175       },
    176       "scaffolding_described": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Agent scaffolding described in detail: event stream architecture (§2.1), runtime environment with bash/IPython/browser (§2.2), agent skills library (§2.3), multi-agent delegation (§2.4). Fig. 2 provides architecture overview."
    180       },
    181       "data_preprocessing_documented": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Benchmark setup procedures described per-benchmark. E.g., SWE-Bench: no hint text used, BIRD: 300 samples from dev set, ML-Bench: quarter subset, BioCoder: context removed from prompts. HumanEvalFix follows Muennighoff et al. setup."
    185       }
    186     },
    187     "limitations_and_scope": {
    188       "limitations_section_present": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section A 'Limitations and Future Work' provides substantive discussion of multiple limitations across several paragraphs."
    192       },
    193       "threats_to_validity_specific": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Specific limitations discussed: agents struggle with complex tasks, long file editing performance is poor, web browsing needs improvement. These are specific to the system being evaluated."
    197       },
    198       "scope_boundaries_stated": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "No explicit statement of what the results do NOT show. The paper does not bound its generalization claims or explicitly state which settings/populations are excluded from its conclusions."
    202       }
    203     },
    204     "data_integrity": {
    205       "raw_data_available": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No raw experimental logs or per-instance results released. Only aggregate success rates reported in tables. Individual agent trajectories not available for verification."
    209       },
    210       "data_collection_described": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Each benchmark's data collection is described by reference to the original papers, with any modifications documented (e.g., BioCoder context removal, BIRD 300-sample selection, ML-Bench quarter subset)."
    214       },
    215       "recruitment_methods_described": {
    216         "applies": false,
    217         "answer": false,
    218         "justification": "No human participants. All evaluations use standard automated benchmarks."
    219       },
    220       "data_pipeline_documented": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "The evaluation pipeline is documented: Docker sandbox setup (§F), action execution API (§2.2), benchmark integration process. The runtime workflow is diagrammed in Fig. 4."
    224       }
    225     },
    226     "conflicts_of_interest": {
    227       "funding_disclosed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No funding or acknowledgments section found in the paper. Multiple institutional affiliations listed but no funding disclosure."
    231       },
    232       "affiliations_disclosed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "Author affiliations clearly listed: UIUC, CMU, Yale, UC Berkeley, Contextual AI, KAUST, ANU, HCMUT, Alibaba, and All Hands AI. All affiliations are transparent."
    236       },
    237       "funder_independent_of_outcome": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "Authors include affiliates of 'All Hands AI' (the commercial entity behind OpenHands) and Alibaba (whose models could be evaluated). No discussion of whether funding sources have a stake in the platform's success."
    241       },
    242       "financial_interests_declared": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No competing interests statement. Authors from 'All Hands AI' (a company built around the OpenHands platform) have clear financial interest in the platform's success, but this is not declared."
    246       }
    247     },
    248     "contamination": {
    249       "training_cutoff_stated": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No training cutoff dates stated for any of the LLMs used (GPT-4o, Claude 3.5 Sonnet, etc.) despite using benchmarks that may overlap with training data."
    253       },
    254       "train_test_overlap_discussed": {
    255         "applies": true,
    256         "answer": false,
    257         "justification": "No discussion of potential train/test overlap for any benchmark. HumanEval (published 2021) and other benchmarks could be in training data of models used."
    258       },
    259       "benchmark_contamination_addressed": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "No contamination analysis despite using multiple public benchmarks (HumanEval, BIRD, SWE-Bench) with models whose training data likely includes these benchmarks."
    263       }
    264     },
    265     "human_studies": {
    266       "pre_registered": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in the study. All evaluations are automated benchmark evaluations."
    270       },
    271       "irb_or_ethics_approval": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants."
    275       },
    276       "demographics_reported": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants."
    280       },
    281       "inclusion_exclusion_criteria": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants."
    285       },
    286       "randomization_described": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants."
    290       },
    291       "blinding_described": {
    292         "applies": false,
    293         "answer": false,
    294         "justification": "No human participants."
    295       },
    296       "attrition_reported": {
    297         "applies": false,
    298         "answer": false,
    299         "justification": "No human participants."
    300       }
    301     },
    302     "cost_and_practicality": {
    303       "inference_cost_reported": {
    304         "applies": true,
    305         "answer": true,
    306         "justification": "Average cost per instance reported for most experiments in the '$ Avg. Cost' column across Tables 4-6. E.g., SWE-Bench Lite with claude-3-5-sonnet costs $1.10 per instance."
    307       },
    308       "compute_budget_stated": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "Total cost estimate provided: 'Running the complete set of 2294 instances costs $6.9k, using a conservative estimate of $3 per instance' (§4.2 footnote). Per-instance costs reported across benchmarks."
    312       }
    313     },
    314     "experimental_rigor": {
    315       "seed_sensitivity_reported": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "No multi-seed results reported. All results appear to be single-run evaluations."
    319       },
    320       "number_of_runs_stated": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "Number of runs not stated for any experiment. Results presented without indicating whether they are from single or multiple runs."
    324       },
    325       "hyperparameter_search_budget": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "No hyperparameter search budget reported. Agent configurations appear selected without documenting the search process."
    329       },
    330       "best_config_selection_justified": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "No justification for why specific agent versions (e.g., v1.5 vs v1.8) or configurations were selected for different benchmarks. Different versions used across benchmarks without explanation."
    334       },
    335       "multiple_comparison_correction": {
    336         "applies": false,
    337         "answer": false,
    338         "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable."
    339       },
    340       "self_comparison_bias_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "Authors evaluate their own platform against competitors without acknowledging potential self-comparison bias. Baseline implementations may differ from original setups."
    344       },
    345       "compute_budget_vs_performance": {
    346         "applies": true,
    347         "answer": true,
    348         "justification": "Cost per instance reported alongside performance for most benchmarks (Tab. 4-6), enabling compute-normalized comparisons. E.g., gpt-4o-mini at $0.01 vs gpt-4o at $1.72 per SWE-Bench instance."
    349       },
    350       "benchmark_construct_validity": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether any of the 15 benchmarks actually measure the claimed capabilities. SWE-Bench is used as proxy for 'software engineering' ability without validity analysis."
    354       },
    355       "scaffold_confound_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "Different scaffolds compared (SWE-Agent vs CodeActAgent vs Agentless) without isolating model vs scaffold effects. Tab. 3-4 compare systems with different scaffolding and different models simultaneously."
    359       }
    360     },
    361     "data_leakage": {
    362       "temporal_leakage_addressed": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No discussion of temporal leakage. Many benchmarks (HumanEval 2021, BIRD 2023) predate the models used, creating potential for training data overlap."
    366       },
    367       "feature_leakage_addressed": {
    368         "applies": true,
    369         "answer": false,
    370         "justification": "No discussion of feature leakage in benchmark evaluations."
    371       },
    372       "non_independence_addressed": {
    373         "applies": true,
    374         "answer": false,
    375         "justification": "No discussion of independence between training and test data for any benchmark."
    376       },
    377       "leakage_detection_method": {
    378         "applies": true,
    379         "answer": false,
    380         "justification": "No leakage detection or prevention methods applied to any benchmark evaluation."
    381       }
    382     }
    383   },
    384   "claims": [
    385     {
    386       "claim": "CodeActAgent v1.8 with claude-3-5-sonnet achieves 26.0% on SWE-Bench Lite, competitive with specialized SWE agents.",
    387       "evidence": "Tab. 4 shows 26.0% vs SWE-Agent 18.0%, AutoCodeRover 19.0%, Aider 26.3%, Agentless 27.3%.",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "The same CodeAct agent demonstrates competitive performance across three major task categories without task-specific modifications.",
    392       "evidence": "Tab. 3 shows CodeActAgent evaluated on SWE-Bench (26.0%), WebArena (15.3%), and GPQA (52.0%) without system prompt changes. §4.1 notes this generality.",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "CodeActAgent fixes 79.3% of bugs in HumanEvalFix, significantly better than all non-agentic approaches.",
    397       "evidence": "Tab. 4: 79.3% vs StarCoder2-15B at 48.6% and DeepSeekCoder-33B at 47.5%. However, SWE-Agent (1-shot) achieves 87.7%.",
    398       "supported": "moderate"
    399     },
    400     {
    401       "claim": "CodeActAgent achieves 76.5% on ML-Bench, outperforming SWE-Agent (42.6%) and Aider (64.4%).",
    402       "evidence": "Tab. 4 shows these numbers. Uses gpt-4o with cost of $0.25 per instance.",
    403       "supported": "moderate"
    404     },
    405     {
    406       "claim": "OpenHands is the only framework with all 10 features compared in Tab. 1 (GUI, standardized tools, sandbox, browser, multi-agent, human-AI collaboration, AgentHub, evaluation, QC).",
    407       "evidence": "Tab. 1 comparison matrix shows OpenHands with checkmarks in all columns while competitors lack at least one.",
    408       "supported": "moderate"
    409     }
    410   ],
    411   "methodology_tags": ["benchmark-eval"],
    412   "key_findings": "OpenHands provides an open-source platform for AI software agents with a Docker-sandboxed runtime, event stream architecture, and evaluation across 15 benchmarks. The CodeActAgent achieves competitive performance as a generalist agent: 26% on SWE-Bench Lite, 15.3% on WebArena, and 52% on GPQA, all without task-specific prompt engineering. The platform reports inference costs per benchmark, showing practical deployment economics. The project has significant community traction with 32K GitHub stars and 188+ contributors.",
    413   "red_flags": [
    414     {
    415       "flag": "Self-evaluation by platform creators",
    416       "detail": "Authors from All Hands AI (the commercial entity behind OpenHands) evaluate their own platform. No independent evaluation or acknowledgment of this potential bias."
    417     },
    418     {
    419       "flag": "No variance or uncertainty quantification",
    420       "detail": "All results are single-run point estimates with no error bars, standard deviations, or confidence intervals despite LLM outputs being stochastic."
    421     },
    422     {
    423       "flag": "Inconsistent agent versions across benchmarks",
    424       "detail": "Some benchmarks use CodeActAgent v1.5, others v1.8, with no explanation for why different versions were used or how they differ."
    425     },
    426     {
    427       "flag": "No contamination analysis",
    428       "detail": "Uses 15 public benchmarks with commercial LLMs without any analysis of whether benchmark data appears in model training sets."
    429     },
    430     {
    431       "flag": "Scaffold-model confound in comparisons",
    432       "detail": "Comparisons in Tab. 3-4 mix scaffold differences and model differences (e.g., SWE-Agent+gpt-4-1106-preview vs CodeActAgent+claude-3-5-sonnet), making it impossible to attribute performance differences."
    433     }
    434   ],
    435   "cited_papers": [
    436     {
    437       "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?",
    438       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R. Narasimhan"],
    439       "year": 2024,
    440       "relevance": "Primary software engineering benchmark used to evaluate OpenHands agents."
    441     },
    442     {
    443       "title": "SWE-agent: Agent-computer interfaces enable automated software engineering",
    444       "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"],
    445       "year": 2024,
    446       "relevance": "Key baseline for software engineering agent evaluation; introduced ACI concept used in OpenHands."
    447     },
    448     {
    449       "title": "Agentless: Demystifying LLM-based software engineering agents",
    450       "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"],
    451       "year": 2024,
    452       "relevance": "Baseline agent approach that achieves competitive SWE-Bench performance without agentic scaffolding."
    453     },
    454     {
    455       "title": "Executable Code Actions Elicit Better LLM Agents",
    456       "authors": ["Xingyao Wang", "Yangyi Chen", "Lifan Yuan", "Yizhe Zhang", "Yunzhu Li", "Hao Peng", "Heng Ji"],
    457       "year": 2024,
    458       "relevance": "CodeAct framework underlying OpenHands' default agent architecture."
    459     },
    460     {
    461       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework",
    462       "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"],
    463       "year": 2023,
    464       "arxiv_id": "2308.08155",
    465       "relevance": "Multi-agent framework compared in Tab. 1; key related work for agent framework design."
    466     },
    467     {
    468       "title": "MetaGPT: Meta Programming for a Multi-Agent Collaborative Framework",
    469       "authors": ["Sirui Hong", "Mingchen Zhuge"],
    470       "year": 2023,
    471       "relevance": "Multi-agent framework compared in Tab. 1 emphasizing standardized operating procedures."
    472     },
    473     {
    474       "title": "WebArena: A Realistic Web Environment for Building Autonomous Agents",
    475       "authors": ["Shuyan Zhou", "Frank F. Xu", "Hao Zhu"],
    476       "year": 2023,
    477       "relevance": "Primary web browsing benchmark and baseline agent for OpenHands evaluation."
    478     },
    479     {
    480       "title": "GAIA: A Benchmark for General AI Assistants",
    481       "authors": ["Grégoire Mialon", "Clémentine Fourrier", "Craig Swift", "Thomas Wolf", "Yann LeCun", "Thomas Scialom"],
    482       "year": 2023,
    483       "arxiv_id": "2311.12983",
    484       "relevance": "General assistant benchmark requiring reasoning, browsing, and coding capabilities."
    485     },
    486     {
    487       "title": "AgentBench: Evaluating LLMs as Agents",
    488       "authors": ["Xiao Liu", "Hao Yu"],
    489       "year": 2023,
    490       "relevance": "Multi-domain agent benchmark; OS/bash subset used for OpenHands evaluation."
    491     },
    492     {
    493       "title": "MINT: Evaluating LLMs in Multi-turn Interaction with Tools and Language Feedback",
    494       "authors": ["Xingyao Wang", "Zihan Wang", "Jiateng Liu"],
    495       "year": 2024,
    496       "relevance": "Multi-turn tool-use benchmark evaluating agent interaction with tools and feedback."
    497     },
    498     {
    499       "title": "AutoCodeRover: Autonomous program improvement",
    500       "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"],
    501       "year": 2024,
    502       "relevance": "Software engineering agent baseline using code search and AST manipulation."
    503     },
    504     {
    505       "title": "Language agents as optimizable graphs",
    506       "authors": ["Mingchen Zhuge", "Wenyi Wang", "Louis Kirsch", "Francesco Faccio", "Dmitrii Khizbullin", "Jurgen Schmidhuber"],
    507       "year": 2024,
    508       "relevance": "GPTSwarm framework integrated into OpenHands AgentHub for graph-based agent optimization."
    509     }
    510   ]
    511 }

Impressum · Datenschutz