scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28986B)
      1 {
      2   "paper": {
      3     "title": "AgentBench: Evaluating LLMs as Agents",
      4     "authors": [
      5       "Xiao Liu",
      6       "Hao Yu",
      7       "Hanchen Zhang",
      8       "Yifan Xu",
      9       "Xuanyu Lei",
     10       "Hanyu Lai",
     11       "Yu Gu",
     12       "Hangliang Ding",
     13       "Kaiwen Men",
     14       "Kejuan Yang",
     15       "Shudan Zhang",
     16       "Xiang Deng",
     17       "Aohan Zeng",
     18       "Zhengxiao Du",
     19       "Chenhui Zhang",
     20       "Sheng Shen",
     21       "Tianjun Zhang",
     22       "Yu Su",
     23       "Huan Sun",
     24       "Minlie Huang",
     25       "Yuxiao Dong",
     26       "Jie Tang"
     27     ],
     28     "year": 2023,
     29     "venue": "ICLR 2024",
     30     "arxiv_id": "2308.03688",
     31     "doi": "10.48550/arXiv.2308.03688"
     32   },
     33   "checklist": {
     34     "artifacts": {
     35       "code_released": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "The paper states 'Datasets, environments, and an integrated evaluation package for AGENTBENCH are released at https://github.com/THUDM/AgentBench' in the abstract. A working URL is provided."
     39       },
     40       "data_released": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "The paper states 'All datasets are publicly available' (Section 4.1) and datasets are released as part of the GitHub repository linked in the abstract."
     44       },
     45       "environment_specified": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "The paper mentions Docker images (Ubuntu Docker for OS) and the Server-Client architecture, but does not provide a requirements.txt, Dockerfile definition, conda environment, or detailed library version listings. Mentioning that tasks run inside Docker is not the same as specifying the environment dependencies. The schema requires 'enough detail to recreate the environment.'"
     49       },
     50       "reproduction_instructions": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "The GitHub repository is provided and the paper describes the evaluation toolkit with a Server-Client architecture (Appendix A) and per-task evaluation setup (Appendices B-I), including that researchers need to 'set up a model server accessible via the HTTP protocol' and use Docker images."
     54       }
     55     },
     56     "statistical_methodology": {
     57       "confidence_intervals_or_error_bars": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "All results in Table 3 are reported as single point estimates with no confidence intervals or error bars. No uncertainty quantification is provided for any of the benchmark scores."
     61       },
     62       "significance_tests": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The paper makes comparative claims (e.g., 'gpt-4 presents the best performance on 6 out of 8 datasets', 'codellama-34b...still presents a clear performance gap to gpt-3.5-turbo') without any statistical significance tests. No p-values, t-tests, or other tests are reported."
     66       },
     67       "effect_sizes_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "While raw numbers are reported (e.g., overall scores 4.01 for gpt-4 vs 0.51 average for OSS LLMs), no formal effect sizes (Cohen's d, etc.) are computed. The comparison lacks a baseline-relative effect size framework."
     71       },
     72       "sample_size_justified": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "The paper explains the benchmark size was set 'to consider non-chat models' and for efficiency (roughly matching MMLU call counts), but does not provide a power analysis or formal justification for why 1,014 test examples is sufficient to reliably distinguish model performance."
     76       },
     77       "variance_reported": {
     78         "applies": true,
     79         "answer": false,
     80         "justification": "The paper states temperature=0 (greedy decoding) was used for all evaluations, meaning only single deterministic runs were performed. No variance, standard deviation, or multiple-run results are reported."
     81       }
     82     },
     83     "evaluation_design": {
     84       "baselines_included": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The benchmark compares 29 LLMs including both API-based commercial models and open-source models. For the Digital Card Game task, two naive strategies (random and greedy) are used as baselines (Section E.1)."
     88       },
     89       "baselines_contemporary": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The evaluation includes the most recent models available at the time of publication, including gpt-4 (0613), claude-2, claude-3 (opus added later), llama-2 (70b), and codellama-34b. The baselines are contemporary."
     93       },
     94       "ablation_study": {
     95         "applies": false,
     96         "answer": false,
     97         "justification": "AgentBench is a benchmark, not a proposed system with removable components. The paper evaluates models on fixed tasks; there are no system components to ablate. Comparing model families (CodeLlama vs Llama-2) is observational model comparison, not ablation."
     98       },
     99       "multiple_metrics": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Multiple metrics are used across tasks: Success Rate (OS, DB, HH), F1 (KG), Reward (DCG, WS), Game Progress (LTP), and Step SR (WB). The paper also reports execution outcome categories (Completed, Invalid Format, Invalid Action, TLE, CLE)."
    103       },
    104       "human_evaluation": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "The paper uses entirely automated evaluation. Appendix F.2 acknowledges that for LTP, automatic evaluation sometimes differs from human evaluation, suggesting human evaluation would be relevant. No human evaluation of LLM agent outputs was conducted despite its relevance to claims about reasoning and decision-making quality."
    108       },
    109       "held_out_test_set": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper explicitly separates Dev and Test sets for all 8 tasks (Table 2, e.g., OS: 26 dev, 144 test). Main results are reported on the test set (Table 3, titled 'Test set (standard) results')."
    113       },
    114       "per_category_breakdown": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Table 3 provides per-environment scores for all 29 LLMs across 8 environments. Additionally, Table 4 provides per-task breakdown of execution outcome categories, and Appendix J provides validity analysis per model."
    118       },
    119       "failure_cases_discussed": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Appendix J.2 provides detailed failure case analysis including qualitative examples of gpt-3.5-turbo getting stuck in a loop (J.2.2), analysis of repetition as primary cause of TLE (J.2.4), and analysis of invalid format errors with a gpt-4 example (J.2.1)."
    123       },
    124       "negative_results_reported": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The paper explicitly reports that code tuning has 'ambivalent impacts' — improving some tasks (Web Shopping) while hurting others (Digital Card Game). Section 4.3 states 'training on code present ambivalent impacts on different agent tasks.'"
    128       }
    129     },
    130     "claims_and_evidence": {
    131       "abstract_claims_supported": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The abstract claims a significant performance disparity between top commercial LLMs and OSS models ≤70B are supported by Table 3 (average 2.32 vs. 0.51). The claim about 'poor long-term reasoning, decision-making, and instruction following' is supported by Table 4 and Appendix J analysis."
    135       },
    136       "causal_claims_justified": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper makes causal claims such as 'training on code...improves some agent tasks' and 'high-quality alignment training...could also help improve LLM agents' based on observational comparisons between model families (e.g., CodeLlama vs. Llama-2) that differ on multiple dimensions, not just the factor of interest. These are treated as causal findings without controlled experiments."
    140       },
    141       "generalization_bounded": {
    142         "applies": true,
    143         "answer": false,
    144         "justification": "The conclusion states AGENTBENCH will 'serve as a cornerstone for subsequent LLM agent research' and Section 4.3 generalizes about code tuning's impact on agents broadly. The paper does not sufficiently bound claims to the specific 8 tasks, the tested models, or the text-only, English-language evaluation setting."
    145       },
    146       "alternative_explanations_discussed": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "The paper does not systematically discuss alternative explanations for the observed performance gaps. For example, the lower OSS performance could reflect RLHF differences, data distributions, or parameter count effects — the paper acknowledges some of these in passing but does not systematically consider alternatives to its interpretations."
    150       }
    151     },
    152     "setup_transparency": {
    153       "model_versions_specified": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "Table 1 lists versions for some models (gpt-4: 0613, gpt-3.5-turbo: 0613, claude-instant: v1.1, vicuna-13b: v1.5) but several key models lack version/snapshot information: claude-2 has '-', glm-4 has '-', chat-bison-001 has '-'. Per the schema, marketing names without snapshot dates do not count as specified versions. The evaluation is inconsistent in version specification."
    157       },
    158       "prompts_provided": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Full prompt templates with actual content are provided in Appendices B through I for all 8 environments. These are not just templates with placeholders — they include the actual instructions sent to models (e.g., the complete OS instruction prompt in Appendix B.3, the full KG prompt with API descriptions in Appendix D.2)."
    162       },
    163       "hyperparameters_reported": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 4.1 states 'To ensure reproducible results, we set temperature=0 (i.e., greedy decoding) in the inference on all tasks.' The paper also describes the context window management strategy (selecting minimum r such that token count ≤ 3500)."
    167       },
    168       "scaffolding_described": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "The evaluation scaffolding is described in detail: CoT prompting strategy (Section 2), the Server-Client framework with max-flow algorithm (Appendix A), per-round interaction management, context window handling with omission strategy, and task-specific interaction loops (Appendices B-I)."
    172       },
    173       "data_preprocessing_documented": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Each task appendix (B-I) describes dataset construction, including filtering criteria. For OS: gpt-4 generated problems 'strictly filtered by passing the unit tests'; 6000 Stack Overflow problems filtered by annotators. For DB: augmented data 'filtered and sampled into final dataset'. Data augmentation bias study in Appendix C.4."
    177       }
    178     },
    179     "limitations_and_scope": {
    180       "limitations_section_present": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "There is no dedicated limitations or threats-to-validity section in the paper. The conclusion mentions that 'even the strongest gpt-4 is not qualified as a practically usable agent' but this is a finding, not a limitations discussion. No substantive limitations section exists."
    184       },
    185       "threats_to_validity_specific": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "No specific threats to validity are discussed. The paper does not address potential issues such as benchmark contamination, the limitation of using greedy decoding only, the lack of multi-run variance, or the restriction to English-only text-only models."
    189       },
    190       "scope_boundaries_stated": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "The paper does not explicitly state what AGENTBENCH does NOT show. There is no discussion of scope boundaries such as: results apply only to text-only LLMs, only to English-language tasks, only to the specific 8 task types tested, or what agent capabilities fall outside the benchmark's coverage."
    194       }
    195     },
    196     "data_integrity": {
    197       "raw_data_available": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "All datasets are released at https://github.com/THUDM/AgentBench. The interactive environments, evaluation scripts, and checking pipelines are also released, enabling independent verification of results."
    201       },
    202       "data_collection_described": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Each appendix (B-I) describes in detail how the dataset was collected. For OS: human annotators from Stack Overflow + gpt-4 generated problems with unit tests. For KG: sourced from GrailQA, ComplexWebQuestions, GraphQuestions with criteria that questions require 5+ tool invocations."
    206       },
    207       "recruitment_methods_described": {
    208         "applies": false,
    209         "answer": false,
    210         "justification": "The benchmark does not involve human participants. The data collection uses automated environments, existing public datasets, and LLM-based augmentation. The NA criterion for human recruitment does not apply here."
    211       },
    212       "data_pipeline_documented": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "The full pipeline from data collection to final evaluation is documented. For DB: source datasets → gpt-3.5-turbo augmentation → validity filtering → final 300 entries by category. The checking pipelines (scripts) for automated evaluation are also described in detail."
    216       }
    217     },
    218     "conflicts_of_interest": {
    219       "funding_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "The acknowledgement section states funding from NSFC (62276148, 61825602), Ministry of Science and Technology of China (2022ZD0118600), Tsinghua University, and New Cornerstone Science Foundation. Zhipu AI is noted as covering 'all GPU and API cost consumed in this study'."
    223       },
    224       "affiliations_disclosed": {
    225         "applies": true,
    226         "answer": true,
    227         "justification": "Author affiliations are listed: Tsinghua University (majority of authors), Ohio State University (Yu Gu, Yu Su, Huan Sun, Xiang Deng), and UC Berkeley (Sheng Shen, Tianjun Zhang). The connection to Zhipu AI (GLM-4 developer) is disclosed via funding acknowledgment."
    228       },
    229       "funder_independent_of_outcome": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "Zhipu AI covered 'all GPU and API cost' and has a 'research fund' supporting this work, yet Zhipu AI's glm-4 model is evaluated in the benchmark (ranked 3rd with score 2.89). Zhipu AI has a direct financial interest in glm-4 appearing competitive, which is a non-independent funding relationship."
    233       },
    234       "financial_interests_declared": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "There is no competing interests statement or declaration of financial interests in the paper. The Zhipu AI funding relationship and the evaluation of Zhipu AI's glm-4 model in the benchmark is not explicitly acknowledged as a potential conflict of interest."
    238       }
    239     },
    240     "contamination": {
    241       "training_cutoff_stated": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "The paper does not state training data cutoff dates for any of the 29 evaluated models. The models are described with release versions but no training data cutoff information is provided, making it impossible to assess whether benchmark data could have appeared in training."
    245       },
    246       "train_test_overlap_discussed": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "The paper does not discuss potential train/test overlap. The DB task notes data augmentation was used partly 'to avoid leakage from the dataset' (Section C.1), suggesting awareness of the issue, but no systematic contamination analysis is performed for any task."
    250       },
    251       "benchmark_contamination_addressed": {
    252         "applies": true,
    253         "answer": false,
    254         "justification": "Two tasks (WebShop and Mind2Web/ALFWorld) are adapted from pre-existing publicly available benchmarks that were available before most models' training cutoffs. The paper does not address whether these benchmarks may have been in the training data of the evaluated models."
    255       }
    256     },
    257     "human_studies": {
    258       "pre_registered": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "This is a benchmark evaluation paper with no human participants. Pre-registration is not applicable."
    262       },
    263       "irb_or_ethics_approval": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "This is a benchmark evaluation paper with no human participants. IRB approval is not applicable."
    267       },
    268       "demographics_reported": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "This is a benchmark evaluation paper with no human participants. Demographics are not applicable."
    272       },
    273       "inclusion_exclusion_criteria": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "This is a benchmark evaluation paper with no human participants. Inclusion/exclusion criteria for participants are not applicable."
    277       },
    278       "randomization_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "This is a benchmark evaluation paper with no human participants. Randomization of participant assignment is not applicable."
    282       },
    283       "blinding_described": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "This is a benchmark evaluation paper with no human participants. Blinding is not applicable."
    287       },
    288       "attrition_reported": {
    289         "applies": false,
    290         "answer": false,
    291         "justification": "This is a benchmark evaluation paper with no human participants. Attrition reporting is not applicable."
    292       }
    293     },
    294     "cost_and_practicality": {
    295       "inference_cost_reported": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "The paper does not report API costs or inference latency for running the benchmark. It notes that Zhipu AI covered 'all GPU and API cost' but does not quantify these costs. The paper acknowledges that evaluations are 'time-consuming' but provides no estimates."
    299       },
    300       "compute_budget_stated": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No total computational budget is stated. The paper mentions GPU costs were covered by Zhipu AI but does not quantify GPU hours, API spend, or hardware used. Running 29 models across 8 environments on 1,014 test examples each represents substantial compute with no quantification."
    304       }
    305     }
    306   },
    307   "claims": [
    308     {
    309       "claim": "There is a significant performance gap between top commercial LLMs (average overall score 2.32) and open-source LLMs ≤70B (average overall score 0.51) on AgentBench.",
    310       "evidence": "Table 3 and Figure 1(b) show overall AgentBench scores. gpt-4 achieves 4.01, while the best OSS model (codellama-34b) achieves 0.96, and the OSS average is 0.51 vs. API average of 2.32.",
    311       "supported": "strong"
    312     },
    313     {
    314       "claim": "Poor long-term reasoning, decision-making, and instruction following are the main obstacles for developing usable LLM agents.",
    315       "evidence": "Table 4 shows Task Limit Exceeded (weak reasoning/decision-making) and Invalid Format/Invalid Action (weak instruction following) as dominant failure modes across 8 tasks. Appendix J.2 provides detailed qualitative analysis.",
    316       "supported": "moderate"
    317     },
    318     {
    319       "claim": "Code training has ambivalent impacts on LLM agent performance, improving some tasks while hurting others.",
    320       "evidence": "Section 4.3 compares CodeLlama vs. Llama-2 series across tasks: CodeLlama outperforms Llama-2 on Web Shopping but underperforms on Digital Card Game. Figure 10 and Table 3 show these differences.",
    321       "supported": "moderate"
    322     },
    323     {
    324       "claim": "High-quality alignment data training (e.g., on GPT-4 generated data) improves LLM agent performance.",
    325       "evidence": "Section 4.3 compares vicuna-13b (trained on ShareGPT/GPT-4 data) vs. llama-2-13b: vicuna-13b achieves 0.93 vs. 0.77 overall score, 'even performs comparably to 3 times larger codellama-34b'.",
    326       "supported": "weak"
    327     },
    328     {
    329       "claim": "GPT-4 is not yet qualified as a practically usable agent.",
    330       "evidence": "Section 2 and conclusion state this explicitly. Table 3 shows GPT-4 achieves 78% SR on House Holding but only 16.6% on Lateral Thinking Puzzles and overall score of 4.01 on a normalized scale where 'practically usable' would require much higher performance.",
    331       "supported": "moderate"
    332     }
    333   ],
    334   "methodology_tags": [
    335     "benchmark-eval"
    336   ],
    337   "key_findings": "AgentBench is a multi-dimensional benchmark evaluating 29 LLMs across 8 interactive environments (OS, Database, Knowledge Graph, Digital Card Game, Lateral Thinking Puzzles, House Holding, Web Shopping, Web Browsing). Top commercial LLMs like GPT-4 significantly outperform open-source models ≤70B, with average overall scores of 2.32 vs. 0.51 respectively. The primary failure modes are Task Limit Exceeded (weak reasoning) and Invalid Format/Action (weak instruction following). Code training shows mixed effects: improving procedural tasks like web shopping while degrading performance on tasks requiring general reasoning like digital card games.",
    338   "red_flags": [
    339     {
    340       "flag": "Conflict of interest: funder evaluates own model",
    341       "detail": "Zhipu AI covered 'all GPU and API cost consumed in this study' and provided a research fund, yet Zhipu AI's glm-4 model is evaluated in the benchmark and ranked 3rd overall (score 2.89, above claude-2). This is a direct conflict of interest that is not explicitly acknowledged as such."
    342     },
    343     {
    344       "flag": "No statistical uncertainty quantification",
    345       "detail": "All benchmark scores are reported as single point estimates with greedy (temperature=0) decoding and no error bars, confidence intervals, or multi-run variance. The paper cannot distinguish whether small performance differences between models are meaningful or within the noise floor."
    346     },
    347     {
    348       "flag": "Contamination not addressed for adapted benchmarks",
    349       "detail": "Two of eight tasks use adapted versions of existing publicly available benchmarks (WebShop from 2022, Mind2Web/ALFWorld). These were available before most evaluated models' training cutoffs, but the paper does not discuss potential contamination."
    350     },
    351     {
    352       "flag": "Causal claims from observational model comparisons",
    353       "detail": "Claims that code training 'improves' some tasks and 'harms' others are made by comparing model families (CodeLlama vs. Llama-2) that differ on multiple dimensions beyond just code training. No controlled ablation is performed, making causal attribution unreliable."
    354     },
    355     {
    356       "flag": "No limitations section",
    357       "detail": "The paper lacks a dedicated limitations or threats-to-validity section. Key limitations such as single-run evaluation, English-only tasks, text-only models, restricted action spaces, and potential benchmark contamination are not discussed."
    358     }
    359   ],
    360   "cited_papers": [
    361     {
    362       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    363       "authors": [
    364         "Shunyu Yao",
    365         "Jeffrey Zhao",
    366         "Dian Yu",
    367         "Nan Du",
    368         "Izhak Shafran",
    369         "Karthik R Narasimhan",
    370         "Yuan Cao"
    371       ],
    372       "year": 2023,
    373       "arxiv_id": "2210.03629",
    374       "relevance": "Pioneer work combining chain-of-thought reasoning with actions in agent tasks; the CoT+action format adopted in AgentBench is based on ReAct."
    375     },
    376     {
    377       "title": "WebShop: Towards Scalable Real-World Web Interaction with Grounded Language Agents",
    378       "authors": [
    379         "Shunyu Yao",
    380         "Howard Chen",
    381         "John Yang",
    382         "Karthik Narasimhan"
    383       ],
    384       "year": 2022,
    385       "relevance": "Simulated online shopping environment adapted as one of AgentBench's 8 evaluation tasks; foundational work on web agent evaluation."
    386     },
    387     {
    388       "title": "Mind2Web: Towards a Generalist Agent for the Web",
    389       "authors": [
    390         "Xiang Deng",
    391         "Yu Gu",
    392         "Boyuan Zheng",
    393         "Shijie Chen",
    394         "Samuel Stevens",
    395         "Boshi Wang",
    396         "Huan Sun",
    397         "Yu Su"
    398       ],
    399       "year": 2023,
    400       "arxiv_id": "2306.06070",
    401       "relevance": "General web benchmark adapted as one of AgentBench's 8 tasks; evaluates LLMs as web browsing agents."
    402     },
    403     {
    404       "title": "ALFWorld: Aligning Text and Embodied Environments for Interactive Learning",
    405       "authors": [
    406         "Mohit Shridhar",
    407         "Xingdi Yuan",
    408         "Marc-Alexandre Cote",
    409         "Yonatan Bisk",
    410         "Adam Trischler",
    411         "Matthew Hausknecht"
    412       ],
    413       "year": 2020,
    414       "relevance": "Text-based household environment used as the House-Holding task in AgentBench; foundational agent evaluation benchmark."
    415     },
    416     {
    417       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    418       "authors": [
    419         "Jason Wei",
    420         "Xuezhi Wang",
    421         "Dale Schuurmans",
    422         "Maarten Bosma",
    423         "Fei Xia",
    424         "Ed Chi",
    425         "Quoc V Le",
    426         "Denny Zhou"
    427       ],
    428       "year": 2022,
    429       "relevance": "Chain-of-thought prompting is the core evaluation strategy adopted throughout AgentBench for all 8 tasks."
    430     },
    431     {
    432       "title": "Reflexion: an autonomous agent with dynamic memory and self-reflection",
    433       "authors": [
    434         "Noah Shinn",
    435         "Beck Labash",
    436         "Ashwin Gopinath"
    437       ],
    438       "year": 2023,
    439       "arxiv_id": "2303.11366",
    440       "relevance": "Advanced reasoning strategy for LLM agents; discussed as a relevant extension beyond AgentBench's primitive CoT evaluation."
    441     },
    442     {
    443       "title": "InterCode: Standardizing and Benchmarking Interactive Coding with Execution Feedback",
    444       "authors": [
    445         "John Yang",
    446         "Akshara Prabhakar",
    447         "Karthik Narasimhan",
    448         "Shunyu Yao"
    449       ],
    450       "year": 2023,
    451       "arxiv_id": "2306.14898",
    452       "relevance": "Concurrent work providing an interactive coding benchmark for bash and SQL environments, overlapping with AgentBench's OS and DB tasks."
    453     },
    454     {
    455       "title": "Generative Agents: Interactive Simulacra of Human Behavior",
    456       "authors": [
    457         "Joon Sung Park",
    458         "Joseph C. O'Brien",
    459         "Carrie J. Cai",
    460         "Meredith Ringel Morris",
    461         "Percy Liang",
    462         "Michael S. Bernstein"
    463       ],
    464       "year": 2023,
    465       "arxiv_id": "2304.03442",
    466       "relevance": "LLM agent work in social simulation contexts; relevant to the survey scope on LLM agent capabilities and evaluation."
    467     },
    468     {
    469       "title": "Evaluating Large Language Models Trained on Code",
    470       "authors": [
    471         "Mark Chen",
    472         "Jerry Tworek",
    473         "Heewoo Jun"
    474       ],
    475       "year": 2021,
    476       "arxiv_id": "2107.03374",
    477       "relevance": "HumanEval benchmark for code generation; foundational code evaluation benchmark that influenced the code-grounded tasks in AgentBench."
    478     },
    479     {
    480       "title": "MetaGPT: Meta Programming for Multi-Agent Collaborative Framework",
    481       "authors": [
    482         "Sirui Hong",
    483         "Xiawu Zheng",
    484         "Jonathan P. Chen"
    485       ],
    486       "year": 2023,
    487       "arxiv_id": "2308.00352",
    488       "relevance": "Multi-agent LLM framework; relevant to the LLM-as-Agent research landscape that AgentBench aims to evaluate."
    489     },
    490     {
    491       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework",
    492       "authors": [
    493         "Qingyun Wu",
    494         "Gagan Bansal",
    495         "Jieyu Zhang"
    496       ],
    497       "year": 2023,
    498       "arxiv_id": "2308.08155",
    499       "relevance": "Multi-agent conversation framework for LLM applications; part of the LLM agent ecosystem being evaluated."
    500     },
    501     {
    502       "title": "Holistic Evaluation of Language Models",
    503       "authors": [
    504         "Percy Liang",
    505         "Rishi Bommasani",
    506         "Tony Lee"
    507       ],
    508       "year": 2022,
    509       "arxiv_id": "2211.09110",
    510       "relevance": "HELM benchmark for comprehensive LLM evaluation; cited as prior work on broad-spectrum LLM benchmarking that AgentBench extends to agent settings."
    511     }
    512   ]
    513 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs