scan.json (28956B)
1 { 2 "paper": { 3 "title": "Agent-as-a-Judge: Evaluate Agents with Agents", 4 "authors": [ 5 "Mingchen Zhuge", 6 "Changsheng Zhao", 7 "Dylan R. Ashley", 8 "Wenyi Wang", 9 "Dmitrii Khizbullin", 10 "Yunyang Xiong", 11 "Zechun Liu", 12 "Ernie Chang", 13 "Raghuraman Krishnamoorthi", 14 "Yuandong Tian", 15 "Yangyang Shi", 16 "Vikas Chandra", 17 "Jurgen Schmidhuber" 18 ], 19 "year": 2024, 20 "venue": "arXiv preprint", 21 "arxiv_id": "2410.10934" 22 }, 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The paper provides a GitHub link: https://github.com/metauto-ai/agent-as-a-judge listed in the header. This is a working URL to the project code." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "The DevAI dataset is released on HuggingFace: https://huggingface.co/devai-benchmark, as stated in the paper header. The dataset is publicly available." 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": false, 38 "justification": "No requirements.txt, Dockerfile, or conda environment file is described in the paper. The trajectory samples show Python dependencies (openhands runtime) but no formal environment specification for reproducing the Agent-as-a-Judge experiments is provided." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": false, 43 "justification": "The paper provides a GitHub link but does not include step-by-step reproduction instructions within the paper itself. No section describing how to run the benchmarks or replicate the results is present." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": false, 50 "justification": "Results in Tables 2 and 3 are reported as point estimates (e.g., '90.44% alignment rate') with no confidence intervals or error bars. No uncertainty quantification is provided for the main results." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "The paper makes comparative claims (Agent-as-a-Judge outperforms LLM-as-a-Judge) but no statistical significance tests (e.g., t-tests, bootstrap tests) are applied. Differences are compared as raw numbers only." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "While absolute differences are noted (e.g., Agent-as-a-Judge at 90.44% vs. LLM-as-a-Judge at 70.76%), no standardized effect sizes (Cohen's d, odds ratios) are reported." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "The DevAI benchmark has only 55 tasks and 365 requirements. No power analysis or justification for why 55 tasks is sufficient to support the claims made is provided." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": false, 70 "justification": "All results appear to be single-run point estimates. No standard deviation, variance across runs, or confidence measures are reported for alignment rates or other metrics." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "The paper compares Agent-as-a-Judge against LLM-as-a-Judge (Table 3) as a direct baseline and against Human-as-a-Judge as the gold standard. Multiple baselines are included." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "LLM-as-a-Judge (Zheng et al., 2024) is a contemporary baseline from 2024, and the human evaluation is the relevant gold standard. The three developer agents (MetaGPT, GPT-Pilot, OpenHands) are current open-source frameworks." 83 }, 84 "ablation_study": { 85 "applies": true, 86 "answer": true, 87 "justification": "Section 4.3 and Appendix K present extensive component ablation studies showing the impact of adding ask, graph, read, locate, retrieve, search, planning, and memory modules. Table 4 and Tables 5-7 show results." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "The paper uses multiple metrics including Alignment Rate, Judge Shift, PR Curves (precision-recall), Requirements Met (independent and dependency-aware), Task Solve Rate, and Self-Termination rate." 93 }, 94 "human_evaluation": { 95 "applies": true, 96 "answer": true, 97 "justification": "Three expert human evaluators performed extensive manual evaluation (86.5 hours total) of the developer agents' outputs, with two rounds including a consensus-building round (Section 3). This is the ground truth the automated judges are compared against." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": false, 102 "justification": "The paper benchmarks on the full DevAI dataset of 55 tasks with no train/test split. All 55 tasks are used for evaluation, and the ablation studies for Agent-as-a-Judge components appear to be on the same data used to select the final configuration." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": false, 107 "justification": "Results are reported per developer agent (MetaGPT, GPT-Pilot, OpenHands) but not broken down by task category (supervised learning, reinforcement learning, computer vision, NLP, etc.) despite these categories being defined in the benchmark." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": true, 112 "justification": "Appendix F discusses qualitative failure modes of the developer agents (e.g., MetaGPT not saving files, OpenDevin skipping code execution). Section 4.1 discusses failure modes of the Agent-as-a-Judge components (planning instability, memory chain errors)." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": true, 117 "justification": "Ablation studies in Section 4.3 and Appendix K show that adding the search, planning, and memory components actually hurt performance relative to the baseline configuration. These negative ablation results are reported and analyzed." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "The abstract claims Agent-as-a-Judge 'dramatically outperforms LLM-as-a-Judge and is as reliable as our human evaluation baseline.' Table 3 shows alignment rates of ~90% for Agent-as-a-Judge vs. ~68-70% for LLM-as-a-Judge, with human individual evaluators averaging 85-89%. The claim is supported by the results." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": false, 129 "justification": "The paper makes causal claims through ablation studies (e.g., 'adding the graph component increases performance to 75.95%'). The ablation design is sequential addition rather than factorial, so component interactions are not fully isolated. Additionally, ablations are run on OpenHands only, not validated across all three developer agents." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper frames Agent-as-a-Judge as a general framework for evaluating agentic systems but validates it only on 55 code-generation tasks in AI development. The abstract and conclusion make broad statements ('concrete step forward for modern agentic systems') without adequately bounding results to this narrow setting." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper does not discuss alternative explanations for why Agent-as-a-Judge outperforms LLM-as-a-Judge. For example: whether the improvement is due to the agent's access to workspace files (not available to vanilla LLM judges), or whether a more capable LLM without agentic features would achieve similar gains. The confound of access level is partially noted (gray-box vs. black-box) but not systematically disentangled." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": true, 145 "answer": true, 146 "justification": "The paper specifies 'gpt-4o-2024-05-13' as the backend LLM for all three developer agents and Agent-as-a-Judge (Section 2.3, Table 1, and sample trajectories). This is a specific model version with a date identifier." 147 }, 148 "prompts_provided": { 149 "applies": true, 150 "answer": true, 151 "justification": "Appendix L provides the actual system prompts for the Agent-as-a-Judge system, including the system prompt (L.1), locate module prompt (L.2), retrieve module prompt (L.3), ask module prompt (L.4), and locate module user prompt (L.5). Full prompt text is provided." 152 }, 153 "hyperparameters_reported": { 154 "applies": true, 155 "answer": false, 156 "justification": "No temperature, top-p, max tokens, or other sampling parameters are reported for the LLM calls made by Agent-as-a-Judge or the developer agents. These settings are not disclosed." 157 }, 158 "scaffolding_described": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section 4.1 and Appendix C describe the Agent-as-a-Judge scaffold in detail, including all eight modular components (graph, locate, read, search, retrieve, ask, memory, planning) with their roles. Figure 6 and 9 show the pipeline diagrams." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": true, 166 "justification": "Appendix E documents the DevAI dataset creation pipeline in detail: manually drafted queries (E.1), setting judging criteria (E.2), building dependency DAGs (E.3), refinement rounds (E.4), dataset analysis (E.5), auxiliary information (E.6). The filtering and construction pipeline is described with criteria." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": false, 173 "justification": "There is no dedicated limitations or threats-to-validity section. The discussion touches on certain limitations (e.g., 'a perfect Agent-as-a-Judge is not the focus') but there is no labeled section with substantive limitation discussion." 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": false, 178 "justification": "No dedicated threats-to-validity discussion is present. The paper briefly notes limitations like the benchmark being small-scale and current agentic systems being limited, but does not systematically enumerate specific threats to the validity of its conclusions." 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": false, 183 "justification": "The scope is not explicitly bounded. The paper evaluates only 55 AI development tasks in Python but makes general claims about Agent-as-a-Judge as a framework for evaluating agentic systems. No explicit statements about what domains, tasks, or agent types the results do NOT generalize to." 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": true, 190 "justification": "The DevAI dataset is released on HuggingFace (https://huggingface.co/devai-benchmark). The paper states the dataset includes tasks, requirements, and preferences in JSON format (Appendix E.7 shows sample). Raw benchmark data is publicly available." 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "Appendix E documents the data collection process: expert annotators manually crafted queries targeting AI development tasks, set binary requirements, built dependency DAGs, and performed two rounds of review and refinement. The process is described in reasonable detail." 196 }, 197 "recruitment_methods_described": { 198 "applies": true, 199 "answer": false, 200 "justification": "Appendix H states 'We recruited three AI experts from the authors to perform human evaluation.' The evaluators are co-authors, which introduces a conflict of interest. Their selection process and qualifications beyond 'AI experts' are not described. This is a notable gap." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": true, 205 "justification": "The pipeline from task creation to final evaluation is documented: draft queries → set requirements → build dependency DAGs → refinement rounds → execute with developer agents (with 1800-second time limit and instrumentation) → collect workspaces and trajectories. The flow is traceable." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "The acknowledgements section discloses funding from 'King Abdullah University of Science and Technology (KAUST) - Center of Excellence for Generative AI under award number 5940 and the SDAIA-KAUST Center of Excellence in Data Science and Artificial Intelligence.'" 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "Author affiliations are clearly stated on the first page: Meta AI (Yunyang Xiong, Zechun Liu, Ernie Chang, Raghuraman Krishnamoorthi, Yuandong Tian, Yangyang Shi, Vikas Chandra, Changsheng Zhao) and KAUST (Mingchen Zhuge, Dylan Ashley, Wenyi Wang, Dmitrii Khizbullin, Jurgen Schmidhuber). The paper uses GPT-4o (OpenAI), not Meta or KAUST products." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": true, 222 "justification": "KAUST funding is independent of the outcome — neither KAUST nor the SDAIA-KAUST center has a financial stake in Agent-as-a-Judge outperforming LLM-as-a-Judge or in the specific developer agents evaluated. The funder is an academic institution." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "No competing interests statement or declaration of financial interests (patents, equity, startups) is present in the paper. Absence of disclosure is not the same as absence of conflict." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": true, 233 "answer": false, 234 "justification": "The paper uses gpt-4o-2024-05-13 as the backend but does not state its training data cutoff. The DevAI benchmark is newly created in 2024 and likely post-dates training cutoffs, but this is not explicitly confirmed." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": true, 238 "answer": false, 239 "justification": "The paper does not discuss whether gpt-4o's training data could overlap with tasks similar to those in DevAI. The benchmark is novel and hand-crafted, which reduces risk, but this is not formally addressed." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": true, 243 "answer": false, 244 "justification": "DevAI is a newly created benchmark, likely created after gpt-4o's training cutoff, but the paper does not explicitly confirm this or discuss contamination risk. No analysis of whether any DevAI tasks appear online is provided." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "The three human evaluators are co-authors performing annotation work as part of their research contribution, not external human participants in a human subjects study. Pre-registration requirements are structurally inapplicable to co-author annotation work." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "The evaluators are co-author researchers, not recruited human participants. IRB review applies to research conducted on human subjects, not to co-authors performing their research roles." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "Demographics reporting applies to recruited human participants whose characteristics could affect generalizability. Co-author annotators performing a methodological role do not constitute human participants requiring demographic characterization." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "Inclusion/exclusion criteria apply to studies recruiting participants from a population. The evaluators are self-selected co-authors contributing to their own paper, not recruited from a participant pool." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "This is a repeated-measures observational study where all evaluators assess all outputs. Randomization is not applicable to this evaluation design." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "Co-author annotators performing a research methodology role do not constitute human participants in a controlled experiment requiring blinding. While blinding would have strengthened the evaluation, the human_studies blinding criterion is structurally inapplicable to co-author annotation work." 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "All three evaluators completed both rounds of evaluation. The paper reports final hours for each evaluator (16.5, 19.5, 22.0 for round 1; 9.5 together for round 2), so attrition is not an issue — all evaluators participated fully." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": true, 287 "answer": true, 288 "justification": "Section 4.4 explicitly reports that Agent-as-a-Judge cost $30.58 in API calls and took 118.43 minutes, while Human-as-a-Judge cost ~$1,297.50 and took 86.5 hours. Table 1 also reports per-agent average costs ($1.19 for MetaGPT, $3.92 for GPT-Pilot, $6.38 for OpenHands)." 289 }, 290 "compute_budget_stated": { 291 "applies": true, 292 "answer": true, 293 "justification": "Section 2.3 states 'a full evaluation on DevAI with one of these three took around 210.65 USD and 14 hours to perform.' Section 4.4 reports LLM-as-a-Judge took 10.99 minutes and cost $29.63. Compute budgets are stated for all major experimental components." 294 } 295 } 296 }, 297 "claims": [ 298 { 299 "claim": "Agent-as-a-Judge achieves approximately 90% alignment with human consensus evaluation, outperforming LLM-as-a-Judge which achieves approximately 70% alignment.", 300 "evidence": "Table 3 shows alignment rates of 88.52%, 83.88%, 90.44% for Agent-as-a-Judge (black-box) vs. 84.15%, 65.30%, 60.38% for LLM-as-a-Judge (black-box) across the three developer agents, evaluated against human consensus.", 301 "supported": "moderate" 302 }, 303 { 304 "claim": "Agent-as-a-Judge performs comparably to or better than individual human evaluators.", 305 "evidence": "Table 3 shows the average alignment rate of individual human evaluators is 89.34%, 84.88%, 85.70% across agents, while Agent-as-a-Judge achieves 88.52%, 83.88%, 90.44% (black-box). PR curves in Figure 7 show Agent-as-a-Judge outperforms any single human evaluator on OpenHands.", 306 "supported": "moderate" 307 }, 308 { 309 "claim": "Agent-as-a-Judge saves 97.72% of time and 97.64% of cost compared to human evaluation.", 310 "evidence": "Section 4.4: Human-as-a-Judge cost ~$1,297.50 and 86.5 hours; Agent-as-a-Judge cost $30.58 and 118.43 minutes. The savings are computed from these reported figures.", 311 "supported": "strong" 312 }, 313 { 314 "claim": "GPT-Pilot and OpenHands each satisfy approximately 29% of DevAI requirements, with only one task fully solved by each.", 315 "evidence": "Table 2: Requirements Met (D) are 28.96% and 28.68% for GPT-Pilot and OpenHands respectively; Task Solve Rate is 1.81% for each (1 out of 55 tasks).", 316 "supported": "strong" 317 }, 318 { 319 "claim": "Human evaluators disagree with each other at rates ranging from 10-30%, illustrating the unreliability of single-human evaluation.", 320 "evidence": "Section 3.2 and Figure 4 report disagreement rates between pairs of evaluators ranging from ~10% to ~30%, and individual evaluator error rates up to 23.77% (cn9o on GPT-Pilot).", 321 "supported": "strong" 322 } 323 ], 324 "methodology_tags": [ 325 "benchmark-eval", 326 "case-study" 327 ], 328 "key_findings": "The paper introduces Agent-as-a-Judge, a framework where agentic systems evaluate other agentic systems, and validates it on DevAI, a new benchmark of 55 AI development tasks. Agent-as-a-Judge achieves approximately 90% alignment with human expert consensus evaluation, substantially outperforming LLM-as-a-Judge (approximately 70% alignment) while reducing evaluation cost by 97.64% and time by 97.72%. Ablation studies show that the graph, read, and locate modules are the most important components, while search, planning, and memory modules did not improve performance in this setting. DevAI reveals that current leading agentic frameworks (MetaGPT, GPT-Pilot, OpenHands) satisfy only 22-45% of task requirements and complete only 0-2% of tasks end-to-end.", 329 "red_flags": [ 330 { 331 "flag": "Evaluator conflict of interest", 332 "detail": "The three human evaluators are recruited 'from the authors,' making them co-authors evaluating work they have a stake in. This introduces a potential conflict of interest in the ground truth used to validate Agent-as-a-Judge. Their evaluations form the gold standard against which the automated judges are compared." 333 }, 334 { 335 "flag": "No train/test split for ablations", 336 "detail": "The ablation studies in Section 4.3 use the same 55 DevAI tasks to select Agent-as-a-Judge components and then report performance. The configuration is optimized and evaluated on the same data without a held-out test set, risking overfitting of the component selection to this specific benchmark." 337 }, 338 { 339 "flag": "Small benchmark with narrow scope", 340 "detail": "The benchmark has only 55 tasks focused specifically on AI development in Python. The paper makes broad claims about Agent-as-a-Judge as a general framework for agentic evaluation, but the proof-of-concept is validated on a single narrow domain with a small task count. No statistical power justification is provided." 341 }, 342 { 343 "flag": "No statistical significance testing", 344 "detail": "The paper's core claim that Agent-as-a-Judge outperforms LLM-as-a-Judge is based on comparing alignment rates (e.g., 90.44% vs. 70.76%) without any statistical tests. With 365 binary requirements, some of these differences may be statistically significant, but this is never verified." 345 }, 346 { 347 "flag": "Confounded comparison between judge types", 348 "detail": "Agent-as-a-Judge can access the workspace files and execute code to verify requirements, while the baseline LLM-as-a-Judge only has trajectory text or limited context. The performance difference may be explained primarily by information access rather than the agentic architecture. The gray-box vs. black-box comparison partially addresses this but doesn't fully disentangle the confound." 349 }, 350 { 351 "flag": "Missing limitations section", 352 "detail": "The paper has no dedicated limitations or threats-to-validity section. A method claiming to replace human evaluation for agentic systems should clearly bound the conditions under which this claim holds." 353 } 354 ], 355 "cited_papers": [ 356 { 357 "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", 358 "authors": [ 359 "Lianmin Zheng", 360 "Wei-Lin Chiang", 361 "Ying Sheng" 362 ], 363 "year": 2024, 364 "relevance": "Foundational LLM-as-a-Judge framework that Agent-as-a-Judge extends; directly relevant to evaluation methodology for LLM-based systems." 365 }, 366 { 367 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 368 "authors": [ 369 "Carlos E Jimenez", 370 "John Yang", 371 "Alexander Wettig" 372 ], 373 "year": 2023, 374 "arxiv_id": "2310.06770", 375 "relevance": "Major agentic code generation benchmark that Agent-as-a-Judge is positioned as an alternative or complement to; widely used evaluation target in the field." 376 }, 377 { 378 "title": "OpenDevin: An Open Platform for AI Software Developers as Generalist Agents", 379 "authors": [ 380 "Xingyao Wang", 381 "Boxuan Li", 382 "Yufan Song" 383 ], 384 "year": 2024, 385 "arxiv_id": "2407.16741", 386 "relevance": "One of three developer agents evaluated on DevAI; represents state-of-the-art agentic code generation." 387 }, 388 { 389 "title": "MetaGPT: Meta Programming for a Multi-Agent Collaborative Framework", 390 "authors": [ 391 "Sirui Hong", 392 "Mingchen Zhuge", 393 "Jonathan Chen" 394 ], 395 "year": 2024, 396 "relevance": "One of three developer agents evaluated on DevAI; leading multi-agent code generation framework." 397 }, 398 { 399 "title": "MLAgentBench: Evaluating Language Agents on Machine Learning Experimentation", 400 "authors": [ 401 "Qian Huang", 402 "Jian Vora", 403 "Percy Liang", 404 "Jure Leskovec" 405 ], 406 "year": 2024, 407 "relevance": "Related benchmark for evaluating agentic systems on ML tasks; compared against DevAI in the paper." 408 }, 409 { 410 "title": "MLE-Bench: Evaluating Machine Learning Agents on Machine Learning Engineering", 411 "authors": [ 412 "Jun Shern Chan", 413 "Neil Chowdhury", 414 "Oliver Jaffe" 415 ], 416 "year": 2024, 417 "relevance": "Related benchmark for ML engineering agent evaluation; discussed as having similar limitations to other existing benchmarks." 418 }, 419 { 420 "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering", 421 "authors": [ 422 "John Yang", 423 "Carlos E Jimenez", 424 "Alexander Wettig" 425 ], 426 "year": 2024, 427 "arxiv_id": "2405.15793", 428 "relevance": "State-of-the-art software engineering agent; represents the class of systems that Agent-as-a-Judge is designed to evaluate." 429 }, 430 { 431 "title": "HumanEval: Evaluating Large Language Models Trained on Code", 432 "authors": [ 433 "Mark Chen", 434 "Jerry Tworek", 435 "Heewoo Jun" 436 ], 437 "year": 2021, 438 "arxiv_id": "2107.03374", 439 "relevance": "Classic code generation benchmark discussed as inadequate for evaluating modern agentic systems; motivation for DevAI." 440 }, 441 { 442 "title": "Data Interpreter: An LLM Agent for Data Science", 443 "authors": [ 444 "Sirui Hong", 445 "Yizhang Lin", 446 "Bangbang Liu" 447 ], 448 "year": 2024, 449 "arxiv_id": "2402.18679", 450 "relevance": "The MetaGPT variant used in the DevAI evaluation; represents agentic data science capability." 451 }, 452 { 453 "title": "ChatEval: Towards Better LLM-based Evaluators through Multi-Agent Debate", 454 "authors": [ 455 "Chi-Min Chan", 456 "Weize Chen", 457 "Yusheng Su" 458 ], 459 "year": 2023, 460 "arxiv_id": "2308.07201", 461 "relevance": "Related multi-LLM evaluation framework; compared against as an alternative approach to automated agent evaluation." 462 }, 463 { 464 "title": "Let's Verify Step by Step", 465 "authors": [ 466 "Hunter Lightman", 467 "Vineet Kosaraju", 468 "Yura Burda" 469 ], 470 "year": 2023, 471 "arxiv_id": "2305.20050", 472 "relevance": "Process-supervised reward models (PRMs) discussed as a key application of Agent-as-a-Judge intermediate feedback." 473 }, 474 { 475 "title": "Language Agents as Optimizable Graphs", 476 "authors": [ 477 "Mingchen Zhuge", 478 "Wenyi Wang", 479 "Louis Kirsch" 480 ], 481 "year": 2024, 482 "arxiv_id": "2402.16823", 483 "relevance": "Graph-based agent optimization framework (GPTSwarm); closely related to the graph module in Agent-as-a-Judge design." 484 }, 485 { 486 "title": "Agentless: Demystifying LLM-based Software Engineering Agents", 487 "authors": [ 488 "Chunqiu Steven Xia", 489 "Yinlin Deng", 490 "Soren Dunn", 491 "Lingming Zhang" 492 ], 493 "year": 2024, 494 "arxiv_id": "2407.01489", 495 "relevance": "Demonstrates LLMs solving SWE-Bench tasks without advanced agents; cited as evidence that SWE-Bench has been overfitted." 496 } 497 ] 498 }