scan.json (27473B)
1 { 2 "paper": { 3 "title": "Automatically Benchmarking LLM Code Agents through Agent-driven Annotation and Evaluation", 4 "authors": [ 5 "Lingyue Fu", 6 "Bolun Zhang", 7 "Hao Guan", 8 "Yaoming Zhu", 9 "Lin Qiu", 10 "Weiwen Liu", 11 "Xuezhi Cao", 12 "Xunliang Cai", 13 "Weinan Zhang", 14 "Yong Yu" 15 ], 16 "year": 2025, 17 "venue": "arXiv", 18 "arxiv_id": "2510.24358" 19 }, 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": true, 25 "justification": "The paper provides a GitHub URL for the PRDBench evaluation code: https://github.com/AGI-Eval-Official/PRDBench (Section 3.1 footnote). A second URL is provided for the minimal code agent: https://github.com/AGI-Eval-Official/Minimal-CodeAgent (Section 4.2 footnote)." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": false, 30 "justification": "The paper states 'PRDBench data will be available soon' (Section 3.1 footnote), which is a promise of future release. Only the evaluation code is currently available, not the benchmark data itself." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided in the paper. Section 4.3 mentions 'a Python virtual environment that contains necessary and useful packages' but does not list specific package versions or provide an environment file." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "No step-by-step reproduction instructions are provided in the paper. While the GitHub repository is referenced, the paper itself does not contain a 'Reproducing Results' section or specific commands to replicate the experiments." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": false, 47 "justification": "All main results in Table 3 are reported as single point estimates (e.g., '55.81%', '45.50%') with no confidence intervals, error bars, or uncertainty measures." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "The paper makes comparative claims (e.g., 'commercial agents typically outperform minimal agents') but does not use any statistical significance tests. Comparisons are based solely on comparing raw numbers." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": false, 57 "justification": "The paper reports raw pass rates and differences (e.g., the 'Enhance' column in Table 3 shows percentage point differences), but no standardized effect sizes like Cohen's d or odds ratios are reported." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "The benchmark consists of 50 tasks and the human alignment study uses 16 code submissions, but no justification for these sample sizes is provided and no power analysis is discussed." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "Results are single-run numbers. No variance, standard deviation, or spread across multiple experimental runs is reported for the main results in Table 3. Variance is mentioned once for alignment scores (Section 4.6.1) but not for the core benchmark results." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "The paper evaluates 8 code agents (4 minimal, 4 commercial) and compares PRDBench against prior benchmarks in Table 1 (SWE-Bench, MLEBench, DevAI, PaperBench)." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "The evaluated agents and models are all current: GPT-5, Claude-3.7-Sonnet, Gemini-2.5-Pro, Qwen3-Coder, Claude Code, CodeX, Gemini CLI, and Qwen Code. These are state-of-the-art systems as of 2025." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": true, 84 "justification": "Section 4.7 (Free Development) serves as an ablation-like experiment, comparing performance with and without fixed interfaces/scaffolding. The comparison of minimal vs. commercial agents using the same backbone LLM (Section 4.4, finding 2) also functions as a component analysis." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "PRDBench evaluates across three test types (unit test, shell interaction, file comparison) with separate pass rates reported (Figure 5). The paper also reports development (DEV) and debugging (DEBUG) scores, plus enhancement delta (Table 3), and cost metrics (Table 4)." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": true, 94 "justification": "Section 4.6.1 presents a human alignment study where two annotators scored 282 test cases from 16 code submissions to validate EvalAgent's reliability, achieving 81.56% alignment." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": false, 99 "justification": "There is no mention of a held-out test set or dev/test split. All 50 tasks appear to be used for evaluation directly. Since the benchmark is newly constructed, there is no concern about tuning on dev data, but no explicit separation is described." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Figure 5 provides per-test-type error rates (unit test, shell interaction, file comparison) for each code agent. Figure 4(b) shows domain distribution. Table 5 breaks down human alignment by test type." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": true, 109 "justification": "Section 4.5.2 discusses error analysis including specific failure patterns. Section 4.4 finding (3) discusses CodeX's 'dramatic drop in performance in the DEBUG phase due to interface inconsistencies introduced during modification.' Section 4.6.1 discusses EvalAgent failure modes including 'propagation chains.'" 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The paper reports several negative findings: CodeX experiences performance drops during debugging (Table 3, -6.99 and -3.90 enhancement), and Claude Code scores lower than the minimal ADK-based Claude agent (Section 4.4, finding 2). EvalAgent's instability across projects is also reported honestly." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "The abstract claims about reduced annotation cost, diverse evaluation, and Agent-as-a-Judge effectiveness are supported. Table 1 compares benchmarks, Table 3 shows code agent results, and Table 5 validates EvalAgent alignment. The claim of '50 real-world Python projects across 20 domains' is confirmed in Section 4.1." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": false, 126 "justification": "Section 4.4 makes causal claims like 'The coding ability of the underlying LLM significantly impacts the development performance of the code agent' based on observational correlation between minimal and commercial agent rankings. The claim 'CodeX experiences a dramatic drop in performance... due to interface inconsistencies' is a causal attribution without controlled analysis isolating this factor." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The title says 'Automatically Benchmarking LLM Code Agents' broadly, but PRDBench is limited to Python projects only (50 tasks). Section 3.1 acknowledges Python was chosen for 'versatility,' but the generalization to all code agents and languages is not bounded. The abstract claims a 'scalable and robust framework' without caveats." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper does not discuss alternative explanations for its findings. For example, the correlation between minimal and commercial agent rankings could be explained by factors other than LLM capability (e.g., similar prompting strategies). No threats-to-validity section addresses confounds." 137 } 138 }, 139 "setup_transparency": { 140 "model_versions_specified": { 141 "applies": true, 142 "answer": false, 143 "justification": "The paper uses marketing names without specific version identifiers: 'Claude-3.7-Sonnet', 'Gemini-2.5-Pro', 'GPT-5', 'GPT-4.1', 'GPT-4o', 'Qwen3-Coder-480B-A35B'. No snapshot dates or API version identifiers are provided for any of these models." 144 }, 145 "prompts_provided": { 146 "applies": true, 147 "answer": true, 148 "justification": "Appendix B provides full prompts for code agents (Round 1, Round 2, and free development) and for EvalAgent, including the complete prompt text with placeholders and expected output format." 149 }, 150 "hyperparameters_reported": { 151 "applies": true, 152 "answer": true, 153 "justification": "Section 4.3 states: 'we set temperature to be 0.1, max token identical to their official APIs' setting. We set top-p to 1.0, Top-k to be 100, and presence penalty to be default to the API.'" 154 }, 155 "scaffolding_described": { 156 "applies": true, 157 "answer": true, 158 "justification": "The paper describes the agent architecture in detail. Section 3.3 describes EvalAgent's six core tools (file reading/writing, command-line execution, image handling, judge tool). Section 4.2 describes minimal agents using ADK with 'essential tools for file manipulation, bash scripting, and Python execution.' The commercial agents are treated as black boxes, which is appropriate." 159 }, 160 "data_preprocessing_documented": { 161 "applies": true, 162 "answer": true, 163 "justification": "Section 3.1 describes the seed task filtering criteria: '(1) the task can be fully implemented in Python; and (2) all datasets required for the task are publicly accessible.' Section 3.2 describes the full 5-step data production workflow with human inspection and iterative refinement. Section 4.1 describes how 50 tasks were selected from filtered candidates across 20 domains." 164 } 165 }, 166 "limitations_and_scope": { 167 "limitations_section_present": { 168 "applies": true, 169 "answer": false, 170 "justification": "There is no dedicated Limitations or Threats to Validity section in the paper. The conclusion (Section 5) is brief and does not discuss any limitations." 171 }, 172 "threats_to_validity_specific": { 173 "applies": true, 174 "answer": false, 175 "justification": "No specific threats to validity are discussed anywhere in the paper. There is no analysis of how the choice of Python-only tasks, the specific annotator pool, or the use of agent-generated scaffolding might bias results." 176 }, 177 "scope_boundaries_stated": { 178 "applies": true, 179 "answer": false, 180 "justification": "The paper does not explicitly state what the results do NOT show. There is no discussion of what populations, languages, or settings are excluded from the claims. The conclusion presents PRDBench as a general 'scalable and realistic foundation' without caveats." 181 } 182 }, 183 "data_integrity": { 184 "raw_data_available": { 185 "applies": true, 186 "answer": false, 187 "justification": "The benchmark data is not yet available ('PRDBench data will be available soon'). Only the evaluation code is released. Raw experimental outputs (agent logs, generated code) are not available for independent verification." 188 }, 189 "data_collection_described": { 190 "applies": true, 191 "answer": true, 192 "justification": "Section 3.1 describes seed task sources ('real-world project requirements, including user requests from end-to-end AI product development platforms, academic theses and projects') and Section 3.2 provides the full 5-step agent-driven data production workflow." 193 }, 194 "recruitment_methods_described": { 195 "applies": true, 196 "answer": true, 197 "justification": "Appendix A describes the 8 annotators (5 full-time, 3 part-time), listing their educational backgrounds, majors, degree levels, and years of developer experience in Table 7. Compensation method is also mentioned: 'All annotators receive daily wages (part-time) or monthly salaries (full-time) according to local labor regulations.'" 198 }, 199 "data_pipeline_documented": { 200 "applies": true, 201 "answer": true, 202 "justification": "The 5-step pipeline (Section 3.2) is documented with clear stages: seed task initialization, scaffold and criteria generation, human inspection, agent-based fix and refinement, and scaffold removal. Section 4.1 provides final statistics (50 tasks, 1,262 scoring points broken down by type)." 203 } 204 }, 205 "conflicts_of_interest": { 206 "funding_disclosed": { 207 "applies": true, 208 "answer": false, 209 "justification": "No funding source or acknowledgments section is present in the paper. There is no mention of grants, corporate sponsors, or funding agencies." 210 }, 211 "affiliations_disclosed": { 212 "applies": true, 213 "answer": true, 214 "justification": "Author affiliations are clearly listed: Shanghai Jiao Tong University, Meituan, and AGI-Eval. Meituan is a technology company, and this affiliation is disclosed." 215 }, 216 "funder_independent_of_outcome": { 217 "applies": true, 218 "answer": false, 219 "justification": "No funding source is disclosed, so independence cannot be assessed. Authors from Meituan (a tech company) could have a commercial interest in benchmark construction tools, but no conflict of interest statement addresses this." 220 }, 221 "financial_interests_declared": { 222 "applies": true, 223 "answer": false, 224 "justification": "No competing interests statement or financial interest declaration is present in the paper." 225 } 226 }, 227 "contamination": { 228 "training_cutoff_stated": { 229 "applies": true, 230 "answer": false, 231 "justification": "The paper evaluates multiple LLMs (GPT-5, Claude-3.7-Sonnet, Gemini-2.5-Pro, Qwen3-Coder) on the PRDBench benchmark but does not state the training data cutoff dates for any of these models." 232 }, 233 "train_test_overlap_discussed": { 234 "applies": true, 235 "answer": false, 236 "justification": "Although PRDBench is newly constructed (reducing contamination risk), the paper does not discuss whether any of the seed tasks, PRDs, or evaluation criteria could have appeared in model training data. No contamination analysis is provided." 237 }, 238 "benchmark_contamination_addressed": { 239 "applies": true, 240 "answer": false, 241 "justification": "PRDBench is a new benchmark, which reduces contamination risk, but the paper does not explicitly discuss this advantage or verify that benchmark materials were not available before model training cutoffs. The seed tasks come from 'real-world project requirements' and 'academic theses' which may have been publicly available." 242 } 243 }, 244 "human_studies": { 245 "pre_registered": { 246 "applies": false, 247 "answer": false, 248 "justification": "The human alignment study in Section 4.6.1 involves annotators evaluating EvalAgent outputs, not a human subjects study in the traditional sense. The annotators are performing quality assurance work, not serving as experimental participants." 249 }, 250 "irb_or_ethics_approval": { 251 "applies": false, 252 "answer": false, 253 "justification": "The annotators are hired workers performing annotation tasks, not experimental participants. IRB approval is not typically required for this type of work." 254 }, 255 "demographics_reported": { 256 "applies": false, 257 "answer": false, 258 "justification": "The annotators are workers performing annotation tasks, not study participants. Their backgrounds are reported in Appendix A for transparency but this is not a human subjects study." 259 }, 260 "inclusion_exclusion_criteria": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in a research study sense. The annotators are hired workers, not experimental subjects." 264 }, 265 "randomization_described": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants study requiring randomization. The paper evaluates code agents, not human behavior." 269 }, 270 "blinding_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants study requiring blinding. The annotators are validating agent outputs, not serving as experimental subjects." 274 }, 275 "attrition_reported": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants study with potential attrition. The annotators are hired workers completing assigned tasks." 279 } 280 }, 281 "cost_and_practicality": { 282 "inference_cost_reported": { 283 "applies": true, 284 "answer": true, 285 "justification": "Table 4 reports detailed cost statistics including time (seconds), input/output token counts, and lines of code for each agent in both rounds. Table 6 reports EvalAgent costs: average 425.62 seconds, 1,242,440 input tokens, 8,825 output tokens, and $2.68 API cost per problem." 286 }, 287 "compute_budget_stated": { 288 "applies": true, 289 "answer": true, 290 "justification": "Table 4 provides comprehensive compute statistics for all agents including wall-clock time and token consumption for both development and debugging phases. Table 6 quantifies EvalAgent's compute costs. Section 4.6.2 also notes '0.5 to 1 hour for human annotators' as a comparison point." 291 } 292 } 293 }, 294 "claims": [ 295 { 296 "claim": "PRDBench reduces annotation complexity so that annotators with undergraduate-level knowledge can complete annotation, averaging only 8 hours per project.", 297 "evidence": "Section 1 states annotators need only 'undergraduate-level knowledge in software engineering related fields' and 'an average of only eight hours needed to finish the scaffolding and metrics for each project.' Appendix A shows annotator backgrounds including Bachelor's degree holders.", 298 "supported": "moderate" 299 }, 300 { 301 "claim": "The coding ability of the underlying LLM significantly impacts code agent development performance, as minimal and commercial agent rankings are consistent.", 302 "evidence": "Table 3 shows that the relative ranking of minimal agents (GPT-5 > Claude > Qwen3-Coder > Gemini) is broadly consistent with commercial agents, though with notable exceptions (Claude Code scores lower than minimal Claude agent).", 303 "supported": "moderate" 304 }, 305 { 306 "claim": "Debugging and initial development require distinct capabilities from code agents.", 307 "evidence": "Table 3 shows CodeX drops from 56.23% (DEV) to 50.24% (DEBUG), while most minimal agents improve. Section 4.4 finding (3) attributes this to 'interface inconsistencies introduced during modification.'", 308 "supported": "moderate" 309 }, 310 { 311 "claim": "EvalAgent achieves 81.56% alignment with human annotators across 282 test cases.", 312 "evidence": "Table 5 reports alignment rates: 81.56% total, 79.44% unit test, 82.55% shell interaction, 84.62% file comparison, based on 16 randomly selected code submissions scored by two annotators.", 313 "supported": "moderate" 314 }, 315 { 316 "claim": "PRDBench effectively differentiates code agents in both fixed inference and free development modes.", 317 "evidence": "Figure 6 shows performance variance of 0.028 for fixed inference vs 0.011 for free development, with 'relative ranking of code agents remains fairly stable between the two settings' (Section 4.7).", 318 "supported": "moderate" 319 }, 320 { 321 "claim": "GPT-5 is the top-performing minimal agent with 55.81% pass rate in development and 60.15% in debugging.", 322 "evidence": "Table 3 shows GPT-5 minimal agent achieves the highest scores in both DEV (55.81%) and DEBUG (60.15%) phases among all agents.", 323 "supported": "strong" 324 } 325 ], 326 "methodology_tags": [ 327 "benchmark-eval" 328 ], 329 "key_findings": "PRDBench is a new benchmark of 50 Python projects across 20 domains that uses agent-driven annotation to reduce benchmark construction costs, requiring only undergraduate-level annotators averaging 8 hours per project. GPT-5 achieves the highest pass rate (55.81% DEV, 60.15% DEBUG) among 8 evaluated code agents, while debugging and development require distinct agent capabilities, with some agents (CodeX) experiencing performance drops during debugging. The Agent-as-a-Judge (EvalAgent) achieves 81.56% alignment with human scoring across 282 test cases at $2.68 per evaluation, though with substantial variance across projects.", 330 "red_flags": [ 331 { 332 "flag": "No limitations section", 333 "detail": "The paper has no dedicated limitations or threats-to-validity section. Significant limitations such as Python-only scope, small benchmark size (50 tasks), and annotator selection are not discussed." 334 }, 335 { 336 "flag": "No statistical rigor in comparisons", 337 "detail": "All benchmark results are single-run point estimates with no confidence intervals, error bars, significance tests, or variance across runs. Claims of one agent outperforming another are based solely on comparing two numbers." 338 }, 339 { 340 "flag": "Data not yet released", 341 "detail": "The benchmark data is described as 'will be available soon' — the paper's core contribution (the benchmark itself) is not independently verifiable at publication time." 342 }, 343 { 344 "flag": "Potential conflict of interest", 345 "detail": "Authors from Meituan (a major tech company) and AGI-Eval, with no funding disclosure or competing interests statement. The paper evaluates commercial products without disclosing potential conflicts." 346 }, 347 { 348 "flag": "Small human alignment sample", 349 "detail": "The EvalAgent validation study uses only 16 code submissions (282 test cases) with only 2 annotators. The high variance reported (std dev 27.83%, range 0-100%) undermines the 81.56% alignment headline, yet no discussion of this limitation appears." 350 }, 351 { 352 "flag": "Unbounded generalization claims", 353 "detail": "The paper presents PRDBench as a 'scalable and robust framework' for code agent evaluation generally, but tests only Python projects. The title claims to benchmark 'LLM Code Agents' without language qualification." 354 } 355 ], 356 "cited_papers": [ 357 { 358 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 359 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"], 360 "year": 2023, 361 "arxiv_id": "2310.06770", 362 "relevance": "Foundational benchmark for evaluating code agents on real GitHub issues, directly compared against PRDBench." 363 }, 364 { 365 "title": "MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering", 366 "authors": ["Jun Shern Chan", "Neil Chowdhury", "Oliver Jaffe"], 367 "year": 2024, 368 "arxiv_id": "2410.07095", 369 "relevance": "Benchmark for evaluating ML engineering agents, compared against PRDBench in Table 1." 370 }, 371 { 372 "title": "PaperBench: Evaluating AI's Ability to Replicate AI Research", 373 "authors": ["Giulio Starace", "Oliver Jaffe", "Dane Sherburn"], 374 "year": 2025, 375 "arxiv_id": "2504.01848", 376 "relevance": "High-cost benchmark using PhD-level annotators, directly motivated PRDBench's lower-cost annotation approach." 377 }, 378 { 379 "title": "Agent-as-a-Judge: Evaluate Agents with Agents", 380 "authors": ["Mingchen Zhuge", "Changsheng Zhao", "Dylan Ashley"], 381 "year": 2024, 382 "arxiv_id": "2410.10934", 383 "relevance": "Introduces the Agent-as-a-Judge paradigm that PRDBench adopts for evaluation, the DevAI benchmark." 384 }, 385 { 386 "title": "CoreCodeBench: A Configurable Multi-Scenario Repository-Level Benchmark", 387 "authors": ["Lingyue Fu", "Hao Guan", "Bolun Zhang"], 388 "year": 2025, 389 "arxiv_id": "2507.05281", 390 "relevance": "Repository-level code benchmark from same research group, related benchmark construction approach." 391 }, 392 { 393 "title": "Automated Benchmark Generation for Repository-Level Coding Tasks", 394 "authors": ["Konstantinos Vergopoulos", "Mark Niklas Müller", "Martin Vechev"], 395 "year": 2025, 396 "arxiv_id": "2503.07701", 397 "relevance": "Automated benchmark generation approach for repository-level coding, directly related methodology." 398 }, 399 { 400 "title": "CodeVisionary: An Agent-based Framework for Evaluating Large Language Models in Code Generation", 401 "authors": ["Xinchen Wang", "Pengfei Gao", "Chao Peng"], 402 "year": 2025, 403 "arxiv_id": "2504.13472", 404 "relevance": "Agent-based LLM code evaluation framework, explores LLM-as-judge for code generation assessment." 405 }, 406 { 407 "title": "Deep-Bench: Deep Learning Benchmark Dataset for Code Generation", 408 "authors": ["Alireza Daghighfarsoodeh", "Chung-Yu Wang"], 409 "year": 2025, 410 "arxiv_id": "2502.18726", 411 "relevance": "Benchmark for code generation in deep learning tasks, complementary evaluation of LLM coding capabilities." 412 }, 413 { 414 "title": "ProjectEval: A Benchmark for Programming Agents Automated Evaluation on Project-Level Code Generation", 415 "authors": ["Kaiyuan Liu", "Youcheng Pan"], 416 "year": 2025, 417 "arxiv_id": "2503.07010", 418 "relevance": "Project-level code generation benchmark that simulates user interaction, alternative evaluation approach for code agents." 419 }, 420 { 421 "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions", 422 "authors": ["Terry Yue Zhuo"], 423 "year": 2025, 424 "arxiv_id": "2406.15877", 425 "relevance": "Benchmark for code generation with diverse function calls, assessing LLM coding capabilities." 426 }, 427 { 428 "title": "SciReplicate-Bench: Benchmarking LLMs in Agent-driven Algorithmic Reproduction from Research Papers", 429 "authors": ["Yanzheng Xiang", "Hanqi Yan"], 430 "year": 2025, 431 "arxiv_id": "2504.00255", 432 "relevance": "Benchmark for agent-driven research paper reproduction, related evaluation of code agent capabilities." 433 }, 434 { 435 "title": "LLM-based Agents Suffer from Hallucinations: A Survey of Taxonomy, Methods, and Directions", 436 "authors": ["Xixun Lin", "Yucheng Ning"], 437 "year": 2025, 438 "arxiv_id": "2509.18970", 439 "relevance": "Survey on LLM agent hallucinations, cited for error propagation chains observed in EvalAgent." 440 } 441 ] 442 }