scan.json (26807B)
1 { 2 "paper": { 3 "title": "CATArena: Evaluating Evolutionary Capabilities of Code Agents via Iterative Tournaments", 4 "authors": [ 5 "Lingyue Fu", 6 "Xin Ding", 7 "Linyue Pan", 8 "Yaoming Zhu", 9 "Shao Zhang", 10 "Lin Qiu", 11 "Weiwen Liu", 12 "Weinan Zhang", 13 "Xuezhi Cao", 14 "Xunliang Cai", 15 "Jiaxin Ding", 16 "Yong Yu" 17 ], 18 "year": 2026, 19 "venue": "arXiv preprint", 20 "arxiv_id": "2510.26852" 21 }, 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "The paper provides the official website https://catarena.ai and references the Google ADK framework at https://github.com/google/adk-python. The framework is presented as a released artifact." 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": false, 32 "justification": "The paper does not release the tournament logs, match data, or the raw scoring matrices. The six task environments reference existing benchmarks (OIBench, SWE-Perf) but the specific tournament data generated by CATArena is not made publicly available." 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "The paper describes Docker environments for SWE-Perf and sandboxed evaluation arenas, but does not provide requirements.txt, Dockerfile, conda environment, or detailed dependency specifications for reproducing the framework." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "No step-by-step reproduction instructions are provided. There is no README with commands to run, no 'Reproducing Results' section. The paper describes the framework at a high level but lacks specific instructions for replicating the experiments." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": true, 48 "answer": false, 49 "justification": "Results in Tables 2, 11, 12, and 13 report only point estimates (Sbase and Sevo values) with no confidence intervals or error bars. The stability analysis in Appendix E reports standard deviations of rankings across 4 repetitions but does not provide CIs for the main metrics." 50 }, 51 "significance_tests": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper makes comparative claims (e.g., evolutionary capability is 'orthogonal' to static performance, commercial agents do not have decisive advantage in Sevo) but provides no statistical significance tests. SWE-Perf uses Mann-Whitney U internally for speedup calculation, but no significance tests are applied to the paper's own comparative claims." 55 }, 56 "effect_sizes_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "Effect sizes are reported in context: e.g., Gemini 'low-start, high-growth trajectory in Gomoku (0.25 → +0.156)', Claude Chess regression (-0.061), Qwen OIBench advance from G1=0.10 to G4=0.43. The Sevo metric itself quantifies rate of improvement with baselines provided." 60 }, 61 "sample_size_justified": { 62 "applies": true, 63 "answer": false, 64 "justification": "The number of agents (6 minimal + 5 commercial) and number of rounds (N=4 main, N=7 extended) are not justified with any formal reasoning. The four-round window is justified post-hoc by observing diminishing returns, but there is no power analysis or formal justification for the number of agents or repetitions." 65 }, 66 "variance_reported": { 67 "applies": true, 68 "answer": true, 69 "justification": "Appendix E (Table 8) reports standard deviations of agent rankings across 4 independent repetitions of Rounds 1 and 2. The stability analysis explicitly reports ranking variance. DISstd in Table 3 quantifies population score dispersion." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": true, 76 "justification": "The paper includes baselines: minimal agents (isolating LLM capabilities) vs. commercial agents (state-of-the-art systems). The Sbase metric serves as a static baseline against which Sevo is compared. Ablation in Table 4 compares full feedback vs. self-reflection-only." 77 }, 78 "baselines_contemporary": { 79 "applies": true, 80 "answer": true, 81 "justification": "The evaluated agents include contemporary state-of-the-art: Claude-4-Sonnet, GPT-5, Gemini-2.5-Pro, DeepSeek-3.1, Qwen3-Coder-480B, and commercial tools like Claude Code, Codex, Gemini CLI, and Qwen Code. These are current as of 2025-2026." 82 }, 83 "ablation_study": { 84 "applies": true, 85 "answer": true, 86 "justification": "Table 4 presents an ablation study isolating peer-learning from self-reflection by comparing full feedback (G2) vs. self-reflection-only (G'2). Section 4.4 explicitly labels this as an ablation study." 87 }, 88 "multiple_metrics": { 89 "applies": true, 90 "answer": true, 91 "justification": "The paper uses two core metrics: Sbase (static base performance) and Sevo (evolutionary capability), plus the global performance score Gn. Additional metrics include DISstd, Trendmean, code similarity measures, and HTTP error rates." 92 }, 93 "human_evaluation": { 94 "applies": false, 95 "answer": false, 96 "justification": "This is a benchmark evaluation paper measuring code agent performance through automated tournament outcomes. Human evaluation of agent outputs is not relevant to the claims about evolutionary capability metrics." 97 }, 98 "held_out_test_set": { 99 "applies": true, 100 "answer": true, 101 "justification": "The variant tasks (Section 4.5, Table 11) serve as held-out evaluation: rule perturbations (inverted hand rankings, Chess960, modified OIBench objectives) that invalidate memorized strategies and test generalization to unseen settings." 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": true, 106 "justification": "Results are broken down across all six tasks (Gomoku, Hold'em, Bridge, Chess, OIBench, SWE-Perf) in Tables 2 and 11, and per-agent evolutionary trajectories are shown in Figures 2 and 3. Additional breakdowns by programming language (Table 12) and ML track (Table 13)." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": true, 111 "justification": "Section 4.4 discusses three failure modes: Blind Copying, Stubborn Stagnation, and Chaotic Reconstruction. Appendix M discusses the Pommerman failure case where all agents failed to generate functional strategies. HTTP error rates are reported in Appendix F." 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": true, 116 "justification": "Multiple negative results are reported: Chess shows negative Trendmean (-0.74) indicating performance collapse; most agents show negative Sevo in Chess and SWE-Perf; Pommerman is an outright failure for all agents; the ML track shows limited technical depth. The paper explicitly states agents 'struggle to concurrently leverage both peer-learning and self-reflection.'" 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": true, 123 "justification": "The abstract claims: (1) evolutionary potential is not strictly correlated with initial proficiency — supported by Table 2 showing Gemini low Sbase/high Sevo; (2) agents struggle to leverage both peer-learning and self-reflection — supported by Table 4; (3) high extensibility and resistance to variance — supported by Tables 11 and 12." 124 }, 125 "causal_claims_justified": { 126 "applies": true, 127 "answer": true, 128 "justification": "The ablation study in Table 4 provides controlled manipulation to isolate self-reflection vs. peer-learning contributions. The variant task experiments control for memorization effects. Most causal language ('driven by feedback', 'enables evolution') is grounded in ablation or controlled comparison." 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper claims to evaluate 'evolutionary capabilities of code agents' broadly but tests on only 6 tasks (4 competitive games + 2 optimization benchmarks) with 11 agents. The title and abstract frame this as a general evaluation standard, but the paper does not explicitly bound claims to these specific task types. Claims like 'evolutionary capability is a distinct dimension of intelligence' are not bounded to the tested settings." 134 }, 135 "alternative_explanations_discussed": { 136 "applies": true, 137 "answer": false, 138 "justification": "The paper does not discuss alternative explanations for its results. For instance, the weak correlation between Sbase and Sevo could be due to task difficulty confounds, prompt sensitivity, or API non-determinism rather than a genuine orthogonal capability dimension. No threats-to-validity section or consideration of confounds is present." 139 } 140 }, 141 "setup_transparency": { 142 "model_versions_specified": { 143 "applies": true, 144 "answer": false, 145 "justification": "Table 7 lists model families (e.g., 'Claude-4-Sonnet', 'GPT-5', 'Gemini-2.5-Pro', 'DeepSeek-3.1') but does not provide specific API snapshot dates or version identifiers (e.g., no 'gpt-5-0613' equivalent). Marketing names without snapshot dates do not count as specified versions per the schema criteria." 146 }, 147 "prompts_provided": { 148 "applies": true, 149 "answer": true, 150 "justification": "Appendix O states 'Full prompt details are in Tables 18 for the main leaderboard, 19 for ML track and 20 for multilingual track.' The paper references providing the full prompts used for agents in the appendix tables." 151 }, 152 "hyperparameters_reported": { 153 "applies": true, 154 "answer": true, 155 "justification": "Appendix D states: 'For all LLMs used in our work, we set the temperature to 0.1 and the max tokens to the official API defaults. We set top-p to 1.0, top-k to 100, and the presence penalty to the API default.'" 156 }, 157 "scaffolding_described": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section 4.1 describes the minimal agent framework using Google ADK with 'only core tools (e.g., file I/O and scripting).' Commercial agents are described as 'CLI-based commercial agents' with 'proprietary toolchains.' Table 7 details agent types and frameworks. The case study in Appendix I shows concrete tool usage patterns." 161 }, 162 "data_preprocessing_documented": { 163 "applies": true, 164 "answer": true, 165 "justification": "The tournament configurations are detailed in Table 6 (Appendix B) with specific settings for each task: board sizes, number of matches, time limits, repetition counts, scoring rules. The performance matrix construction (Section 3.3) documents how raw match results are transformed into scores." 166 } 167 }, 168 "limitations_and_scope": { 169 "limitations_section_present": { 170 "applies": true, 171 "answer": false, 172 "justification": "There is no dedicated limitations or threats-to-validity section. The Impact Statement is a single paragraph that states 'There are many potential societal consequences of our work, none of which we feel must be specifically highlighted here' which does not address methodological limitations." 173 }, 174 "threats_to_validity_specific": { 175 "applies": true, 176 "answer": false, 177 "justification": "No specific threats to validity are discussed. The paper does not address potential issues such as: the single-chain evolutionary experiments, prompt sensitivity, API non-determinism effects beyond the limited stability analysis, or whether results generalize beyond these specific tasks." 178 }, 179 "scope_boundaries_stated": { 180 "applies": true, 181 "answer": false, 182 "justification": "The paper does not explicitly state what the results do NOT show. It acknowledges difficulty in SWE-Perf and Chess but frames these as agent limitations rather than scope boundaries of the evaluation framework. No explicit statements about what populations, settings, or claims are excluded." 183 } 184 }, 185 "data_integrity": { 186 "raw_data_available": { 187 "applies": true, 188 "answer": false, 189 "justification": "The raw tournament logs, match-level data, and full scoring matrices are not made available. Only aggregated results (Sbase, Sevo, G scores) are reported in the paper. Independent verification of the underlying data is not possible." 190 }, 191 "data_collection_described": { 192 "applies": true, 193 "answer": true, 194 "justification": "The data collection procedure is well described: tournament configurations in Table 6, scoring rules in Table 5, repetition protocols in Appendix B, and the full evaluation protocol in Section 4.1. The process from code submission to score computation is documented." 195 }, 196 "recruitment_methods_described": { 197 "applies": false, 198 "answer": false, 199 "justification": "No human participants are involved. The evaluated agents are selected models and commercial tools, which are standard benchmarks rather than recruited participants." 200 }, 201 "data_pipeline_documented": { 202 "applies": true, 203 "answer": true, 204 "justification": "The full pipeline is documented: initial development → execution feedback → iterative evolution (Section 3.1), with scoring rules (Table 5), tournament formats (Table 6), and metric calculation (Section 3.3 and Appendix A). The transformation from raw match results to Sbase and Sevo is formalized." 205 } 206 }, 207 "conflicts_of_interest": { 208 "funding_disclosed": { 209 "applies": true, 210 "answer": false, 211 "justification": "No funding source is disclosed anywhere in the paper. There is no acknowledgments section listing grants, corporate sponsors, or funding agencies." 212 }, 213 "affiliations_disclosed": { 214 "applies": true, 215 "answer": true, 216 "justification": "Author affiliations are listed: Shanghai Jiao Tong University, Tsinghua University, and Meituan. The Meituan affiliation (a major Chinese technology company) is disclosed." 217 }, 218 "funder_independent_of_outcome": { 219 "applies": true, 220 "answer": false, 221 "justification": "No funding source is disclosed, so independence cannot be assessed. Authors from Meituan (a tech company) are involved, and the paper evaluates multiple commercial LLM products, but there is no statement about whether any evaluated companies funded the research." 222 }, 223 "financial_interests_declared": { 224 "applies": true, 225 "answer": false, 226 "justification": "No competing interests or financial interests statement is provided. Authors from Meituan may have interests related to the Doubao-Seed model evaluated in the paper, but this is not declared." 227 } 228 }, 229 "contamination": { 230 "training_cutoff_stated": { 231 "applies": true, 232 "answer": false, 233 "justification": "No training data cutoff dates are stated for any of the evaluated models. This is relevant because competitive game strategies and OIBench problems may exist in training data." 234 }, 235 "train_test_overlap_discussed": { 236 "applies": true, 237 "answer": true, 238 "justification": "Section 4.5 directly addresses contamination through variant tasks: 'To verify that CATArena evaluates genuine adaptive capabilities rather than the retrieval of memorized solutions, we conducted experiments on unseen variants... that invalidate standard strategies present in the agents' pre-training data.'" 239 }, 240 "benchmark_contamination_addressed": { 241 "applies": true, 242 "answer": true, 243 "justification": "The variant task experiments (Section 4.5, Table 11) are explicitly designed to address benchmark contamination by introducing rule perturbations that invalidate memorized strategies. The paper states results confirm evaluation is 'independent of data contamination or specific task familiarity.'" 244 } 245 }, 246 "human_studies": { 247 "pre_registered": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants are involved in this study." 251 }, 252 "irb_or_ethics_approval": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants are involved in this study." 256 }, 257 "demographics_reported": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants are involved in this study." 261 }, 262 "inclusion_exclusion_criteria": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants are involved in this study." 266 }, 267 "randomization_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants are involved in this study." 271 }, 272 "blinding_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants are involved in this study." 276 }, 277 "attrition_reported": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants are involved in this study." 281 } 282 }, 283 "cost_and_practicality": { 284 "inference_cost_reported": { 285 "applies": true, 286 "answer": true, 287 "justification": "Appendix N (Tables 14-17) provides detailed cost statistics including total input tokens, output tokens, total time in seconds, tools used, valid lines of code, and average thinking time for each agent across all tasks and rounds." 288 }, 289 "compute_budget_stated": { 290 "applies": true, 291 "answer": false, 292 "justification": "While per-agent token costs and execution times are reported (Appendix N), the total computational budget (total API spend, total GPU hours for tournament execution, total cost of the study) is not stated." 293 } 294 } 295 }, 296 "claims": [ 297 { 298 "claim": "Evolutionary capability (Sevo) is not strictly correlated with static base performance (Sbase) — agents with high initial pass rates do not necessarily possess strong evolving abilities.", 299 "evidence": "Table 2 shows Claude-4 has Sbase=0.90 in Chess but Sevo=-0.061, while Gemini-2.5-Pro has Sbase=0.25 in Gomoku but Sevo=+0.156. The paper states 'We observe a weak correlation between Sbase and Sevo across all tasks' (Section 4.2).", 300 "supported": "moderate" 301 }, 302 { 303 "claim": "Current agents struggle to concurrently leverage both peer-learning and self-reflection for effective performance gains.", 304 "evidence": "Table 4 ablation study shows most models rely predominantly on a single mechanism. Only Claude and Qwen demonstrate effective peer-learning in specific tasks. The paper identifies failure patterns: 'agents typically exhibit a dependency on a single source' (Section 4.4).", 305 "supported": "moderate" 306 }, 307 { 308 "claim": "Commercial code agent frameworks primarily enhance static generation capability rather than evolutionary ability.", 309 "evidence": "Table 2 shows commercial agents frequently outperform Best ADK Agent in Sbase but do not exhibit decisive advantage in Sevo. For example, Claude Code has Sbase=0.78 but Sevo=-0.035 in Gomoku (Section 4.2).", 310 "supported": "moderate" 311 }, 312 { 313 "claim": "CATArena is robust to task variants and maintains discriminative power for evolutionary capability evaluation.", 314 "evidence": "Table 11 shows variant task results where Sevo maintains discriminative power despite altered baselines. Section 4.5 states 'the evolutionary metric retains its effectiveness in distinguishing agent capabilities.'", 315 "supported": "moderate" 316 }, 317 { 318 "claim": "Competitive tasks foster population convergence while objective tasks drive divergence.", 319 "evidence": "Table 3 shows DISstd is negative for most competitive tasks (Gomoku -0.05, Hold'em -0.81, Bridge -0.82) and positive for objective tasks (OIBench +0.42, SWE-Perf +0.20), with Chess as an exception (+0.55).", 320 "supported": "moderate" 321 } 322 ], 323 "methodology_tags": [ 324 "benchmark-eval" 325 ], 326 "key_findings": "CATArena introduces a tournament-based framework for evaluating code agents' evolutionary capabilities through iterative multi-turn development, using dual metrics (Sbase for static proficiency and Sevo for evolutionary potential). The framework reveals that evolutionary capability is largely orthogonal to single-turn generation performance, with agents like Gemini showing low-start/high-growth trajectories. Current agents predominantly rely on either self-reflection or peer-learning but struggle to synergize both mechanisms. The framework demonstrates robustness to variant tasks and extensibility to new domains, though complex environments like Chess and Pommerman remain challenging for all evaluated agents.", 327 "red_flags": [ 328 { 329 "flag": "Single-chain main experiments", 330 "detail": "The main evolutionary experiments (N=4 rounds) are conducted as a single chain due to computational cost. While a stability analysis is done on Rounds 1 and 2 (4 repetitions), the actual evolutionary trajectories and Sevo metrics are based on single runs, making it impossible to assess the reliability of the evolutionary conclusions." 331 }, 332 { 333 "flag": "No limitations section", 334 "detail": "The paper lacks any dedicated limitations or threats-to-validity section. For a paper proposing a new evaluation framework that makes broad claims about 'evolutionary capability as a distinct dimension of intelligence,' the absence of discussed limitations is a significant omission." 335 }, 336 { 337 "flag": "Meituan affiliation undisclosed conflict", 338 "detail": "Three authors are affiliated with Meituan, which develops the Doubao-Seed model evaluated in the paper. This potential conflict of interest is not acknowledged, and no competing interests statement is provided." 339 }, 340 { 341 "flag": "No significance testing for comparative claims", 342 "detail": "Claims about weak correlation between Sbase and Sevo, and about commercial agents not having evolutionary advantages, are stated based on visual inspection of point estimates in Table 2 without any correlation coefficients, confidence intervals, or significance tests across the full set of results." 343 }, 344 { 345 "flag": "Broad claims from narrow evaluation", 346 "detail": "The paper evaluates 6 minimal agents and 5 commercial agents across 6 tasks (primarily games and algorithm optimization) but makes general claims about 'evolutionary capability' as a dimension of intelligence. The task diversity is limited to games and code optimization, and generalizing to broader software engineering evolution is not justified." 347 } 348 ], 349 "cited_papers": [ 350 { 351 "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering", 352 "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"], 353 "year": 2024, 354 "arxiv_id": "2405.15793", 355 "relevance": "Core baseline for evaluating code agents on software engineering benchmarks." 356 }, 357 { 358 "title": "Evaluating Large Language Models Trained on Code", 359 "authors": ["Mark Chen"], 360 "year": 2021, 361 "arxiv_id": "2107.03374", 362 "relevance": "Introduced HumanEval and Pass@k metric that CATArena aims to extend beyond." 363 }, 364 { 365 "title": "Self-Refine: Iterative Refinement with Self-Feedback", 366 "authors": ["Aman Madaan"], 367 "year": 2023, 368 "relevance": "Key prior work on self-reflection mechanisms for LLM output refinement." 369 }, 370 { 371 "title": "Reflexion: Language Agents with Verbal Reinforcement Learning", 372 "authors": ["Noah Shinn"], 373 "year": 2023, 374 "relevance": "Foundational work on iterative self-correction in LLM agents." 375 }, 376 { 377 "title": "GameBench: Evaluating Strategic Reasoning Abilities of LLM Agents", 378 "authors": ["Anthony Costarelli"], 379 "year": 2024, 380 "arxiv_id": "2406.06613", 381 "relevance": "Prior benchmark for evaluating LLM agents in game-playing scenarios." 382 }, 383 { 384 "title": "OIBench: Benchmarking Strong Reasoning Models with Olympiad in Informatics", 385 "authors": ["Yaoming Zhu"], 386 "year": 2025, 387 "arxiv_id": "2506.10481", 388 "relevance": "One of the two objective optimization tasks used within CATArena framework." 389 }, 390 { 391 "title": "SWE-Perf: Can Language Models Optimize Code Performance on Real-World Repositories?", 392 "authors": ["Xin He"], 393 "year": 2025, 394 "arxiv_id": "2507.12415", 395 "relevance": "Second objective optimization task integrated into CATArena for repository-level performance optimization." 396 }, 397 { 398 "title": "RedCode: Risky Code Execution and Generation Benchmark for Code Agents", 399 "authors": ["Cheng Guo"], 400 "year": 2024, 401 "arxiv_id": "2411.07781", 402 "relevance": "Specialized code security evaluation benchmark for code agents." 403 }, 404 { 405 "title": "A Survey on Code Generation with LLM-based Agents", 406 "authors": ["Yifan Dong"], 407 "year": 2025, 408 "arxiv_id": "2508.00083", 409 "relevance": "Survey covering LLM-based code agent capabilities and evaluation." 410 }, 411 { 412 "title": "SWE-Compass: Towards Unified Evaluation of Agentic Coding Abilities for Large Language Models", 413 "authors": ["Jianghao Xu"], 414 "year": 2025, 415 "arxiv_id": "2511.05459", 416 "relevance": "Unified framework for evaluating agentic coding capabilities." 417 }, 418 { 419 "title": "Evolutionary Perspectives on the Evaluation of LLM-based AI Agents: A Comprehensive Survey", 420 "authors": ["Jiahao Zhu"], 421 "year": 2025, 422 "arxiv_id": "2506.11102", 423 "relevance": "Survey on evolutionary evaluation perspectives for LLM agents." 424 }, 425 { 426 "title": "SWE-PolyBench: A Multi-Language Benchmark for Repository Level Evaluation of Coding Agents", 427 "authors": ["Md Saidur Rashid"], 428 "year": 2025, 429 "arxiv_id": "2504.08703", 430 "relevance": "Multi-language benchmark for repository-level code agent evaluation." 431 } 432 ] 433 }