scan.json (27855B)
1 { 2 "paper": { 3 "title": "Evolving Excellence: Automated Optimization of LLM-based Agents", 4 "authors": [ 5 "Paul Brookes", 6 "Vardan Voskanyan", 7 "Rafail Giavrimis", 8 "Matthew Truscott", 9 "Mina Ilieva", 10 "Chrystalla Pavlou", 11 "Alexandru Staicu", 12 "Manal Adham", 13 "Will Evers-Hood", 14 "Jingzhi Gong", 15 "Kejia Zhang", 16 "Matvey Fedoseev", 17 "Vishal Sharma", 18 "Roman Bauer", 19 "Zheng Wang", 20 "Hema Nair", 21 "Wei Jie", 22 "Tianhua Xu", 23 "Aurora Constantin", 24 "Carmine Ventre", 25 "Leslie Kanthan", 26 "Michail Basios" 27 ], 28 "year": 2025, 29 "venue": "arXiv", 30 "arxiv_id": "2512.09108", 31 "doi": "10.48550/arXiv.2512.09108" 32 }, 33 "scan_version": 2, 34 "active_modules": ["experimental_rigor", "data_leakage"], 35 "methodology_tags": ["benchmark-eval"], 36 "key_findings": "Artemis, an evolutionary optimization platform, achieves 13.6% improvement on competitive programming (ALE Agent), 10.1% on code optimization (Mini-SWE), 36.9% token cost reduction on math reasoning (CrewAI), and 22% accuracy improvement on primary math (MathTales-Teacher with Qwen2.5-7B). Results suggest automated agent configuration tuning is most effective for under-optimized agents with clear metrics, while already well-tuned systems show limited accuracy gains but may benefit from cost optimization.", 37 "checklist": { 38 "artifacts": { 39 "code_released": { 40 "applies": true, 41 "answer": false, 42 "justification": "Section 7 states 'we are going to open source the code for all four case study agents' as future intent. No repository URL is provided. The Artemis platform itself is proprietary." 43 }, 44 "data_released": { 45 "applies": true, 46 "answer": true, 47 "justification": "The benchmarks used (AtCoder Heuristic Contest, SWE-Perf, Math Odyssey, GSM8K) are publicly available. The paper references public benchmark sources." 48 }, 49 "environment_specified": { 50 "applies": true, 51 "answer": false, 52 "justification": "No environment specifications, requirements files, or dependency details are provided. The paper does not describe hardware or software environment beyond mentioning Claude 3.5 Sonnet and Qwen2.5-7B as base models." 53 }, 54 "reproduction_instructions": { 55 "applies": true, 56 "answer": false, 57 "justification": "No step-by-step reproduction instructions are provided. Section 7 acknowledges 'the complete Artemis platform setup cannot be shared.'" 58 } 59 }, 60 "statistical_methodology": { 61 "confidence_intervals_or_error_bars": { 62 "applies": true, 63 "answer": true, 64 "justification": "Section 6.1 reports 95% confidence intervals for ALE Agent results: baseline 0.660 (95% CI: [0.594, 0.726]), prompt optimized 0.750 (95% CI: [0.689, 0.811])." 65 }, 66 "significance_tests": { 67 "applies": true, 68 "answer": true, 69 "justification": "Multiple significance tests are reported: Mann-Whitney U test for Mini-SWE (p < 0.005), p-values for ALE (p = 0.10), CrewAI accuracy (p = 0.478), CrewAI tokens (p < 10^-6), MathTales (p < 0.001)." 70 }, 71 "effect_sizes_reported": { 72 "applies": true, 73 "answer": true, 74 "justification": "Percentage improvements with baseline context are reported throughout: 13.6% improvement from 66.0% to 75.0% (ALE), 10.1% gain (Mini-SWE), 36.9% token reduction (CrewAI), 22% accuracy improvement from 0.59 to 0.81 (MathTales)." 75 }, 76 "sample_size_justified": { 77 "applies": true, 78 "answer": false, 79 "justification": "No justification is given for the sample sizes used. The ALE benchmark has 40 problems, CrewAI uses stratified samples of 30 problems, MathTales uses 50 validation and 300 evaluation problems, but no power analysis or justification for these sizes is provided." 80 }, 81 "variance_reported": { 82 "applies": true, 83 "answer": false, 84 "justification": "While confidence intervals are provided for ALE, no standard deviations or variance across experimental runs are reported. MathTales reports averages over 3 runs but without spread measures. The paper does not report variance across optimization runs or seeds." 85 } 86 }, 87 "evaluation_design": { 88 "baselines_included": { 89 "applies": true, 90 "answer": true, 91 "justification": "Each agent system is compared against its unoptimized baseline configuration. Table 3 summarizes baseline vs. optimized results for all four agents." 92 }, 93 "baselines_contemporary": { 94 "applies": true, 95 "answer": false, 96 "justification": "The only baselines are the agents' own unoptimized configurations. No comparison against other optimization methods (DSPy, ADAS, AFlow, PromptBreeder) is included despite Table 1 listing them as related work." 97 }, 98 "ablation_study": { 99 "applies": true, 100 "answer": false, 101 "justification": "No ablation study is conducted to determine which components of Artemis (semantic mutations, crossover, hierarchical evaluation, Bayesian optimization) contribute to the improvements. The ALE Agent compares prompt vs. search optimization strategies, but this is not a component ablation." 102 }, 103 "multiple_metrics": { 104 "applies": true, 105 "answer": true, 106 "justification": "Multiple metrics are reported: ALE uses acceptance rate, average performance, and average rank. Mini-SWE reports apply rate, correctness, and performance score. CrewAI reports accuracy and token cost. MathTales reports accuracy and completeness." 107 }, 108 "human_evaluation": { 109 "applies": true, 110 "answer": false, 111 "justification": "No human evaluation of the optimized configurations or outputs is included. All evaluation is automated through benchmarks." 112 }, 113 "held_out_test_set": { 114 "applies": true, 115 "answer": true, 116 "justification": "MathTales uses a separate validation set (50 problems) for optimization and a larger evaluation set (300 problems) for final testing. CrewAI tests both 30×10 and 50×6 configurations against the full 387-problem corpus." 117 }, 118 "per_category_breakdown": { 119 "applies": true, 120 "answer": true, 121 "justification": "Mini-SWE provides per-project breakdowns (Figure 8) showing performance across 9 Python projects. CrewAI provides per-run breakdowns." 122 }, 123 "failure_cases_discussed": { 124 "applies": true, 125 "answer": true, 126 "justification": "Section 6.6 discusses when optimization does not work. Section 7 notes Mini-SWE had -0.1% for pylint, and CrewAI showed a slight accuracy decrease. The paper discusses conditions under which optimization provides limited benefit." 127 }, 128 "negative_results_reported": { 129 "applies": true, 130 "answer": true, 131 "justification": "CrewAI showed a non-significant 3.7% accuracy decrease (p = 0.478). ALE results did not reach statistical significance (p = 0.10). Mini-SWE showed minimal/negative gains for some projects (pylint -0.1%)." 132 } 133 }, 134 "claims_and_evidence": { 135 "abstract_claims_supported": { 136 "applies": true, 137 "answer": true, 138 "justification": "Abstract claims of 13.6% ALE improvement, 10.1% Mini-SWE gain, 36.9% CrewAI cost reduction, and 22% MathTales accuracy improvement are all supported by results in Section 6 and Table 3." 139 }, 140 "causal_claims_justified": { 141 "applies": true, 142 "answer": false, 143 "justification": "The paper claims Artemis 'delivers substantial improvements' and optimization 'can substantially enhance under-optimized systems,' but the study design does not control for confounds. Improvements could be due to simply trying more configurations rather than Artemis's specific evolutionary approach. No comparison with random search or other optimization baselines." 144 }, 145 "generalization_bounded": { 146 "applies": true, 147 "answer": false, 148 "justification": "The title claims 'Automated Optimization of LLM-based Agents' broadly, but results are on only 4 specific agents. Section 7 discusses limitations but the abstract and introduction present results as generalizable ('making sophisticated optimization accessible to practitioners')." 149 }, 150 "alternative_explanations_discussed": { 151 "applies": true, 152 "answer": false, 153 "justification": "The paper does not discuss whether the improvements could be achieved by simpler methods (random search, manual prompt engineering by a domain expert). No consideration of whether the evolutionary approach specifically contributes versus simply trying many configurations." 154 }, 155 "proxy_outcome_distinction": { 156 "applies": true, 157 "answer": true, 158 "justification": "The paper measures specific metrics (acceptance rate, performance score, token cost, accuracy) and frames claims at the same granularity. It does not inflate benchmark scores into broader claims about 'agent intelligence' or similar." 159 } 160 }, 161 "setup_transparency": { 162 "model_versions_specified": { 163 "applies": true, 164 "answer": false, 165 "justification": "Mini-SWE states 'Claude 3.5 Sonnet' without a snapshot date or API version. MathTales uses 'Qwen2.5-7B.' ALE Agent and CrewAI do not specify which LLM they use. The LLMs used in Artemis's mutation/crossover operators are not specified." 166 }, 167 "prompts_provided": { 168 "applies": true, 169 "answer": true, 170 "justification": "Before/after prompt text is provided for all four agents in Figures 5, 7, 13, and 14, showing the actual prompt content used in experiments." 171 }, 172 "hyperparameters_reported": { 173 "applies": true, 174 "answer": false, 175 "justification": "No temperature, top-p, or other LLM sampling parameters are reported. Evolutionary algorithm parameters (population size, generations) are mentioned only for MathTales (2 generations, population size 3). Other agents lack these details." 176 }, 177 "scaffolding_described": { 178 "applies": true, 179 "answer": true, 180 "justification": "The Artemis platform's workflow is described in Section 4 with three stages (project setup, component discovery, optimization strategies). The hierarchical evaluation strategy is described. Agent pipelines are described at a high level for each case study." 181 }, 182 "data_preprocessing_documented": { 183 "applies": true, 184 "answer": false, 185 "justification": "For CrewAI, the stratified sampling method for selecting 30 problems from 387 is mentioned but not described in detail. No documentation of how benchmark data was prepared or filtered for any agent." 186 } 187 }, 188 "limitations_and_scope": { 189 "limitations_section_present": { 190 "applies": true, 191 "answer": true, 192 "justification": "Section 8 'Conclusion, Limitations, and Future Work' includes a dedicated paragraph discussing limitations including optimization effectiveness varying with initial configuration quality, generalizability concerns, and computational costs." 193 }, 194 "threats_to_validity_specific": { 195 "applies": true, 196 "answer": true, 197 "justification": "Section 8 identifies specific threats: optimization effectiveness varies with initial configuration quality, ALE did not reach statistical significance, CrewAI showed accuracy decrease, and overfitting to benchmarks is acknowledged as a risk." 198 }, 199 "scope_boundaries_stated": { 200 "applies": true, 201 "answer": false, 202 "justification": "While the paper discusses that well-tuned agents may not benefit, it does not explicitly state what types of agents, tasks, or domains the results do NOT apply to. The broad framing of the title and introduction exceeds the tested scope." 203 } 204 }, 205 "data_integrity": { 206 "raw_data_available": { 207 "applies": true, 208 "answer": false, 209 "justification": "No raw experimental data (per-problem results, full optimization logs, individual run outputs) is made available." 210 }, 211 "data_collection_described": { 212 "applies": true, 213 "answer": true, 214 "justification": "Each experiment describes the benchmark source, number of problems, and evaluation procedure. Section 5 details the experimental setup for all four agents." 215 }, 216 "recruitment_methods_described": { 217 "applies": false, 218 "answer": false, 219 "justification": "No human participants. All evaluations use automated benchmarks." 220 }, 221 "data_pipeline_documented": { 222 "applies": true, 223 "answer": false, 224 "justification": "The pipeline from optimization runs to final reported results is not documented. For CrewAI, it's unclear how the 30-problem stratified samples were drawn. The relationship between optimization (validation) and final evaluation runs is not fully described." 225 } 226 }, 227 "conflicts_of_interest": { 228 "funding_disclosed": { 229 "applies": true, 230 "answer": true, 231 "justification": "Acknowledgment section states: 'This work was supported by EU Horizon 2020 Grant 101008280 (DIOR).'" 232 }, 233 "affiliations_disclosed": { 234 "applies": true, 235 "answer": true, 236 "justification": "Author affiliations are listed. Most authors are affiliated with TurinTech AI, the company that developed Artemis." 237 }, 238 "funder_independent_of_outcome": { 239 "applies": true, 240 "answer": false, 241 "justification": "TurinTech AI, the company that developed and commercially offers Artemis, employs most of the authors. The EU grant supports the project but the primary conflict is that the authors are evaluating their own commercial product." 242 }, 243 "financial_interests_declared": { 244 "applies": true, 245 "answer": false, 246 "justification": "No competing interests or financial interests statement is present. Multiple authors work for TurinTech AI which commercially sells Artemis, but this conflict is not explicitly declared." 247 } 248 }, 249 "contamination": { 250 "training_cutoff_stated": { 251 "applies": true, 252 "answer": false, 253 "justification": "No training cutoff dates are stated for Claude 3.5 Sonnet or Qwen2.5-7B. The LLMs used in Artemis's operators are not specified at all." 254 }, 255 "train_test_overlap_discussed": { 256 "applies": true, 257 "answer": false, 258 "justification": "No discussion of whether the benchmark problems (GSM8K published 2021, AtCoder problems, SWE-Perf) could have appeared in the training data of the LLMs used." 259 }, 260 "benchmark_contamination_addressed": { 261 "applies": true, 262 "answer": false, 263 "justification": "GSM8K (2021) is widely known to be contaminated in many models. AtCoder problems are public. No contamination analysis is conducted for any benchmark." 264 } 265 }, 266 "human_studies": { 267 "pre_registered": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "irb_or_ethics_approval": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 }, 277 "demographics_reported": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in this study." 281 }, 282 "inclusion_exclusion_criteria": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants in this study." 286 }, 287 "randomization_described": { 288 "applies": false, 289 "answer": false, 290 "justification": "No human participants in this study." 291 }, 292 "blinding_described": { 293 "applies": false, 294 "answer": false, 295 "justification": "No human participants in this study." 296 }, 297 "attrition_reported": { 298 "applies": false, 299 "answer": false, 300 "justification": "No human participants in this study." 301 } 302 }, 303 "cost_and_practicality": { 304 "inference_cost_reported": { 305 "applies": true, 306 "answer": true, 307 "justification": "Section 5.1 reports $24-26 per evaluation run for ALE, Section 5.2 reports $30-60 for Mini-SWE. Total optimization time is reported: 411.2 hours for ALE prompt optimization, 260.5 hours for search, 9 hours for Mini-SWE." 308 }, 309 "compute_budget_stated": { 310 "applies": true, 311 "answer": true, 312 "justification": "Total optimization hours are reported: 671.7 hours for ALE Agent, 9 hours for Mini-SWE. Section 5.2 mentions '20-30 hours evaluation time for a full benchmarking.'" 313 } 314 }, 315 "experimental_rigor": { 316 "seed_sensitivity_reported": { 317 "applies": true, 318 "answer": false, 319 "justification": "No seed sensitivity analysis is reported. Results are not shown across multiple random seeds for any agent." 320 }, 321 "number_of_runs_stated": { 322 "applies": true, 323 "answer": true, 324 "justification": "CrewAI states 12 evaluation runs and tests 30×10 and 50×6 configurations. MathTales reports 3 evaluation runs. ALE evaluates on all 40 problems per run." 325 }, 326 "hyperparameter_search_budget": { 327 "applies": true, 328 "answer": false, 329 "justification": "Only MathTales states its evolutionary parameters (2 generations, population size 3). Other agents do not specify the number of configurations explored, generations, or total evolutionary search budget." 330 }, 331 "best_config_selection_justified": { 332 "applies": true, 333 "answer": true, 334 "justification": "MathTales explicitly selects the best configuration from validation set performance and evaluates on a separate 300-problem evaluation set. CrewAI tests generalization with stratified sampling." 335 }, 336 "multiple_comparison_correction": { 337 "applies": true, 338 "answer": false, 339 "justification": "Multiple statistical tests are performed across 4 agents and multiple metrics, but no multiple comparison correction (Bonferroni, etc.) is applied." 340 }, 341 "self_comparison_bias_addressed": { 342 "applies": true, 343 "answer": false, 344 "justification": "The authors evaluate their own commercial product (Artemis) against unoptimized baselines without acknowledging self-evaluation bias. No independent evaluation is conducted." 345 }, 346 "compute_budget_vs_performance": { 347 "applies": true, 348 "answer": false, 349 "justification": "No performance curves as a function of compute budget are shown. It is unclear how performance scales with optimization time. ALE required 671.7 hours vs Mini-SWE's 9 hours, but no analysis of diminishing returns." 350 }, 351 "benchmark_construct_validity": { 352 "applies": true, 353 "answer": false, 354 "justification": "No discussion of whether the benchmarks (AtCoder, SWE-Perf, Math Odyssey, GSM8K) validly measure the capabilities the paper claims to optimize. GSM8K in particular is known to have saturation and contamination issues." 355 }, 356 "scaffold_confound_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "The paper varies agent configurations through Artemis while keeping the scaffold constant per agent, but does not discuss whether scaffold choice confounds results. Different agents use different scaffolds, making cross-agent comparisons unreliable." 360 } 361 }, 362 "data_leakage": { 363 "temporal_leakage_addressed": { 364 "applies": true, 365 "answer": false, 366 "justification": "No discussion of temporal leakage. GSM8K (2021) predates all models used. AtCoder problems are public. No analysis of whether models have seen benchmark solutions." 367 }, 368 "feature_leakage_addressed": { 369 "applies": true, 370 "answer": false, 371 "justification": "No discussion of whether the evaluation setup leaks information. For example, whether optimized prompts effectively encode benchmark-specific patterns rather than general strategies." 372 }, 373 "non_independence_addressed": { 374 "applies": true, 375 "answer": false, 376 "justification": "No discussion of whether validation and evaluation sets share structural similarities that could inflate generalization claims." 377 }, 378 "leakage_detection_method": { 379 "applies": true, 380 "answer": false, 381 "justification": "No concrete leakage detection or prevention method is applied to any benchmark." 382 } 383 } 384 }, 385 "claims": [ 386 { 387 "claim": "Artemis achieves 13.6% improvement in acceptance rate for the ALE Agent on AtCoder Heuristic Contest through prompt optimization.", 388 "evidence": "Section 6.1, Table 3: acceptance rate rose from 0.660 (95% CI: [0.594, 0.726]) to 0.750 (95% CI: [0.689, 0.811]). p = 0.10.", 389 "supported": "moderate" 390 }, 391 { 392 "claim": "Mini-SWE Agent shows a statistically significant 10.1% performance improvement on SWE-Perf.", 393 "evidence": "Section 6.2, Table 3: performance score improved from 0.891 to 0.981, p < 0.005 via Mann-Whitney U test. Apply rate and correctness maintained at 92.1% and 87.9%.", 394 "supported": "strong" 395 }, 396 { 397 "claim": "CrewAI Agent achieves 36.9% reduction in token cost with only a non-significant 3.7% accuracy decrease.", 398 "evidence": "Section 6.3, Table 3: tokens reduced from 12033 to 7329 (p < 10^-6), accuracy dropped from 0.82 to 0.78 (p = 0.478).", 399 "supported": "strong" 400 }, 401 { 402 "claim": "MathTales-Teacher Agent achieves 22% accuracy improvement on GSM8K primary-level mathematics.", 403 "evidence": "Section 6.4, Table 3: accuracy improved from 0.59 to 0.81 (p < 0.001), completeness from 0.796 to 0.917 (p < 0.001). Results evaluated on 300 problems across 3 runs.", 404 "supported": "strong" 405 }, 406 { 407 "claim": "Artemis is a general-purpose framework that works across agent architectures and task types.", 408 "evidence": "Results on 4 agents spanning competitive programming, code optimization, and math reasoning. However, only 4 agents tested, all by the same team, and the platform is proprietary.", 409 "supported": "weak" 410 } 411 ], 412 "red_flags": [ 413 { 414 "flag": "Company evaluating its own product", 415 "detail": "Most authors are affiliated with TurinTech AI, which commercially develops and sells the Artemis platform. The paper evaluates Artemis without independent validation or comparison against competing optimization methods. No competing interests statement is included." 416 }, 417 { 418 "flag": "No comparison with alternative optimization methods", 419 "detail": "Despite Table 1 listing APE, PromptBreeder, ADAS, AFlow, GEPA, and ShinkaEvolve as related work, no empirical comparison against any of these methods is conducted. The only baseline is the unoptimized agent configuration, making it impossible to assess whether Artemis's evolutionary approach adds value over simpler alternatives like random search." 420 }, 421 { 422 "flag": "Benchmark contamination risk unaddressed", 423 "detail": "GSM8K (published 2021) is widely known to be contaminated in many LLMs. AtCoder problems are publicly available. No contamination analysis is performed, which is especially concerning because optimized prompts could be encoding benchmark-specific patterns." 424 }, 425 { 426 "flag": "Proprietary platform limits reproducibility", 427 "detail": "The Artemis platform is proprietary and 'the complete Artemis platform setup cannot be shared' (Section 7). Agent code is promised for future release but not yet available. Results cannot be independently verified." 428 }, 429 { 430 "flag": "Statistical significance not reached for headline result", 431 "detail": "The ALE Agent's 13.6% improvement (the largest absolute gain, featured prominently in the abstract) did not reach statistical significance (p = 0.10). This is acknowledged but the result is still presented as a headline finding." 432 } 433 ], 434 "cited_papers": [ 435 { 436 "title": "Large language models are human-level prompt engineers", 437 "authors": ["Yongchao Zhou", "Andrei Ioan Muresanu", "Ziwen Han"], 438 "year": 2023, 439 "arxiv_id": "2211.01910", 440 "relevance": "APE framework for automated prompt optimization, baseline approach for LLM-based prompt engineering." 441 }, 442 { 443 "title": "PromptBreeder: Self-referential self-improvement via prompt evolution", 444 "authors": ["Chrisantha Fernando"], 445 "year": 2023, 446 "arxiv_id": "2309.16797", 447 "relevance": "Evolutionary algorithms applied to prompt optimization, direct predecessor approach." 448 }, 449 { 450 "title": "SWE-bench: Can language models resolve real-world github issues?", 451 "authors": ["Carlos E Jimenez", "John Yang"], 452 "year": 2024, 453 "relevance": "Major benchmark for evaluating code generation agents on real-world software engineering tasks." 454 }, 455 { 456 "title": "Automated design of agentic systems", 457 "authors": ["Shengran Hu", "Cong Lu", "Jeff Clune"], 458 "year": 2024, 459 "arxiv_id": "2408.08435", 460 "relevance": "ADAS framework for code-based agent workflow optimization." 461 }, 462 { 463 "title": "AFlow: Automating agentic workflow generation", 464 "authors": ["Jiayi Chen"], 465 "year": 2024, 466 "arxiv_id": "2410.10762", 467 "relevance": "MCTS-based approach to automating agentic workflow design." 468 }, 469 { 470 "title": "Code generation with AlphaCodium: From prompt engineering to flow engineering", 471 "authors": ["Tal Ridnik", "Dedy Kredo", "Itamar Friedman"], 472 "year": 2024, 473 "arxiv_id": "2401.08500", 474 "relevance": "Flow engineering approach to code generation, demonstrating workflow-level optimization over prompt-only methods." 475 }, 476 { 477 "title": "Why do multi-agent LLM systems fail? MAST: Multi-agent system failure taxonomy", 478 "authors": ["Weize Chen"], 479 "year": 2024, 480 "arxiv_id": "2503.13657", 481 "relevance": "Taxonomy of failure modes in multi-agent LLM systems, motivating need for systematic agent optimization." 482 }, 483 { 484 "title": "ReAct: Synergizing reasoning and acting in language models", 485 "authors": ["Shunyu Yao"], 486 "year": 2022, 487 "arxiv_id": "2210.03629", 488 "relevance": "Foundational agent reasoning framework combining reasoning traces with actions." 489 }, 490 { 491 "title": "Reflexion: Language agents with verbal reinforcement learning", 492 "authors": ["Noah Shinn"], 493 "year": 2023, 494 "relevance": "Memory-based self-feedback framework for LLM agents without weight updates." 495 }, 496 { 497 "title": "SWE-Perf: can language models optimize code performance on real-world repositories?", 498 "authors": ["Xinyi He"], 499 "year": 2025, 500 "relevance": "Benchmark used in this paper for evaluating code performance optimization agents." 501 }, 502 { 503 "title": "AlphaEvolve: A coding agent for scientific and algorithmic discovery", 504 "authors": ["Alexander Novikov"], 505 "year": 2025, 506 "arxiv_id": "2506.13131", 507 "relevance": "Closed-loop LLM generation and verification pipeline for algorithmic improvement using evolutionary methods." 508 }, 509 { 510 "title": "SWE-agent: Agent-computer interfaces enable automated software engineering", 511 "authors": ["John Yang", "Carlos E Jimenez"], 512 "year": 2024, 513 "relevance": "Agent framework for autonomous software engineering tasks, context for agent evaluation methods." 514 } 515 ] 516 }