scan.json (28790B)
1 { 2 "paper": { 3 "title": "Let the Barbarians In: How AI Can Accelerate Systems Performance Research", 4 "authors": [ 5 "Audrey Cheng", 6 "Shu Liu", 7 "Melissa Pan", 8 "Zhifei Li", 9 "Shubham Agarwal", 10 "Mert Cemri", 11 "Bowen Wang", 12 "Alexander Krentsel", 13 "Tian Xia", 14 "Jongseok Park", 15 "Shuo Yang", 16 "Jeff Chen", 17 "Lakshya Agrawal", 18 "Ashwin Naren", 19 "Shulu Li", 20 "Ruiying Ma", 21 "Aditya Desai", 22 "Jiarong Xing", 23 "Koushik Sen", 24 "Matei Zaharia", 25 "Ion Stoica" 26 ], 27 "year": 2025, 28 "venue": "arXiv", 29 "arxiv_id": "2512.14806", 30 "doi": "10.48550/arXiv.2512.14806" 31 }, 32 "scan_version": 2, 33 "active_modules": ["experimental_rigor", "data_leakage"], 34 "methodology_tags": ["benchmark-eval", "case-study"], 35 "key_findings": "Three open-source ADRS frameworks (OpenEvolve, GEPA, ShinkaEvolve) can generate algorithms that match or exceed human state-of-the-art for systems performance problems across ten case studies, including 13x faster MoE load balancing and 35% greater spot instance savings. The paper identifies best practices along three axes (specification, evaluation, feedback) and finds that LLMs can discover cross-domain techniques (e.g., Hamilton's Apportionment from political science for GPU load balancing). OpenEvolve showed the most consistent performance across models, while GEPA and ShinkaEvolve showed stronger model-specific preferences.", 36 "checklist": { 37 "artifacts": { 38 "code_released": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper references open-source ADRS frameworks (OpenEvolve, GEPA, ShinkaEvolve) with GitHub links, but does not release the code for its own case studies, simulators, evaluation setups, or configuration files beyond what is shown in the appendix." 42 }, 43 "data_released": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper uses simulators and workload traces from prior work but does not release a unified dataset or artifact package. No download link for the evaluation data is provided." 47 }, 48 "environment_specified": { 49 "applies": true, 50 "answer": false, 51 "justification": "No requirements.txt, Dockerfile, or environment specification is provided. The paper mentions using specific models (GPT-5, Gemini-3.0-Pro) and frameworks but does not specify library versions or environment details." 52 }, 53 "reproduction_instructions": { 54 "applies": true, 55 "answer": false, 56 "justification": "No step-by-step reproduction instructions are provided. Configuration files are shown in Appendix D but without instructions on how to run the experiments end-to-end." 57 } 58 }, 59 "statistical_methodology": { 60 "confidence_intervals_or_error_bars": { 61 "applies": true, 62 "answer": false, 63 "justification": "Table 3 reports mean ± standard deviation but no confidence intervals or error bars. Standard deviation is reported but CIs are not." 64 }, 65 "significance_tests": { 66 "applies": true, 67 "answer": false, 68 "justification": "The paper claims frameworks outperform SOTA baselines but uses no statistical significance tests. Comparisons are based solely on mean values from three runs." 69 }, 70 "effect_sizes_reported": { 71 "applies": true, 72 "answer": true, 73 "justification": "Effect sizes are reported with baseline context throughout: '13× faster', '35% greater savings', '6% higher average cost savings', '17% cost savings', '3× runtime speedup', '60% better than greedy' — all with baseline reference points." 74 }, 75 "sample_size_justified": { 76 "applies": true, 77 "answer": false, 78 "justification": "Each experiment is repeated three times with no justification for why three runs is sufficient. No power analysis or discussion of whether three runs provides adequate statistical power." 79 }, 80 "variance_reported": { 81 "applies": true, 82 "answer": true, 83 "justification": "Table 3 reports mean ± standard deviation across three runs for each framework-model combination on all tasks." 84 } 85 }, 86 "evaluation_design": { 87 "baselines_included": { 88 "applies": true, 89 "answer": true, 90 "justification": "Each case study includes baselines: SOTA human-designed solutions and initial/greedy programs. Table 2 lists SOTA baselines for all ten tasks. Table 3 includes Human SOTA row." 91 }, 92 "baselines_contemporary": { 93 "applies": true, 94 "answer": true, 95 "justification": "Baselines are from recent, top-venue publications: NSDI '24 (CBL), VLDB '24 (TXN), MLSys '25 (LLM-SQL), and state-of-the-art implementations from DeepSeek." 96 }, 97 "ablation_study": { 98 "applies": true, 99 "answer": true, 100 "justification": "Multiple ablation studies are conducted: seed diversity (Section 5.1, LLM-SQL), feedback granularity (Section 5.3, CBL with minimal/moderate/detailed), training set coverage (Appendix C.1), and hint refinement effects." 101 }, 102 "multiple_metrics": { 103 "applies": true, 104 "answer": true, 105 "justification": "Multiple metrics are used: performance score, runtime, cost, LOC. For EPLB: balance factor and rearrangement runtime. For LLM-SQL: prefix hit rate and algorithm runtime. Combined scores are also defined." 106 }, 107 "human_evaluation": { 108 "applies": true, 109 "answer": false, 110 "justification": "No human evaluation of the generated solutions' quality is reported. The authors mention they 'readily understood the generated solutions' (Section 2) but no systematic human evaluation was conducted." 111 }, 112 "held_out_test_set": { 113 "applies": true, 114 "answer": true, 115 "justification": "For CBL, 30% of traces are sampled for feedback during search, with final results reported on the full evaluation set (Section 4.1). Appendix C.1 discusses training/test split effects." 116 }, 117 "per_category_breakdown": { 118 "applies": true, 119 "answer": true, 120 "justification": "Table 3 provides per-task breakdowns across all ten tasks for all framework-model combinations. Per-workload results are shown in figures (e.g., Figure 8 for CBL)." 121 }, 122 "failure_cases_discussed": { 123 "applies": true, 124 "answer": true, 125 "justification": "Failure cases are discussed extensively: Cloudcast shows no improvement (Table 2), reward hacking examples (MAS, EPLB, Prism in Section 5.2), and Appendix C.3 categorizes failure patterns from 420 LLM-judged traces." 126 }, 127 "negative_results_reported": { 128 "applies": true, 129 "answer": true, 130 "justification": "Cloudcast failed to improve over baseline. TXN online setting couldn't surpass SMF. MAS v3 degraded performance when verification was removed. Section 6.1 discusses problems ADRS is not suited for." 131 } 132 }, 133 "claims_and_evidence": { 134 "abstract_claims_supported": { 135 "applies": true, 136 "answer": true, 137 "justification": "Abstract claims of '13× faster load balancing' and '35% greater savings' are supported by Table 2 and Table 3 results. The claim that ADRS-generated solutions 'can match or even outperform human state-of-the-art' is supported across multiple case studies." 138 }, 139 "causal_claims_justified": { 140 "applies": true, 141 "answer": true, 142 "justification": "The paper's causal claims are mostly about ablation effects (e.g., 'providing diverse seeds improves scores', 'moderate feedback outperforms minimal and detailed'). These are supported by controlled single-variable ablation experiments described in Section 5." 143 }, 144 "generalization_bounded": { 145 "applies": true, 146 "answer": true, 147 "justification": "Section 6.1 explicitly discusses which problems are best suited (isolated changes, reliable evaluations, efficient evaluations) and which are not. The paper states 'we do not yet have a universal recipe' in the abstract and limits claims to systems performance problems." 148 }, 149 "alternative_explanations_discussed": { 150 "applies": true, 151 "answer": true, 152 "justification": "The paper discusses training data contamination as an alternative explanation for TXN online results (Section 4.5: 'likely due to training data contamination from the SMF paper'). Section 6 discusses limitations and alternative framings." 153 }, 154 "proxy_outcome_distinction": { 155 "applies": true, 156 "answer": true, 157 "justification": "The paper measures specific system metrics (cost savings, runtime, load balance factor, makespan) and claims improvements in those specific metrics rather than making broader proxy claims. The metrics directly measure what is claimed." 158 } 159 }, 160 "setup_transparency": { 161 "model_versions_specified": { 162 "applies": true, 163 "answer": false, 164 "justification": "The paper uses 'GPT-5' and 'Gemini-3.0-Pro-Preview' and 'Gemini-3.0' without snapshot dates or API versions. The ShinkaEvolve config lists 'gpt-4.1-mini', 'gpt-4.1-nano', 'o4-mini' without versions. No model version identifiers beyond marketing names." 165 }, 166 "prompts_provided": { 167 "applies": true, 168 "answer": false, 169 "justification": "The paper describes prompts in natural language (three-part template: problem, evaluation criteria, context) but does not provide the actual prompt text used in experiments. Section 5.1 discusses prompt design principles without showing prompts." 170 }, 171 "hyperparameters_reported": { 172 "applies": true, 173 "answer": true, 174 "justification": "Configuration files are provided in Appendix D for all three frameworks, including temperature (0.7), top_p (0.95), max_tokens (32000), population sizes, migration rates, island counts, and iteration limits." 175 }, 176 "scaffolding_described": { 177 "applies": true, 178 "answer": true, 179 "justification": "The ADRS architecture is described in detail in Section 3.2 and Figure 1b, with five components (Prompt Generator, Solution Generator, Evaluator, Storage, Solution Selector). Table 1 compares framework designs. Each framework's parent selection, evolution context, and evaluation mechanisms are documented." 180 }, 181 "data_preprocessing_documented": { 182 "applies": true, 183 "answer": true, 184 "justification": "For each case study, the evaluator setup, trace sampling, and data pipeline are described. For CBL: '30% of the traces as a feedback subset' (Section 4.1). Simulator configurations are specified per case study." 185 } 186 }, 187 "limitations_and_scope": { 188 "limitations_section_present": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 6 'Limitations and Open Challenges' is a dedicated section discussing which problems ADRS is suited for (Section 6.1) and open challenges (Section 6.2)." 192 }, 193 "threats_to_validity_specific": { 194 "applies": true, 195 "answer": true, 196 "justification": "Section 6.1 identifies specific limitations: ADRS struggles with problems requiring coordinated multi-module changes, problems where semantic equivalence is undecidable, and problems requiring expensive evaluations. Training data contamination is discussed for TXN." 197 }, 198 "scope_boundaries_stated": { 199 "applies": true, 200 "answer": true, 201 "justification": "Section 6.1 explicitly states what ADRS does NOT handle well: multi-module changes, expensive evaluations, and undecidable correctness verification. The abstract notes 'we do not yet have a universal recipe for applying ADRS across all of systems research.'" 202 } 203 }, 204 "data_integrity": { 205 "raw_data_available": { 206 "applies": true, 207 "answer": false, 208 "justification": "No raw experimental data (evolution logs, per-iteration scores, generated programs) is released for independent verification." 209 }, 210 "data_collection_described": { 211 "applies": true, 212 "answer": true, 213 "justification": "Each case study describes the data source: existing simulators from published papers (CBL from NSDI '24, TXN from VLDB '24), public benchmarks (ShareGPT, GSM8K), and specific trace configurations." 214 }, 215 "recruitment_methods_described": { 216 "applies": true, 217 "answer": false, 218 "justification": "The pilot survey (Appendix A, Figure 7) of 31 PhD students does not describe how participants were recruited, what their specific research areas are, or whether recruitment could introduce bias." 219 }, 220 "data_pipeline_documented": { 221 "applies": true, 222 "answer": true, 223 "justification": "The evaluation pipeline is documented: each framework runs 100 iterations, solutions are evaluated against predefined workloads, scores are collected with mean ± std over 3 runs. The flow from generation to evaluation is described in Section 3.2." 224 } 225 }, 226 "conflicts_of_interest": { 227 "funding_disclosed": { 228 "applies": true, 229 "answer": false, 230 "justification": "No funding source or acknowledgments section is present in the paper." 231 }, 232 "affiliations_disclosed": { 233 "applies": true, 234 "answer": true, 235 "justification": "All authors are listed as affiliated with UC Berkeley. The affiliation is clearly stated." 236 }, 237 "funder_independent_of_outcome": { 238 "applies": true, 239 "answer": false, 240 "justification": "No funding information is disclosed, making independence impossible to assess." 241 }, 242 "financial_interests_declared": { 243 "applies": true, 244 "answer": false, 245 "justification": "No competing interests statement is present. Several authors may have connections to companies whose products are referenced (e.g., Ion Stoica co-founded Databricks and Anyscale), but no declaration is made." 246 } 247 }, 248 "contamination": { 249 "training_cutoff_stated": { 250 "applies": true, 251 "answer": false, 252 "justification": "The paper uses GPT-5 and Gemini-3.0-Pro but does not state training data cutoff dates for these models. This is relevant because the paper's own case study baselines (NSDI '24, VLDB '24) could be in the training data." 253 }, 254 "train_test_overlap_discussed": { 255 "applies": true, 256 "answer": true, 257 "justification": "The paper explicitly acknowledges contamination risk for TXN: 'OpenEvolve rediscovered this algorithm from a random baseline, likely due to training data contamination from the SMF paper' (Section 4.5). However, this is only discussed for one case study." 258 }, 259 "benchmark_contamination_addressed": { 260 "applies": true, 261 "answer": false, 262 "justification": "Beyond the TXN contamination note, the paper does not systematically address whether the LLMs' training data includes the baseline algorithms, simulator code, or published solutions being compared against. The fact that LLMs may have memorized SOTA solutions could confound results." 263 } 264 }, 265 "human_studies": { 266 "pre_registered": { 267 "applies": false, 268 "answer": false, 269 "justification": "The paper's main contribution is benchmark evaluation of ADRS frameworks, not a human subjects study. The pilot survey in Appendix A is minor." 270 }, 271 "irb_or_ethics_approval": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human subjects study; the pilot survey of PhD students is a minor supplementary element." 275 }, 276 "demographics_reported": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human subjects study in the main contribution." 280 }, 281 "inclusion_exclusion_criteria": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human subjects study in the main contribution." 285 }, 286 "randomization_described": { 287 "applies": false, 288 "answer": false, 289 "justification": "No human subjects study." 290 }, 291 "blinding_described": { 292 "applies": false, 293 "answer": false, 294 "justification": "No human subjects study." 295 }, 296 "attrition_reported": { 297 "applies": false, 298 "answer": false, 299 "justification": "No human subjects study." 300 } 301 }, 302 "cost_and_practicality": { 303 "inference_cost_reported": { 304 "applies": true, 305 "answer": true, 306 "justification": "Table 2 reports cost per case study: most under $15-$30. Time is also reported (1-8 hours for 100 iterations). Section 4 states 'most of our case studies required only a few hours and cost less than several tens of dollars.'" 307 }, 308 "compute_budget_stated": { 309 "applies": true, 310 "answer": true, 311 "justification": "Table 2 reports time and cost for each case study. Iteration counts (100) are stated. Total API spend is bounded (e.g., '≤$15', '≤$30'). Number of LLM calls per iteration is stated per framework in Table 1." 312 } 313 }, 314 "experimental_rigor": { 315 "seed_sensitivity_reported": { 316 "applies": true, 317 "answer": true, 318 "justification": "Table 3 reports mean ± standard deviation across three runs for each framework-model combination, showing result variation across runs." 319 }, 320 "number_of_runs_stated": { 321 "applies": true, 322 "answer": true, 323 "justification": "Section 4 states 'We repeat each experiment three times.' Table 3 reports results over three runs." 324 }, 325 "hyperparameter_search_budget": { 326 "applies": true, 327 "answer": false, 328 "justification": "The paper uses default configurations for each framework (Section 4: 'using their default configurations') but does not report any hyperparameter search budget for tuning these defaults or the evaluator scoring functions." 329 }, 330 "best_config_selection_justified": { 331 "applies": true, 332 "answer": true, 333 "justification": "The paper reports results for all three frameworks with both models (Table 3), not just the best configuration. Best framework per task is identified but all results are shown." 334 }, 335 "multiple_comparison_correction": { 336 "applies": true, 337 "answer": false, 338 "justification": "The paper compares three frameworks × two models × ten tasks with no statistical tests at all, let alone multiple comparison correction." 339 }, 340 "self_comparison_bias_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "The authors design the case studies, evaluators, and scoring functions, then evaluate their own ADRS approach. This author-evaluation bias is not acknowledged." 344 }, 345 "compute_budget_vs_performance": { 346 "applies": true, 347 "answer": false, 348 "justification": "All experiments are capped at 100 iterations, but no performance-vs-compute curves are shown. The paper does not report how performance scales with iteration budget." 349 }, 350 "benchmark_construct_validity": { 351 "applies": true, 352 "answer": true, 353 "justification": "Section 2 discusses why systems performance problems are well-suited for ADRS (verifiable solutions, preserving correctness, small algorithmic code, simulator-based evaluation). Section 6.1 discusses which problems are and aren't well-suited." 354 }, 355 "scaffold_confound_addressed": { 356 "applies": true, 357 "answer": true, 358 "justification": "The paper evaluates three different frameworks (scaffolds) and two different models, separating the framework effect from the model effect. Table 3 shows results by framework × model. Table 1 describes design differences between frameworks." 359 } 360 }, 361 "data_leakage": { 362 "temporal_leakage_addressed": { 363 "applies": true, 364 "answer": false, 365 "justification": "The paper does not discuss whether GPT-5 or Gemini-3.0 were trained on the baseline algorithms' source code or published papers. The TXN contamination note is an exception but is not systematic." 366 }, 367 "feature_leakage_addressed": { 368 "applies": true, 369 "answer": false, 370 "justification": "No discussion of whether the evaluation setup provides information that would not be available in a real research scenario (e.g., the ADRS prompts may implicitly encode knowledge of the SOTA solution)." 371 }, 372 "non_independence_addressed": { 373 "applies": true, 374 "answer": false, 375 "justification": "No discussion of whether the LLMs' training data includes the exact simulators, baseline implementations, or published papers being used in the evaluation." 376 }, 377 "leakage_detection_method": { 378 "applies": true, 379 "answer": false, 380 "justification": "No concrete leakage detection or prevention method is used. The TXN contamination is noted post-hoc as a suspicion but no detection method was applied." 381 } 382 } 383 }, 384 "claims": [ 385 { 386 "claim": "ADRS-generated solutions can match or outperform human state-of-the-art designs across ten systems performance case studies.", 387 "evidence": "Table 2 shows improvements over SOTA in 8 of 10 tasks. Table 3 provides quantitative results with mean ± std over 3 runs. Specific improvements: 13× faster EPLB, 35% better CBL savings, 60% better offline TXN.", 388 "supported": "moderate" 389 }, 390 { 391 "claim": "OpenEvolve achieves the highest success rate across frameworks, delivering the best solution in 9 out of 20 total cases.", 392 "evidence": "Table 3 aggregate results across all tasks. Section 4 states framework comparison: OpenEvolve 9, ShinkaEvolve 8, GEPA 6 top results.", 393 "supported": "moderate" 394 }, 395 { 396 "claim": "ADRS frameworks can discover cross-domain techniques that human experts might overlook.", 397 "evidence": "Table 4 shows five examples: Hamilton's Apportionment for EPLB, Borda Count for TXN, Kirchhoff's Current Law for telemetry, UCB for CBL-Multi, gradient-based power control for NS3.", 398 "supported": "moderate" 399 }, 400 { 401 "claim": "Moderate feedback granularity outperforms both minimal and detailed feedback for ADRS.", 402 "evidence": "Section 5.3 ablation on CBL: moderate feedback achieves 13.0% cost reduction vs minimal (7.7%) and detailed (10.2%).", 403 "supported": "weak" 404 }, 405 { 406 "claim": "Diverse seed programs improve ADRS evolution outcomes compared to uniform seeding.", 407 "evidence": "Section 5.1 ablation on LLM-SQL: uniform seeding caps at 0.74, diverse seeds reach 0.7755. Only diverse-seed runs exceeded 0.74.", 408 "supported": "moderate" 409 } 410 ], 411 "red_flags": [ 412 { 413 "flag": "Only 3 runs per experiment", 414 "detail": "Each experiment is repeated only three times, which is insufficient for reliable statistical inference. Standard deviations in Table 3 are sometimes very large relative to differences between methods (e.g., NS3 ShinkaEvolve GPT-5: 89.50 ± 18.73), suggesting high variance that 3 runs cannot adequately characterize." 415 }, 416 { 417 "flag": "No statistical significance tests", 418 "detail": "Despite claiming one framework or model outperforms another, no significance tests are reported. Given the high variance in some tasks, many claimed differences may not be statistically significant." 419 }, 420 { 421 "flag": "Contamination risk largely unaddressed", 422 "detail": "The LLMs used (GPT-5, Gemini-3.0) were almost certainly trained on the published SOTA algorithms being compared against. The paper acknowledges this for TXN ('likely due to training data contamination') but does not systematically address it for other case studies. The cross-domain 'discoveries' could be memorized techniques." 423 }, 424 { 425 "flag": "Self-evaluation bias", 426 "detail": "The authors designed both the evaluation frameworks and the scoring functions, then reported results of their own approach. No independent evaluation was conducted. The evaluator design choices (scoring weights, test set composition) could favor ADRS outcomes." 427 }, 428 { 429 "flag": "Selective case study presentation", 430 "detail": "Cloudcast, where ADRS failed to improve over baseline, receives much less attention than successful cases. The paper presents 5 detailed case studies in the main text, all showing positive results, with the failure case relegated to the appendix." 431 }, 432 { 433 "flag": "Draft comments left in paper", 434 "detail": "Internal comments ('/* accheng: @Ion, do we want to keep this still? */', '/* shu: do we still need this table? */') are left in the paper, suggesting incomplete revision and potentially premature conclusions." 435 } 436 ], 437 "cited_papers": [ 438 { 439 "title": "OpenEvolve: an open-source evolutionary coding agent", 440 "authors": ["Asankhaya Sharma"], 441 "year": 2025, 442 "relevance": "Open-source implementation of AlphaEvolve's evolutionary LLM-based code generation approach, central to this paper's evaluation." 443 }, 444 { 445 "title": "GEPA: Reflective prompt evolution can outperform reinforcement learning", 446 "authors": ["Lakshya A Agrawal", "Shangyin Tan"], 447 "year": 2025, 448 "arxiv_id": "2507.19457", 449 "relevance": "One of three ADRS frameworks evaluated; uses natural-language reflection to mutate prompts with Pareto filtering." 450 }, 451 { 452 "title": "ShinkaEvolve: Towards open-ended and sample-efficient program evolution", 453 "authors": ["Robert Tjarko Lange", "Yuki Imajuku", "Edoardo Cetin"], 454 "year": 2025, 455 "arxiv_id": "2509.19349", 456 "relevance": "Third ADRS framework evaluated; emphasizes structured introspection and correctness-gated evolution." 457 }, 458 { 459 "title": "AlphaEvolve: A coding agent for scientific and algorithmic discovery", 460 "authors": ["Alexander Novikov"], 461 "year": 2025, 462 "arxiv_id": "2506.13131", 463 "relevance": "Proprietary Google DeepMind ADRS framework using MAP-Elites and island models for LLM-based algorithm discovery." 464 }, 465 { 466 "title": "ChatDev: Communicative agents for software development", 467 "authors": ["Chen Qian"], 468 "year": 2024, 469 "arxiv_id": "2307.07924", 470 "relevance": "Multi-agent LLM system for software development, used as baseline in the MAS case study." 471 }, 472 { 473 "title": "MetaGPT: Meta programming for a multi-agent collaborative framework", 474 "authors": ["Sirui Hong"], 475 "year": 2024, 476 "arxiv_id": "2308.00352", 477 "relevance": "Multi-agent framework evaluated in the MAS optimization case study." 478 }, 479 { 480 "title": "Why do multi-agent LLM systems fail?", 481 "authors": ["Mert Cemri", "Melissa Z Pan"], 482 "year": 2025, 483 "arxiv_id": "2503.13657", 484 "relevance": "Proposes MAST taxonomy of multi-agent system failure modes, used in the MAS case study evaluation." 485 }, 486 { 487 "title": "Glia: A human-inspired AI for automated systems design and optimization", 488 "authors": ["Pouya Hamadanian"], 489 "year": 2025, 490 "arxiv_id": "2510.27176", 491 "relevance": "Agentic workflow for systems design with strong results on distributed LLM inference." 492 }, 493 { 494 "title": "Darwin Godel Machine: Open-ended evolution of self-improving agents", 495 "authors": ["Jenny Zhang", "Shengran Hu", "Cong Lu", "Robert Lange", "Jeff Clune"], 496 "year": 2025, 497 "arxiv_id": "2505.22954", 498 "relevance": "Demonstrates open-ended self-referential code improvement and self-evolving AI agents." 499 }, 500 { 501 "title": "MLGym: A new framework and benchmark for advancing AI research agents", 502 "authors": ["Deepak Nathani"], 503 "year": 2025, 504 "arxiv_id": "2502.14499", 505 "relevance": "Benchmark for AI research agents, related to automated research evaluation." 506 }, 507 { 508 "title": "DAPO: An open-source LLM reinforcement learning system at scale", 509 "authors": ["Qiying Yu"], 510 "year": 2025, 511 "arxiv_id": "2503.14476", 512 "relevance": "Open-source LLM RL system; paper's finding that excessive length signals unreliability is cited." 513 }, 514 { 515 "title": "AlgoTune: Can language models speed up general-purpose numerical programs?", 516 "authors": ["Ori Press"], 517 "year": 2025, 518 "arxiv_id": "2507.15887", 519 "relevance": "Benchmark evaluating LLM program optimization abilities, directly related to AI-driven code generation." 520 } 521 ] 522 }