scan-v5.json (26431B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Live-SWE-agent: Can Software Engineering Agents Self-Evolve on the Fly?", 6 "authors": [ 7 "Chunqiu Steven Xia", 8 "Zhe Wang", 9 "Yan Yang", 10 "Yuxiang Wei", 11 "Lingming Zhang" 12 ], 13 "year": 2025, 14 "venue": "arXiv", 15 "arxiv_id": "2511.13646", 16 "doi": null 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "The 77.4% SWE-bench Verified and 45.8% SWE-Bench Pro solve rates claimed in the abstract are directly supported by Tables 1 and 3. The SOTA claim is substantiated by Figure 1's leaderboard comparison.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "The causal claim that on-the-fly tool creation improves performance is supported by the ablation in Table 4, which isolates tool creation and reflection on 50 controlled problems with the same LLM backend.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "Section 4.4 extrapolates results to test generation, vulnerability detection, and synthesis tasks not evaluated; the multilingual generalizability claim rests on only 50 problems, yet is presented as broad evidence of generalization.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper does not discuss alternative explanations for performance gains, such as whether the reflection prompt alone (absent actual tool synthesis) or increased context length from tool discussions is the true driver.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "Resolve rate on SWE-bench directly measures whether the agent successfully resolves a software issue via automated tests, matching the stated claim of effective software engineering performance.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": false, 55 "justification": "There is no dedicated limitations or threats-to-validity section; Section 4.4 is labeled 'Discussion and Future Work' and mentions constraints only in passing.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": false, 61 "justification": "No specific threats are discussed — not benchmark contamination, the small ablation sample (50 problems), nor the confound that Figure 1 comparisons use different underlying LLMs across systems.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": false, 67 "justification": "The paper mentions focusing on tool creation as a 'first step' but does not explicitly state what the results do NOT show (e.g., that multilingual results from 50 problems cannot be considered representative).", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": false, 75 "justification": "No funding acknowledgment appears anywhere in the paper text.", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "Author affiliations with the University of Illinois Urbana-Champaign are disclosed in the paper header alongside email addresses.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": false, 86 "answer": false, 87 "justification": "No funding disclosed; this appears to be unfunded academic work from UIUC.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests or financial interests statement is included anywhere in the paper.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "Key terms are explicitly defined: 'live software agent' (evolves on-the-fly during runtime without offline training), 'custom tools' (scripts executable in the environment), and 'on-the-fly self-evolution' is described mechanically in Section 2.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "Five explicit contributions are enumerated in Section 1: first live software agent, minimal implementation, SOTA performance, comprehensive analysis, and unified leaderboard.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section 5 explicitly positions LIVE-SWE-AGENT against static agents, offline self-improving agents (DGM, SICA, HGM), and workflow-based approaches (Agentless, Moatless), explaining how it differs from each.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": true, 124 "justification": "Source code is publicly available at https://github.com/OpenAutoCoder/live-swe-agent as stated in the abstract and contributions section.", 125 "source": "haiku" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": true, 130 "justification": "SWE-bench Verified, SWE-Bench Pro, and SWE-bench Multilingual are standard public benchmarks used unmodified.", 131 "source": "haiku" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper specifies the mini-SWE-agent framework and model versions/temperatures but provides no requirements.txt, Dockerfile, or complete dependency specification within the paper itself.", 137 "source": "haiku" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "No step-by-step reproduction instructions appear in the paper; only a GitHub link is provided without procedural guidance.", 143 "source": "haiku" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": false, 150 "justification": "No confidence intervals or error bars are reported for any results; all tables present single point estimates of resolve rate.", 151 "source": "haiku" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": false, 156 "justification": "No statistical significance tests are applied to any comparative claims despite multiple system comparisons across models and configurations.", 157 "source": "haiku" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Percentage-point improvements are reported throughout (e.g., 8.3pp over DGM in Table 2; 14pp improvement of LIVE-SWE-AGENT over mini-SWE-agent for Claude 4.5 Sonnet in Table 5).", 163 "source": "haiku" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "The ablation uses 50 randomly selected problems from SWE-bench Verified without justification for this sample size, power analysis, or random seed specification.", 169 "source": "haiku" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": false, 174 "justification": "Results are from single attempts per issue (explicitly stated: 'sample one patch per issue') with no variance or standard deviation reported across any runs.", 175 "source": "haiku" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "Multiple baselines are included: mini-SWE-agent (direct ablation baseline), SICA, DGM, HGM (offline self-improving agents in Table 2), and SWE-agent (for SWE-Bench Pro in Table 3).", 183 "source": "haiku" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "Baselines include state-of-the-art models (Claude 4.5 Sonnet, GPT-5, Gemini 3 Pro) and recent agents; Figure 1 compares against the latest proprietary systems including GPT-5.1 and Claude Opus 4.1.", 189 "source": "haiku" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": true, 194 "justification": "Table 4 ablates tool creation and reflection components; Table 5 ablates across six LLM backends from nano to frontier models, systematically isolating each contribution.", 195 "source": "haiku" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "Resolve rate (%) and average cost per issue ($) are both reported in all main result tables, providing effectiveness and efficiency dimensions.", 201 "source": "haiku" 202 }, 203 "human_evaluation": { 204 "applies": false, 205 "answer": false, 206 "justification": "Not applicable; SWE-bench uses automated test execution to determine issue resolution correctness.", 207 "source": "haiku" 208 }, 209 "held_out_test_set": { 210 "applies": true, 211 "answer": true, 212 "justification": "SWE-bench Verified (500 problems) and SWE-Bench Pro (731 problems) are established held-out benchmark sets with no indication the authors used them for any training.", 213 "source": "haiku" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": false, 218 "justification": "Performance results are not broken down by issue category, repository, problem difficulty, or language; only aggregate resolve rates appear in the main tables.", 219 "source": "haiku" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": true, 224 "justification": "The paper specifically analyzes the GPT-5-Nano failure mode: 'GPT-5-Nano fails to understand the goal of creating custom tools and is often stuck in a loop,' providing a concrete mechanistic explanation.", 225 "source": "haiku" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": true, 230 "justification": "Table 5 explicitly reports that GPT-5-Nano drops from 44% to 14% with LIVE-SWE-AGENT — a significant negative result clearly presented and analyzed.", 231 "source": "haiku" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": true, 238 "justification": "Claude 4.5 Sonnet is specified with snapshot date 'claude-sonnet-4-5-20250929'; Gemini 3 Pro, GPT-5, GPT-5-Mini, and GPT-5-Nano are named specifically throughout.", 239 "source": "haiku" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": true, 244 "justification": "Full verbatim prompts are provided in Appendix D, including the complete initial prompt (Figure 7) and per-step feedback message template (Figure 8).", 245 "source": "haiku" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": true, 250 "justification": "Temperature settings are specified per model family (0.0 for Anthropic, 1.0 for Gemini and OpenAI), with maximum step limit of 250 and maximum cost cap of $3 per issue documented.", 251 "source": "haiku" 252 }, 253 "scaffolding_described": { 254 "applies": true, 255 "answer": true, 256 "justification": "Section 2 describes the scaffolding in detail: how the reflection message is appended after each step, how tool synthesis integrates into the agent loop, and what modifications were made to mini-SWE-agent.", 257 "source": "haiku" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "Standard benchmarks are used without modification; ablation subsets are listed exhaustively in Appendix C, though random seeds are absent.", 263 "source": "haiku" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": false, 270 "justification": "Raw agent trajectories and evaluation logs are not released; the GitHub repo contains code but the paper does not indicate per-instance outputs are available for independent verification.", 271 "source": "haiku" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": false, 276 "justification": "The 50-problem ablation subsets are stated as 'randomly selected' but no random seed is provided, making exact replication of the ablation conditions impossible.", 277 "source": "haiku" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "Standard public benchmarks are used; no participant recruitment involved.", 283 "source": "haiku" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": false, 288 "justification": "The evaluation pipeline is described at a high level but details such as how SWE-bench harnesses were configured and how tie cases were handled are not fully documented.", 289 "source": "haiku" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "No training cutoff dates are stated for Claude 4.5 Sonnet, GPT-5, or Gemini 3 Pro, despite SWE-bench issues being drawn from public GitHub repositories that could appear in training corpora.", 297 "source": "haiku" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": true, 301 "answer": false, 302 "justification": "The paper does not mention whether SWE-bench problems (from public GitHub issues) may have been present in model training data, a known concern for this benchmark.", 303 "source": "haiku" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": true, 307 "answer": false, 308 "justification": "Benchmark contamination is entirely unaddressed; SWE-bench Verified problems are from publicly available GitHub repositories and may have been seen during pretraining of the evaluated models.", 309 "source": "haiku" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants involved.", 317 "source": "haiku" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants involved.", 323 "source": "haiku" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants involved.", 329 "source": "haiku" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants involved.", 335 "source": "haiku" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants involved.", 341 "source": "haiku" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants involved.", 347 "source": "haiku" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants involved.", 353 "source": "haiku" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": true, 360 "justification": "Average cost per issue is reported in Tables 1, 3, and 6 across all evaluated models and benchmarks (ranging from $0.04 to $0.73).", 361 "source": "haiku" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": false, 366 "justification": "Total computational budget for all experiments is not stated; only per-issue average costs are reported without aggregate experiment costs.", 367 "source": "haiku" 368 } 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "LIVE-SWE-AGENT achieves 77.4% resolve rate on SWE-bench Verified without test-time scaling, outperforming all existing open-source and proprietary agents at time of writing", 375 "evidence": "Table 1 shows 77.4% with Gemini 3 Pro backend; Figure 1 compares against 14 other systems including GPT-5.1(High) at 77.2% and Claude Opus 4.1 at 76.3%", 376 "supported": "strong" 377 }, 378 { 379 "claim": "LIVE-SWE-AGENT achieves 45.8% resolve rate on SWE-Bench Pro, the best-known result at time of writing", 380 "evidence": "Table 3: 45.8% vs 43.6% for SWE-agent baseline; Figure 1 shows it leading the Pro leaderboard over all listed systems", 381 "supported": "strong" 382 }, 383 { 384 "claim": "On-the-fly tool creation causally improves solve rate over base mini-SWE-agent", 385 "evidence": "Table 4 on 50 problems: without tool creation 62%, without reflection 64%, full system 76% with Claude 4.5 Sonnet — consistent direction across all four LLMs in Table 1", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "LIVE-SWE-AGENT outperforms offline self-evolving agents (DGM, HGM, SICA) at zero offline training cost", 390 "evidence": "Table 2: LIVE-SWE-AGENT 65.0% vs HGM 56.7% vs DGM 53.3% vs SICA 50.0% on the 60-problem subset chosen by prior work", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "The per-step reflection mechanism is necessary for optimal tool creation performance", 395 "evidence": "Table 4: removing reflection drops from 76% to 64%; reflection also increases average tools created from 2.92 to 3.28", 396 "supported": "moderate" 397 }, 398 { 399 "claim": "Weaker LLMs do not benefit from LIVE-SWE-AGENT and may perform significantly worse", 400 "evidence": "Table 5: GPT-5-Nano drops from 44% (mini-SWE-agent) to 14% (LIVE-SWE-AGENT); trajectory analysis shows it gets stuck in tool-creation loops", 401 "supported": "strong" 402 } 403 ], 404 "methodology_tags": [ 405 "benchmark-eval" 406 ], 407 "key_findings": "LIVE-SWE-AGENT enables software engineering agents to self-evolve by creating and using custom Python tools on the fly during issue resolution, achieving state-of-the-art 77.4% on SWE-bench Verified and 45.8% on SWE-Bench Pro with no offline training cost. Ablation studies show both the initial tool-creation instruction and per-step reflection prompting are necessary: removing either degrades performance, with the reflection component contributing a larger share. The approach scales with model capability — stronger LLMs (Claude 4.5 Sonnet, GPT-5) benefit substantially while weaker models (GPT-5-Nano) are hurt, getting stuck in unproductive synthesis loops — suggesting tool creation requires strong base reasoning. Custom tools span both general utilities (edit, view, search) and issue-specific analyzers, with repository-correlated clustering visible in t-SNE embeddings.", 408 "red_flags": [ 409 { 410 "flag": "No variance or statistical tests", 411 "detail": "All results are single-point estimates from one attempt per issue with no confidence intervals, error bars, or significance tests, despite multiple comparative claims across systems." 412 }, 413 { 414 "flag": "Small ablation sample without seed", 415 "detail": "Key ablation studies (Tables 4 and 5) use only 50 randomly selected problems (10% of SWE-bench Verified) without power analysis or random seed specification, making exact replication impossible." 416 }, 417 { 418 "flag": "Benchmark contamination unaddressed", 419 "detail": "SWE-bench issues come from public GitHub repositories that could appear in training data for all evaluated models; no training cutoffs are stated and contamination is never discussed." 420 }, 421 { 422 "flag": "Leaderboard comparison is time-sensitive", 423 "detail": "Figure 1 SOTA comparisons are against a rapidly-moving leaderboard; the paper acknowledges results are 'at the time of writing,' and the comparative claim may already be outdated (paper is from late 2025, evaluated in April 2026)." 424 }, 425 { 426 "flag": "Cross-system comparison confounded by LLM backend", 427 "detail": "Figure 1 compares LIVE-SWE-AGENT (Gemini 3 Pro) against proprietary systems using different underlying LLMs; performance gains may reflect the stronger base model rather than the scaffolding technique." 428 }, 429 { 430 "flag": "No reproduction instructions or dependency spec", 431 "detail": "Despite releasing code on GitHub, no reproduction instructions, dependency specifications, or random seeds appear in the paper, limiting independent verification." 432 } 433 ], 434 "cited_papers": [ 435 { 436 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 437 "relevance": "Primary benchmark framework underlying all evaluation; foundational dataset for software agent assessment in the field" 438 }, 439 { 440 "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering", 441 "relevance": "Direct baseline and the mini-SWE-agent base scaffold that LIVE-SWE-AGENT is built upon" 442 }, 443 { 444 "title": "Darwin Gödel Machine: Open-Ended Evolution of Self-Improving Agents", 445 "relevance": "Key comparison for offline self-evolving agents; LIVE-SWE-AGENT claims to outperform it with zero offline cost" 446 }, 447 { 448 "title": "Huxley-Gödel Machine: Human-Level Coding Agent Development by an Approximation of the Optimal Self-Improving Machine", 449 "relevance": "Direct baseline in Table 2; defines the 60-problem subset used for offline self-improving agent comparison" 450 }, 451 { 452 "title": "SICA: A Self-Improving Coding Agent", 453 "relevance": "Direct baseline in the offline self-evolving agent comparison (Table 2)" 454 }, 455 { 456 "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents", 457 "relevance": "Major competing software agent framework used as comparison point throughout the paper" 458 }, 459 { 460 "title": "Agentless: Demystifying LLM-Based Software Engineering Agents", 461 "relevance": "Complementary workflow-based approach; positioned as a contrast showing simpler alternatives to complex agent scaffolds" 462 }, 463 { 464 "title": "SWE-Bench Pro: Can AI Agents Solve Long-Horizon Software Engineering Tasks?", 465 "relevance": "Second primary benchmark used for evaluation; captures more realistic, enterprise-level software problems" 466 }, 467 { 468 "title": "SWE-bench Multilingual", 469 "relevance": "Additional benchmark used for cross-language generalizability testing (50-problem subset)" 470 } 471 ], 472 "engagement_factors": { 473 "practical_relevance": { 474 "score": 3, 475 "justification": "Open-source code released with a live leaderboard; directly applicable to software engineering automation at under $1 per issue." 476 }, 477 "surprise_contrarian": { 478 "score": 2, 479 "justification": "Challenges the assumption that complex handcrafted scaffolds are necessary by showing a minimal prompt modification outperforms agents with thousands of lines of purpose-built code." 480 }, 481 "fear_safety": { 482 "score": 1, 483 "justification": "Self-modifying agents raise implicit safety questions about autonomous code execution, but the paper does not engage with safety concerns and modification scope is limited to tool scripts." 484 }, 485 "drama_conflict": { 486 "score": 2, 487 "justification": "Claims to beat all proprietary systems including GPT-5 and Claude Opus with an open-source scaffold, setting up a clear open-vs-proprietary narrative." 488 }, 489 "demo_ability": { 490 "score": 3, 491 "justification": "Code publicly available on GitHub with a live leaderboard at live-swe-agent.github.io; practitioners can immediately run it with their own LLM backend." 492 }, 493 "brand_recognition": { 494 "score": 1, 495 "justification": "UIUC academic group with prior SWE-bench publications (Agentless, ChatRepair) but not a major AI lab name." 496 } 497 }, 498 "hn_data": { 499 "threads": [ 500 { 501 "hn_id": "25290469", 502 "title": "Electrocharged respirator fabrics with common materials: A candy machine N95", 503 "points": 3, 504 "comments": 0, 505 "url": "https://news.ycombinator.com/item?id=25290469", 506 "created_at": "2020-12-03T16:23:16Z" 507 }, 508 { 509 "hn_id": "46685772", 510 "title": "Can Highlighting Help GitHub Maintainers Track Security Fixes?", 511 "points": 2, 512 "comments": 0, 513 "url": "https://news.ycombinator.com/item?id=46685772", 514 "created_at": "2026-01-19T23:00:02Z" 515 }, 516 { 517 "hn_id": "42393379", 518 "title": "Hymba: A Hybrid-Head Architecture for Small Language Models", 519 "points": 2, 520 "comments": 0, 521 "url": "https://news.ycombinator.com/item?id=42393379", 522 "created_at": "2024-12-11T21:51:17Z" 523 }, 524 { 525 "hn_id": "39524530", 526 "title": "Towards a measurement theory in QFT: \"Impossible\" quantum measurements", 527 "points": 2, 528 "comments": 0, 529 "url": "https://news.ycombinator.com/item?id=39524530", 530 "created_at": "2024-02-27T14:31:51Z" 531 } 532 ], 533 "top_points": 3, 534 "total_points": 9, 535 "total_comments": 0 536 } 537 }