ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25024B)


      1 {
      2   "paper": {
      3     "title": "LIVE-SWE-AGENT: Can Software Engineering Agents Self-Evolve on the Fly?",
      4     "authors": [
      5       "Chunqiu Steven Xia",
      6       "Zhe Wang",
      7       "Yan Yang",
      8       "Yuxiang Wei",
      9       "Lingming Zhang"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv",
     13     "arxiv_id": "2511.13646"
     14   },
     15   "scan_version": 2,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "GitHub repository provided: https://github.com/OpenAutoCoder/live-swe-agent. Paper states it 'has been publicly available.'"
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Uses publicly available benchmarks: SWE-bench Verified, SWE-Bench Pro, and SWE-bench Multilingual. No proprietary data collected."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided in the paper. Only mentions building on mini-SWE-agent (~100 lines of code)."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No step-by-step reproduction instructions in the paper. The GitHub link exists but the paper itself does not contain a 'Reproducing Results' section or specific commands."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "All results are reported as point estimates (e.g., '77.4%', '45.8%') with no confidence intervals or error bars."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "Claims like 'outperforming all existing software agents' are based on comparing raw percentages without any statistical significance tests."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Reports improvements with baseline context throughout, e.g., 'improving the resolve rate by 8.3 percentage points compared to previous best approach' (Section 4.1), and Table 5 shows relative improvements like '↑22.6%'."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The ablation uses 50 'randomly selected' problems with no justification for why 50 is sufficient. The 60-problem subset from prior work is also used without power analysis."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "Paper explicitly states 'we sample one patch per issue' (Section 3). All results are single-run with no variance, standard deviation, or spread measures reported."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Compares against mini-SWE-agent, SWE-agent, and self-evolving agents (SICA, DGM, HGM) in Tables 1-3 and Figure 1."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Baselines include state-of-the-art systems: Claude Sonnet 4.5, GPT-5, Gemini 3 Pro, and recent agents like DGM (2025), HGM (2025), OpenHands, Trae."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Table 4 ablates tool creation (w/o tool creation) and reflection (w/o reflection), showing contribution of each component on 50 problems."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Reports both resolve rate (%) and average cost ($) per issue across experiments (Tables 1, 3, 6). Table 4 also reports number of tools created."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "Evaluation is entirely automated via SWE-bench test suites (pass/fail). No human evaluation of patch quality, maintainability, or correctness beyond tests."
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "SWE-bench Verified, SWE-Bench Pro, and SWE-bench Multilingual are standard held-out benchmarks not used for development of the method."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "No per-repository or per-difficulty breakdown of resolve rates. SWE-Bench Pro spans 11 repositories and 4 languages but only aggregate results are reported in Table 3."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Section 4.3 discusses GPT-5-Nano failing with LIVE-SWE-AGENT: 'GPT-5-Nano fails to understand the goal of creating custom tools and is often stuck in a loop.' Also discusses when tool creation doesn't help."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Table 5 reports GPT-5-Nano performance drops 68.2% with LIVE-SWE-AGENT vs mini-SWE-agent, and GPT-5-Mini drops 3.3%. These negative results are discussed."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Abstract claims of 77.4% on Verified and 45.8% on Pro are supported by Tables 1 and 3. Claim of outperforming all existing agents is supported by Figure 1 comparisons."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Causal claim that tool creation improves performance is supported by controlled ablation in Table 4 (same 50 problems, same LLM, varying only the tool creation/reflection components)."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "Title says 'Software Engineering Agents' broadly. Claims 'outperforming all existing software agents' but tests only on SWE-bench family benchmarks (issue resolution). Section 4.4 speculates about applications to testing, vulnerability detection, and software synthesis without evidence."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "No discussion of alternative explanations for performance gains. Could the improvement be from the additional reflection prompt rather than tool creation itself? The ablation partially addresses this but no alternative explanations are considered."
    134       },
    135       "proxy_outcome_distinction": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Claims are stated at the level of measurements: 'resolve rate' on specific benchmarks. No overclaiming that resolve rate equals general software engineering ability (though Section 4.4 speculates about broader applications, these are framed as future work)."
    139       }
    140     },
    141     "setup_transparency": {
    142       "model_versions_specified": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Specifies 'claude-sonnet-4-5-20250929' with exact API identifier (Section 3). Temperature settings also specified per model family (Appendix A)."
    146       },
    147       "prompts_provided": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Full initial prompt and feedback message provided in Appendix D (Figures 7 and 8), including the complete tool creation instructions and reflection message."
    151       },
    152       "hyperparameters_reported": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Section 3 and Appendix A report: max step limit 250, max cost $3/issue, temperature 0.0 for Anthropic models, temperature 1 for GPT and Gemini models."
    156       },
    157       "scaffolding_described": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 2 describes the scaffold in detail: agent loop, tool creation mechanism, reflection prompt, custom tool synthesis process. Figure 2 provides an overview diagram."
    161       },
    162       "data_preprocessing_documented": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Uses standard benchmarks without modification. SWE-bench Verified (500 problems), SWE-Bench Pro (731 public problems), and a 50-problem random subset of SWE-bench Multilingual are clearly specified."
    166       }
    167     },
    168     "limitations_and_scope": {
    169       "limitations_section_present": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "No dedicated limitations section. Section 4.4 is 'Discussion and Future Work' which discusses extensions and applications but does not substantively discuss limitations of the current work."
    173       },
    174       "threats_to_validity_specific": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No threats to validity discussed. No mention of internal or external validity concerns."
    178       },
    179       "scope_boundaries_stated": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No explicit scope boundaries stated. The paper does not state what the results do NOT show or which settings are excluded from the claims."
    183       }
    184     },
    185     "data_integrity": {
    186       "raw_data_available": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "Agent trajectories, tool scripts generated, and detailed per-problem results are not released. Only aggregate results are shown. The ablation problem lists are provided (Appendix C) but not the raw outputs."
    190       },
    191       "data_collection_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Data sources are standard benchmarks with well-documented collection procedures: SWE-bench Verified (validated by human developers), SWE-Bench Pro (Scale AI), SWE-bench Multilingual."
    195       },
    196       "recruitment_methods_described": {
    197         "applies": false,
    198         "answer": false,
    199         "justification": "No human participants. Data comes from standard public benchmarks."
    200       },
    201       "data_pipeline_documented": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Pipeline is clear: issue description → agent with initial prompt → iterative tool creation/usage loop → patch submission → automated evaluation via SWE-bench harness."
    205       }
    206     },
    207     "conflicts_of_interest": {
    208       "funding_disclosed": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "No funding source disclosed anywhere in the paper. No acknowledgments section with grant information."
    212       },
    213       "affiliations_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "All authors' affiliations clearly stated: University of Illinois Urbana-Champaign. One author noted as research intern."
    217       },
    218       "funder_independent_of_outcome": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No funding disclosed, so independence cannot be assessed. The authors use commercial APIs (Anthropic, OpenAI, Google) but no disclosure of whether API credits were provided."
    222       },
    223       "financial_interests_declared": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No competing interests or financial interests statement present in the paper."
    227       }
    228     },
    229     "contamination": {
    230       "training_cutoff_stated": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "Uses Claude 4.5 Sonnet, GPT-5 variants, Gemini 3 Pro without stating any of their training data cutoff dates."
    234       },
    235       "train_test_overlap_discussed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No discussion of whether the LLMs' training data includes SWE-bench problems or their solutions from GitHub."
    239       },
    240       "benchmark_contamination_addressed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "SWE-bench problems come from public GitHub issues that could be in LLM training data. This contamination risk is not discussed."
    244       }
    245     },
    246     "human_studies": {
    247       "pre_registered": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study."
    251       },
    252       "irb_or_ethics_approval": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "demographics_reported": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "inclusion_exclusion_criteria": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "randomization_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "blinding_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "attrition_reported": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       }
    282     },
    283     "cost_and_practicality": {
    284       "inference_cost_reported": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "Tables 1, 3, and 6 report average dollar cost per issue for each configuration. E.g., LIVE-SWE-AGENT with Claude 4.5 Sonnet costs $0.68/issue on Verified."
    288       },
    289       "compute_budget_stated": {
    290         "applies": true,
    291         "answer": true,
    292         "justification": "Maximum cost of $3 per issue and maximum step limit of 250 stated (Section 3). Per-issue average costs reported. Table 2 also contrasts offline training costs (DGM: 1231 hours vs LIVE-SWE-AGENT: 0 hours)."
    293       }
    294     },
    295     "experimental_rigor": {
    296       "seed_sensitivity_reported": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "Single attempt per problem: 'we sample one patch per issue' (Section 3). No seed sensitivity analysis."
    300       },
    301       "number_of_runs_stated": {
    302         "applies": true,
    303         "answer": true,
    304         "justification": "Explicitly states 'sample one patch per issue' (Section 3), making clear these are single-run results."
    305       },
    306       "hyperparameter_search_budget": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "No description of how the reflection prompt or tool creation instructions were developed. No hyperparameter search budget reported."
    310       },
    311       "best_config_selection_justified": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "The final prompt design is presented without explaining what alternatives were tried or how the configuration was selected."
    315       },
    316       "multiple_comparison_correction": {
    317         "applies": false,
    318         "answer": false,
    319         "justification": "No statistical tests performed, so multiple comparison correction is not applicable."
    320       },
    321       "self_comparison_bias_addressed": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "Authors evaluate their own system against baselines including their own re-use of mini-SWE-agent results. No acknowledgment of self-comparison bias."
    325       },
    326       "compute_budget_vs_performance": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "Cost is reported alongside performance in Tables 1, 3, 6. The paper notes LIVE-SWE-AGENT achieves improvements with 'only a minimal increase in cost' and even 'slight cost savings' in some cases (Section 4.1)."
    330       },
    331       "benchmark_construct_validity": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "No discussion of whether SWE-bench resolve rate actually measures real-world software engineering capability. The paper uses the benchmarks without questioning construct validity."
    335       },
    336       "scaffold_confound_addressed": {
    337         "applies": true,
    338         "answer": true,
    339         "justification": "The paper controls for scaffold: Table 1 compares mini-SWE-agent vs LIVE-SWE-AGENT with the same LLMs. Table 3 uses same LLM (Claude 4.5 Sonnet) for SWE-agent vs LIVE-SWE-AGENT comparison. The paper's core contribution IS the scaffold."
    340       }
    341     },
    342     "data_leakage": {
    343       "temporal_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "SWE-bench problems are from public GitHub issues that predate model training. No discussion of temporal leakage risk."
    347       },
    348       "feature_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether the evaluation setup leaks information not available in real usage scenarios."
    352       },
    353       "non_independence_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether training data includes solutions to the same GitHub issues used in SWE-bench."
    357       },
    358       "leakage_detection_method": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No leakage detection or prevention method applied."
    362       }
    363     }
    364   },
    365   "claims": [
    366     {
    367       "claim": "LIVE-SWE-AGENT achieves 77.4% resolve rate on SWE-bench Verified without test-time scaling, outperforming all existing agents.",
    368       "evidence": "Table 1 and Figure 1 show 77.4% with Gemini 3 Pro. Figure 1 compares against open and proprietary agents.",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "LIVE-SWE-AGENT achieves 45.8% on SWE-Bench Pro, the best reported result.",
    373       "evidence": "Table 3 shows 45.8% vs SWE-agent's 43.6% with the same LLM (Claude 4.5 Sonnet). Figure 1 confirms top position.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "On-the-fly tool creation improves performance over the base scaffold with minimal cost overhead.",
    378       "evidence": "Table 4 ablation on 50 problems: 62% (no tool creation) → 64% (no reflection) → 76% (full). Tables 1, 3, 6 show marginal cost increases.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "LIVE-SWE-AGENT outperforms offline self-evolving agents (DGM, HGM, SICA) while requiring zero offline training cost.",
    383       "evidence": "Table 2: 65.0% vs DGM's 53.3% on the 60-problem subset. DGM costs 1231 hours offline; LIVE-SWE-AGENT costs 0.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Stronger LLMs benefit more from on-the-fly tool creation.",
    388       "evidence": "Table 5: GPT-5-Nano drops 68.2%, GPT-5-Mini drops 3.3%, while Claude 4.5 Sonnet gains 22.6% with LIVE-SWE-AGENT.",
    389       "supported": "moderate"
    390     }
    391   ],
    392   "methodology_tags": ["benchmark-eval"],
    393   "key_findings": "LIVE-SWE-AGENT enables LLM-based software agents to create and modify their own tools at runtime while solving issues, achieving state-of-the-art 77.4% on SWE-bench Verified and 45.8% on SWE-Bench Pro. The approach requires only minimal modifications to existing agent scaffolds (initial prompt + reflection message) with no offline training. Ablations show tool creation and step-level reflection both contribute to improvements, but weaker LLMs (GPT-5-Nano) lack the reasoning capability to benefit from on-the-fly tool synthesis.",
    394   "red_flags": [
    395     {
    396       "flag": "No statistical tests for any comparison",
    397       "detail": "All claims of 'outperforming' are based on comparing raw percentages without significance tests. On a 500-problem benchmark, a 2-3% difference may not be statistically significant."
    398     },
    399     {
    400       "flag": "Single-run results",
    401       "detail": "All results are from a single attempt per problem with no variance estimates. LLM outputs are stochastic, so single-run results may not be stable, especially with temperature=1 for GPT and Gemini models."
    402     },
    403     {
    404       "flag": "Small ablation sample without justification",
    405       "detail": "Ablation experiments use only 50 randomly selected problems (10% of SWE-bench Verified) with no power analysis or justification for sample size adequacy."
    406     },
    407     {
    408       "flag": "No limitations section",
    409       "detail": "The paper has no limitations or threats-to-validity discussion. Section 4.4 discusses future work and applications but does not acknowledge weaknesses."
    410     },
    411     {
    412       "flag": "Contamination risk unaddressed",
    413       "detail": "SWE-bench problems come from public GitHub issues. Models like GPT-5 and Gemini 3 Pro may have seen both the issues and their solutions during training. This is not discussed."
    414     },
    415     {
    416       "flag": "Leaderboard comparison mixes scaffolds",
    417       "detail": "Figure 1 compares LIVE-SWE-AGENT against proprietary systems with different scaffolds (Claude Code, OpenAI's custom setup). While the paper's controlled experiments (Tables 1, 3) are fair, the leaderboard claims conflate scaffold and model differences."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "SWE-agent: Agent-computer interfaces enable automated software engineering",
    423       "authors": ["John Yang", "Carlos E Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"],
    424       "year": 2024,
    425       "relevance": "Primary baseline agent scaffold; LIVE-SWE-AGENT builds on mini-SWE-agent from this work."
    426     },
    427     {
    428       "title": "OpenHands: An open platform for AI software developers as generalist agents",
    429       "authors": ["Xingyao Wang"],
    430       "year": 2024,
    431       "arxiv_id": "2407.16741",
    432       "relevance": "Major open-source agentic framework for software engineering, compared in leaderboard."
    433     },
    434     {
    435       "title": "Agentless: Demystifying LLM-based software engineering agents",
    436       "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"],
    437       "year": 2024,
    438       "arxiv_id": "2407.01489",
    439       "relevance": "Contrasting approach arguing complex agent scaffolds are unnecessary; uses predefined workflows instead."
    440     },
    441     {
    442       "title": "Darwin Godel Machine: Open-ended evolution of self-improving agents",
    443       "authors": ["Jenny Zhang", "Shengran Hu", "Cong Lu", "Robert Lange", "Jeff Clune"],
    444       "year": 2025,
    445       "arxiv_id": "2505.22954",
    446       "relevance": "Key baseline for self-improving agents; requires costly offline training ($22K per run) compared to LIVE-SWE-AGENT's zero offline cost."
    447     },
    448     {
    449       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    450       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig"],
    451       "year": 2023,
    452       "arxiv_id": "2310.06770",
    453       "relevance": "Foundational benchmark for evaluating software engineering agents on real-world issues."
    454     },
    455     {
    456       "title": "SWE-RL: Advancing LLM reasoning via reinforcement learning on open software evolution",
    457       "authors": ["Yuxiang Wei"],
    458       "year": 2025,
    459       "arxiv_id": "2502.18449",
    460       "relevance": "RL-based approach to improving LLMs for software engineering tasks."
    461     },
    462     {
    463       "title": "Conversational automated program repair",
    464       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    465       "year": 2023,
    466       "arxiv_id": "2301.13246",
    467       "relevance": "Early work on interactive LLM-based bug fixing (ChatRepair), precursor to agentic approaches."
    468     },
    469     {
    470       "title": "Huxley-Gödel Machine: Human-level coding agent development by an approximation of the optimal self-improving machine",
    471       "authors": ["Wenyi Wang"],
    472       "year": 2025,
    473       "relevance": "Self-improving agent baseline requiring 512 hours offline training; outperformed by LIVE-SWE-AGENT."
    474     },
    475     {
    476       "title": "SICA: A self-improving coding agent",
    477       "authors": ["Maxime Robeyns", "Martin Szummer", "Laurence Aitchison"],
    478       "year": 2025,
    479       "relevance": "Self-improving agent baseline that enters infinite loops; outperformed by LIVE-SWE-AGENT."
    480     },
    481     {
    482       "title": "Large language models as tool makers",
    483       "authors": ["Tianle Cai", "Xuezhi Wang", "Tengyu Ma", "Xinyun Chen", "Denny Zhou"],
    484       "year": 2024,
    485       "relevance": "Prior work on LLMs creating tools for general reasoning, though not targeting real-world software engineering."
    486     },
    487     {
    488       "title": "Evaluating large language models trained on code",
    489       "authors": ["Mark Chen"],
    490       "year": 2021,
    491       "arxiv_id": "2107.03374",
    492       "relevance": "Codex/HumanEval paper foundational to code generation evaluation."
    493     },
    494     {
    495       "title": "AutoCodeRover: Autonomous program improvement",
    496       "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"],
    497       "year": 2024,
    498       "relevance": "Software engineering agent using code search and analysis for autonomous bug fixing."
    499     }
    500   ]
    501 }

Impressum · Datenschutz