scan.json (20631B)
1 { 2 "paper": { 3 "title": "A Comprehensive Survey on Benchmarks and Solutions in Software Engineering of LLM-Empowered Agentic System", 4 "authors": ["Jiale Guo", "Suizhi Huang", "Mei Li", "Dong Huang", "Xingsheng Chen", "Regina Zhang", "Zhijiang Guo", "Han Yu", "Siu-Ming Yiu", "Pietro Lio", "Kwok-Yan Lam"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2510.09721" 8 }, 9 "scan_version": 2, 10 "active_modules": ["survey_methodology"], 11 "methodology_tags": ["meta-analysis"], 12 "key_findings": "This survey reviews 150+ papers on LLM-powered software engineering, providing a taxonomy across Solutions (prompt-based, fine-tuning, agent-based) and Benchmarks (code generation, translation, repair, others). It maps 50+ benchmarks to their corresponding solution strategies and identifies research gaps including multi-agent collaboration, self-evolving systems, and formal verification integration. The analysis traces the evolution from simple prompt engineering to sophisticated agentic systems with planning, reasoning, memory, and tool augmentation.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": true, 18 "justification": "The paper states 'We maintain a GitHub repository that continuously updates the reviewed and related papers at https://github.com/lisaGuojl/LLM-Agent-SE-Survey' in the abstract." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "No structured dataset of the surveyed papers (e.g., extracted metadata, classification labels, analysis data) is released. The GitHub repo appears to be a paper list, not analysis data." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "No environment or dependency specifications are provided. As a survey, analysis scripts and their dependencies could have been documented but were not." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No step-by-step instructions for reproducing the survey's paper selection, classification, or analysis are provided." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": false, 39 "answer": false, 40 "justification": "This is a survey paper that does not run experiments or report quantitative results requiring confidence intervals." 41 }, 42 "significance_tests": { 43 "applies": false, 44 "answer": false, 45 "justification": "Survey paper with no statistical comparisons of its own." 46 }, 47 "effect_sizes_reported": { 48 "applies": false, 49 "answer": false, 50 "justification": "Survey paper with no experimental results." 51 }, 52 "sample_size_justified": { 53 "applies": false, 54 "answer": false, 55 "justification": "Survey paper; no experimental sample size to justify." 56 }, 57 "variance_reported": { 58 "applies": false, 59 "answer": false, 60 "justification": "Survey paper with no experimental runs." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "Table I compares this survey against 5 prior surveys (Zhang et al., Jiang et al., Wang et al., Dong et al., Sapkota et al.) across features like taxonomy, pipeline, prompt, agent, benchmarks, solutions, and scope." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "The compared surveys are from 2023-2025, which are contemporary and relevant." 73 }, 74 "ablation_study": { 75 "applies": false, 76 "answer": false, 77 "justification": "Survey paper with no system components to ablate." 78 }, 79 "multiple_metrics": { 80 "applies": false, 81 "answer": false, 82 "justification": "Survey paper; no system evaluation metrics." 83 }, 84 "human_evaluation": { 85 "applies": false, 86 "answer": false, 87 "justification": "Human evaluation is not relevant for a survey's taxonomy claims." 88 }, 89 "held_out_test_set": { 90 "applies": false, 91 "answer": false, 92 "justification": "Survey paper with no test sets." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "The taxonomy provides per-category breakdowns across Solutions (prompt-based, fine-tune-based, agent-based subcategories) and Benchmarks (code generation, translation, repair, others). Figure 4 shows distribution across subcategories." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "The paper discusses limitations and failure modes of reviewed approaches, e.g., 'current LLMs struggle to use iterative feedback when presented as I/O examples' (Sec. IV-A), performance degradation on R language (Sec. IV-A), and SWE-bench solutions failing in production (Sec. VII-B/VIII-B)." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper reports negative findings from reviewed work, e.g., 'incorporating semantic information from execution traces provided limited SFT benefit' (Sec. IV-B), LLM performance degrades on R language, and most LLM-correct solutions would fail code review in production." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The abstract claims are supported: the paper does review 150+ papers, provides a taxonomy along Solutions and Benchmarks dimensions, connects 50+ benchmarks to solutions, and identifies future directions. These are structural claims about the survey's coverage, which the paper delivers." 115 }, 116 "causal_claims_justified": { 117 "applies": false, 118 "answer": false, 119 "justification": "The paper is a survey and does not make its own causal claims. It reports causal claims from reviewed papers." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "The title claims comprehensive coverage of 'Software Engineering of LLM-Empowered Agentic System' but the scope is primarily code generation, translation, and repair. Requirements engineering, deployment, DevOps, and other SE activities receive minimal coverage. The selection criteria mention top venues but actual coverage includes substantial arXiv preprints." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": false, 128 "answer": false, 129 "justification": "As a survey/taxonomy paper with no empirical results of its own, alternative explanations are not applicable." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": false, 133 "answer": false, 134 "justification": "Theoretical/survey paper with no measurements of its own." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": false, 140 "answer": false, 141 "justification": "Survey paper that does not use any models." 142 }, 143 "prompts_provided": { 144 "applies": false, 145 "answer": false, 146 "justification": "Survey paper that does not use prompting." 147 }, 148 "hyperparameters_reported": { 149 "applies": false, 150 "answer": false, 151 "justification": "Survey paper with no experiments." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "Survey paper with no agentic scaffolding." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": false, 161 "justification": "The paper states selection criteria (papers from 2023-2025 from top venues, novel benchmarks/methods, significant empirical evaluation) but does not provide counts at each filtering stage, the exact search queries used, or the specific process that led from an initial set to the final 150+ papers. No PRISMA-style flow or filtering pipeline with counts." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": false, 168 "justification": "There is no dedicated limitations section. The paper has 'Challenges and Future Directions' sections (VII and VIII, which appear duplicated) but these discuss challenges of the field, not limitations of the survey itself." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": false, 173 "justification": "No threats to validity of the survey methodology are discussed. No acknowledgment of potential selection bias in paper collection, classification subjectivity, or coverage gaps." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": false, 178 "justification": "While the paper states it covers papers from 2023-2025 and focuses on code generation, translation, and repair, it does not explicitly state what is NOT covered or what the results do NOT show. The title claims comprehensive coverage without bounding it." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "No list of all reviewed papers with their classifications, extraction data, or analysis spreadsheets is made available for verification." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section I describes the paper selection process: sources (top conferences, journals, preprints), selection criteria (4 criteria listed), and time period (2023-2025)." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants; data source is published papers from known venues." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": false, 200 "justification": "The paper lists selection criteria but does not document the pipeline from initial search to final inclusion — no counts of papers found, screened, excluded at each stage, or who performed the screening." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding or acknowledgments section is present in the paper." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are clearly listed: NTU, HKUST, HKU, Cambridge, SJTU." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding information is disclosed, so independence cannot be assessed." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial interests statement is present." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": false, 228 "answer": false, 229 "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": false, 233 "answer": false, 234 "justification": "Survey paper with no model evaluation." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": false, 238 "answer": false, 239 "justification": "Survey paper with no model evaluation." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in this survey." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "Survey paper with no method or system to cost." 284 }, 285 "compute_budget_stated": { 286 "applies": false, 287 "answer": false, 288 "justification": "Survey paper." 289 } 290 }, 291 "survey_methodology": { 292 "prisma_or_structured_protocol": { 293 "applies": true, 294 "answer": false, 295 "justification": "No PRISMA flow diagram, no protocol registration, no reproducible search queries. The paper lists general selection criteria and venue sources but does not follow a structured review protocol." 296 }, 297 "quality_assessment_of_sources": { 298 "applies": true, 299 "answer": false, 300 "justification": "The survey does not assess the methodological quality of its source papers. All papers are treated equally regardless of their rigor, sample sizes, or evaluation quality. No quality scoring or risk-of-bias assessment." 301 }, 302 "publication_bias_discussed": { 303 "applies": true, 304 "answer": false, 305 "justification": "No discussion of publication bias. The survey does not consider whether the papers it reviews skew toward positive results or whether negative results are underrepresented." 306 } 307 } 308 }, 309 "claims": [ 310 { 311 "claim": "This is the first comprehensive survey connecting benchmarks and solutions in LLM-empowered software engineering.", 312 "evidence": "Table I compares against 5 prior surveys, showing none cover both Benchmarks and Solutions columns with full SE scope. The paper reviews 150+ papers across both dimensions.", 313 "supported": "moderate" 314 }, 315 { 316 "claim": "The field has evolved from prompt-based methods to sophisticated agent-based systems incorporating planning, reasoning, memory, and tool augmentation.", 317 "evidence": "Sections IV-A through IV-C document this progression with specific papers at each stage, from instructional prompting through fine-tuning to multi-agent architectures.", 318 "supported": "strong" 319 }, 320 { 321 "claim": "Existing surveys lack coverage of benchmarks, making it difficult to evaluate and compare different approaches systematically.", 322 "evidence": "Table I shows that 4 of 5 compared surveys have '×' in the Benchmarks column. Only Zhang et al. (2023) covers benchmarks but is limited to program repair.", 323 "supported": "strong" 324 }, 325 { 326 "claim": "A majority of algorithmically successful solutions on benchmarks like SWE-Bench would fail in production due to security flaws, performance regressions, or coding standard violations.", 327 "evidence": "Stated in Section VII-B/VIII-B as a finding from 'recent analyses' but no specific citation or data is provided for this claim.", 328 "supported": "weak" 329 } 330 ], 331 "red_flags": [ 332 { 333 "flag": "No quality assessment of sources", 334 "detail": "The survey reviews 150+ papers but does not assess their methodological quality. Papers with rigorous evaluation and papers with weak evidence are treated equally, potentially laundering weak results." 335 }, 336 { 337 "flag": "No structured review protocol", 338 "detail": "No PRISMA flow, no registered protocol, no reproducible search queries. The paper selection process is described only in general terms, making the review difficult to reproduce or verify for completeness." 339 }, 340 { 341 "flag": "Duplicated sections", 342 "detail": "Sections VII and VIII are both titled 'Challenges and Future Directions' with substantially overlapping content (scalability, evaluation, deployment themes appear twice with slight rewording), suggesting incomplete editing." 343 }, 344 { 345 "flag": "Overclaimed scope", 346 "detail": "The title claims 'comprehensive' coverage of 'Software Engineering' but the taxonomy focuses primarily on code generation, translation, and repair. Other major SE activities (requirements, design, deployment, maintenance at scale) receive cursory treatment in the Applications section." 347 }, 348 { 349 "flag": "Unsupported production-readiness claims", 350 "detail": "Section VII-B/VIII-B claims 'a majority of algorithmically successful solutions on benchmarks like SWE-Bench would fail in production' without citing a specific source or data for this assertion." 351 } 352 ], 353 "cited_papers": [ 354 { 355 "title": "SWE-bench: Can language models resolve real-world github issues?", 356 "authors": ["C. E. Jimenez", "J. Yang", "A. Wettig", "S. Yao", "K. Pei", "O. Press", "K. R. Narasimhan"], 357 "year": 2024, 358 "relevance": "Foundational benchmark for evaluating LLM agents on real-world GitHub issue resolution." 359 }, 360 { 361 "title": "SWE-agent: Agent-computer interfaces enable automated software engineering", 362 "authors": ["J. Yang", "C. E. Jimenez", "A. Wettig", "K. Lieret", "S. Yao", "K. Narasimhan", "O. Press"], 363 "year": 2024, 364 "relevance": "Introduced the Agent-Computer Interface concept for software engineering agents." 365 }, 366 { 367 "title": "AutoCodeRover: Autonomous program improvement", 368 "authors": ["Y. Zhang", "H. Ruan", "Z. Fan", "A. Roychoudhury"], 369 "year": 2024, 370 "relevance": "Autonomous agent for program repair using iterative code search and patch generation." 371 }, 372 { 373 "title": "Evaluating large language models trained on code", 374 "authors": ["M. Chen"], 375 "year": 2021, 376 "arxiv_id": "2107.03374", 377 "relevance": "Introduced HumanEval benchmark, foundational for code generation evaluation." 378 }, 379 { 380 "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions", 381 "authors": ["T. Y. Zhuo"], 382 "year": 2025, 383 "relevance": "Benchmark evaluating LLMs on complex code generation with diverse function calls." 384 }, 385 { 386 "title": "Demystifying llm-based software engineering agents", 387 "authors": ["C. S. Xia", "Y. Deng", "S. Dunn", "L. Zhang"], 388 "year": 2025, 389 "relevance": "Agentless approach to software engineering showing simple plans can be sufficient." 390 }, 391 { 392 "title": "MAGIS: LLM-based multi-agent framework for github issue resolution", 393 "authors": ["W. Tao", "Y. Zhou", "Y. Wang", "W. Zhang", "H. Zhang", "Y. Cheng"], 394 "year": 2024, 395 "relevance": "Multi-agent framework with Manager, Developer, QA Engineer roles for issue resolution." 396 }, 397 { 398 "title": "Training software engineering agents and verifiers with SWE-gym", 399 "authors": ["J. Pan", "X. Wang", "G. Neubig"], 400 "year": 2025, 401 "relevance": "Environment for training and evaluating tool-using SE agents via SFT." 402 }, 403 { 404 "title": "SWE-smith: Scaling data for software engineering agents", 405 "authors": ["J. Yang", "K. Lieret", "C. E. Jimenez"], 406 "year": 2025, 407 "relevance": "Extends SWE-bench for training SE agents at scale." 408 }, 409 { 410 "title": "Kimi K2: Open agentic intelligence", 411 "authors": ["K. Team"], 412 "year": 2025, 413 "arxiv_id": "2507.20534", 414 "relevance": "Open model optimized for agentic tool use in code generation and debugging." 415 }, 416 { 417 "title": "PatchAgent: A practical program repair agent mimicking human expertise", 418 "authors": ["Z. Yu", "Z. Guo", "Y. Wu"], 419 "year": 2025, 420 "relevance": "Program repair agent with integrated security and functional verification." 421 }, 422 { 423 "title": "MASAI: Modular architecture for software-engineering AI agents", 424 "authors": ["N. Wadhwa", "A. Sonwane", "D. Arora"], 425 "year": 2024, 426 "relevance": "Modular agent architecture decomposing issue resolution into specialized sub-agents." 427 } 428 ] 429 }