scan-v5.json (20225B)
1 { 2 "scan_version": 5, 3 "paper_type": "survey", 4 "paper": { 5 "title": "Large Language Model-Based Agents for Software Engineering: A Survey", 6 "authors": [ 7 "Junwei Liu", 8 "Kaixin Wang", 9 "Yixuan Chen", 10 "Xin Peng", 11 "Zhenpeng Chen", 12 "Lingming Zhang", 13 "Yiling Lou" 14 ], 15 "year": 2024, 16 "venue": "arXiv", 17 "arxiv_id": "2409.02977", 18 "doi": null 19 }, 20 "checklist": { 21 "claims_and_evidence": { 22 "abstract_claims_supported": { 23 "applies": true, 24 "answer": true, 25 "justification": "All abstract claims (comprehensive survey, 124 papers, two-perspective categorization, open challenges discussion) are substantiated by the paper's content with documented methodology and dedicated sections.", 26 "source": "haiku" 27 }, 28 "causal_claims_justified": { 29 "applies": true, 30 "answer": true, 31 "justification": "Comparative claims about agents vs. standalone LLMs are supported by citing specific empirical results from reviewed papers (e.g., pass@1 results from [325],[94] for code generation), which is appropriate attribution for a survey.", 32 "source": "haiku" 33 }, 34 "generalization_bounded": { 35 "applies": true, 36 "answer": false, 37 "justification": "Section 7.1 makes broad claims that 'LLM-based agents offer stronger performance and wider applicability in real-world software engineering' without acknowledging publication bias in the reviewed corpus or that the sample is non-random.", 38 "source": "haiku" 39 }, 40 "alternative_explanations_discussed": { 41 "applies": true, 42 "answer": false, 43 "justification": "Section 7.2 only discusses threats to coverage (missed papers) but not threats to interpretation — publication bias, cherry-picked benchmarks, or confounds in why agents appear to outperform standalone LLMs are not considered.", 44 "source": "haiku" 45 }, 46 "proxy_outcome_distinction": { 47 "applies": true, 48 "answer": true, 49 "justification": "The paper explicitly notes metric limitations: 'software accuracy is merely assessed based on manually scored executability or code similarity, which may not fully capture the quality of the generated software.'", 50 "source": "haiku" 51 } 52 }, 53 "limitations_and_scope": { 54 "limitations_section_present": { 55 "applies": true, 56 "answer": true, 57 "justification": "Section 7.2 'Threats to Validity' is a dedicated multi-paragraph section with five distinct named threats, well beyond a single sentence in a conclusion.", 58 "source": "haiku" 59 }, 60 "threats_to_validity_specific": { 61 "applies": true, 62 "answer": true, 63 "justification": "Threats are specific: e.g., 'Multi-Agent Strategy in Requirements Engineering Lacks Sufficient Validation' with exact counts (3 of 4 papers use multi-agent, 2 of those are unreviewed preprints); 'Iterative Coverage Improvements in Unit Testing' supported by 1 of 3 published works.", 64 "source": "haiku" 65 }, 66 "scope_boundaries_stated": { 67 "applies": true, 68 "answer": true, 69 "justification": "Section 3.1 explicitly defines scope (LLM-based agents with iterative environment interaction, SE lifecycle tasks), explicitly excludes standalone LLM workflows, and states 'conducting extra experimental analysis is beyond the scope of this survey.'", 70 "source": "haiku" 71 } 72 }, 73 "conflicts_of_interest": { 74 "funding_disclosed": { 75 "applies": true, 76 "answer": false, 77 "justification": "No funding disclosure statement appears anywhere in the paper. The acknowledgements section only thanks authors who provided feedback on the draft.", 78 "source": "haiku" 79 }, 80 "affiliations_disclosed": { 81 "applies": true, 82 "answer": true, 83 "justification": "Author affiliations are clearly disclosed: Fudan University (China), Nanyang Technological University (Singapore), and University of Illinois Urbana-Champaign (USA).", 84 "source": "haiku" 85 }, 86 "funder_independent_of_outcome": { 87 "applies": false, 88 "answer": false, 89 "justification": "No funding is disclosed, making this criterion not applicable.", 90 "source": "haiku" 91 }, 92 "financial_interests_declared": { 93 "applies": true, 94 "answer": false, 95 "justification": "There is no competing interests or financial interests declaration anywhere in the paper.", 96 "source": "haiku" 97 } 98 }, 99 "scope_and_framing": { 100 "key_terms_defined": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 3.1 provides explicit operational definitions for both 'SE tasks' (along the software lifecycle) and 'LLM-based agents' (must iteratively perceive feedback from and act upon a dynamic environment), explicitly distinguishing agents from standalone LLMs.", 104 "source": "haiku" 105 }, 106 "intended_contribution_clear": { 107 "applies": true, 108 "answer": true, 109 "justification": "Three explicit contributions are listed: (1) comprehensive survey of 124 papers on LLM agents for SE, (2) analysis from SE and agent perspectives, (3) discussion of research opportunities and future directions.", 110 "source": "haiku" 111 }, 112 "engagement_with_prior_work": { 113 "applies": true, 114 "answer": true, 115 "justification": "Section 2.4 explicitly compares to five related surveys ([2],[3],[24],[34],[40]), specifying how this work differs: wider SE task coverage (end-to-end tasks), agent architecture taxonomy (memory/planning/action), and empirical survey vs. vision paper.", 116 "source": "haiku" 117 } 118 } 119 }, 120 "type_checklist": { 121 "survey": { 122 "search_and_selection": { 123 "search_strategy_reproducible": { 124 "applies": true, 125 "answer": true, 126 "justification": "DBLP was searched on July 1, 2024 with the complete keyword set provided verbatim, yielding 10,362 hits. Table 2 shows per-keyword hit counts. The iterative keyword refinement procedure is documented in detail.", 127 "source": "haiku" 128 }, 129 "inclusion_exclusion_explicit": { 130 "applies": true, 131 "answer": true, 132 "justification": "Table 1 provides 2 explicit inclusion criteria and 7 explicit exclusion criteria covering agent type, evaluation presence, LLM integration degree, paper length, grey literature exclusion, and deduplication.", 133 "source": "haiku" 134 }, 135 "prisma_or_structured_protocol": { 136 "applies": true, 137 "answer": false, 138 "justification": "Authors explicitly state 'we position this paper as a comprehensive survey rather than a systematic literature review' and no PRISMA or equivalent structured protocol is followed or mentioned.", 139 "source": "haiku" 140 }, 141 "search_terms_provided": { 142 "applies": true, 143 "answer": true, 144 "justification": "The complete final keyword query is provided verbatim in the text: ('agent' OR 'llm' OR 'language model') AND ('api' OR 'bug' OR 'code' OR ... 'vulnerab'), with 19 OR-combined SE terms.", 145 "source": "haiku" 146 }, 147 "databases_listed": { 148 "applies": true, 149 "answer": true, 150 "justification": "DBLP is explicitly named as the primary database with justification (covers 7M+ publications, 6,500+ conferences, 1,850+ journals, includes arXiv). Google Scholar is used for forward snowballing.", 151 "source": "haiku" 152 }, 153 "screening_process_documented": { 154 "applies": true, 155 "answer": true, 156 "justification": "Table 2 shows counts at each stage: 10,362 hits → 67 after manual inspection → 108 after snowballing → 124 after author feedback. Three screening stages are described with two independent reviewers and a third arbiter for disagreements.", 157 "source": "haiku" 158 }, 159 "review_scope_justified": { 160 "applies": true, 161 "answer": true, 162 "justification": "DBLP choice is justified by citing prior SE surveys showing other databases are typically DBLP subsets. The temporal scope (up to September 2024) is justified by the rapidly evolving field requiring timely coverage.", 163 "source": "haiku" 164 } 165 }, 166 "synthesis_quality": { 167 "conflicting_findings_acknowledged": { 168 "applies": true, 169 "answer": true, 170 "justification": "Conflicting findings are acknowledged in multiple places: simpler Agentless outperforms complex agents on SWE-bench Lite; pure autonomous localization (SWE-agent) performs worst despite full autonomy; multi-planner approaches don't always surpass single-planner (Flows experiment).", 171 "source": "haiku" 172 }, 173 "quality_assessment_of_sources": { 174 "applies": true, 175 "answer": false, 176 "justification": "No systematic quality rubric or risk-of-bias assessment is applied to the 124 reviewed papers. The only quality dimension noted is peer-review status (published vs. preprint), and this is flagged only for specific strategies in Section 7.2 rather than applied systematically.", 177 "source": "haiku" 178 }, 179 "publication_bias_discussed": { 180 "applies": true, 181 "answer": false, 182 "justification": "Publication bias is not discussed. The paper notes 75% peer-reviewed sources but never acknowledges that published papers skew toward positive results — a significant omission given the conclusion that agents broadly outperform standalone LLMs.", 183 "source": "haiku" 184 }, 185 "quantitative_synthesis_present": { 186 "applies": true, 187 "answer": false, 188 "justification": "Performance numbers from individual papers are tabulated (e.g., Figure 17 for SWE-bench Lite resolve rates) but no formal quantitative synthesis, meta-analysis, or effect size aggregation is performed across studies.", 189 "source": "haiku" 190 }, 191 "recommendations_supported_by_evidence": { 192 "applies": true, 193 "answer": true, 194 "justification": "Section 6 recommendations are grounded in specific evidence: the call for better benchmarks cites documented SWE-bench issues [260],[269]; the priority for efficiency metrics is backed by finding only 46.7% of surveyed papers quantify computational costs.", 195 "source": "haiku" 196 } 197 } 198 } 199 }, 200 "claims": [ 201 { 202 "claim": "59.7% of existing LLM-based agents for SE are multi-agent systems", 203 "evidence": "Internal count of the 124 collected papers", 204 "supported": "strong" 205 }, 206 { 207 "claim": "LLM-based agents consistently achieve higher pass@1 than foundation LLMs across code generation benchmarks of various complexity", 208 "evidence": "Cited from papers [325] and [94] reporting results on HumanEval, LiveCodeBench, and CodeAgentBench", 209 "supported": "moderate" 210 }, 211 { 212 "claim": "Simpler traditional fault localization approaches (Agentless) surpass many complex agentic approaches on SWE-bench Lite", 213 "evidence": "Figure 17 comparing resolve rates from original papers; explicitly discussed in Section 4.8.8", 214 "supported": "strong" 215 }, 216 { 217 "claim": "Only 46.7% of surveyed papers explicitly quantify efficiency through time, token consumption, monetary cost, or feedback loop metrics", 218 "evidence": "Internal analysis of reporting practices across the 124 reviewed papers", 219 "supported": "strong" 220 }, 221 { 222 "claim": "Agents employing dynamic patch verification and patch ranking generally achieve higher resolve rates on SWE-bench Lite", 223 "evidence": "Observation from Figure 17: all top-5 agents (Agentless, MASAI, SpecRover, DEIBase, CodeR) use dynamic checking or ranking", 224 "supported": "moderate" 225 }, 226 { 227 "claim": "Multi-agent collaboration is the predominant strategy in requirements engineering but lacks peer-reviewed validation", 228 "evidence": "3 of 4 RE papers use multi-agent architectures, but only 2 of those 3 are published in peer-reviewed venues", 229 "supported": "strong" 230 } 231 ], 232 "methodology_tags": [ 233 "qualitative" 234 ], 235 "key_findings": "This survey of 124 papers maps LLM-based agents for software engineering across both SE task types (requirements, code generation, static checking, testing, debugging, IT ops, end-to-end development and maintenance) and agent components (planning, memory, perception, action, multi-agent collaboration). Key structural finding: 59.7% of agents are multi-agent systems, with vertical collaboration architecture most common. A notable empirical counterpoint emerges: simpler, less-agentic approaches like Agentless outperform complex autonomous agents on SWE-bench Lite, suggesting agent complexity does not guarantee performance gains. The survey identifies critical methodological gaps in the field: lack of fine-grained metrics beyond final success rates, limited benchmarks that reflect real-world SE complexity, and insufficient exploration of human-agent collaboration beyond predefined interaction points.", 236 "red_flags": [ 237 { 238 "flag": "Publication bias not discussed", 239 "detail": "The survey concludes agents broadly outperform standalone LLMs (Section 7.1) but never acknowledges that the reviewed literature is a non-random sample skewed toward positive results by publication bias." 240 }, 241 { 242 "flag": "No quality assessment of sources", 243 "detail": "All 124 papers are treated as equally credible regardless of evaluation rigor; no methodological quality rubric or risk-of-bias tool is applied to the reviewed papers." 244 }, 245 { 246 "flag": "Broad superiority claims from biased sample", 247 "detail": "Section 7.1 makes sweeping claims ('LLM-based agents offer stronger performance and wider applicability in real-world software engineering') that overstate what a survey of published papers can support." 248 }, 249 { 250 "flag": "25% of sources are unreviewed preprints", 251 "detail": "Approximately 31 arXiv preprints without peer review are included; several support key findings (e.g., in requirements engineering and bug detection sections) and may not be replicable." 252 }, 253 { 254 "flag": "No funding disclosure", 255 "detail": "No funding sources are disclosed despite the paper representing a large multi-institution collaborative effort." 256 } 257 ], 258 "cited_papers": [ 259 { 260 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 261 "relevance": "Primary benchmark for end-to-end software maintenance; central to the survey's performance comparison of agents" 262 }, 263 { 264 "title": "ChatDev: Communicative Agents for Software Development", 265 "relevance": "Major exemplar of multi-agent end-to-end software development using waterfall model with role specialization" 266 }, 267 { 268 "title": "MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework", 269 "relevance": "Demonstrates structured communication via documents and shared message pool in multi-agent SE systems" 270 }, 271 { 272 "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering", 273 "relevance": "Key paper in end-to-end maintenance; notable finding that full autonomy leads to worst performance among compared agents" 274 }, 275 { 276 "title": "Demystifying LLM-based Software Engineering Agents (Agentless)", 277 "relevance": "Demonstrates that non-agentic simpler approaches can outperform complex agent systems on SWE-bench Lite" 278 }, 279 { 280 "title": "The Rise and Potential of Large Language Model Based Agents: A Survey", 281 "relevance": "Foundational general-purpose agent survey providing the four-component framework (planning, memory, perception, action) used throughout this paper" 282 }, 283 { 284 "title": "Reflexion: Language Agents with Verbal Reinforcement Learning", 285 "relevance": "Foundational paper on iterative self-refinement and memory mechanisms in agents" 286 }, 287 { 288 "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversations", 289 "relevance": "Widely-used multi-agent framework enabling diverse SE agent architectures reviewed in this survey" 290 } 291 ], 292 "engagement_factors": { 293 "practical_relevance": { 294 "score": 3, 295 "justification": "Comprehensive taxonomy of agent tools across all major SE tasks with concrete performance comparisons is directly actionable for SE practitioners and researchers." 296 }, 297 "surprise_contrarian": { 298 "score": 2, 299 "justification": "The finding that simple non-agentic approaches (Agentless) outperform complex autonomous agents challenges the prevailing assumption that more sophisticated agents are better." 300 }, 301 "fear_safety": { 302 "score": 1, 303 "justification": "Security-related agents (PentestGPT, vulnerability detection) are covered but the survey does not foreground safety risks from autonomous agents." 304 }, 305 "drama_conflict": { 306 "score": 1, 307 "justification": "Mild debate about complex vs. simple agents; the simplicity-beats-complexity observation creates some implicit controversy." 308 }, 309 "demo_ability": { 310 "score": 2, 311 "justification": "GitHub repository at https://github.com/FudanSELab/Agent4SE-Paper-List is provided; many of the 124 reviewed tools have public implementations." 312 }, 313 "brand_recognition": { 314 "score": 1, 315 "justification": "Fudan University and UIUC are respected but not the top-tier industry labs (Google/Meta/OpenAI) that drive viral attention for AI papers." 316 } 317 }, 318 "hn_data": { 319 "threads": [ 320 { 321 "hn_id": "40295298", 322 "title": "Agent Hospital that simulates the entire process of treating illness", 323 "points": 5, 324 "comments": 0, 325 "url": "https://news.ycombinator.com/item?id=40295298", 326 "created_at": "2024-05-08T07:31:20Z" 327 }, 328 { 329 "hn_id": "39870179", 330 "title": "SportsNGEN: Sustained Generation of Multi-Player Sports Gameplay", 331 "points": 3, 332 "comments": 0, 333 "url": "https://news.ycombinator.com/item?id=39870179", 334 "created_at": "2024-03-29T23:27:12Z" 335 }, 336 { 337 "hn_id": "39680785", 338 "title": "Abstracting Denotational Interpreters", 339 "points": 2, 340 "comments": 1, 341 "url": "https://news.ycombinator.com/item?id=39680785", 342 "created_at": "2024-03-12T15:36:26Z" 343 }, 344 { 345 "hn_id": "40556931", 346 "title": "Agent Hospital: A Simulacrum of Hospital with Evolvable Medical Agents", 347 "points": 2, 348 "comments": 0, 349 "url": "https://news.ycombinator.com/item?id=40556931", 350 "created_at": "2024-06-02T20:17:22Z" 351 }, 352 { 353 "hn_id": "39680189", 354 "title": "VideoMamba: State Space Model for Efficient Video Understanding", 355 "points": 2, 356 "comments": 0, 357 "url": "https://news.ycombinator.com/item?id=39680189", 358 "created_at": "2024-03-12T14:48:56Z" 359 }, 360 { 361 "hn_id": "28684178", 362 "title": "Inconsistency in Conference Peer Review: Revisiting the 2014 NeurIPS Experiment", 363 "points": 2, 364 "comments": 0, 365 "url": "https://news.ycombinator.com/item?id=28684178", 366 "created_at": "2021-09-28T15:47:40Z" 367 }, 368 { 369 "hn_id": "28615448", 370 "title": "Inconsistency in Conference Peer Review: Revisiting the 2014 NeurIPS Experiment", 371 "points": 2, 372 "comments": 0, 373 "url": "https://news.ycombinator.com/item?id=28615448", 374 "created_at": "2021-09-22T12:15:06Z" 375 }, 376 { 377 "hn_id": "38924816", 378 "title": "Complex systems approach to natural language", 379 "points": 1, 380 "comments": 0, 381 "url": "https://news.ycombinator.com/item?id=38924816", 382 "created_at": "2024-01-09T11:19:20Z" 383 } 384 ], 385 "top_points": 5, 386 "total_points": 19, 387 "total_comments": 1 388 } 389 }