scan.json (19067B)
1 { 2 "paper": { 3 "title": "A Survey on Code Generation with LLM-based Agents", 4 "authors": ["Yihong Dong", "Xue Jiang", "Jiaru Qian", "Tian Wang", "Kechi Zhang", "Zhi Jin", "Ge Li"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2508.00083" 8 }, 9 "scan_version": 2, 10 "active_modules": ["survey_methodology"], 11 "methodology_tags": ["meta-analysis"], 12 "key_findings": "This survey systematically reviews 100 core papers on LLM-based code generation agents, categorizing techniques into single-agent (planning, tool integration, reflection) and multi-agent systems (pipeline, hierarchical, self-negotiation, self-evolving workflows). It traces the evolution from passive code completion to autonomous development agents spanning the full SDLC, and catalogs deployed tools (Copilot, Cursor, Claude Code, Devin). The paper identifies five challenge dimensions: core capability limitations, robustness/updatability, tool integration/deployment, trustworthiness/security, and evaluation completeness.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": true, 18 "justification": "The paper provides a GitHub link: https://github.com/JiaruQian/awesome-llm-based-agent4code for an 'awesome' list of related literature." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "No structured dataset of the 447 candidate or 100 final papers is released. The GitHub repo appears to be a curated reading list, not the analysis data." 24 }, 25 "environment_specified": { 26 "applies": false, 27 "answer": false, 28 "justification": "This is a survey paper with no computational experiments requiring an environment." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No instructions are provided for reproducing the literature search, screening, or analysis pipeline." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": false, 39 "answer": false, 40 "justification": "Survey paper with no experiments producing statistical results." 41 }, 42 "significance_tests": { 43 "applies": false, 44 "answer": false, 45 "justification": "No statistical comparisons are performed." 46 }, 47 "effect_sizes_reported": { 48 "applies": false, 49 "answer": false, 50 "justification": "No experiments are run; the paper summarizes others' results narratively." 51 }, 52 "sample_size_justified": { 53 "applies": false, 54 "answer": false, 55 "justification": "No experiments requiring sample size justification." 56 }, 57 "variance_reported": { 58 "applies": false, 59 "answer": false, 60 "justification": "No experiments producing variance measures." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": false, 67 "justification": "The survey does not compare itself against prior surveys in a structured way. It mentions four prior surveys [37-40] but only describes them briefly without a systematic comparison of coverage or methodology." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "The prior surveys mentioned [37-40] are from 2024-2025 and are contemporary." 73 }, 74 "ablation_study": { 75 "applies": false, 76 "answer": false, 77 "justification": "Survey paper with no system components to ablate." 78 }, 79 "multiple_metrics": { 80 "applies": false, 81 "answer": false, 82 "justification": "No evaluation metrics are applied to the survey's own analysis." 83 }, 84 "human_evaluation": { 85 "applies": false, 86 "answer": false, 87 "justification": "Human evaluation is not relevant to a literature survey's claims." 88 }, 89 "held_out_test_set": { 90 "applies": false, 91 "answer": false, 92 "justification": "No test set is relevant to this survey." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "The paper provides breakdowns by year (Figure 1a), by venue (Figure 1b), by technique category (Sections 4.1-4.2), and by application area (Section 5)." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "Section 8 discusses challenges and limitations of code generation agents, including Devin's 'low success rates, frequent loops, and difficult-to-resolve hallucination issues' (Section 7)." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper discusses negative aspects: Devin's poor practical reliability, error cascading in multi-agent systems, hallucination problems, and security risks (Sections 7-8)." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The abstract claims the paper presents a 'systematic survey' with coverage of techniques, applications, benchmarks, and tools. The body delivers on all of these with structured sections." 115 }, 116 "causal_claims_justified": { 117 "applies": false, 118 "answer": false, 119 "justification": "The paper makes no causal claims; it describes and categorizes existing work." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "The title and scope claim comprehensive coverage of 'code generation with LLM-based agents' but the literature search is limited to CCF-recommended venues plus arXiv, from 2022-June 2025. Non-CCF venues and non-English/non-Chinese papers are excluded without acknowledgment of this boundary." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": false, 128 "answer": false, 129 "justification": "As a pure survey/taxonomy paper with no empirical results, alternative explanations are not applicable." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": false, 133 "answer": false, 134 "justification": "Theoretical/survey paper with no measurements of its own." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": false, 140 "answer": false, 141 "justification": "No models are used in this survey paper." 142 }, 143 "prompts_provided": { 144 "applies": false, 145 "answer": false, 146 "justification": "No prompting is used." 147 }, 148 "hyperparameters_reported": { 149 "applies": false, 150 "answer": false, 151 "justification": "No experiments are run." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "No agentic scaffolding is used in the survey itself." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section 2.1 describes the literature collection pipeline: databases searched (ACM, IEEE, SpringerLink, Google Scholar, DBLP, CNKI), search keywords, retrieval fields, time span (2022-June 2025), five screening criteria, and final count (447 candidates → 100 core papers). The filtering criteria are stated at each stage." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": false, 168 "justification": "There is no dedicated limitations section for the survey methodology itself. Section 8 discusses challenges of the surveyed technology, not limitations of the survey." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": false, 173 "justification": "No threats to validity of the survey itself are discussed. The paper does not address potential biases in its own literature selection or analysis." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": false, 178 "justification": "The paper does not explicitly state what it does NOT cover. The venue restriction to CCF-recommended conferences, the exclusion of non-English/non-Chinese literature, and the 2022-2025 time boundary are mentioned in the method but not framed as scope limitations." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "The list of 447 candidate papers and the screening decisions are not made available. Only 100 papers appear in the reference list without the screening metadata." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section 2.1 describes databases, keywords, time span, and screening criteria in detail." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants; data source is academic literature databases." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": false, 200 "justification": "While the screening criteria are listed, the paper does not report how many papers were removed at each screening step (only '447 → 100'). The intermediate counts per criterion are not provided." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding information is disclosed anywhere in the paper." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Authors are identified as being 'mainly with the School of Computer Science, Peking University, Beijing, China.'" 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding is disclosed, so independence cannot be assessed." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial interests statement is present in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": false, 228 "answer": false, 229 "justification": "Survey paper that does not evaluate any pre-trained model on a benchmark." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": false, 233 "answer": false, 234 "justification": "Survey paper with no model evaluation." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": false, 238 "answer": false, 239 "justification": "Survey paper with no model evaluation." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "Survey paper with no computational method of its own." 284 }, 285 "compute_budget_stated": { 286 "applies": false, 287 "answer": false, 288 "justification": "Survey paper with no computational experiments." 289 } 290 }, 291 "survey_methodology": { 292 "prisma_or_structured_protocol": { 293 "applies": true, 294 "answer": false, 295 "justification": "The paper describes a search strategy with databases and keywords (Section 2.1) but does not follow PRISMA or any named structured review protocol. No PRISMA flow diagram is provided. No protocol registration is mentioned." 296 }, 297 "quality_assessment_of_sources": { 298 "applies": true, 299 "answer": false, 300 "justification": "The survey does not assess the methodological quality of its 100 included papers. All papers are treated equally regardless of rigor. Screening criteria focus on publication venue and relevance, not study quality." 301 }, 302 "publication_bias_discussed": { 303 "applies": true, 304 "answer": false, 305 "justification": "No discussion of publication bias. The survey does not consider whether its sources skew toward positive results or whether negative-result papers on code generation agents are underrepresented." 306 } 307 } 308 }, 309 "claims": [ 310 { 311 "claim": "Code generation agents are characterized by three core features: autonomy, expanded task scope, and enhancement of engineering practicality.", 312 "evidence": "This is a framing claim developed throughout Sections 1 and 3, supported by categorization of reviewed literature rather than empirical evidence.", 313 "supported": "moderate" 314 }, 315 { 316 "claim": "447 candidate papers were retrieved and screened to 100 core papers.", 317 "evidence": "Section 2.1 describes the search and screening methodology across six databases with five screening criteria.", 318 "supported": "moderate" 319 }, 320 { 321 "claim": "The number of papers in this field shows a year-over-year increasing trend since 2023.", 322 "evidence": "Figure 1(a) shows 14 papers in 2023, 46 in 2024, and 40 in Jan-Jun 2025, supporting a growth trend.", 323 "supported": "strong" 324 }, 325 { 326 "claim": "Devin has 'low success rates, frequent loops, and difficult-to-resolve hallucination issues'.", 327 "evidence": "Stated in Section 7 (Table 1) as a known limitation, but no citation or data is provided to support this specific characterization.", 328 "supported": "weak" 329 } 330 ], 331 "red_flags": [ 332 { 333 "flag": "No quality assessment of reviewed papers", 334 "detail": "The survey treats all 100 included papers equally without assessing their methodological quality. This launders weak results alongside strong ones without distinction." 335 }, 336 { 337 "flag": "Screening criteria exclude empirical surveys and reviews", 338 "detail": "Criterion (4) explicitly excludes 'empirical surveys and review literature', which could omit important meta-research relevant to the field's methodological health." 339 }, 340 { 341 "flag": "No PRISMA protocol", 342 "detail": "Despite claiming a 'systematic' survey, the paper does not follow PRISMA or any established systematic review protocol, and does not provide intermediate screening counts." 343 }, 344 { 345 "flag": "Venue bias toward CCF-A conferences", 346 "detail": "Literature search is restricted to CCF-recommended venues, potentially missing important work published in non-CCF venues or industry reports." 347 }, 348 { 349 "flag": "Devin criticism without evidence", 350 "detail": "The paper characterizes Devin as having 'low success rates' and 'difficult-to-resolve hallucination issues' without citing any empirical evaluation or source for these claims." 351 } 352 ], 353 "cited_papers": [ 354 { 355 "title": "Self-collaboration code generation via ChatGPT", 356 "authors": ["Yihong Dong", "Xue Jiang", "Zhi Jin", "Ge Li"], 357 "year": 2024, 358 "relevance": "Foundational multi-agent code generation framework using role-based collaboration." 359 }, 360 { 361 "title": "Chatdev: Communicative agents for software development", 362 "authors": ["Chen Qian"], 363 "year": 2023, 364 "relevance": "Influential multi-agent system simulating software company roles for code generation." 365 }, 366 { 367 "title": "Metagpt: Meta programming for multi-agent collaborative framework", 368 "authors": ["Sirui Hong"], 369 "year": 2023, 370 "relevance": "Multi-agent framework simulating complete software company organization for development." 371 }, 372 { 373 "title": "Swe-agent: Agent-computer interfaces enable automated software engineering", 374 "authors": ["John Yang"], 375 "year": 2024, 376 "relevance": "Key agent framework for automated software engineering evaluated on SWE-bench." 377 }, 378 { 379 "title": "Swe-bench: Can language models resolve real-world github issues?", 380 "authors": ["Carlos E. Jimenez"], 381 "year": 2024, 382 "relevance": "Primary benchmark for evaluating code generation agents on real-world software tasks." 383 }, 384 { 385 "title": "Evaluating large language models trained on code", 386 "authors": ["Mark Chen"], 387 "year": 2021, 388 "relevance": "Introduced HumanEval benchmark and Codex, foundational for LLM code generation evaluation." 389 }, 390 { 391 "title": "Self-refine: Iterative refinement with self-feedback", 392 "authors": ["Aman Madaan"], 393 "year": 2023, 394 "relevance": "Key reflection/self-improvement technique used across code generation agents." 395 }, 396 { 397 "title": "AutoCodeRover: Autonomous program improvement", 398 "authors": ["Yuntong Zhang"], 399 "year": 2024, 400 "relevance": "Autonomous agent for program repair demonstrating agent-based debugging approaches." 401 }, 402 { 403 "title": "Livecodebench: Holistic and contamination free evaluation of large language models for code", 404 "authors": ["Naman Jain"], 405 "year": 2025, 406 "relevance": "Contamination-aware benchmark addressing data leakage in code generation evaluation." 407 }, 408 { 409 "title": "From LLMs to LLM-based agents for software engineering: A survey of current, challenges and future", 410 "authors": ["Haolin Jin"], 411 "year": 2024, 412 "relevance": "Prior survey on LLM-based agents for SE, useful for comparison of survey coverage." 413 }, 414 { 415 "title": "Generalization or memorization: Data contamination and trustworthy evaluation for large language models", 416 "authors": ["Yihong Dong"], 417 "year": 2024, 418 "relevance": "Addresses data contamination in LLM evaluation, directly relevant to benchmark validity." 419 }, 420 { 421 "title": "Swe-search: Enhancing software agents with monte carlo tree search and iterative refinement", 422 "authors": ["Antonis Antoniades"], 423 "year": 2024, 424 "relevance": "Applies MCTS to software engineering agents, advancing search-based code generation." 425 } 426 ] 427 }