scan.json (26648B)
1 { 2 "paper": { 3 "title": "AgentSpawn: Adaptive Multi-Agent Collaboration Through Dynamic Spawning for Long-Horizon Code Generation", 4 "authors": [ 5 "Igor Costa" 6 ], 7 "year": 2026, 8 "venue": "arXiv", 9 "arxiv_id": "2602.07072" 10 }, 11 "checklist": { 12 "artifacts": { 13 "code_released": { 14 "applies": true, 15 "answer": false, 16 "justification": "No repository URL, GitHub link, or archive is provided. The conclusion explicitly says future work includes 'implementing the AgentSpawn prototype,' indicating no implementation exists yet." 17 }, 18 "data_released": { 19 "applies": true, 20 "answer": false, 21 "justification": "The paper references SWE-bench and Defects4J as benchmarks, but no dataset or data release is provided. The custom refactoring task set (100 tasks) is not released anywhere." 22 }, 23 "environment_specified": { 24 "applies": true, 25 "answer": false, 26 "justification": "No environment specification is provided. No requirements.txt, Dockerfile, or dependency list is mentioned. The paper proposes future implementation using Claude 3.5 Sonnet or GPT-4 but does not specify any environment." 27 }, 28 "reproduction_instructions": { 29 "applies": true, 30 "answer": false, 31 "justification": "No reproduction instructions are provided. The conclusion explicitly lists 'empirical validation on SWE-bench and Defects4J with proper baseline configurations' as future work, confirming the experiments were not actually run." 32 } 33 }, 34 "statistical_methodology": { 35 "confidence_intervals_or_error_bars": { 36 "applies": true, 37 "answer": false, 38 "justification": "No confidence intervals or error bars are reported for any result. All tables show single point estimates (e.g., '+97%', '67%') with no uncertainty quantification." 39 }, 40 "significance_tests": { 41 "applies": true, 42 "answer": false, 43 "justification": "No statistical significance tests are reported anywhere in the paper. Comparative claims such as 'AgentSpawn achieves +97% over GPT-4 Single' are made with no p-values, t-tests, or other tests." 44 }, 45 "effect_sizes_reported": { 46 "applies": true, 47 "answer": false, 48 "justification": "The paper reports percentage improvements over baselines but these are presented without uncertainty or baseline absolute values for context (Table 2 uses 'baseline' without stating the actual rate), making effect sizes uninterpretable. More critically, the results are not from real experiments." 49 }, 50 "sample_size_justified": { 51 "applies": true, 52 "answer": false, 53 "justification": "The paper claims 300 SWE-bench tasks, 200 Defects4J tasks, and 100 custom tasks, but no justification is provided for these sizes. More critically, the conclusion confirms these experiments were never run." 54 }, 55 "variance_reported": { 56 "applies": true, 57 "answer": false, 58 "justification": "No variance, standard deviation, or spread measures are reported for any result. All numbers are single-point estimates with no indication of run-to-run variability." 59 } 60 }, 61 "evaluation_design": { 62 "baselines_included": { 63 "applies": true, 64 "answer": false, 65 "justification": "While Table 2 lists baselines (GPT-4 Single, AutoGen, CrewAI, AFLOW), the conclusion admits that 'empirical validation on SWE-bench and Defects4J with proper baseline configurations' is future work — meaning the baseline comparisons were never actually conducted." 66 }, 67 "baselines_contemporary": { 68 "applies": true, 69 "answer": false, 70 "justification": "The baselines listed (AutoGen, CrewAI, AFLOW) are contemporary, but since the experiments were never conducted (per the conclusion), this is moot. The numbers in Table 2 are not from real experiments." 71 }, 72 "ablation_study": { 73 "applies": true, 74 "answer": false, 75 "justification": "Table 3 presents an ablation study but the conclusion confirms these are not real experimental results — 'empirical validation' is listed as future work. The ablation numbers are fabricated." 76 }, 77 "multiple_metrics": { 78 "applies": true, 79 "answer": false, 80 "justification": "Section 4.1 proposes five metrics (task completion rate, memory overhead, spawn count, coherence violations, cost per success) but no experiments were conducted. Proposing metrics in a design is not the same as using them in an evaluation. No metrics were actually measured." 81 }, 82 "human_evaluation": { 83 "applies": false, 84 "answer": false, 85 "justification": "The paper evaluates code generation quality using automated metrics (task completion, test pass rates). Human evaluation is not applicable and not claimed." 86 }, 87 "held_out_test_set": { 88 "applies": true, 89 "answer": false, 90 "justification": "No train/test split is described. The paper uses existing benchmarks (SWE-bench, Defects4J) as test sets, but since no experiments were run, this is irrelevant. No data splitting methodology is specified." 91 }, 92 "per_category_breakdown": { 93 "applies": true, 94 "answer": false, 95 "justification": "Tables 4 and 5 present breakdowns by task difficulty (Easy/Medium/Hard) and task type (Simple refactor/Multi-file fix/Complex feature). However, since the conclusion confirms no experiments were run, these numbers are fabricated. Presenting invented data in categorized tables does not constitute providing per-category breakdowns of actual results." 96 }, 97 "failure_cases_discussed": { 98 "applies": true, 99 "answer": false, 100 "justification": "No actual failure cases are shown or analyzed. Section 5.4 mentions limitations but does not present any empirical failure analysis or examples of when AgentSpawn fails." 101 }, 102 "negative_results_reported": { 103 "applies": true, 104 "answer": false, 105 "justification": "The paper only presents positive results (AgentSpawn outperforms all baselines on all tasks). No negative results are reported, and since no experiments were conducted, this is expected." 106 } 107 }, 108 "claims_and_evidence": { 109 "abstract_claims_supported": { 110 "applies": true, 111 "answer": false, 112 "justification": "The abstract claims '34% higher completion rates than static baselines on benchmarks like SWE-bench' and '42% memory overhead reduction.' The conclusion contradicts this by listing empirical validation as future work, meaning the abstract's empirical claims are not supported by actual experiments." 113 }, 114 "causal_claims_justified": { 115 "applies": true, 116 "answer": false, 117 "justification": "The paper makes strong causal claims (e.g., 'memory slicing reduces context overflow failures by 42%', 'adaptive spawning enables specialized expertise application') with no experimental basis. Ablation in Table 3 purports to show component contributions causally, but no experiments were conducted." 118 }, 119 "generalization_bounded": { 120 "applies": true, 121 "answer": false, 122 "justification": "The paper claims results on 'benchmarks like SWE-bench' in the abstract without clarifying that no actual evaluation was performed. Section 5.6 also generalizes to 'document authoring, data analysis, system administration' without any evidence." 123 }, 124 "alternative_explanations_discussed": { 125 "applies": true, 126 "answer": false, 127 "justification": "No alternative explanations for the results are discussed. Given that results are fabricated, there is nothing to explain. Section 5.4 mentions hyperparameter sensitivity and domain generalization as limitations but not as alternative explanations for observed results." 128 } 129 }, 130 "setup_transparency": { 131 "model_versions_specified": { 132 "applies": true, 133 "answer": false, 134 "justification": "The conclusion mentions 'implementing the AgentSpawn prototype using Claude 3.5 Sonnet or GPT-4' as future work, confirming no specific model was used. No model versions are specified for the reported experiments." 135 }, 136 "prompts_provided": { 137 "applies": true, 138 "answer": false, 139 "justification": "No actual prompts are provided. Skills are described abstractly as 'parameterized prompts s = (p, θs)' with example templates, but no actual prompt text used in experiments is given — and no experiments were conducted." 140 }, 141 "hyperparameters_reported": { 142 "applies": true, 143 "answer": true, 144 "justification": "Table 1 provides hyperparameter specifications (δ=0.7, θ=0.5, weights w1-w5, max spawn depth=3, concurrent spawn limit=4, timeout=600s, relevance weights α=0.3, β=0.3, γ=0.2, δ=0.2). These are proposed rather than empirically tuned, but they are stated." 145 }, 146 "scaffolding_described": { 147 "applies": true, 148 "answer": true, 149 "justification": "The paper extensively describes the agentic scaffolding: five components (Memory Manager, Skill Library, Spawn Controller, Resume Coordinator, Coherence Manager), three algorithms (memory slicing, adaptive spawning, coherence protocol), and data structures (SpawnPackage, ResumePackage) in Appendix A." 150 }, 151 "data_preprocessing_documented": { 152 "applies": true, 153 "answer": false, 154 "justification": "No data preprocessing steps are described. The paper mentions using SWE-bench (300 tasks) and Defects4J (200 tasks) and a custom set (100 tasks) but provides no description of how tasks were selected, filtered, or preprocessed." 155 } 156 }, 157 "limitations_and_scope": { 158 "limitations_section_present": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section 5.4 'Limitations' discusses four limitations: hyperparameter sensitivity, semantic merge complexity, domain generalization, and spawn depth scaling. This is a substantive (not just a single sentence) dedicated section." 162 }, 163 "threats_to_validity_specific": { 164 "applies": true, 165 "answer": false, 166 "justification": "Section 5.4 mentions limitations but does not address threats to validity of the experimental results. Crucially, it does not acknowledge that the results are not from real experiments — the most significant validity threat in the paper." 167 }, 168 "scope_boundaries_stated": { 169 "applies": true, 170 "answer": false, 171 "justification": "The paper claims results in the abstract and body without clarifying that no empirical validation has been performed. The conclusion's future work list reveals the scope, but the paper presents its (non-existent) results as if they are real without clearly bounding what was and was not demonstrated." 172 } 173 }, 174 "data_integrity": { 175 "raw_data_available": { 176 "applies": true, 177 "answer": false, 178 "justification": "No raw data is available. The experimental results presented in Tables 2-6 are not from actual experiments — the conclusion explicitly lists empirical validation as future work, meaning there is no underlying data to verify." 179 }, 180 "data_collection_described": { 181 "applies": true, 182 "answer": false, 183 "justification": "No data collection procedure is described because no data was collected. The paper refers to existing benchmarks (SWE-bench, Defects4J) and a 'custom refactoring task set' without describing how either was prepared." 184 }, 185 "recruitment_methods_described": { 186 "applies": false, 187 "answer": false, 188 "justification": "No human participants are involved. The paper evaluates code generation benchmarks, not human subjects." 189 }, 190 "data_pipeline_documented": { 191 "applies": true, 192 "answer": false, 193 "justification": "No data pipeline is documented because no experiments were conducted. The conclusion confirms empirical validation is future work, making any data pipeline description inapplicable." 194 } 195 }, 196 "conflicts_of_interest": { 197 "funding_disclosed": { 198 "applies": true, 199 "answer": false, 200 "justification": "No funding source is disclosed anywhere in the paper. There is no acknowledgments section, no grant information, and no mention of institutional support." 201 }, 202 "affiliations_disclosed": { 203 "applies": true, 204 "answer": true, 205 "justification": "The author lists 'AutoHand Evolve' as affiliation with the email igor@autohand.ai. The affiliation is stated on the title page." 206 }, 207 "funder_independent_of_outcome": { 208 "applies": true, 209 "answer": false, 210 "justification": "Since no funding is disclosed, it cannot be determined whether the funder is independent. The author is affiliated with AutoHand Evolve, a company that would presumably benefit from the proposed architecture being seen as successful." 211 }, 212 "financial_interests_declared": { 213 "applies": true, 214 "answer": false, 215 "justification": "No competing interests statement, patent disclosures, or financial interests declaration is present anywhere in the paper." 216 } 217 }, 218 "contamination": { 219 "training_cutoff_stated": { 220 "applies": true, 221 "answer": false, 222 "justification": "No model training cutoff is stated. The conclusion mentions 'implementing using Claude 3.5 Sonnet or GPT-4' as future work, and no specific model with a training cutoff was used in experiments." 223 }, 224 "train_test_overlap_discussed": { 225 "applies": true, 226 "answer": false, 227 "justification": "No discussion of train/test overlap is present. SWE-bench is a public benchmark that LLMs may have been trained on; this is not addressed. No actual experiments were conducted." 228 }, 229 "benchmark_contamination_addressed": { 230 "applies": true, 231 "answer": false, 232 "justification": "SWE-bench (used as benchmark) was published before the training cutoffs of many current LLMs. The paper does not address contamination risk. No actual experiments were conducted." 233 } 234 }, 235 "human_studies": { 236 "pre_registered": { 237 "applies": false, 238 "answer": false, 239 "justification": "No human participants are involved. The study evaluates automated code generation benchmarks." 240 }, 241 "irb_or_ethics_approval": { 242 "applies": false, 243 "answer": false, 244 "justification": "No human participants are involved. IRB approval is not applicable." 245 }, 246 "demographics_reported": { 247 "applies": false, 248 "answer": false, 249 "justification": "No human participants are involved." 250 }, 251 "inclusion_exclusion_criteria": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants are involved." 255 }, 256 "randomization_described": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants or randomized controlled trial design are involved." 260 }, 261 "blinding_described": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants are involved." 265 }, 266 "attrition_reported": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants are involved." 270 } 271 }, 272 "cost_and_practicality": { 273 "inference_cost_reported": { 274 "applies": true, 275 "answer": false, 276 "justification": "Table 6 presents cost figures ($2.10 vs. $2.30 per success, API call counts, timing) and Section 5.3 discusses O(n·ℓ) cost scaling. However, these are hypothetical projections from experiments that were never conducted (per the conclusion's future work list), not measured inference costs." 277 }, 278 "compute_budget_stated": { 279 "applies": true, 280 "answer": false, 281 "justification": "No total computational budget for the experiments is stated. Since no experiments were conducted, there is no actual compute budget to report. Table 6 shows per-task estimates but not total experimental budget." 282 } 283 } 284 }, 285 "claims": [ 286 { 287 "claim": "AgentSpawn achieves 34% higher task completion rates than static baselines on benchmarks like SWE-bench.", 288 "evidence": "Abstract and Section 6 (Conclusion). Table 2 shows +97% over GPT-4 Single on SWE-bench. However, the conclusion (Section 6) explicitly lists 'empirical validation on SWE-bench and Defects4J with proper baseline configurations' as future work item (2), contradicting the claim that these results were obtained.", 289 "supported": "unsupported" 290 }, 291 { 292 "claim": "AgentSpawn reduces memory overhead by 42% through selective memory slicing.", 293 "evidence": "Abstract, Section 3.2, Table 5. The 42% figure is derived from the algorithm design (87K → 51K tokens for multi-file fix tasks shown in Figure 2). This appears to be a calculated/simulated figure, not measured from real experiments.", 294 "supported": "weak" 295 }, 296 { 297 "claim": "Adaptive spawning (Algorithm 2) provides the largest contribution to performance gains (+18-23% across benchmarks).", 298 "evidence": "Table 3 ablation study. Since no experiments were conducted (conclusion lists empirical validation as future work), these ablation numbers are not from real experiments.", 299 "supported": "unsupported" 300 }, 301 { 302 "claim": "73% of concurrent modification conflicts are resolvable via semantic (LLM-based) merge.", 303 "evidence": "Section 3.6, Equation 6, Figure 4. Stated as 'based on analysis of typical conflict patterns in multi-file refactoring tasks.' No empirical source or prior work is cited for this figure, and it is presented as a design assumption.", 304 "supported": "weak" 305 }, 306 { 307 "claim": "AgentSpawn achieves lower cost per successful completion ($2.10 vs. $2.30) despite higher per-task cost.", 308 "evidence": "Table 6 cost-benefit analysis. No actual experiments were run; these numbers are projections. The conclusion confirms empirical validation is future work.", 309 "supported": "unsupported" 310 } 311 ], 312 "methodology_tags": [ 313 "theoretical", 314 "benchmark-eval" 315 ], 316 "key_findings": "AgentSpawn proposes a multi-agent architecture enabling dynamic agent spawning based on runtime complexity metrics, with automatic memory slicing, skill inheritance, and coherence management for concurrent agents. The paper presents experimental results showing substantial gains over baselines (34-97% improvement on SWE-bench, Defects4J, and custom tasks) and a 42% memory reduction through selective slicing. Critically, the conclusion reveals that actual empirical validation is future work, meaning the reported experimental results were not obtained from real experiments. The paper is a theoretical design proposal with simulated or hypothetical results presented as if they were empirical findings.", 317 "red_flags": [ 318 { 319 "flag": "Fabricated experimental results presented as real", 320 "detail": "The paper presents Tables 2-6 as experimental results (complete with specific numbers like '+97% on SWE-bench', '$2.10 cost per success') while the conclusion explicitly lists 'empirical validation on SWE-bench and Defects4J with proper baseline configurations' as future work item (2). This is a fundamental integrity issue: the paper reads as if experiments were conducted when they were not." 321 }, 322 { 323 "flag": "No implementation exists", 324 "detail": "The conclusion states that future work includes 'implementing the AgentSpawn prototype using Claude 3.5 Sonnet or GPT-4.' An architectural paper presenting quantitative benchmarking results without any actual implementation is presenting simulated or invented results." 325 }, 326 { 327 "flag": "Claims dramatically outrun evidence", 328 "detail": "The abstract claims '34% higher completion rates than static baselines on benchmarks like SWE-bench' as an established finding, but this is actually a design goal unsupported by any empirical evidence. The gap between what is claimed and what exists (only architectural design with algorithms) is extreme." 329 }, 330 { 331 "flag": "No statistical validation", 332 "detail": "No confidence intervals, error bars, p-values, or variance measures are provided for any result. Even if experiments had been conducted, reporting single point estimates for systems with significant stochastic variation (LLM outputs, complex task outcomes) would be methodologically inadequate." 333 }, 334 { 335 "flag": "Conflict resolution statistics of uncertain provenance", 336 "detail": "The paper states '73% of conflicts resolvable via semantic merge' (Figure 4, Equation 6) based on 'analysis of typical conflict patterns in multi-file refactoring tasks' with no citation, source dataset, or methodology described. This figure appears to be asserted without evidence." 337 }, 338 { 339 "flag": "Single author, no institutional affiliation with established research track record", 340 "detail": "The paper is authored by a single researcher at 'AutoHand Evolve' with no institutional academic affiliation. No competing interests, funding, or peer review history is disclosed." 341 }, 342 { 343 "flag": "No code or artifacts released", 344 "detail": "No implementation, code, data, or reproducible artifact of any kind is provided. Since the system was never implemented, reproducibility is impossible." 345 } 346 ], 347 "cited_papers": [ 348 { 349 "title": "A survey on code generation with llm-based agents", 350 "authors": [ 351 "Yihong Dong", 352 "Xue Jiang", 353 "Jiaru Qian", 354 "Tian Wang", 355 "Kechi Zhang", 356 "Zhi Jin", 357 "Ge Li" 358 ], 359 "year": 2025, 360 "arxiv_id": "2508.00083", 361 "relevance": "Survey of LLM-based agents for code generation, directly relevant to the survey scope." 362 }, 363 { 364 "title": "A comprehensive survey of self-evolving ai agents: A new paradigm bridging foundation models and lifelong agentic systems", 365 "authors": [ 366 "Jinyuan Fang", 367 "Yanwen Peng", 368 "Xi Zhang" 369 ], 370 "year": 2025, 371 "arxiv_id": "2508.07407", 372 "relevance": "Survey of self-evolving AI agents, relevant to agentic AI methodology evaluation." 373 }, 374 { 375 "title": "A survey of self-evolving agents: On path to artificial super intelligence", 376 "authors": [ 377 "Huan-ang Gao", 378 "Jiayi Geng", 379 "Wenyue Hua" 380 ], 381 "year": 2025, 382 "arxiv_id": "2507.21046", 383 "relevance": "Survey of self-evolving agent architectures, relevant to the agentic AI research landscape." 384 }, 385 { 386 "title": "MetaGPT: Meta programming for multi-agent collaborative framework", 387 "authors": [ 388 "Sirui Hong", 389 "Xiawu Zheng", 390 "Jonathan Chen" 391 ], 392 "year": 2023, 393 "arxiv_id": "2308.00352", 394 "relevance": "Multi-agent framework for software development with role-based specialization, core related work for code generation agents." 395 }, 396 { 397 "title": "Communicative agents for software development", 398 "authors": [ 399 "Chen Qian", 400 "Xin Cong", 401 "Cheng Yang" 402 ], 403 "year": 2023, 404 "arxiv_id": "2307.07924", 405 "relevance": "ChatDev: multi-agent system for software development, directly relevant to multi-agent code generation evaluation." 406 }, 407 { 408 "title": "MemGPT: Towards LLMs as operating systems", 409 "authors": [ 410 "Charles Packer", 411 "Vivian Fang", 412 "Shishir G Patil", 413 "Kevin Wooders", 414 "Ion Stoica" 415 ], 416 "year": 2023, 417 "arxiv_id": "2310.08560", 418 "relevance": "Hierarchical memory management for LLM agents, foundational work for long-context agent systems." 419 }, 420 { 421 "title": "AFLOW: Automating agentic workflow generation", 422 "authors": [ 423 "Jiayi Zhang", 424 "Jinyu Xiang", 425 "Zhaoyang Yu" 426 ], 427 "year": 2025, 428 "relevance": "Workflow graph optimization for agentic systems, used as baseline comparison in this paper." 429 }, 430 { 431 "title": "Difficulty-aware agentic orchestration for query-specific multi-agent workflows", 432 "authors": [ 433 "Jinwei Su", 434 "Qizhen Lan", 435 "Yinghui Xia" 436 ], 437 "year": 2025, 438 "arxiv_id": "2509.11079", 439 "relevance": "Dynamic agent composition based on query difficulty, directly related to adaptive multi-agent orchestration." 440 }, 441 { 442 "title": "Multi-agent collaboration mechanisms: A survey of LLMs", 443 "authors": [ 444 "Khanh-Tung Tran", 445 "Dung Dao", 446 "Minh-Duong Nguyen" 447 ], 448 "year": 2025, 449 "arxiv_id": "2501.06322", 450 "relevance": "Survey of multi-agent collaboration mechanisms for LLMs, highly relevant to survey scope." 451 }, 452 { 453 "title": "A-MEM: Agentic memory for LLM agents", 454 "authors": [ 455 "Wujiang Xu", 456 "Zujie Liang", 457 "Kai Mei" 458 ], 459 "year": 2025, 460 "arxiv_id": "2502.12110", 461 "relevance": "Memory architecture for LLM agents using Zettelkasten-style interconnected knowledge, relevant to memory management in agentic systems." 462 }, 463 { 464 "title": "Plan-and-act: Improving planning of agents for long-horizon tasks", 465 "authors": [ 466 "Lutfi Eren Erdogan", 467 "Nicholas Lee", 468 "Sehoon Kim" 469 ], 470 "year": 2025, 471 "arxiv_id": "2503.09572", 472 "relevance": "Long-horizon planning for agents, directly relevant to the code generation domain studied." 473 }, 474 { 475 "title": "Ai agentic programming: A survey of techniques, challenges, and opportunities", 476 "authors": [ 477 "Huanting Wang", 478 "Jingzhi Gong", 479 "Huawei Zhang", 480 "Jie Xu", 481 "Zheng Wang" 482 ], 483 "year": 2025, 484 "arxiv_id": "2508.11126", 485 "relevance": "Survey of AI agentic programming techniques, directly relevant to survey scope." 486 }, 487 { 488 "title": "When single-agent with skills replace multi-agent systems and when they fail", 489 "authors": [ 490 "Xiaoxiao Li" 491 ], 492 "year": 2026, 493 "arxiv_id": "2601.04748", 494 "relevance": "Analysis of when single-agent vs. multi-agent approaches are appropriate, highly relevant to multi-agent system evaluation." 495 } 496 ] 497 }