scan-v5.json (22538B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Bugs in Modern LLM Agent Frameworks: An Empirical Study", 6 "authors": [ 7 "Xinxue Zhu", 8 "Jiacong Wu", 9 "Xiaoyu Zhang", 10 "Tianlin Li", 11 "Yanzhou Mu", 12 "Juan Zhai", 13 "Chao Shen", 14 "Chunrong Fang", 15 "Yang Liu" 16 ], 17 "year": 2026, 18 "venue": "FSE", 19 "arxiv_id": "2602.21806", 20 "doi": null 21 }, 22 "checklist": { 23 "claims_and_evidence": { 24 "abstract_claims_supported": { 25 "applies": true, 26 "answer": true, 27 "justification": "All abstract claims (998 bugs analyzed, 15 root causes, 7 symptoms, 5 lifecycle stages, API Misuse 32.97%, API Incompatibility 22.34%, Self-Action concentration) are explicitly supported by Results section data.", 28 "source": "haiku" 29 }, 30 "causal_claims_justified": { 31 "applies": false, 32 "answer": false, 33 "justification": "Paper presents taxonomy and distributions, not causal claims. No causal inference required for descriptive taxonomy work.", 34 "source": "haiku" 35 }, 36 "generalization_bounded": { 37 "applies": true, 38 "answer": false, 39 "justification": "Study limited to CrewAI and LangChain, but title/conclusions generalize to 'modern LLM agent frameworks' and 'LLM software supply chain' without explicitly bounding these claims.", 40 "source": "haiku" 41 }, 42 "alternative_explanations_discussed": { 43 "applies": true, 44 "answer": false, 45 "justification": "Paper presents taxonomy without discussing alternative interpretations. No consideration of reporting bias, labeling bias, or alternative frameworks for organizing root causes.", 46 "source": "haiku" 47 }, 48 "proxy_outcome_distinction": { 49 "applies": true, 50 "answer": false, 51 "justification": "Root causes inferred from issue descriptions rather than code analysis. No explicit discussion of whether manually-inferred causes match actual code-level causation or whether GitHub issues capture true bug distribution.", 52 "source": "haiku" 53 } 54 }, 55 "limitations_and_scope": { 56 "limitations_section_present": { 57 "applies": true, 58 "answer": false, 59 "justification": "No dedicated limitations or threats-to-validity section. Conclusion mentions future work but not systematic discussion of study limitations.", 60 "source": "haiku" 61 }, 62 "threats_to_validity_specific": { 63 "applies": true, 64 "answer": false, 65 "justification": "No specific threats discussed. No inter-rater agreement metrics, annotator bias analysis, or discussion of sampling limitations despite manual labeling being the core process.", 66 "source": "haiku" 67 }, 68 "scope_boundaries_stated": { 69 "applies": true, 70 "answer": false, 71 "justification": "Scope limited to two frameworks and GitHub issues spanning Dec 2023-Jan 2026, but boundaries not stated as explicit limitations. Title claims broader applicability without qualification.", 72 "source": "haiku" 73 } 74 }, 75 "conflicts_of_interest": { 76 "funding_disclosed": { 77 "applies": true, 78 "answer": false, 79 "justification": "No funding sources disclosed in the paper.", 80 "source": "haiku" 81 }, 82 "affiliations_disclosed": { 83 "applies": true, 84 "answer": true, 85 "justification": "Each author lists institutional affiliation (Nantong, Nanjing, NTU Singapore, Beihang, UMass Amherst, Xi'an Jiaotong).", 86 "source": "haiku" 87 }, 88 "funder_independent_of_outcome": { 89 "applies": false, 90 "answer": false, 91 "justification": "No funding disclosed; not applicable.", 92 "source": "haiku" 93 }, 94 "financial_interests_declared": { 95 "applies": true, 96 "answer": false, 97 "justification": "No competing interests or financial disclosures statement provided.", 98 "source": "haiku" 99 } 100 }, 101 "scope_and_framing": { 102 "key_terms_defined": { 103 "applies": true, 104 "answer": false, 105 "justification": "Key terms like 'agent framework,' 'root cause,' and 'symptom' used without formal definitions, though operational definition of 'bug' is provided via two-stage filtering criteria.", 106 "source": "haiku" 107 }, 108 "intended_contribution_clear": { 109 "applies": true, 110 "answer": true, 111 "justification": "Paper explicitly lists three contributions: lifecycle-oriented taxonomy, empirical findings (15 root causes, 7 symptoms), and released artifacts. Contribution clearly framed.", 112 "source": "haiku" 113 }, 114 "engagement_with_prior_work": { 115 "applies": true, 116 "answer": true, 117 "justification": "Paper positions against prior work on agent-level failures vs. framework-level bugs (refs 3, 9, 10), though related work discussion is brief and concentrated in introduction.", 118 "source": "haiku" 119 } 120 } 121 }, 122 "type_checklist": { 123 "empirical": { 124 "artifacts": { 125 "code_released": { 126 "applies": true, 127 "answer": false, 128 "justification": "Paper states 'We release our curated dataset, taxonomy definitions, and analysis scripts' but provides no link, repository, or supplementary materials URL.", 129 "source": "haiku" 130 }, 131 "data_released": { 132 "applies": true, 133 "answer": false, 134 "justification": "Claims to release 'curated dataset' without providing link, repository, or supplementary materials. Original GitHub issues are public but labeled/processed version not available.", 135 "source": "haiku" 136 }, 137 "environment_specified": { 138 "applies": true, 139 "answer": false, 140 "justification": "No environment specifications, requirements.txt, Dockerfile, or dependency declarations provided.", 141 "source": "haiku" 142 }, 143 "reproduction_instructions": { 144 "applies": true, 145 "answer": false, 146 "justification": "Methodology describes process but not in reproducible detail. Actual reproduction requires access to curated labeled dataset (not provided) or redoing entire manual annotation.", 147 "source": "haiku" 148 } 149 }, 150 "statistical_methodology": { 151 "confidence_intervals_or_error_bars": { 152 "applies": true, 153 "answer": false, 154 "justification": "Results report frequencies (329/998, 223/998) and percentages (32.97%, 22.34%) but no confidence intervals or uncertainty bounds.", 155 "source": "haiku" 156 }, 157 "significance_tests": { 158 "applies": true, 159 "answer": false, 160 "justification": "No statistical significance tests (chi-square, Fisher's exact, etc.) reported for distributions or comparisons across frameworks or lifecycle stages.", 161 "source": "haiku" 162 }, 163 "effect_sizes_reported": { 164 "applies": true, 165 "answer": false, 166 "justification": "Results report proportions as percentages but these are descriptive, not comparative. No effect sizes from between-group contrasts.", 167 "source": "haiku" 168 }, 169 "sample_size_justified": { 170 "applies": true, 171 "answer": false, 172 "justification": "Final sample of 998 bugs (from 2,773 collected) is not justified. No power analysis or discussion of adequacy for detecting patterns.", 173 "source": "haiku" 174 }, 175 "variance_reported": { 176 "applies": true, 177 "answer": false, 178 "justification": "Results presented as point counts and percentages without error bars, confidence intervals, or variance estimates. No uncertainty quantification.", 179 "source": "haiku" 180 } 181 }, 182 "evaluation_design": { 183 "baselines_included": { 184 "applies": false, 185 "answer": false, 186 "justification": "Descriptive taxonomy study, not a comparative evaluation; baseline comparisons not applicable.", 187 "source": "haiku" 188 }, 189 "baselines_contemporary": { 190 "applies": false, 191 "answer": false, 192 "justification": "Not applicable to taxonomy study.", 193 "source": "haiku" 194 }, 195 "ablation_study": { 196 "applies": false, 197 "answer": false, 198 "justification": "Not applicable to taxonomy work.", 199 "source": "haiku" 200 }, 201 "multiple_metrics": { 202 "applies": true, 203 "answer": true, 204 "justification": "Study examines bugs from multiple perspectives: 15 root cause categories, 7 symptom categories, and distribution across 5 lifecycle stages. Multi-faceted analysis provided.", 205 "source": "haiku" 206 }, 207 "human_evaluation": { 208 "applies": false, 209 "answer": false, 210 "justification": "Two annotators label bugs, but this is data labeling, not evaluation of system outputs. No user study or user-facing evaluation.", 211 "source": "haiku" 212 }, 213 "held_out_test_set": { 214 "applies": false, 215 "answer": false, 216 "justification": "Not applicable; not a prediction task.", 217 "source": "haiku" 218 }, 219 "per_category_breakdown": { 220 "applies": true, 221 "answer": true, 222 "justification": "Extensive per-category analysis: root causes broken down into 15 categories with counts (Figure 2), symptoms into 7 categories (Figure 3), and lifecycle stage distribution detailed across all stages.", 223 "source": "haiku" 224 }, 225 "failure_cases_discussed": { 226 "applies": true, 227 "answer": false, 228 "justification": "Taxonomy describes failure modes (root causes/symptoms) but provides limited detailed case examples or rich qualitative illustrations beyond category membership.", 229 "source": "haiku" 230 }, 231 "negative_results_reported": { 232 "applies": false, 233 "answer": false, 234 "justification": "Descriptive study without hypothesis-driven negative results. All findings presented uniformly without surprise or null findings.", 235 "source": "haiku" 236 } 237 }, 238 "setup_transparency": { 239 "model_versions_specified": { 240 "applies": false, 241 "answer": false, 242 "justification": "Not evaluating models; not applicable.", 243 "source": "haiku" 244 }, 245 "prompts_provided": { 246 "applies": false, 247 "answer": false, 248 "justification": "Not applicable; no prompts or LLM usage in the study.", 249 "source": "haiku" 250 }, 251 "hyperparameters_reported": { 252 "applies": false, 253 "answer": false, 254 "justification": "Not applicable.", 255 "source": "haiku" 256 }, 257 "scaffolding_described": { 258 "applies": true, 259 "answer": false, 260 "justification": "High-level framework characteristics mentioned ('LangChain offers rich abstractions; CrewAI focuses on role-based collaboration') but insufficient detail on internal APIs, execution semantics, or implementation to fully understand frameworks.", 261 "source": "haiku" 262 }, 263 "data_preprocessing_documented": { 264 "applies": true, 265 "answer": true, 266 "justification": "Two-stage preprocessing documented: (1) label filtering for 'bug' label, (2) manual inspection excluding 'documentation typos,' 'usage questions,' and 'infrastructure issues.' Criteria and process clearly described.", 267 "source": "haiku" 268 } 269 }, 270 "data_integrity": { 271 "raw_data_available": { 272 "applies": true, 273 "answer": false, 274 "justification": "Original GitHub issues are public but curated/labeled dataset is not provided. Cannot independently verify annotations.", 275 "source": "haiku" 276 }, 277 "data_collection_described": { 278 "applies": true, 279 "answer": true, 280 "justification": "Collection procedure well-documented: full scraping of GitHub (both open/closed issues), 2,773 total issues (1,660 CrewAI, 1,113 LangChain), time span Dec 7 2023–Jan 10 2026, and data elements extracted (title, labels, content, comments).", 281 "source": "haiku" 282 }, 283 "recruitment_methods_described": { 284 "applies": false, 285 "answer": false, 286 "justification": "Not a human subjects study; not applicable.", 287 "source": "haiku" 288 }, 289 "data_pipeline_documented": { 290 "applies": true, 291 "answer": true, 292 "justification": "Full pipeline documented: GitHub collection → label filtering → manual inspection → initial taxonomy construction (100 samples) → large-scale annotation. Process and stages clearly described with Figure 1 overview.", 293 "source": "haiku" 294 } 295 }, 296 "contamination": { 297 "training_cutoff_stated": { 298 "applies": false, 299 "answer": false, 300 "justification": "Not evaluating models on benchmarks; not applicable.", 301 "source": "haiku" 302 }, 303 "train_test_overlap_discussed": { 304 "applies": false, 305 "answer": false, 306 "justification": "Not applicable.", 307 "source": "haiku" 308 }, 309 "benchmark_contamination_addressed": { 310 "applies": false, 311 "answer": false, 312 "justification": "Not applicable.", 313 "source": "haiku" 314 } 315 }, 316 "human_studies": { 317 "pre_registered": { 318 "applies": false, 319 "answer": false, 320 "justification": "No human participants; not applicable.", 321 "source": "haiku" 322 }, 323 "irb_or_ethics_approval": { 324 "applies": false, 325 "answer": false, 326 "justification": "No human subjects; not applicable.", 327 "source": "haiku" 328 }, 329 "demographics_reported": { 330 "applies": false, 331 "answer": false, 332 "justification": "Not applicable.", 333 "source": "haiku" 334 }, 335 "inclusion_exclusion_criteria": { 336 "applies": false, 337 "answer": false, 338 "justification": "Not applicable.", 339 "source": "haiku" 340 }, 341 "randomization_described": { 342 "applies": false, 343 "answer": false, 344 "justification": "Not applicable.", 345 "source": "haiku" 346 }, 347 "blinding_described": { 348 "applies": false, 349 "answer": false, 350 "justification": "Not applicable.", 351 "source": "haiku" 352 }, 353 "attrition_reported": { 354 "applies": false, 355 "answer": false, 356 "justification": "Not applicable.", 357 "source": "haiku" 358 } 359 }, 360 "cost_and_practicality": { 361 "inference_cost_reported": { 362 "applies": false, 363 "answer": false, 364 "justification": "Taxonomy study, not a system with inference costs; not applicable.", 365 "source": "haiku" 366 }, 367 "compute_budget_stated": { 368 "applies": true, 369 "answer": false, 370 "justification": "No mention of computational resources or time investment for manual annotation of 998 bugs by two researchers over the study period.", 371 "source": "haiku" 372 } 373 } 374 } 375 }, 376 "claims": [ 377 { 378 "claim": "API Misuse (32.97%) and API Incompatibility (22.34%) together account for over 55% of agent framework bugs", 379 "evidence": "Analysis of 998 labeled bug reports; 329 API Misuse + 223 API Incompatibility = 552/998 bugs", 380 "supported": "strong" 381 }, 382 { 383 "claim": "Self-Action stage contains the highest concentration of bugs (88% of issues)", 384 "evidence": "Lifecycle stage distribution: 882/998 bugs mapped to Self-Action stage; detailed breakdown across all 5 stages provided", 385 "supported": "strong" 386 }, 387 { 388 "claim": "Framework bugs manifest primarily as Functional Error (78%), Crash (10%), and Build Failure (7%)", 389 "evidence": "Symptom analysis reported in Figure 3: S2 Functional Error 781/998, S1 Crash 100/998, S3 Build Failure 67/998", 390 "supported": "strong" 391 }, 392 { 393 "claim": "Execution semantics mechanisms are the dominant source of framework failures", 394 "evidence": "Self-Action stage concentration (88%) and API-related root causes (55%) suggest execution-level problems dominate over interface issues", 395 "supported": "moderate" 396 }, 397 { 398 "claim": "CrewAI and LangChain represent suitable agent frameworks for understanding modern LLM agent bugs", 399 "evidence": "Justified by \"representative and widely used,\" \"68.5k stars on GitHub,\" complementary design emphases; no independent validation", 400 "supported": "moderate" 401 }, 402 { 403 "claim": "Curated dataset, taxonomy definitions, and analysis scripts will be released to enable replication", 404 "evidence": "Stated in abstract and contributions section; no link, repository, or supplementary materials provided with paper", 405 "supported": "weak" 406 } 407 ], 408 "methodology_tags": [ 409 "observational", 410 "case-study" 411 ], 412 "key_findings": "The paper characterizes 998 bug reports from CrewAI and LangChain via a lifecycle-oriented taxonomy, finding that 55% of bugs stem from API-related issues (misuse + incompatibility) and that 88% concentrate in the Self-Action (execution) stage, where planning and tool invocation occur. Bugs primarily manifest as functional errors (78%), crashes (10%), and build failures (7%), indicating execution-level disruptions rather than isolated interface problems. This taxonomy across five agent lifecycle stages (Initialization, Perception, Self-Action, Mutual Interaction, Evolution) provides a structured lens for understanding how framework-level issues propagate during agent execution.", 413 "red_flags": [ 414 { 415 "flag": "No inter-rater reliability metrics", 416 "detail": "Two annotators labeled all 998 bugs but no Cohen's kappa, agreement rate, or conflict resolution statistics reported. Prevents assessment of labeling consistency." 417 }, 418 { 419 "flag": "No statistical analysis", 420 "detail": "Frequencies and percentages reported without confidence intervals, significance tests, or hypothesis testing. No uncertainty quantification." 421 }, 422 { 423 "flag": "Inferred root causes, not validated", 424 "detail": "Root causes inferred from GitHub issue descriptions rather than code analysis or detailed investigation. Gap between inferred and actual causation." 425 }, 426 { 427 "flag": "Limited generalization scope", 428 "detail": "Study limited to 2 frameworks (CrewAI, LangChain) but title and conclusions generalize to 'modern LLM agent frameworks' without explicit qualification." 429 }, 430 { 431 "flag": "No threats-to-validity discussion", 432 "detail": "Paper lacks dedicated limitations or threats section. No discussion of sampling bias, reporting bias, annotator bias, or other validity threats." 433 }, 434 { 435 "flag": "Artifacts not provided", 436 "detail": "Paper claims to release curated dataset and analysis scripts but provides no link, repository URL, or supplementary materials." 437 }, 438 { 439 "flag": "Potential reporting bias", 440 "detail": "GitHub issues reflect what users report, not the full universe of bugs. Some bugs unreported, others over-reported. Frequency may not reflect actual prevalence." 441 }, 442 { 443 "flag": "Framework selection not justified", 444 "detail": "CrewAI and LangChain chosen for being 'representative,' but no systematic justification or comparison against other agent frameworks." 445 } 446 ], 447 "cited_papers": [ 448 { 449 "title": "Why do multi-agent LLM systems fail?", 450 "authors": "Cemri et al.", 451 "year": 2025, 452 "relevance": "Prior work on agent-level failures; this paper studies framework-level bugs as distinct from agent reasoning failures" 453 }, 454 { 455 "title": "A Characterization Study of Bugs in LLM Agent Workflow Orchestration Frameworks", 456 "authors": "Xue et al.", 457 "year": 2025, 458 "relevance": "Closely related work analyzing agent library bugs; distinguishes this paper's dynamic lifecycle approach from static component mapping" 459 }, 460 { 461 "title": "Which agent causes task failures and when? On automated failure attribution of LLM multi-agent systems", 462 "authors": "Zhang et al.", 463 "year": 2025, 464 "relevance": "Related work on agent failure analysis; complements framework-level bug taxonomy" 465 }, 466 { 467 "title": "Large language model supply chain: A research agenda", 468 "authors": "Wang et al.", 469 "year": 2025, 470 "relevance": "Contextualizes framework bugs within LLM software supply chain security and quality concerns" 471 } 472 ], 473 "engagement_factors": { 474 "practical_relevance": { 475 "score": 2, 476 "justification": "Taxonomy helps framework developers and maintainers identify high-risk areas (Self-Action stage, API-related bugs) but provides limited actionable guidance for improvement." 477 }, 478 "surprise_contrarian": { 479 "score": 1, 480 "justification": "Finding that execution/orchestration is the main bug source is fairly predictable given complexity of agent execution semantics; no surprising contrarian insight." 481 }, 482 "fear_safety": { 483 "score": 1, 484 "justification": "Mentions 'security risks' and 'supply chain threat' once but does not investigate or emphasize safety/security concerns beyond acknowledgment." 485 }, 486 "drama_conflict": { 487 "score": 0, 488 "justification": "Neutral technical taxonomy work without contentious claims, novel controversies, or dramatic findings." 489 }, 490 "demo_ability": { 491 "score": 1, 492 "justification": "CrewAI and LangChain are publicly available and can be used, but study's taxonomy and curated dataset are not provided, limiting reproducibility or demonstration of findings." 493 }, 494 "brand_recognition": { 495 "score": 2, 496 "justification": "Studies well-known frameworks (CrewAI, LangChain) but authors span diverse institutions of mixed prestige (Nantong, Nanjing, NTU Singapore, Beihang, UMass Amherst, Xi'an Jiaotong)." 497 } 498 }, 499 "hn_data": { 500 "threads": [], 501 "top_points": 0, 502 "total_points": 0, 503 "total_comments": 0 504 } 505 }