scan.json (17245B)
1 { 2 "scan_version": 2, 3 "active_modules": [], 4 "paper": { 5 "title": "Bugs in Modern LLM Agent Frameworks: An Empirical Study", 6 "authors": ["Xinxue Zhu", "Jiacong Wu", "Xiaoyu Zhang", "Tianlin Li", "Yanzhou Mu", "Juan Zhai", "Chao Shen", "Chunrong Fang", "Yang Liu"], 7 "year": 2026, 8 "venue": "FSE 2026 Companion", 9 "arxiv_id": "2602.21806" 10 }, 11 "methodology_tags": ["qualitative", "observational"], 12 "key_findings": "Analysis of 998 bug reports from CrewAI and LangChain identifies 15 root cause categories and 7 symptom categories across five agent lifecycle stages. API Misuse (32.97%) and API Incompatibility (22.34%) account for over 55% of all bugs, concentrated in the Self-Action execution stage. Symptoms predominantly manifest as Functional Error (781/998), Crash (100/998), and Build Failure (67/998), indicating framework bugs mainly disrupt workflow progression rather than causing isolated interface issues.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": false, 18 "justification": "The paper claims 'We release our curated dataset, taxonomy definitions, and analysis scripts' in contributions but no repository URL or download link is provided in the paper." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "Same claim of releasing curated dataset but no URL or link is given. The 998 issue reports are drawn from public GitHub but the curated/labeled dataset is not linked." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "No environment specifications, requirements, or tooling details are provided." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No reproduction instructions are provided. The methodology describes the process but not how to replicate the analysis." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "No confidence intervals or error bars are reported. Results are presented as raw counts and percentages only." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "No statistical significance tests are used despite claims about distributions and concentrations of bugs across stages." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": false, 50 "justification": "No effect sizes reported. Only raw counts and percentages are provided." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "No justification for why 998 issues are sufficient or why only two frameworks were selected. The sample size is a result of filtering, not a design choice." 56 }, 57 "variance_reported": { 58 "applies": false, 59 "answer": false, 60 "justification": "This is a manual classification study, not an experimental study with multiple runs. There are no experimental runs to report variance across." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "The paper positions against prior work [3, 9, 10] which study agent-level failures or static library components, and explains how their lifecycle-oriented perspective differs." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "References [3], [9], and [10] are all from 2025, which is contemporary work." 73 }, 74 "ablation_study": { 75 "applies": false, 76 "answer": false, 77 "justification": "This is a taxonomy/classification study, not a system with components to ablate." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "The study examines both root causes (15 categories) and symptoms (7 categories) across lifecycle stages, providing multiple analytical dimensions." 83 }, 84 "human_evaluation": { 85 "applies": false, 86 "answer": false, 87 "justification": "Human evaluation of system outputs is not relevant here — the study IS a manual analysis of bug reports, not a system producing outputs to evaluate." 88 }, 89 "held_out_test_set": { 90 "applies": false, 91 "answer": false, 92 "justification": "Not an ML evaluation study. No train/test split applies." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Results are broken down by root cause category (15 types), symptom category (7 types), and lifecycle stage (5 stages), with counts for each combination." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": false, 102 "justification": "No specific example bug reports are discussed in detail. The taxonomy is presented only with aggregate counts, not illustrative cases." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": false, 107 "justification": "No negative results or surprising non-findings are discussed. All findings are presented positively." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The abstract claims about API Misuse/Incompatibility dominance and Self-Action concentration are supported by the counts in Figures 2-3 and the lifecycle distribution analysis." 115 }, 116 "causal_claims_justified": { 117 "applies": false, 118 "answer": false, 119 "justification": "The paper is descriptive — it classifies and counts bug types without making causal claims about why bugs occur." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "The title says 'Modern LLM Agent Frameworks' but only two frameworks (CrewAI, LangChain) are studied. The paper does not clearly bound its generalizability to these two frameworks." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": false, 129 "justification": "No alternative explanations are discussed. For instance, the concentration in Self-Action could reflect reporting bias (users more likely to report execution bugs) rather than actual bug distribution." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper uses GitHub issue reports labeled 'bug' as a proxy for actual framework bugs, but does not discuss the gap between reported issues and actual bug prevalence, severity, or distribution." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": false, 140 "answer": false, 141 "justification": "No LLMs are used in the methodology. This is a manual analysis study." 142 }, 143 "prompts_provided": { 144 "applies": false, 145 "answer": false, 146 "justification": "No prompting is used in this study." 147 }, 148 "hyperparameters_reported": { 149 "applies": false, 150 "answer": false, 151 "justification": "No ML models or hyperparameters are involved in the methodology." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "No agentic scaffolding is used in the study methodology." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section 2.2 describes a two-stage filtering process: label filtering (retaining 'bug' labeled issues) reducing from 2,773 to 1,010, then manual inspection removing three categories of irrelevant reports, yielding 998." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": false, 168 "justification": "No limitations or threats-to-validity section is present. Section 4 is 'Conclusion & Future Work' with no substantive limitations discussion." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": false, 173 "justification": "No threats to validity are discussed anywhere in the paper." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": false, 178 "justification": "No explicit scope boundaries are stated. The paper does not clarify what its results do NOT show or which frameworks/scenarios are excluded from its claims." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "Although the paper claims to release artifacts, no URL or archive is provided. The underlying GitHub issues are public but the labeled dataset is not available." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section 2.1 describes collecting from GitHub repositories of CrewAI and LangChain, spanning December 7, 2023 to January 10, 2026, with 2,773 original issues (1,660 CrewAI, 1,113 LangChain)." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants. Data source is public GitHub issue reports from specific repositories." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "The pipeline is documented: 2,773 collected → label filtering to 1,010 → manual inspection to 998. Section 2.2 describes each stage with counts and criteria." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding information or acknowledgments section is present in the paper." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are listed: Nantong University, Nanjing University, NTU Singapore, Beihang University, UMass Amherst, Xi'an Jiaotong University. No obvious conflicts with the evaluated frameworks." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding is disclosed, so independence cannot be assessed." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial interests statement is present." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": false, 228 "answer": false, 229 "justification": "This study does not evaluate any pre-trained model on a benchmark. It is a manual bug classification study." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": false, 233 "answer": false, 234 "justification": "No model evaluation on benchmarks is performed." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": false, 238 "answer": false, 239 "justification": "No model evaluation on benchmarks is performed." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants. The study analyzes public GitHub issue reports." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "This is a manual analysis study, not a system with inference costs." 284 }, 285 "compute_budget_stated": { 286 "applies": false, 287 "answer": false, 288 "justification": "Manual analysis study with no significant compute requirements." 289 } 290 } 291 }, 292 "claims": [ 293 { 294 "claim": "API Misuse (32.97%) and API Incompatibility (22.34%) together account for over 55% of all agent framework bugs.", 295 "evidence": "Figure 2 shows 329 API Misuse and 223 API Incompatibility out of 998 total bugs. Section 3.1.", 296 "supported": "strong" 297 }, 298 { 299 "claim": "Root causes concentrate in the Self-Action lifecycle stage, which contains most reported cases.", 300 "evidence": "Section 3.1 lifecycle distribution analysis shows 882/998 bugs in Self-Action stage, with API Misuse (289/882) and API Incompatibility (211/882) dominating.", 301 "supported": "strong" 302 }, 303 { 304 "claim": "Functional Error is the dominant symptom (781/998), followed by Crash (100/998) and Build Failure (67/998).", 305 "evidence": "Figure 3 and Section 3.2 provide the distribution counts.", 306 "supported": "strong" 307 }, 308 { 309 "claim": "Framework bugs mainly manifest as breakdowns in workflow progression rather than isolated interface issues.", 310 "evidence": "The concentration of Functional Error, Crash, and Build Failure symptoms in the Self-Action stage (Section 3.2) supports this, though the claim is somewhat interpretive.", 311 "supported": "moderate" 312 } 313 ], 314 "red_flags": [ 315 { 316 "flag": "No inter-rater reliability metrics", 317 "detail": "Two annotators labeled 998 issues but no Cohen's kappa, Krippendorff's alpha, or agreement rate is reported. The paper says annotators 'cross-check results' and hold 'online meetings to reach an agreement' but does not quantify initial disagreement or reliability." 318 }, 319 { 320 "flag": "No limitations section", 321 "detail": "A 5-page empirical study with no discussion of threats to validity. Key unaddressed threats include: selection bias (only two frameworks), reporting bias (GitHub issues may not represent actual bug distribution), and generalizability beyond CrewAI/LangChain." 322 }, 323 { 324 "flag": "Overclaiming from narrow sample", 325 "detail": "Title claims 'Modern LLM Agent Frameworks' (plural, general) but only studies CrewAI and LangChain. Other major frameworks (AutoGen, DSPy, LlamaIndex agents) are excluded without justification." 326 }, 327 { 328 "flag": "No illustrative examples", 329 "detail": "998 bugs are classified into 15 root causes and 7 symptoms but not a single concrete bug example is shown. The reader cannot verify whether the taxonomy categories are applied correctly." 330 }, 331 { 332 "flag": "Artifacts promised but not delivered", 333 "detail": "The paper lists 'Reproducible Artifacts' as a contribution, claiming to release dataset, taxonomy, and scripts, but provides no URL, repository link, or archive reference." 334 } 335 ], 336 "cited_papers": [ 337 { 338 "title": "Why do multi-agent llm systems fail?", 339 "authors": ["Mert Cemri", "Melissa Z Pan", "Shuyi Yang"], 340 "year": 2025, 341 "arxiv_id": "2503.13657", 342 "relevance": "Directly studies failure modes in multi-agent LLM systems, complementary to framework-level bug analysis." 343 }, 344 { 345 "title": "Evaluating Large Language Models Trained on Code", 346 "authors": ["Mark Chen", "Jerry Tworek"], 347 "year": 2021, 348 "arxiv_id": "2107.03374", 349 "relevance": "Foundational Codex/HumanEval paper on LLM code generation capabilities." 350 }, 351 { 352 "title": "Large language model supply chain: A research agenda", 353 "authors": ["Shenao Wang", "Yanjie Zhao", "Xinyi Hou", "Haoyu Wang"], 354 "year": 2025, 355 "relevance": "Frames the LLM software supply chain problem that framework bugs contribute to." 356 }, 357 { 358 "title": "A Characterization Study of Bugs in LLM Agent Workflow Orchestration Frameworks", 359 "authors": ["Ziluo Xue", "Yanjie Zhao", "Shenao Wang"], 360 "year": 2025, 361 "relevance": "Most closely related prior work studying bugs in LLM agent libraries via static component mapping." 362 }, 363 { 364 "title": "Which agent causes task failures and when? on automated failure attribution of llm multi-agent systems", 365 "authors": ["Shaokun Zhang", "Ming Yin", "Jieyu Zhang"], 366 "year": 2025, 367 "arxiv_id": "2505.00212", 368 "relevance": "Studies automated failure attribution in multi-agent systems, complementary perspective on agent failures." 369 } 370 ] 371 }