scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (23193B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Bugs in Modern LLM Agent Frameworks: An Empirical Study",
      6     "authors": [
      7       "Xinxue Zhu",
      8       "Jiacong Wu",
      9       "Xiaoyu Zhang",
     10       "Tianlin Li",
     11       "Yanzhou Mu",
     12       "Juan Zhai",
     13       "Chao Shen",
     14       "Chunrong Fang",
     15       "Yang Liu"
     16     ],
     17     "year": 2026,
     18     "venue": "FSE'26",
     19     "arxiv_id": "2602.21806",
     20     "doi": null
     21   },
     22   "checklist": {
     23     "claims_and_evidence": {
     24       "abstract_claims_supported": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The abstract claims about API Misuse/Incompatibility dominance and Self-Action concentration are supported by the counts in Figures 2-3 and the lifecycle distribution analysis.",
     28         "source": "opus"
     29       },
     30       "causal_claims_justified": {
     31         "applies": false,
     32         "answer": false,
     33         "justification": "The paper is descriptive — it classifies and counts bug types without making causal claims about why bugs occur.",
     34         "source": "opus"
     35       },
     36       "generalization_bounded": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The title says 'Modern LLM Agent Frameworks' but only two frameworks (CrewAI, LangChain) are studied. The paper does not clearly bound its generalizability to these two frameworks.",
     40         "source": "opus"
     41       },
     42       "alternative_explanations_discussed": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No alternative explanations are discussed. For instance, the concentration in Self-Action could reflect reporting bias (users more likely to report execution bugs) rather than actual bug distribution.",
     46         "source": "opus"
     47       },
     48       "proxy_outcome_distinction": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper uses GitHub issue reports labeled 'bug' as a proxy for actual framework bugs, but does not discuss the gap between reported issues and actual bug prevalence, severity, or distribution.",
     52         "source": "opus"
     53       }
     54     },
     55     "limitations_and_scope": {
     56       "limitations_section_present": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No limitations or threats-to-validity section is present. Section 4 is 'Conclusion & Future Work' with no substantive limitations discussion.",
     60         "source": "opus"
     61       },
     62       "threats_to_validity_specific": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No threats to validity are discussed anywhere in the paper.",
     66         "source": "opus"
     67       },
     68       "scope_boundaries_stated": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No explicit scope boundaries are stated. The paper does not clarify what its results do NOT show or which frameworks/scenarios are excluded from its claims.",
     72         "source": "opus"
     73       }
     74     },
     75     "conflicts_of_interest": {
     76       "funding_disclosed": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "No funding information or acknowledgments section is present in the paper.",
     80         "source": "opus"
     81       },
     82       "affiliations_disclosed": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Author affiliations are listed: Nantong University, Nanjing University, NTU Singapore, Beihang University, UMass Amherst, Xi'an Jiaotong University. No obvious conflicts with the evaluated frameworks.",
     86         "source": "opus"
     87       },
     88       "funder_independent_of_outcome": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No funding is disclosed, so independence cannot be assessed.",
     92         "source": "opus"
     93       },
     94       "financial_interests_declared": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No competing interests or financial interests statement is present.",
     98         "source": "opus"
     99       }
    100     },
    101     "scope_and_framing": {
    102       "key_terms_defined": {
    103         "applies": true,
    104         "answer": false,
    105         "justification": "Key terms ('agent framework bug', 'root cause', 'lifecycle stage', 'symptom') are explained through examples and context but lack formal definitions. Definitions emerge only through the taxonomy itself, not stated upfront.",
    106         "source": "haiku"
    107       },
    108       "intended_contribution_clear": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Contributions explicitly stated: (1) Innovative lifecycle-oriented perspective, (2) Empirical taxonomy of 15 root causes and 7 symptoms, (3) Reproducible artifacts (dataset, scripts).",
    112         "source": "haiku"
    113       },
    114       "engagement_with_prior_work": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Introduction explicitly engages: [3,10] focus on agent-level failures (not framework), [9] analyzes library bugs but overlooks 'dynamic execution and temporal workflows'. Paper positions itself as filling the execution-semantics gap.",
    118         "source": "haiku"
    119       }
    120     }
    121   },
    122   "type_checklist": {
    123     "empirical": {
    124       "artifacts": {
    125         "code_released": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "The paper claims 'We release our curated dataset, taxonomy definitions, and analysis scripts' in contributions but no repository URL or download link is provided in the paper.",
    129           "source": "opus"
    130         },
    131         "data_released": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "Same claim of releasing curated dataset but no URL or link is given. The 998 issue reports are drawn from public GitHub but the curated/labeled dataset is not linked.",
    135           "source": "opus"
    136         },
    137         "environment_specified": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No environment specifications, requirements, or tooling details are provided.",
    141           "source": "opus"
    142         },
    143         "reproduction_instructions": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "No reproduction instructions are provided. The methodology describes the process but not how to replicate the analysis.",
    147           "source": "opus"
    148         }
    149       },
    150       "statistical_methodology": {
    151         "confidence_intervals_or_error_bars": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No confidence intervals or error bars are reported. Results are presented as raw counts and percentages only.",
    155           "source": "opus"
    156         },
    157         "significance_tests": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "No statistical significance tests are used despite claims about distributions and concentrations of bugs across stages.",
    161           "source": "opus"
    162         },
    163         "effect_sizes_reported": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "No effect sizes reported. Only raw counts and percentages are provided.",
    167           "source": "opus"
    168         },
    169         "sample_size_justified": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "No justification for why 998 issues are sufficient or why only two frameworks were selected. The sample size is a result of filtering, not a design choice.",
    173           "source": "opus"
    174         },
    175         "variance_reported": {
    176           "applies": false,
    177           "answer": false,
    178           "justification": "This is a manual classification study, not an experimental study with multiple runs. There are no experimental runs to report variance across.",
    179           "source": "opus"
    180         }
    181       },
    182       "evaluation_design": {
    183         "baselines_included": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "The paper positions against prior work [3, 9, 10] which study agent-level failures or static library components, and explains how their lifecycle-oriented perspective differs.",
    187           "source": "opus"
    188         },
    189         "baselines_contemporary": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "References [3], [9], and [10] are all from 2025, which is contemporary work.",
    193           "source": "opus"
    194         },
    195         "ablation_study": {
    196           "applies": false,
    197           "answer": false,
    198           "justification": "This is a taxonomy/classification study, not a system with components to ablate.",
    199           "source": "opus"
    200         },
    201         "multiple_metrics": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "The study examines both root causes (15 categories) and symptoms (7 categories) across lifecycle stages, providing multiple analytical dimensions.",
    205           "source": "opus"
    206         },
    207         "human_evaluation": {
    208           "applies": false,
    209           "answer": false,
    210           "justification": "Human evaluation of system outputs is not relevant here — the study IS a manual analysis of bug reports, not a system producing outputs to evaluate.",
    211           "source": "opus"
    212         },
    213         "held_out_test_set": {
    214           "applies": false,
    215           "answer": false,
    216           "justification": "Not an ML evaluation study. No train/test split applies.",
    217           "source": "opus"
    218         },
    219         "per_category_breakdown": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Results are broken down by root cause category (15 types), symptom category (7 types), and lifecycle stage (5 stages), with counts for each combination.",
    223           "source": "opus"
    224         },
    225         "failure_cases_discussed": {
    226           "applies": true,
    227           "answer": false,
    228           "justification": "No specific example bug reports are discussed in detail. The taxonomy is presented only with aggregate counts, not illustrative cases.",
    229           "source": "opus"
    230         },
    231         "negative_results_reported": {
    232           "applies": true,
    233           "answer": false,
    234           "justification": "No negative results or surprising non-findings are discussed. All findings are presented positively.",
    235           "source": "opus"
    236         }
    237       },
    238       "setup_transparency": {
    239         "model_versions_specified": {
    240           "applies": false,
    241           "answer": false,
    242           "justification": "No LLMs are used in the methodology. This is a manual analysis study.",
    243           "source": "opus"
    244         },
    245         "prompts_provided": {
    246           "applies": false,
    247           "answer": false,
    248           "justification": "No prompting is used in this study.",
    249           "source": "opus"
    250         },
    251         "hyperparameters_reported": {
    252           "applies": false,
    253           "answer": false,
    254           "justification": "No ML models or hyperparameters are involved in the methodology.",
    255           "source": "opus"
    256         },
    257         "scaffolding_described": {
    258           "applies": false,
    259           "answer": false,
    260           "justification": "No agentic scaffolding is used in the study methodology.",
    261           "source": "opus"
    262         },
    263         "data_preprocessing_documented": {
    264           "applies": true,
    265           "answer": true,
    266           "justification": "Section 2.2 describes a two-stage filtering process: label filtering (retaining 'bug' labeled issues) reducing from 2,773 to 1,010, then manual inspection removing three categories of irrelevant reports, yielding 998.",
    267           "source": "opus"
    268         }
    269       },
    270       "data_integrity": {
    271         "raw_data_available": {
    272           "applies": true,
    273           "answer": false,
    274           "justification": "Although the paper claims to release artifacts, no URL or archive is provided. The underlying GitHub issues are public but the labeled dataset is not available.",
    275           "source": "opus"
    276         },
    277         "data_collection_described": {
    278           "applies": true,
    279           "answer": true,
    280           "justification": "Section 2.1 describes collecting from GitHub repositories of CrewAI and LangChain, spanning December 7, 2023 to January 10, 2026, with 2,773 original issues (1,660 CrewAI, 1,113 LangChain).",
    281           "source": "opus"
    282         },
    283         "recruitment_methods_described": {
    284           "applies": false,
    285           "answer": false,
    286           "justification": "No human participants. Data source is public GitHub issue reports from specific repositories.",
    287           "source": "opus"
    288         },
    289         "data_pipeline_documented": {
    290           "applies": true,
    291           "answer": true,
    292           "justification": "The pipeline is documented: 2,773 collected → label filtering to 1,010 → manual inspection to 998. Section 2.2 describes each stage with counts and criteria.",
    293           "source": "opus"
    294         }
    295       },
    296       "contamination": {
    297         "training_cutoff_stated": {
    298           "applies": false,
    299           "answer": false,
    300           "justification": "This study does not evaluate any pre-trained model on a benchmark. It is a manual bug classification study.",
    301           "source": "opus"
    302         },
    303         "train_test_overlap_discussed": {
    304           "applies": false,
    305           "answer": false,
    306           "justification": "No model evaluation on benchmarks is performed.",
    307           "source": "opus"
    308         },
    309         "benchmark_contamination_addressed": {
    310           "applies": false,
    311           "answer": false,
    312           "justification": "No model evaluation on benchmarks is performed.",
    313           "source": "opus"
    314         }
    315       },
    316       "human_studies": {
    317         "pre_registered": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants. The study analyzes public GitHub issue reports.",
    321           "source": "opus"
    322         },
    323         "irb_or_ethics_approval": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants.",
    327           "source": "opus"
    328         },
    329         "demographics_reported": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants.",
    333           "source": "opus"
    334         },
    335         "inclusion_exclusion_criteria": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants.",
    339           "source": "opus"
    340         },
    341         "randomization_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants.",
    345           "source": "opus"
    346         },
    347         "blinding_described": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants.",
    351           "source": "opus"
    352         },
    353         "attrition_reported": {
    354           "applies": false,
    355           "answer": false,
    356           "justification": "No human participants.",
    357           "source": "opus"
    358         }
    359       },
    360       "cost_and_practicality": {
    361         "inference_cost_reported": {
    362           "applies": false,
    363           "answer": false,
    364           "justification": "This is a manual analysis study, not a system with inference costs.",
    365           "source": "opus"
    366         },
    367         "compute_budget_stated": {
    368           "applies": false,
    369           "answer": false,
    370           "justification": "Manual analysis study with no significant compute requirements.",
    371           "source": "opus"
    372         }
    373       }
    374     }
    375   },
    376   "claims": [
    377     {
    378       "claim": "API Misuse (32.97%) and API Incompatibility (22.34%) together account for 55.3% of all framework bugs",
    379       "evidence": "Section 3.1: Figure 2 shows distribution with raw counts (329 and 223 out of 998); paper explicitly states 'over half of the cases'",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Bugs concentrate in the Self-Action stage with API-related issues (289 and 211 out of 882 bugs in this stage)",
    384       "evidence": "Section 3.1 'Analysis on Lifecycle Distribution': Self-Action stage analysis shows API Misuse and Incompatibility dominate; Finding 1 summarizes",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Crash (100/998), Functional Error (781/998), and Build Failure (67/998) are the dominant symptoms",
    389       "evidence": "Section 3.2, Figure 3, and Finding 2 report these symptom counts and percentages",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Most bugs stem from execution semantics rather than infrastructure issues",
    394       "evidence": "Finding 1 states this, based on concentration of API Misuse/Incompatibility in Self-Action stage; conclusion reiterates 'execution-semantic bugs'",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "CrewAI and LangChain are representative frameworks suitable for understanding LLM agent framework bugs broadly",
    399       "evidence": "Section 2.1 describes them as 'representative and widely used' with 68.5k GitHub stars combined, but no empirical evidence provided that these findings generalize to other frameworks",
    400       "supported": "weak"
    401     },
    402     {
    403       "claim": "The lifecycle-oriented taxonomy reveals where bugs arise during agent execution",
    404       "evidence": "Taxonomy constructed (15 root causes, 7 symptoms, 5 lifecycle stages) and mapped, but no independent validation that this taxonomy is optimal or uniquely explanatory",
    405       "supported": "moderate"
    406     }
    407   ],
    408   "methodology_tags": [
    409     "observational",
    410     "case-study",
    411     "qualitative"
    412   ],
    413   "key_findings": "Bugs in LLM agent frameworks are dominated by API-related issues: API Misuse (33%) and API Incompatibility (22%) together account for over half of all 998 reported bugs analyzed from CrewAI and LangChain. These bugs concentrate in the Self-Action (execution) stage, where framework mechanisms coordinate tool invocation and planning. The most visible symptoms are Functional Error (78%), Crash (10%), and Build Failure (7%), indicating that framework bugs manifest primarily as disruptions to task execution flow rather than isolated interface problems.",
    414   "red_flags": [
    415     {
    416       "flag": "No inter-rater reliability metrics",
    417       "detail": "Two independent annotators label all 998 reports but no Cohen's kappa, agreement percentage, or disagreement resolution process quantified beyond 'cross-checked for consistency'"
    418     },
    419     {
    420       "flag": "Limited generalizability",
    421       "detail": "Only two frameworks studied (CrewAI, LangChain). Results claimed for 'Modern LLM Agent Frameworks' broadly without explicit scope limits to these two or discussion of applicability to proprietary/commercial systems"
    422     },
    423     {
    424       "flag": "No independent validation of taxonomy",
    425       "detail": "Initial taxonomy constructed on only 100 samples; unclear whether the 15 root causes and 7 symptoms would emerge from different samples or if categories are stable across frameworks"
    426     },
    427     {
    428       "flag": "No causal analysis",
    429       "detail": "Identifies alleged root causes through manual categorization but does not establish causation through controlled experiments, ablations, or interventions"
    430     },
    431     {
    432       "flag": "Reported bugs as proxy for true distribution",
    433       "detail": "GitHub bug reports may not represent actual distribution of bugs in production or unreported failures; sampling bias not discussed"
    434     },
    435     {
    436       "flag": "Undefined filtering criteria",
    437       "detail": "Manual filtering uses subjective language ('non-functional textual errors', 'does not involve framework logic') without operational definitions or examples"
    438     },
    439     {
    440       "flag": "No limitations or threats-to-validity section",
    441       "detail": "Paper lacks formal discussion of study limitations, frame bias, inter-rater disagreement rates, or scope constraints"
    442     },
    443     {
    444       "flag": "Artifacts promised but not provided",
    445       "detail": "Abstract states 'We release our curated dataset, taxonomy definitions, and analysis scripts' but no link, repository, or access method provided in paper"
    446     },
    447     {
    448       "flag": "No funding disclosure",
    449       "detail": "Affiliations listed but no funding source disclosed; potential conflicts of interest regarding framework selection not addressed"
    450     }
    451   ],
    452   "cited_papers": [
    453     {
    454       "title": "Why do multi-agent llm systems fail?",
    455       "authors": "Cemri, Pan, Yang, et al.",
    456       "year": 2025,
    457       "relevance": "Directly addresses agent-level failures in multi-agent systems; complements framework-level analysis"
    458     },
    459     {
    460       "title": "A Characterization Study of Bugs in LLM Agent Workflow Orchestration Frameworks",
    461       "authors": "Xue, Zhao, Wang, Chen, Wang",
    462       "year": 2025,
    463       "relevance": "Concurrent work analyzing framework bugs; cited as analyzing 'static components' without dynamic execution lifecycle"
    464     },
    465     {
    466       "title": "Which agent causes task failures and when? on automated failure attribution of llm multi-agent systems",
    467       "authors": "Zhang, Yin, Zhang, et al.",
    468       "year": 2025,
    469       "relevance": "Addresses failure attribution in multi-agent systems; complements taxonomy of bug manifestations"
    470     },
    471     {
    472       "title": "Large language model supply chain: A research agenda",
    473       "authors": "Wang, Zhao, Hou, Wang",
    474       "year": 2025,
    475       "relevance": "Frames framework bugs as LLM software supply chain security issue; contextualizes framework reliability in broader ecosystem"
    476     },
    477     {
    478       "title": "Evaluating Large Language Models Trained on Code",
    479       "authors": "Chen et al.",
    480       "year": 2021,
    481       "relevance": "Foundational work on code-related LLM capabilities; relevant to understanding agent framework design constraints"
    482     }
    483   ],
    484   "engagement_factors": {
    485     "practical_relevance": {
    486       "score": 2,
    487       "justification": "Developers working with CrewAI and LangChain can reference this taxonomy to diagnose and understand bug patterns, but taxonomy is descriptive (no fixes or solutions provided)."
    488     },
    489     "surprise_contrarian": {
    490       "score": 1,
    491       "justification": "Finding that execution stage has most bugs is somewhat expected (execution is complex), though the dominance of API misuse over other categories is somewhat notable."
    492     },
    493     "fear_safety": {
    494       "score": 2,
    495       "justification": "Paper mentions that framework bugs 'amplify their impact, leading to incorrect execution, resource misuse, and security risks' and discusses 'serious security threat to agent-based software systems', raising infrastructure reliability concerns."
    496     },
    497     "drama_conflict": {
    498       "score": 0,
    499       "justification": "Technical taxonomy paper with no controversy, vendor conflict, or dramatic angle."
    500     },
    501     "demo_ability": {
    502       "score": 0,
    503       "justification": "Analysis paper; no demo possible. Taxonomy is presented as tables and figures, not interactive or runnable."
    504     },
    505     "brand_recognition": {
    506       "score": 2,
    507       "justification": "CrewAI and LangChain are popular frameworks in the LLM community, but authors are from mid-tier academic institutions (Nanjing, NTU, Beihang, UMass), not mega-labs (OpenAI, Google, Meta)."
    508     }
    509   },
    510   "hn_data": {
    511     "threads": [],
    512     "top_points": 0,
    513     "total_points": 0,
    514     "total_comments": 0
    515   }
    516 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs