ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (17245B)


      1 {
      2   "scan_version": 2,
      3   "active_modules": [],
      4   "paper": {
      5     "title": "Bugs in Modern LLM Agent Frameworks: An Empirical Study",
      6     "authors": ["Xinxue Zhu", "Jiacong Wu", "Xiaoyu Zhang", "Tianlin Li", "Yanzhou Mu", "Juan Zhai", "Chao Shen", "Chunrong Fang", "Yang Liu"],
      7     "year": 2026,
      8     "venue": "FSE 2026 Companion",
      9     "arxiv_id": "2602.21806"
     10   },
     11   "methodology_tags": ["qualitative", "observational"],
     12   "key_findings": "Analysis of 998 bug reports from CrewAI and LangChain identifies 15 root cause categories and 7 symptom categories across five agent lifecycle stages. API Misuse (32.97%) and API Incompatibility (22.34%) account for over 55% of all bugs, concentrated in the Self-Action execution stage. Symptoms predominantly manifest as Functional Error (781/998), Crash (100/998), and Build Failure (67/998), indicating framework bugs mainly disrupt workflow progression rather than causing isolated interface issues.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "The paper claims 'We release our curated dataset, taxonomy definitions, and analysis scripts' in contributions but no repository URL or download link is provided in the paper."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "Same claim of releasing curated dataset but no URL or link is given. The 998 issue reports are drawn from public GitHub but the curated/labeled dataset is not linked."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment specifications, requirements, or tooling details are provided."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No reproduction instructions are provided. The methodology describes the process but not how to replicate the analysis."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No confidence intervals or error bars are reported. Results are presented as raw counts and percentages only."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No statistical significance tests are used despite claims about distributions and concentrations of bugs across stages."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No effect sizes reported. Only raw counts and percentages are provided."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No justification for why 998 issues are sufficient or why only two frameworks were selected. The sample size is a result of filtering, not a design choice."
     56       },
     57       "variance_reported": {
     58         "applies": false,
     59         "answer": false,
     60         "justification": "This is a manual classification study, not an experimental study with multiple runs. There are no experimental runs to report variance across."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper positions against prior work [3, 9, 10] which study agent-level failures or static library components, and explains how their lifecycle-oriented perspective differs."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "References [3], [9], and [10] are all from 2025, which is contemporary work."
     73       },
     74       "ablation_study": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "This is a taxonomy/classification study, not a system with components to ablate."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The study examines both root causes (15 categories) and symptoms (7 categories) across lifecycle stages, providing multiple analytical dimensions."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "Human evaluation of system outputs is not relevant here — the study IS a manual analysis of bug reports, not a system producing outputs to evaluate."
     88       },
     89       "held_out_test_set": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "Not an ML evaluation study. No train/test split applies."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down by root cause category (15 types), symptom category (7 types), and lifecycle stage (5 stages), with counts for each combination."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "No specific example bug reports are discussed in detail. The taxonomy is presented only with aggregate counts, not illustrative cases."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "No negative results or surprising non-findings are discussed. All findings are presented positively."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims about API Misuse/Incompatibility dominance and Self-Action concentration are supported by the counts in Figures 2-3 and the lifecycle distribution analysis."
    115       },
    116       "causal_claims_justified": {
    117         "applies": false,
    118         "answer": false,
    119         "justification": "The paper is descriptive — it classifies and counts bug types without making causal claims about why bugs occur."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title says 'Modern LLM Agent Frameworks' but only two frameworks (CrewAI, LangChain) are studied. The paper does not clearly bound its generalizability to these two frameworks."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "No alternative explanations are discussed. For instance, the concentration in Self-Action could reflect reporting bias (users more likely to report execution bugs) rather than actual bug distribution."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper uses GitHub issue reports labeled 'bug' as a proxy for actual framework bugs, but does not discuss the gap between reported issues and actual bug prevalence, severity, or distribution."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "No LLMs are used in the methodology. This is a manual analysis study."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "No prompting is used in this study."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "No ML models or hyperparameters are involved in the methodology."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used in the study methodology."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 2.2 describes a two-stage filtering process: label filtering (retaining 'bug' labeled issues) reducing from 2,773 to 1,010, then manual inspection removing three categories of irrelevant reports, yielding 998."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "No limitations or threats-to-validity section is present. Section 4 is 'Conclusion & Future Work' with no substantive limitations discussion."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No threats to validity are discussed anywhere in the paper."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No explicit scope boundaries are stated. The paper does not clarify what its results do NOT show or which frameworks/scenarios are excluded from its claims."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "Although the paper claims to release artifacts, no URL or archive is provided. The underlying GitHub issues are public but the labeled dataset is not available."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 2.1 describes collecting from GitHub repositories of CrewAI and LangChain, spanning December 7, 2023 to January 10, 2026, with 2,773 original issues (1,660 CrewAI, 1,113 LangChain)."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. Data source is public GitHub issue reports from specific repositories."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The pipeline is documented: 2,773 collected → label filtering to 1,010 → manual inspection to 998. Section 2.2 describes each stage with counts and criteria."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding information or acknowledgments section is present in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are listed: Nantong University, Nanjing University, NTU Singapore, Beihang University, UMass Amherst, Xi'an Jiaotong University. No obvious conflicts with the evaluated frameworks."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding is disclosed, so independence cannot be assessed."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is present."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "This study does not evaluate any pre-trained model on a benchmark. It is a manual bug classification study."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "No model evaluation on benchmarks is performed."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "No model evaluation on benchmarks is performed."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants. The study analyzes public GitHub issue reports."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "This is a manual analysis study, not a system with inference costs."
    284       },
    285       "compute_budget_stated": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "Manual analysis study with no significant compute requirements."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "API Misuse (32.97%) and API Incompatibility (22.34%) together account for over 55% of all agent framework bugs.",
    295       "evidence": "Figure 2 shows 329 API Misuse and 223 API Incompatibility out of 998 total bugs. Section 3.1.",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "Root causes concentrate in the Self-Action lifecycle stage, which contains most reported cases.",
    300       "evidence": "Section 3.1 lifecycle distribution analysis shows 882/998 bugs in Self-Action stage, with API Misuse (289/882) and API Incompatibility (211/882) dominating.",
    301       "supported": "strong"
    302     },
    303     {
    304       "claim": "Functional Error is the dominant symptom (781/998), followed by Crash (100/998) and Build Failure (67/998).",
    305       "evidence": "Figure 3 and Section 3.2 provide the distribution counts.",
    306       "supported": "strong"
    307     },
    308     {
    309       "claim": "Framework bugs mainly manifest as breakdowns in workflow progression rather than isolated interface issues.",
    310       "evidence": "The concentration of Functional Error, Crash, and Build Failure symptoms in the Self-Action stage (Section 3.2) supports this, though the claim is somewhat interpretive.",
    311       "supported": "moderate"
    312     }
    313   ],
    314   "red_flags": [
    315     {
    316       "flag": "No inter-rater reliability metrics",
    317       "detail": "Two annotators labeled 998 issues but no Cohen's kappa, Krippendorff's alpha, or agreement rate is reported. The paper says annotators 'cross-check results' and hold 'online meetings to reach an agreement' but does not quantify initial disagreement or reliability."
    318     },
    319     {
    320       "flag": "No limitations section",
    321       "detail": "A 5-page empirical study with no discussion of threats to validity. Key unaddressed threats include: selection bias (only two frameworks), reporting bias (GitHub issues may not represent actual bug distribution), and generalizability beyond CrewAI/LangChain."
    322     },
    323     {
    324       "flag": "Overclaiming from narrow sample",
    325       "detail": "Title claims 'Modern LLM Agent Frameworks' (plural, general) but only studies CrewAI and LangChain. Other major frameworks (AutoGen, DSPy, LlamaIndex agents) are excluded without justification."
    326     },
    327     {
    328       "flag": "No illustrative examples",
    329       "detail": "998 bugs are classified into 15 root causes and 7 symptoms but not a single concrete bug example is shown. The reader cannot verify whether the taxonomy categories are applied correctly."
    330     },
    331     {
    332       "flag": "Artifacts promised but not delivered",
    333       "detail": "The paper lists 'Reproducible Artifacts' as a contribution, claiming to release dataset, taxonomy, and scripts, but provides no URL, repository link, or archive reference."
    334     }
    335   ],
    336   "cited_papers": [
    337     {
    338       "title": "Why do multi-agent llm systems fail?",
    339       "authors": ["Mert Cemri", "Melissa Z Pan", "Shuyi Yang"],
    340       "year": 2025,
    341       "arxiv_id": "2503.13657",
    342       "relevance": "Directly studies failure modes in multi-agent LLM systems, complementary to framework-level bug analysis."
    343     },
    344     {
    345       "title": "Evaluating Large Language Models Trained on Code",
    346       "authors": ["Mark Chen", "Jerry Tworek"],
    347       "year": 2021,
    348       "arxiv_id": "2107.03374",
    349       "relevance": "Foundational Codex/HumanEval paper on LLM code generation capabilities."
    350     },
    351     {
    352       "title": "Large language model supply chain: A research agenda",
    353       "authors": ["Shenao Wang", "Yanjie Zhao", "Xinyi Hou", "Haoyu Wang"],
    354       "year": 2025,
    355       "relevance": "Frames the LLM software supply chain problem that framework bugs contribute to."
    356     },
    357     {
    358       "title": "A Characterization Study of Bugs in LLM Agent Workflow Orchestration Frameworks",
    359       "authors": ["Ziluo Xue", "Yanjie Zhao", "Shenao Wang"],
    360       "year": 2025,
    361       "relevance": "Most closely related prior work studying bugs in LLM agent libraries via static component mapping."
    362     },
    363     {
    364       "title": "Which agent causes task failures and when? on automated failure attribution of llm multi-agent systems",
    365       "authors": ["Shaokun Zhang", "Ming Yin", "Jieyu Zhang"],
    366       "year": 2025,
    367       "arxiv_id": "2505.00212",
    368       "relevance": "Studies automated failure attribution in multi-agent systems, complementary perspective on agent failures."
    369     }
    370   ]
    371 }

Impressum · Datenschutz