ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (22538B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Bugs in Modern LLM Agent Frameworks: An Empirical Study",
      6     "authors": [
      7       "Xinxue Zhu",
      8       "Jiacong Wu",
      9       "Xiaoyu Zhang",
     10       "Tianlin Li",
     11       "Yanzhou Mu",
     12       "Juan Zhai",
     13       "Chao Shen",
     14       "Chunrong Fang",
     15       "Yang Liu"
     16     ],
     17     "year": 2026,
     18     "venue": "FSE",
     19     "arxiv_id": "2602.21806",
     20     "doi": null
     21   },
     22   "checklist": {
     23     "claims_and_evidence": {
     24       "abstract_claims_supported": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "All abstract claims (998 bugs analyzed, 15 root causes, 7 symptoms, 5 lifecycle stages, API Misuse 32.97%, API Incompatibility 22.34%, Self-Action concentration) are explicitly supported by Results section data.",
     28         "source": "haiku"
     29       },
     30       "causal_claims_justified": {
     31         "applies": false,
     32         "answer": false,
     33         "justification": "Paper presents taxonomy and distributions, not causal claims. No causal inference required for descriptive taxonomy work.",
     34         "source": "haiku"
     35       },
     36       "generalization_bounded": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Study limited to CrewAI and LangChain, but title/conclusions generalize to 'modern LLM agent frameworks' and 'LLM software supply chain' without explicitly bounding these claims.",
     40         "source": "haiku"
     41       },
     42       "alternative_explanations_discussed": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Paper presents taxonomy without discussing alternative interpretations. No consideration of reporting bias, labeling bias, or alternative frameworks for organizing root causes.",
     46         "source": "haiku"
     47       },
     48       "proxy_outcome_distinction": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Root causes inferred from issue descriptions rather than code analysis. No explicit discussion of whether manually-inferred causes match actual code-level causation or whether GitHub issues capture true bug distribution.",
     52         "source": "haiku"
     53       }
     54     },
     55     "limitations_and_scope": {
     56       "limitations_section_present": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No dedicated limitations or threats-to-validity section. Conclusion mentions future work but not systematic discussion of study limitations.",
     60         "source": "haiku"
     61       },
     62       "threats_to_validity_specific": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No specific threats discussed. No inter-rater agreement metrics, annotator bias analysis, or discussion of sampling limitations despite manual labeling being the core process.",
     66         "source": "haiku"
     67       },
     68       "scope_boundaries_stated": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "Scope limited to two frameworks and GitHub issues spanning Dec 2023-Jan 2026, but boundaries not stated as explicit limitations. Title claims broader applicability without qualification.",
     72         "source": "haiku"
     73       }
     74     },
     75     "conflicts_of_interest": {
     76       "funding_disclosed": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "No funding sources disclosed in the paper.",
     80         "source": "haiku"
     81       },
     82       "affiliations_disclosed": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Each author lists institutional affiliation (Nantong, Nanjing, NTU Singapore, Beihang, UMass Amherst, Xi'an Jiaotong).",
     86         "source": "haiku"
     87       },
     88       "funder_independent_of_outcome": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "No funding disclosed; not applicable.",
     92         "source": "haiku"
     93       },
     94       "financial_interests_declared": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No competing interests or financial disclosures statement provided.",
     98         "source": "haiku"
     99       }
    100     },
    101     "scope_and_framing": {
    102       "key_terms_defined": {
    103         "applies": true,
    104         "answer": false,
    105         "justification": "Key terms like 'agent framework,' 'root cause,' and 'symptom' used without formal definitions, though operational definition of 'bug' is provided via two-stage filtering criteria.",
    106         "source": "haiku"
    107       },
    108       "intended_contribution_clear": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Paper explicitly lists three contributions: lifecycle-oriented taxonomy, empirical findings (15 root causes, 7 symptoms), and released artifacts. Contribution clearly framed.",
    112         "source": "haiku"
    113       },
    114       "engagement_with_prior_work": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Paper positions against prior work on agent-level failures vs. framework-level bugs (refs 3, 9, 10), though related work discussion is brief and concentrated in introduction.",
    118         "source": "haiku"
    119       }
    120     }
    121   },
    122   "type_checklist": {
    123     "empirical": {
    124       "artifacts": {
    125         "code_released": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "Paper states 'We release our curated dataset, taxonomy definitions, and analysis scripts' but provides no link, repository, or supplementary materials URL.",
    129           "source": "haiku"
    130         },
    131         "data_released": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "Claims to release 'curated dataset' without providing link, repository, or supplementary materials. Original GitHub issues are public but labeled/processed version not available.",
    135           "source": "haiku"
    136         },
    137         "environment_specified": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No environment specifications, requirements.txt, Dockerfile, or dependency declarations provided.",
    141           "source": "haiku"
    142         },
    143         "reproduction_instructions": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "Methodology describes process but not in reproducible detail. Actual reproduction requires access to curated labeled dataset (not provided) or redoing entire manual annotation.",
    147           "source": "haiku"
    148         }
    149       },
    150       "statistical_methodology": {
    151         "confidence_intervals_or_error_bars": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "Results report frequencies (329/998, 223/998) and percentages (32.97%, 22.34%) but no confidence intervals or uncertainty bounds.",
    155           "source": "haiku"
    156         },
    157         "significance_tests": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "No statistical significance tests (chi-square, Fisher's exact, etc.) reported for distributions or comparisons across frameworks or lifecycle stages.",
    161           "source": "haiku"
    162         },
    163         "effect_sizes_reported": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "Results report proportions as percentages but these are descriptive, not comparative. No effect sizes from between-group contrasts.",
    167           "source": "haiku"
    168         },
    169         "sample_size_justified": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "Final sample of 998 bugs (from 2,773 collected) is not justified. No power analysis or discussion of adequacy for detecting patterns.",
    173           "source": "haiku"
    174         },
    175         "variance_reported": {
    176           "applies": true,
    177           "answer": false,
    178           "justification": "Results presented as point counts and percentages without error bars, confidence intervals, or variance estimates. No uncertainty quantification.",
    179           "source": "haiku"
    180         }
    181       },
    182       "evaluation_design": {
    183         "baselines_included": {
    184           "applies": false,
    185           "answer": false,
    186           "justification": "Descriptive taxonomy study, not a comparative evaluation; baseline comparisons not applicable.",
    187           "source": "haiku"
    188         },
    189         "baselines_contemporary": {
    190           "applies": false,
    191           "answer": false,
    192           "justification": "Not applicable to taxonomy study.",
    193           "source": "haiku"
    194         },
    195         "ablation_study": {
    196           "applies": false,
    197           "answer": false,
    198           "justification": "Not applicable to taxonomy work.",
    199           "source": "haiku"
    200         },
    201         "multiple_metrics": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "Study examines bugs from multiple perspectives: 15 root cause categories, 7 symptom categories, and distribution across 5 lifecycle stages. Multi-faceted analysis provided.",
    205           "source": "haiku"
    206         },
    207         "human_evaluation": {
    208           "applies": false,
    209           "answer": false,
    210           "justification": "Two annotators label bugs, but this is data labeling, not evaluation of system outputs. No user study or user-facing evaluation.",
    211           "source": "haiku"
    212         },
    213         "held_out_test_set": {
    214           "applies": false,
    215           "answer": false,
    216           "justification": "Not applicable; not a prediction task.",
    217           "source": "haiku"
    218         },
    219         "per_category_breakdown": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Extensive per-category analysis: root causes broken down into 15 categories with counts (Figure 2), symptoms into 7 categories (Figure 3), and lifecycle stage distribution detailed across all stages.",
    223           "source": "haiku"
    224         },
    225         "failure_cases_discussed": {
    226           "applies": true,
    227           "answer": false,
    228           "justification": "Taxonomy describes failure modes (root causes/symptoms) but provides limited detailed case examples or rich qualitative illustrations beyond category membership.",
    229           "source": "haiku"
    230         },
    231         "negative_results_reported": {
    232           "applies": false,
    233           "answer": false,
    234           "justification": "Descriptive study without hypothesis-driven negative results. All findings presented uniformly without surprise or null findings.",
    235           "source": "haiku"
    236         }
    237       },
    238       "setup_transparency": {
    239         "model_versions_specified": {
    240           "applies": false,
    241           "answer": false,
    242           "justification": "Not evaluating models; not applicable.",
    243           "source": "haiku"
    244         },
    245         "prompts_provided": {
    246           "applies": false,
    247           "answer": false,
    248           "justification": "Not applicable; no prompts or LLM usage in the study.",
    249           "source": "haiku"
    250         },
    251         "hyperparameters_reported": {
    252           "applies": false,
    253           "answer": false,
    254           "justification": "Not applicable.",
    255           "source": "haiku"
    256         },
    257         "scaffolding_described": {
    258           "applies": true,
    259           "answer": false,
    260           "justification": "High-level framework characteristics mentioned ('LangChain offers rich abstractions; CrewAI focuses on role-based collaboration') but insufficient detail on internal APIs, execution semantics, or implementation to fully understand frameworks.",
    261           "source": "haiku"
    262         },
    263         "data_preprocessing_documented": {
    264           "applies": true,
    265           "answer": true,
    266           "justification": "Two-stage preprocessing documented: (1) label filtering for 'bug' label, (2) manual inspection excluding 'documentation typos,' 'usage questions,' and 'infrastructure issues.' Criteria and process clearly described.",
    267           "source": "haiku"
    268         }
    269       },
    270       "data_integrity": {
    271         "raw_data_available": {
    272           "applies": true,
    273           "answer": false,
    274           "justification": "Original GitHub issues are public but curated/labeled dataset is not provided. Cannot independently verify annotations.",
    275           "source": "haiku"
    276         },
    277         "data_collection_described": {
    278           "applies": true,
    279           "answer": true,
    280           "justification": "Collection procedure well-documented: full scraping of GitHub (both open/closed issues), 2,773 total issues (1,660 CrewAI, 1,113 LangChain), time span Dec 7 2023–Jan 10 2026, and data elements extracted (title, labels, content, comments).",
    281           "source": "haiku"
    282         },
    283         "recruitment_methods_described": {
    284           "applies": false,
    285           "answer": false,
    286           "justification": "Not a human subjects study; not applicable.",
    287           "source": "haiku"
    288         },
    289         "data_pipeline_documented": {
    290           "applies": true,
    291           "answer": true,
    292           "justification": "Full pipeline documented: GitHub collection → label filtering → manual inspection → initial taxonomy construction (100 samples) → large-scale annotation. Process and stages clearly described with Figure 1 overview.",
    293           "source": "haiku"
    294         }
    295       },
    296       "contamination": {
    297         "training_cutoff_stated": {
    298           "applies": false,
    299           "answer": false,
    300           "justification": "Not evaluating models on benchmarks; not applicable.",
    301           "source": "haiku"
    302         },
    303         "train_test_overlap_discussed": {
    304           "applies": false,
    305           "answer": false,
    306           "justification": "Not applicable.",
    307           "source": "haiku"
    308         },
    309         "benchmark_contamination_addressed": {
    310           "applies": false,
    311           "answer": false,
    312           "justification": "Not applicable.",
    313           "source": "haiku"
    314         }
    315       },
    316       "human_studies": {
    317         "pre_registered": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants; not applicable.",
    321           "source": "haiku"
    322         },
    323         "irb_or_ethics_approval": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human subjects; not applicable.",
    327           "source": "haiku"
    328         },
    329         "demographics_reported": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "Not applicable.",
    333           "source": "haiku"
    334         },
    335         "inclusion_exclusion_criteria": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "Not applicable.",
    339           "source": "haiku"
    340         },
    341         "randomization_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "Not applicable.",
    345           "source": "haiku"
    346         },
    347         "blinding_described": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "Not applicable.",
    351           "source": "haiku"
    352         },
    353         "attrition_reported": {
    354           "applies": false,
    355           "answer": false,
    356           "justification": "Not applicable.",
    357           "source": "haiku"
    358         }
    359       },
    360       "cost_and_practicality": {
    361         "inference_cost_reported": {
    362           "applies": false,
    363           "answer": false,
    364           "justification": "Taxonomy study, not a system with inference costs; not applicable.",
    365           "source": "haiku"
    366         },
    367         "compute_budget_stated": {
    368           "applies": true,
    369           "answer": false,
    370           "justification": "No mention of computational resources or time investment for manual annotation of 998 bugs by two researchers over the study period.",
    371           "source": "haiku"
    372         }
    373       }
    374     }
    375   },
    376   "claims": [
    377     {
    378       "claim": "API Misuse (32.97%) and API Incompatibility (22.34%) together account for over 55% of agent framework bugs",
    379       "evidence": "Analysis of 998 labeled bug reports; 329 API Misuse + 223 API Incompatibility = 552/998 bugs",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Self-Action stage contains the highest concentration of bugs (88% of issues)",
    384       "evidence": "Lifecycle stage distribution: 882/998 bugs mapped to Self-Action stage; detailed breakdown across all 5 stages provided",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Framework bugs manifest primarily as Functional Error (78%), Crash (10%), and Build Failure (7%)",
    389       "evidence": "Symptom analysis reported in Figure 3: S2 Functional Error 781/998, S1 Crash 100/998, S3 Build Failure 67/998",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Execution semantics mechanisms are the dominant source of framework failures",
    394       "evidence": "Self-Action stage concentration (88%) and API-related root causes (55%) suggest execution-level problems dominate over interface issues",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "CrewAI and LangChain represent suitable agent frameworks for understanding modern LLM agent bugs",
    399       "evidence": "Justified by \"representative and widely used,\" \"68.5k stars on GitHub,\" complementary design emphases; no independent validation",
    400       "supported": "moderate"
    401     },
    402     {
    403       "claim": "Curated dataset, taxonomy definitions, and analysis scripts will be released to enable replication",
    404       "evidence": "Stated in abstract and contributions section; no link, repository, or supplementary materials provided with paper",
    405       "supported": "weak"
    406     }
    407   ],
    408   "methodology_tags": [
    409     "observational",
    410     "case-study"
    411   ],
    412   "key_findings": "The paper characterizes 998 bug reports from CrewAI and LangChain via a lifecycle-oriented taxonomy, finding that 55% of bugs stem from API-related issues (misuse + incompatibility) and that 88% concentrate in the Self-Action (execution) stage, where planning and tool invocation occur. Bugs primarily manifest as functional errors (78%), crashes (10%), and build failures (7%), indicating execution-level disruptions rather than isolated interface problems. This taxonomy across five agent lifecycle stages (Initialization, Perception, Self-Action, Mutual Interaction, Evolution) provides a structured lens for understanding how framework-level issues propagate during agent execution.",
    413   "red_flags": [
    414     {
    415       "flag": "No inter-rater reliability metrics",
    416       "detail": "Two annotators labeled all 998 bugs but no Cohen's kappa, agreement rate, or conflict resolution statistics reported. Prevents assessment of labeling consistency."
    417     },
    418     {
    419       "flag": "No statistical analysis",
    420       "detail": "Frequencies and percentages reported without confidence intervals, significance tests, or hypothesis testing. No uncertainty quantification."
    421     },
    422     {
    423       "flag": "Inferred root causes, not validated",
    424       "detail": "Root causes inferred from GitHub issue descriptions rather than code analysis or detailed investigation. Gap between inferred and actual causation."
    425     },
    426     {
    427       "flag": "Limited generalization scope",
    428       "detail": "Study limited to 2 frameworks (CrewAI, LangChain) but title and conclusions generalize to 'modern LLM agent frameworks' without explicit qualification."
    429     },
    430     {
    431       "flag": "No threats-to-validity discussion",
    432       "detail": "Paper lacks dedicated limitations or threats section. No discussion of sampling bias, reporting bias, annotator bias, or other validity threats."
    433     },
    434     {
    435       "flag": "Artifacts not provided",
    436       "detail": "Paper claims to release curated dataset and analysis scripts but provides no link, repository URL, or supplementary materials."
    437     },
    438     {
    439       "flag": "Potential reporting bias",
    440       "detail": "GitHub issues reflect what users report, not the full universe of bugs. Some bugs unreported, others over-reported. Frequency may not reflect actual prevalence."
    441     },
    442     {
    443       "flag": "Framework selection not justified",
    444       "detail": "CrewAI and LangChain chosen for being 'representative,' but no systematic justification or comparison against other agent frameworks."
    445     }
    446   ],
    447   "cited_papers": [
    448     {
    449       "title": "Why do multi-agent LLM systems fail?",
    450       "authors": "Cemri et al.",
    451       "year": 2025,
    452       "relevance": "Prior work on agent-level failures; this paper studies framework-level bugs as distinct from agent reasoning failures"
    453     },
    454     {
    455       "title": "A Characterization Study of Bugs in LLM Agent Workflow Orchestration Frameworks",
    456       "authors": "Xue et al.",
    457       "year": 2025,
    458       "relevance": "Closely related work analyzing agent library bugs; distinguishes this paper's dynamic lifecycle approach from static component mapping"
    459     },
    460     {
    461       "title": "Which agent causes task failures and when? On automated failure attribution of LLM multi-agent systems",
    462       "authors": "Zhang et al.",
    463       "year": 2025,
    464       "relevance": "Related work on agent failure analysis; complements framework-level bug taxonomy"
    465     },
    466     {
    467       "title": "Large language model supply chain: A research agenda",
    468       "authors": "Wang et al.",
    469       "year": 2025,
    470       "relevance": "Contextualizes framework bugs within LLM software supply chain security and quality concerns"
    471     }
    472   ],
    473   "engagement_factors": {
    474     "practical_relevance": {
    475       "score": 2,
    476       "justification": "Taxonomy helps framework developers and maintainers identify high-risk areas (Self-Action stage, API-related bugs) but provides limited actionable guidance for improvement."
    477     },
    478     "surprise_contrarian": {
    479       "score": 1,
    480       "justification": "Finding that execution/orchestration is the main bug source is fairly predictable given complexity of agent execution semantics; no surprising contrarian insight."
    481     },
    482     "fear_safety": {
    483       "score": 1,
    484       "justification": "Mentions 'security risks' and 'supply chain threat' once but does not investigate or emphasize safety/security concerns beyond acknowledgment."
    485     },
    486     "drama_conflict": {
    487       "score": 0,
    488       "justification": "Neutral technical taxonomy work without contentious claims, novel controversies, or dramatic findings."
    489     },
    490     "demo_ability": {
    491       "score": 1,
    492       "justification": "CrewAI and LangChain are publicly available and can be used, but study's taxonomy and curated dataset are not provided, limiting reproducibility or demonstration of findings."
    493     },
    494     "brand_recognition": {
    495       "score": 2,
    496       "justification": "Studies well-known frameworks (CrewAI, LangChain) but authors span diverse institutions of mixed prestige (Nantong, Nanjing, NTU Singapore, Beihang, UMass Amherst, Xi'an Jiaotong)."
    497     }
    498   },
    499   "hn_data": {
    500     "threads": [],
    501     "top_points": 0,
    502     "total_points": 0,
    503     "total_comments": 0
    504   }
    505 }

Impressum · Datenschutz