scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25469B)
      1 {
      2   "paper": {
      3     "title": "AgentFM: Role-Aware Failure Management for Distributed Databases with LLM-Driven Multi-Agents",
      4     "authors": [
      5       "Lingzhe Zhang",
      6       "Yunpeng Zhai",
      7       "Tong Jia",
      8       "Xiaosong Huang",
      9       "Chiming Duan",
     10       "Ying Li"
     11     ],
     12     "year": 2025,
     13     "venue": "FSE '25 (33rd ACM International Conference on the Foundations of Software Engineering)",
     14     "arxiv_id": "2504.06614",
     15     "doi": "10.1145/3696630.3728492"
     16   },
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "No GitHub link, repository URL, or archive is provided anywhere in the paper. There is no mention of code release."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "The injected anomaly dataset used for evaluation is not released. The paper uses Apache IoTDB as the test platform but the specific experimental data (injected anomalies, collected traces, metrics, logs) is not made available."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No requirements file, Dockerfile, or dependency listing is provided. The paper mentions using Qwen2.5-72b and Apache IoTDB but gives no environment setup details."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No README, script, or step-by-step instructions for reproducing the experiments are provided. The evaluation section (Section 4) describes the experimental design but not how to reproduce it."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "All results in Tables 1, 2, and 3 are reported as point estimates only (e.g., 'Anomaly Detection: F1-Score 95.76%'). No confidence intervals or error bars are reported."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "No statistical significance tests are conducted. The preliminary study (Table 1) and main evaluation (Table 3) compare numbers directly without any statistical testing."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No effect sizes are reported. There are no baseline comparisons for the main evaluation of AgentFM, so effect size reporting is not possible in any case."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The evaluation uses 10 anomaly types with 20 injections each (200 total), but no justification is given for why this sample size was chosen or whether it is sufficient for the precision/recall claims made."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "All results are single-run point estimates. No variance, standard deviation, or results across multiple runs are reported."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "The main evaluation of AgentFM (Table 3) includes no baselines. PLELog is used in the preliminary empirical study (Table 1) as a demonstration tool, not as a comparison to AgentFM. AgentFM's detection/diagnosis numbers are reported in isolation."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No baselines are included in the AgentFM evaluation, so this cannot be assessed. The preliminary study uses PLELog (2021) as the sole method, without comparing against more recent approaches."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "No ablation study is provided. The paper proposes a multi-component system (system agents, data agents, task agents, meta-agent) but does not test the contribution of any individual component."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Section 4.2 reports precision, recall, and F1-score for both anomaly detection and failure diagnosis, and additionally includes manual analysis of mitigation solutions (Figure 3)."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Section 4.2 states 'the mitigation solutions generated are manually analyzed to determine their usefulness' and 'For failure diagnosis, we manually review the LLM-generated results.' Human evaluation of the system's outputs is included."
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "The paper describes manually injecting anomalies into Apache IoTDB but does not describe a train/test split. The RAG+CoT approach uses 'historical data as labeled examples,' but it is unclear whether the test examples are separate from the historical examples used to guide the model."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "Table 3 reports aggregate precision, recall, and F1 for anomaly detection and failure diagnosis as a whole, but does not break down results by anomaly type (e.g., CPU saturation vs. network partition vs. excessive import)."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Section 4.2 explicitly notes: 'the model struggles to identify correct results when dealing with complex internal anomalies (e.g., excessive data import).' This is a specific failure case discussion."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "No systematic negative results or ablation failures are reported. The single failure mention (excessive data import) is qualitative and not quantified. The paper presents AgentFM's performance favorably without discussing approaches that were tried and abandoned."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The abstract claims 'preliminary evaluations using Apache IoTDB demonstrate the effectiveness of AgentFM.' The results in Table 3 (F1 of 95.76% for detection, 87.62% for diagnosis) support this modest claim, and the paper consistently uses hedging language ('preliminary')."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The paper makes causal claims that role-awareness leads to better failure management (the core design argument), and that AgentFM's architecture 'facilitates specialized agents for each role—ensuring a more nuanced and effective approach.' However, without baselines, there is no causal evidence that role-awareness itself improves performance."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The paper consistently describes results as 'preliminary' and explicitly states experiments are conducted on Apache IoTDB only. The conclusion says 'preliminary experiments demonstrate the feasibility of AgentFM' and future work plans are acknowledged, so the scope is appropriately bounded."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "No alternative explanations for the observed performance are considered. The paper does not discuss whether simpler baselines might achieve similar results, or whether the high detection F1 could be inflated by the test design (e.g., 20 injections per anomaly type, relatively simple scenarios)."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper states 'Qwen2.5-72b' is used but provides no snapshot date, API version, or specific model checkpoint identifier. Model behavior can vary significantly across versions."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "Section 3.3 describes that 'all agents follow a common RAG+CoT approach' and that 'prompts vary by task,' but no actual prompt text is provided. The paper describes what prompts do in natural language without showing the actual text sent to the model."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "No temperature, top-p, max tokens, or other LLM hyperparameters are reported. The paper mentions using 'function call feature' for detection but provides no configuration details."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Section 3 describes the multi-agent architecture in detail, including the meta-agent orchestration (Figure 1), System Role Manager workflow (Figure 2), data agent preprocessing pipelines (Sections 3.1-3.2), and the sequential task agent execution (Section 3.3). The scaffolding is described at a design level, though implementation details are sparse."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 3.2 documents the Metric Agent preprocessing pipeline (noise removal, imputation, conversion to natural language) and Log Agent processing (sequence compression via log parsing, semantic compression via LLM summarization), with formal notation."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "There is no dedicated limitations or threats-to-validity section. Section 5 (Conclusion) mentions only that the model 'struggles to identify correct results when dealing with complex internal anomalies' as future work direction, not as a substantive limitations discussion."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "No threats to validity are discussed at all. Key threats—such as the evaluation on a single database system, lack of baselines, manual anomaly injection methodology, and the use of manually reviewed mitigation solutions—are not addressed."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "While the paper uses 'preliminary' language throughout, it does not explicitly state what the results do not show. For example, it does not state that results are limited to time-series databases, single-cluster configurations, or the specific anomaly types tested."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No raw data (traces, metrics, logs, injected anomaly records) is made available. The experimental data is entirely inaccessible to independent verification."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 4.1 describes the data collection procedure: 10 anomaly types (listed explicitly) were manually injected into Apache IoTDB, each 20 times, generating traces/metrics/logs from the running system."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "This is a benchmark-style evaluation with manually injected anomalies on a database system; there are no human participants or samples that require recruitment description."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "While the preprocessing components are described in Section 3.2, the full pipeline from anomaly injection to final evaluation numbers is not documented. It is unclear how many log/metric instances were collected, how anomalous vs. normal windows were defined for detection, and how training examples were selected for RAG."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The Acknowledgement section states: 'This work is supported by Key-Area Research and Development Program of Guangdong Province, China (NO.2020B010164003).'"
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are listed on the title page: five authors are from Peking University and one (Yunpeng Zhai) is from Alibaba Group. Affiliations are clearly disclosed."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "The funder (Guangdong Province government research program) is a government funding body with no apparent financial stake in the AgentFM framework or Apache IoTDB performance results."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "There is no competing interests or financial disclosure statement. Yunpeng Zhai is affiliated with Alibaba Group, and the paper references Alibaba OceanBase and Alibaba Cloud business impacts, but no conflict-of-interest declaration is provided."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "The paper uses Qwen2.5-72b but does not state the model's training data cutoff date. This is relevant because the model might have been trained on documentation or logs from Apache IoTDB or similar systems."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No discussion of potential overlap between Qwen2.5-72b's training data and the Apache IoTDB documentation, configuration files, or known anomaly patterns used in the evaluation."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "The evaluation uses manually injected anomalies on a live Apache IoTDB instance rather than a public benchmark dataset. The test cases were created by the authors and not published prior to the model's training, so benchmark contamination in the traditional sense does not apply."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "This is a system evaluation paper with no human subjects study; pre-registration is not applicable."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants are involved; IRB approval is not applicable."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants are involved; demographics are not applicable."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants are involved; inclusion/exclusion criteria are not applicable."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants or experimental human study design; randomization is not applicable."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants experimental study; blinding is not applicable."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants; attrition is not applicable."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No inference cost, API cost, latency, or tokens consumed are reported. AgentFM calls Qwen2.5-72b across multiple agents per failure event, which could be expensive in practice, but costs are not quantified."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No GPU hours, API spend, hardware configuration, or compute time is stated. The paper does not disclose what compute resources were used to run the Qwen2.5-72b model or the experiments."
    288       }
    289     }
    290   },
    291   "claims": [
    292     {
    293       "claim": "Different nodes in a distributed database have varying significance for failure management; Node6 achieves the best anomaly detection performance due to hosting the largest number of leader partitions.",
    294       "evidence": "Table 1 in Section 2.1 shows PLELog-based anomaly detection F1-scores per node ranging from 47.53% (Node4) to 90.73% (Node6) for a manually injected 'excessive data export' anomaly.",
    295       "supported": "moderate"
    296     },
    297     {
    298       "claim": "Metrics are effective for detecting resource anomalies (CPU, memory) while logs excel at identifying internal database issues (data export/import, configuration errors).",
    299       "evidence": "Table 2 in Section 2.2 shows anomaly classification results using DTW-based classification on metrics vs. log data across 5 anomaly categories, with an F1 > 50% threshold for identifiability.",
    300       "supported": "weak"
    301     },
    302     {
    303       "claim": "AgentFM achieves 95.76% F1-score for anomaly detection and 87.62% F1-score for failure diagnosis on Apache IoTDB.",
    304       "evidence": "Table 3 in Section 4.2 reports precision/recall/F1 for anomaly detection (95.14%/97.03%/95.76%) and failure diagnosis (89.61%/87.04%/87.62%) using Qwen2.5-72b with 200 injected anomalies.",
    305       "supported": "weak"
    306     },
    307     {
    308       "claim": "AgentFM generates targeted and effective mitigation solutions for system remediation.",
    309       "evidence": "Section 4.2 and Figure 3 present one example mitigation output (recommendations for CPU resource increase and load balancing), described as 'targeted and effective assistance.'",
    310       "supported": "weak"
    311     }
    312   ],
    313   "methodology_tags": [
    314     "benchmark-eval",
    315     "case-study"
    316   ],
    317   "key_findings": "AgentFM proposes a role-aware multi-agent framework for distributed database failure management that distinguishes system roles (node types), data roles (metrics vs. logs), and task roles (detection, diagnosis, mitigation). A preliminary empirical study on Apache IoTDB shows that node role correlates with anomaly detection performance and that metrics and logs identify different anomaly types. Preliminary evaluation using Qwen2.5-72b on 200 manually injected anomalies shows 95.76% F1 for detection and 87.62% F1 for diagnosis, though without baselines for comparison.",
    318   "red_flags": [
    319     {
    320       "flag": "No baseline comparison",
    321       "detail": "The main evaluation of AgentFM (Table 3) includes no comparison to any prior system or baseline. There is no way to assess whether the reported detection/diagnosis scores represent an improvement over simpler or existing approaches."
    322     },
    323     {
    324       "flag": "Single-run results, no variance",
    325       "detail": "All performance numbers are reported as single-point estimates with no repeated runs, standard deviation, or confidence intervals. Results may not be stable across different random seeds, prompt orderings, or anomaly injection timings."
    326     },
    327     {
    328       "flag": "Manual mitigation evaluation with single example",
    329       "detail": "The mitigation solution quality is assessed by manual review, and only a single example is shown (Figure 3). This provides very weak evidence of effectiveness across diverse failure scenarios."
    330     },
    331     {
    332       "flag": "No per-category breakdown despite multiple anomaly types",
    333       "detail": "Although 10 different anomaly types are injected, results are reported only as aggregate F1 scores. The single qualitative failure mention (excessive data import) suggests the aggregate score masks significant variation across anomaly types."
    334     },
    335     {
    336       "flag": "Model version underspecified",
    337       "detail": "The paper uses 'Qwen2.5-72b' without a snapshot date or checkpoint identifier. Given that Qwen2.5-72b may have been trained on Apache IoTDB documentation and logs, the model version specification matters for reproducibility and contamination assessment."
    338     },
    339     {
    340       "flag": "Alibaba affiliation without conflict-of-interest disclosure",
    341       "detail": "Co-author Yunpeng Zhai is affiliated with Alibaba Group, and the paper cites Alibaba Cloud and OceanBase as motivation. No competing interests statement is provided despite the potential financial interest."
    342     },
    343     {
    344       "flag": "Unclear train/test separation for RAG examples",
    345       "detail": "The RAG+CoT approach uses 'historical data as labeled examples' but it is not clear whether these labeled examples are drawn from the same 200 injected anomalies used for evaluation, which would constitute test leakage."
    346     }
    347   ],
    348   "cited_papers": [
    349     {
    350       "title": "Recommending root-cause and mitigation steps for cloud incidents using large language models",
    351       "authors": ["Toufique Ahmed", "Supriyo Ghosh", "Chetan Bansal", "Thomas Zimmermann", "Xuchao Zhang", "Saravan Rajmohan"],
    352       "year": 2023,
    353       "relevance": "Directly related work on using LLMs for cloud incident root cause analysis and mitigation recommendation."
    354     },
    355     {
    356       "title": "Eadro: An end-to-end troubleshooting framework for microservices on multi-source data",
    357       "authors": ["Cheryl Lee", "Tianyi Yang", "Zhuangbin Chen", "Yuxin Su", "Michael R Lyu"],
    358       "year": 2023,
    359       "relevance": "Related work on multimodal (trace+log+metric) failure management for microservice systems, a key comparison point."
    360     },
    361     {
    362       "title": "A Survey of AIOps for Failure Management in the Era of Large Language Models",
    363       "authors": ["Lingzhe Zhang", "Tong Jia", "Mengxi Jia", "Yifan Wu", "Aiwei Liu", "Yong Yang", "Zhonghai Wu", "Xuming Hu", "Philip S Yu", "Ying Li"],
    364       "year": 2024,
    365       "arxiv_id": "2406.11213",
    366       "relevance": "Survey of LLM-based AIOps for failure management, directly scoping the research landscape this paper contributes to."
    367     },
    368     {
    369       "title": "Building AI Agents for Autonomous Clouds: Challenges and Design Principles",
    370       "authors": ["Manish Shetty", "Yinfang Chen", "Gagan Somashekar", "Minghua Ma", "Yogesh Simmhan", "Xuchao Zhang", "Jonathan Mace", "Dax Vandevoorde", "Pedro Las-Casas", "Shachee Mishra Gupta"],
    371       "year": 2024,
    372       "relevance": "Related work on autonomous cloud operations using AI agents, relevant to the agentic AIOps theme."
    373     },
    374     {
    375       "title": "Exploring llm-based agents for root cause analysis",
    376       "authors": ["Devjeet Roy", "Xuchao Zhang", "Rashi Bhave", "Chetan Bansal", "Pedro Las-Casas", "Rodrigo Fonseca", "Saravan Rajmohan"],
    377       "year": 2024,
    378       "relevance": "LLM-based agent approach for root cause analysis, a direct comparison to AgentFM's diagnosis capability."
    379     },
    380     {
    381       "title": "mABC: multi-Agent Blockchain-Inspired Collaboration for root cause analysis in micro-services architecture",
    382       "authors": ["Wei Zhang", "Hongcheng Guo", "Jian Yang", "Yi Zhang", "Chaoran Yan", "Zhoujin Tian", "Hangyuan Ji", "Zhoujun Li", "Tongliang Li", "Tieqiao Zheng"],
    383       "year": 2024,
    384       "arxiv_id": "2404.12135",
    385       "relevance": "Multi-agent approach to root cause analysis in microservices, closely related to AgentFM's multi-agent architecture."
    386     },
    387     {
    388       "title": "Semi-supervised log-based anomaly detection via probabilistic label estimation",
    389       "authors": ["Lin Yang", "Junjie Chen", "Zan Wang", "Weijing Wang", "Jiajun Jiang", "Xuyuan Dong", "Wenbin Zhang"],
    390       "year": 2021,
    391       "relevance": "PLELog is used as the baseline anomaly detection method in the preliminary empirical study (Table 1)."
    392     },
    393     {
    394       "title": "Automated root causing of cloud incidents using in-context learning with gpt-4",
    395       "authors": ["Xuchao Zhang", "Supriyo Ghosh", "Chetan Bansal", "Rujia Wang", "Minghua Ma", "Yu Kang", "Saravan Rajmohan"],
    396       "year": 2024,
    397       "relevance": "In-context learning with GPT-4 for cloud incident root causing, directly relevant to LLM-based AIOps."
    398     },
    399     {
    400       "title": "Multivariate Log-based Anomaly Detection for Distributed Database",
    401       "authors": ["Lingzhe Zhang", "Tong Jia", "Mengxi Jia", "Ying Li", "Yong Yang", "Zhonghai Wu"],
    402       "year": 2024,
    403       "relevance": "Prior work from the same group on log-based anomaly detection for distributed databases, a direct predecessor to AgentFM."
    404     },
    405     {
    406       "title": "Robust failure diagnosis of microservice system through multimodal data",
    407       "authors": ["Shenglin Zhang", "Pengxiang Jin", "Zihan Lin", "Yongqian Sun", "Bicheng Zhang", "Sibo Xia", "Zhengdan Li", "Zhenyu Zhong", "Minghua Ma", "Wa Jin"],
    408       "year": 2023,
    409       "relevance": "Multimodal failure diagnosis for microservices, relevant as a comparison system for multi-source data integration approaches."
    410     }
    411   ]
    412 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs