calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (14607B)
      1 {
      2   "paper_slug": "agentfm-roleaware-failure-2025",
      3   "calibration_date": "2026-02-28",
      4   "model": "opus",
      5   "total_questions": 50,
      6   "agreement_count": 50,
      7   "disagreement_count": 0,
      8   "agreement_rate": 1.0,
      9   "disagreements": [],
     10   "opus_checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": false,
     15         "justification": "No repository URL, GitHub link, Zenodo archive, or any other code release is mentioned anywhere in the paper."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "The injected anomaly dataset, collected traces, metrics, and logs used for evaluation are not released. Apache IoTDB itself is public but the experimental data is not provided."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No requirements.txt, Dockerfile, conda environment, or dependency listing is provided. Only 'Qwen2.5-72b' and 'Apache IoTDB' are mentioned without environment setup details."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No README, reproduction scripts, or step-by-step instructions are provided. Section 4 describes the experimental design at a high level but not how to reproduce it."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "All results in Tables 1, 2, and 3 are reported as point estimates only (e.g., 'F1-Score 95.76%'). No confidence intervals or error bars are provided."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No statistical significance tests are used. The preliminary study (Table 1) and main evaluation (Table 3) present raw numbers without any testing."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "No effect sizes are reported. Without baselines in the main AgentFM evaluation, there is no reference point for effect size computation."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The evaluation uses 10 anomaly types with 20 injections each (200 total) but no justification is provided for why this sample size was chosen or whether it is sufficient for the claims made."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "All results are single-run point estimates. No variance, standard deviation, or results across multiple runs are reported."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The main evaluation of AgentFM (Table 3) includes no baseline comparisons. PLELog in Table 1 is used in the preliminary empirical study as a tool to motivate the approach, not as a comparison to AgentFM."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No baselines are included in the AgentFM evaluation, making this criterion impossible to satisfy. The preliminary study uses only PLELog (2021)."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "AgentFM has multiple components (system agents, data agents, task agents, meta-agent) but no ablation study tests the contribution of any individual component."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Section 4.2 reports precision, recall, and F1-score for both anomaly detection and failure diagnosis tasks."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Section 4.2 states: 'the mitigation solutions generated are manually analyzed to determine their usefulness' and 'we manually review the LLM-generated results.' The system's outputs are evaluated by humans."
     85       },
     86       "held_out_test_set": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "The RAG+CoT approach uses 'historical data as labeled examples' for in-context learning, but it is not clear whether the test examples are separated from the historical examples used to guide the model."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "Although 10 different anomaly types are injected, Table 3 reports only aggregate precision, recall, and F1 without per-anomaly-type breakdowns."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Section 4.2 notes: 'the model struggles to identify correct results when dealing with complex internal anomalies (e.g., excessive data import).' This is a specific failure case."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "No systematic negative results are reported. The brief mention of difficulty with complex internal anomalies is qualitative and not quantified. No approaches tried and abandoned are described."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The abstract claims 'preliminary evaluations using Apache IoTDB demonstrate the effectiveness of AgentFM.' Table 3 results (F1 95.76% detection, 87.62% diagnosis) support this appropriately hedged claim."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "The paper argues role-awareness leads to better failure management and that AgentFM's architecture 'ensures a more nuanced and effective approach.' Without baselines or ablations, there is no causal evidence that role-awareness itself drives the performance."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The paper consistently uses 'preliminary' language and tests only on Apache IoTDB. The conclusion says 'preliminary experiments demonstrate the feasibility of AgentFM.' Scope is appropriately bounded."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "No alternative explanations for results are discussed. The paper does not consider whether simpler approaches or the LLM's pre-existing knowledge of Apache IoTDB could explain the results."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper states 'Qwen2.5-72b' but does not specify the variant (base vs. instruct), snapshot date, or checkpoint identifier. Multiple Qwen2.5-72B variants exist with substantially different behaviors."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "Section 3.3 describes a 'RAG+CoT approach' and notes that 'prompts vary by task,' but no actual prompt text is provided. Prompts are described only in natural language."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "No temperature, top-p, max tokens, or other LLM configuration parameters are reported despite using an LLM API for all agents."
    144       },
    145       "scaffolding_described": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Section 3 describes the multi-agent architecture in detail: meta-agent orchestration (Figure 1), System Role Manager (Figure 2), data agent pipelines (Sections 3.1-3.2), and sequential task agent execution (Section 3.3)."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section 3.2 documents the Metric Agent preprocessing (noise removal, imputation, NL conversion with formal notation) and Log Agent processing (sequence compression via log parsing, semantic compression via LLM summarization)."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion briefly mentions future work directions but does not substantively discuss limitations."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "No threats to validity are discussed. Key threats such as single-system evaluation, lack of baselines, manual anomaly injection methodology, and unclear train/test separation are not addressed."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "While 'preliminary' language is used throughout, the paper does not explicitly state what the results do not show or what settings/systems are excluded from the claims."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No raw data (traces, metrics, logs, anomaly injection records) is made available for independent verification."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 4.1 describes the data collection: 10 anomaly types are listed explicitly (CPU saturation, IO saturation, memory saturation, etc.), each injected 20 times into Apache IoTDB."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "This is a system evaluation with manually injected anomalies on a database; there are no human participants or samples requiring recruitment description."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "While Section 3.2 describes preprocessing components, the full pipeline from anomaly injection to final evaluation numbers is not documented. How normal vs. anomalous windows were defined, how many instances per class were collected, and how RAG examples were selected remain unclear."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The Acknowledgement section states: 'This work is supported by Key-Area Research and Development Program of Guangdong Province, China (NO.2020B010164003).'"
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "All author affiliations are listed on the title page: five from Peking University and one (Yunpeng Zhai) from Alibaba Group."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The funder (Guangdong Province government research program) is a government body with no apparent financial stake in AgentFM's performance or Apache IoTDB."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests or financial disclosure statement is present. Yunpeng Zhai is from Alibaba Group, and the paper references Alibaba Cloud and OceanBase as motivation, but no conflict-of-interest declaration is provided."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "The paper uses Qwen2.5-72b for anomaly detection and diagnosis but does not state its training data cutoff. The LLM's prior knowledge of Apache IoTDB patterns could influence results."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No discussion of whether Qwen2.5-72b's training data includes Apache IoTDB documentation, log patterns, or similar system operational knowledge that could affect evaluation results."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "The evaluation uses manually injected anomalies on a live Apache IoTDB instance, not a published benchmark dataset. The test cases were created by the authors and were not publicly available before the model's training."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants; pre-registration is not applicable."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants; IRB approval is not applicable."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants; demographics are not applicable."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants; inclusion/exclusion criteria are not applicable."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants or experimental human study design; randomization is not applicable."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants or experimental human study; blinding is not applicable."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants; attrition is not applicable."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "No inference cost, API cost, latency, or token consumption is reported despite AgentFM making multiple LLM calls across several agents per failure event."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No GPU hours, total API spend, hardware configuration, or compute time is stated for running Qwen2.5-72b or the experiments."
    281       }
    282     }
    283   }
    284 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs