scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (16362B)
      1 {
      2   "paper": {
      3     "title": "Chaos Engineering 2.0: A Review of AI-Driven, Policy-Guided Resilience for Multi-Cloud Systems",
      4     "authors": ["Lasbrey Chibuzo Opara", "Ogheneruemu Nathaniel Akatakpo", "Ifeanyi Charles Ironuru", "Kingsley Anyaene", "Benjamin Osaze Enobakhare"],
      5     "year": 2025,
      6     "venue": "Journal of Computer, Software, and Program (JCSP)",
      7     "doi": "10.69739/jcsp.v2i2.846"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No code or analysis scripts are released. The paper includes illustrative Terraform snippets but no repository or archive."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No dataset, search corpus, or extracted data is released."
     20       },
     21       "environment_specified": {
     22         "applies": false,
     23         "answer": false,
     24         "justification": "This is a narrative review with no computational experiments requiring an environment."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No instructions for reproducing the literature search or analysis are provided."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": false,
     35         "answer": false,
     36         "justification": "Narrative review with no original quantitative experiments."
     37       },
     38       "significance_tests": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "No statistical comparisons are made by the authors."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No original experiments; the paper cites vendor-reported figures but does not conduct its own analysis."
     47       },
     48       "sample_size_justified": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "Narrative review, not an empirical study."
     52       },
     53       "variance_reported": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "Narrative review, no experimental runs."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Table 2 compares Chaos 1.0 vs Chaos 2.0 across multiple dimensions, serving as a baseline comparison framework."
     64       },
     65       "baselines_contemporary": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "No experimental evaluation where baseline recency is relevant."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "No system with components to ablate."
     74       },
     75       "multiple_metrics": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "No experimental evaluation with metrics."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No system outputs to evaluate."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No experimental evaluation."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 1 provides per-tool breakdowns across multi-cloud support, AI hooks, IaC integration, observability, and policy guardrails."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The case studies discuss specific failure modes discovered (cache stampede, shared storage choke point in blue/green rollback)."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that open-source chaos tools have limited production adoption (single-digit percentage) and that Chaos 1.0 had blind spots."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims the review synthesizes resilience patterns and presents case studies, which the paper body delivers. Claims are descriptive rather than quantitative."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper makes causal claims like 'organizations using AI planning cut mean time-to-resolution by up to 90%' and 'chaos drills could have mitigated' CrowdStrike impact, based on vendor marketing data with no independent verification or causal identification strategy."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title claims coverage of 'Multi-Cloud Systems' broadly, but the two case studies are illustrative anecdotes without stated generalization boundaries. The paper does not clearly bound what its findings do not cover."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No alternative explanations are discussed for the claimed benefits. Vendor-reported ROI figures are taken at face value without considering selection bias, confounds, or alternative causes for improved resilience."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "No LLM or ML model evaluation is conducted."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "No prompting is used."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No model experiments."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used by the authors."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Section 3 describes databases and search queries but provides no counts of results at each screening stage, no specific inclusion/exclusion criteria beyond broad categories, and no PRISMA-style flow."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "There is no dedicated limitations or threats-to-validity section anywhere in the paper."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No threats to validity are discussed."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what it does NOT cover or what claims it is NOT making. Section 3.1 mentions it is 'not a scoping or full systematic review' but no further boundaries are stated."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No list of included/excluded papers, no raw extraction data, no supplementary materials."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 3 describes the databases searched, time window (2011-August 2025), and core queries used."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants; data sources are databases, not recruited subjects."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The methodology section lists databases and queries but does not document how many papers were found, screened, or included at each stage. The extraction framework (Section 3.4) lists fields but not counts."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding disclosure or acknowledgments section is present."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: universities in Nigeria and Lithuania, plus one industry engineer at Peterbilt Motors."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information disclosed, so independence cannot be assessed."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "No pre-trained model is evaluated on any benchmark."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "No model evaluation."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No model evaluation."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "Survey/review paper with no method of its own to cost."
    275       },
    276       "compute_budget_stated": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "Survey/review paper."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Organizations using AI planning for chaos engineering cut mean time-to-resolution by up to 90%",
    286       "evidence": "Cited from Gremlin's 2023 'State of Chaos Engineering' report (Section 1, Introduction). No independent verification provided.",
    287       "supported": "weak"
    288     },
    289     {
    290       "claim": "Chaos Engineering 2.0 with AI-guided orchestration reduces experiment setup time from hours to minutes",
    291       "evidence": "Based on Harness's January 2025 GenAI agent release (Section 4.2.1). Vendor marketing claim, no controlled study.",
    292       "supported": "weak"
    293     },
    294     {
    295       "claim": "A cache-stampede chaos drill revealed a blind spot that could have cost ~$140,000 in abandoned carts during a two-hour peak",
    296       "evidence": "Illustrative case study in Section 4.4.1. Based on 'synthetic A/B modeling' with no methodology details.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "Respondents logged a 245% ROI once chaos became continuous, mostly from shorter outages and faster incident triage",
    301       "evidence": "Cited from Forrester cost-benefit survey via Gremlin (Section 4.5.3). Vendor-commissioned study.",
    302       "supported": "weak"
    303     },
    304     {
    305       "claim": "89% of large enterprises run workloads across two or more cloud providers",
    306       "evidence": "Cited from Flexera 2024 State of the Cloud survey (Section 1, Introduction). Industry survey, not peer-reviewed.",
    307       "supported": "moderate"
    308     }
    309   ],
    310   "methodology_tags": ["meta-analysis", "case-study"],
    311   "key_findings": "This narrative review synthesizes chaos engineering evolution from Netflix's Chaos Monkey (Chaos 1.0) to AI-driven, policy-guarded, service-mesh-delivered fault injection (Chaos 2.0) for multi-cloud systems. It provides a tool comparison matrix across six platforms and two illustrative case studies showing how latency chaos uncovered a cache-stampede vulnerability and how partition chaos exposed a flawed blue/green rollback. The paper identifies five future directions including autonomous chaos agents, security chaos engineering, and regulatory-driven adoption (DORA, FCA, PCI DSS 4.0).",
    312   "red_flags": [
    313     {
    314       "flag": "Heavy reliance on vendor marketing as evidence",
    315       "detail": "Most quantitative claims (90% MTTR reduction, 245% ROI, 70% reduction in manual edits) come from vendor blogs, marketing pages, and vendor-commissioned studies (Gremlin, Harness, Forrester/Gremlin). These are presented as findings without critical assessment of their reliability or methodology."
    316     },
    317     {
    318       "flag": "Case studies lack verifiable detail",
    319       "detail": "The two case studies are presented as anonymized anecdotes ('a mid-market retailer', 'a payment processor') with no verifiable specifics, no company names, no dates, and no independent confirmation. The $140K loss estimate comes from unspecified 'synthetic A/B modeling'."
    320     },
    321     {
    322       "flag": "No limitations section",
    323       "detail": "The paper has no limitations, threats to validity, or scope boundaries section, which is a significant methodological gap for a review paper."
    324     },
    325     {
    326       "flag": "Methodology lacks rigor for a review paper",
    327       "detail": "Section 3 describes a search protocol but provides no PRISMA flow, no counts of papers found/screened/included, no quality assessment of included sources, and no clear distinction between peer-reviewed and grey literature evidence."
    328     },
    329     {
    330       "flag": "Uncritical quality laundering",
    331       "detail": "The review treats blog posts, Medium articles, vendor documentation, and marketing pages with the same evidential weight as peer-reviewed research, effectively laundering low-quality sources into a peer-reviewed publication."
    332     },
    333     {
    334       "flag": "ChatGPT artifacts in references",
    335       "detail": "Multiple reference URLs contain 'utm_source=chatgpt.com' parameters, suggesting sources were found via ChatGPT rather than systematic database searches as claimed in the methodology."
    336     }
    337   ],
    338   "cited_papers": [
    339     {
    340       "title": "Lineage-driven Fault Injection",
    341       "authors": ["Peter Alvaro", "Joshua Rosen", "Joseph M. Hellerstein"],
    342       "year": 2015,
    343       "doi": "10.1145/2723372.2723711",
    344       "relevance": "Foundational work on formalizing fault selection as causal lineage queries, relevant to AI-guided testing methodology."
    345     },
    346     {
    347       "title": "Service-Level Fault Injection Testing",
    348       "authors": ["Christopher S. Meiklejohn", "Andrea Estrada", "Yiwen Song", "Heather Miller", "Rohan Padhye"],
    349       "year": 2021,
    350       "doi": "10.1145/3472883.3487005",
    351       "relevance": "Advances fault injection to RPC boundaries in microservices, relevant to automated testing of distributed systems."
    352     },
    353     {
    354       "title": "Unveiling the microservices testing methods, challenges, solutions, and solutions gaps: A systematic mapping study",
    355       "authors": ["Muqeet Hui", "Ling Wang", "Huaning Li"],
    356       "year": 2025,
    357       "doi": "10.1016/j.jss.2024.112232",
    358       "relevance": "Systematic mapping of microservice testing methods, directly relevant to evaluation methodology in distributed systems."
    359     },
    360     {
    361       "title": "Chaos Engineering in the Cloud-Native Era: Evaluating Distributed AI Model Resilience on Kubernetes",
    362       "authors": ["Anila Gogineni"],
    363       "year": 2025,
    364       "doi": "10.51219/JAIMLD/anila-gogineni/477",
    365       "relevance": "Applies chaos engineering to AI model resilience, connecting fault injection with LLM/AI system reliability."
    366     }
    367   ]
    368 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs