scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28777B)
      1 {
      2   "paper": {
      3     "title": "Toward Trustworthy Agentic AI: A Multimodal Framework for Preventing Prompt Injection Attacks",
      4     "authors": [
      5       "Toqeer Ali Syed",
      6       "Mishal Ateeq Almutairi",
      7       "Mahmoud Abdel Moaty"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2512.23557",
     12     "doi": "10.48550/arXiv.2512.23557"
     13   },
     14   "scan_version": 3,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "The paper proposes a Cross-Agent Multimodal Provenance-Aware Defense Framework that integrates text sanitization, visual sanitization, provenance tracking, and output validation for multi-agent LLM systems. The framework reports 94% detection accuracy for multimodal prompt injection (vs 52–66% for baselines), 70% reduction in cross-modal trust leakage, and 96% task accuracy retention on benign inputs. However, the evaluation dataset, sample sizes, and experimental methodology are almost entirely unspecified, making these numbers impossible to verify or reproduce.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No source code repository, GitHub link, or archive is provided anywhere in the paper. The implementation is described in Section IV but no code is released."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No evaluation dataset is released or even named. The paper does not specify what attack samples or benign inputs were used for evaluation, nor provide any download links."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Section IV mentions Python, RoBERTa, PaddleOCR, CLIP, Redis, GPT-4o-mini, and LLaVA/BLIP-2, but no version numbers, requirements.txt, Dockerfile, or dependency specifications are provided. This is insufficient to recreate the environment."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions are provided. Section IV describes the architecture and integration but not how to set up and run the experiments."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Figure 3 shows only point estimates for detection accuracy, trust leakage, and task accuracy retention. No confidence intervals, error bars, or uncertainty measures are reported anywhere."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper claims the proposed method outperforms all baselines (Section V.A: 94% vs 52–66%) but provides no statistical significance tests. Comparisons are made by simply presenting two numbers side by side."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "The paper reports raw performance values with baseline context: 94% detection vs 52/61/66% baselines, trust leakage reduced from 0.24 to 0.07 (explicitly stated as '70% reduction'), and 96% vs 94% task retention. This provides enough context to understand effect magnitude."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No sample sizes are stated at all. The paper does not specify how many attack samples, benign inputs, or evaluation instances were used. There is no power analysis or justification for the evaluation scale."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No variance, standard deviation, or any spread measure is reported. All results appear as single point estimates with no indication of run-to-run variability."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Section V compares against four baselines: keyword filtering, safety fine-tuning, post-hoc output filtering, and single VLM baseline. Results are shown in Figure 3."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "The baselines are generic technique categories (keyword filtering, fine-tuning, output filtering) rather than specific contemporary defense systems. No particular implementations or recent defense papers are cited as baselines. The paper does not compare against any named state-of-the-art prompt injection defense system."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "The system has four distinct components (Text Sanitizer, Visual Sanitizer, Provenance Ledger, Output Validator) but no ablation study is conducted to measure the individual contribution of each component."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Three evaluation metrics are used: detection accuracy (%), cross-modal trust leakage, and task accuracy retention (%). All three are reported in Section V and shown in Figure 3."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No human evaluation is conducted. All evaluation is automated. For a security defense system, human review of false positives/negatives or adversarial robustness testing by human red-teamers would be relevant but is not included."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No mention of train/test splits, held-out data, or any data partitioning strategy. The evaluation dataset itself is not described, so it is impossible to determine whether proper held-out testing was used."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "Only aggregate numbers are reported. No breakdown by attack type (text vs image vs metadata vs agent-to-agent), attack difficulty, or modality is provided despite the multimodal nature of the framework."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "No failure cases are shown or discussed. The paper does not analyze what types of attacks the system misses (the 6% not detected) or when the sanitization fails."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": false,
    112         "justification": "No negative results are reported. Every metric shows the proposed system outperforming all baselines. No failed approaches, ablations that hurt performance, or configurations that did not work are mentioned."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims 'multimodal injection detection accuracy is significantly enhanced' (94% vs 52–66%), 'cross-agent trust leakage is minimized' (0.07 vs 0.24), and 'agentic execution pathways become stable' (96% task retention). These qualitative claims are directionally supported by the numbers in Section V, though the evidence quality is low."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper makes causal claims throughout — e.g., the framework 'enhances' detection, sanitization 'prevents' injection propagation, trust masking 'ensures' safety. Without ablation studies or controlled experiments isolating individual components, these causal claims are not adequately justified. The comparison against baselines does not control for the added complexity of the multi-component system."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The title claims 'Trustworthy Agentic AI' broadly. The abstract generalizes to 'agentic AI systems' and 'LangChain or GraphChain-style workflows.' No evaluation dataset is specified, so it is impossible to know what attack types, modalities, or system configurations were actually tested. No generalization bounds are stated."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "No alternative explanations for the results are discussed. The paper does not consider whether the performance gains might be due to the multi-stage processing overhead, the specific attack types chosen, or other confounds rather than the proposed trust-aware architecture."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper measures detection accuracy, trust leakage, and task retention — and frames these as exactly what they are. There is no proxy gap; the claims match the granularity of the measurements."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "Section IV.A mentions 'GPT-4o-mini via OpenAI API', 'LLaVA/BLIP-2 via Hugging Face Transformers', and 'RoBERTa-based pattern detector' but provides no specific version numbers, snapshot dates, or model sizes for any of these."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "No prompts or system instructions are provided. The paper describes what the agents do algorithmically (Algorithms 1–4) but does not show the actual prompt text sent to GPT-4o-mini or other LLMs."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "No hyperparameters are reported — no trust score thresholds, detection thresholds, temperature settings, attention mask parameters, or any tunable values. The algorithms reference trust scores and thresholds abstractly without specifying concrete values."
    157       },
    158       "scaffolding_described": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "The multi-agent scaffolding is described in substantial detail. Section III provides algorithms for each agent (Algorithms 1–4), Figure 1 shows the architecture, Figure 2 shows the sequence diagram, Section IV describes the integration points with LangChain/GraphChain, and Table II lists required modifications."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": false,
    166         "justification": "No description of how the evaluation data was prepared, what attack samples were used, how benign inputs were selected, or any preprocessing steps applied to the test data."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "There is no limitations section. The paper goes directly from the Discussion (Section V.D) to the Conclusion (Section VI) without acknowledging any limitations."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No threats to validity are discussed anywhere in the paper."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No scope boundaries are stated. The paper does not explicitly state what was not tested, what attack types are excluded, or what limitations apply to the generality of the results."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "No raw data is available. Neither the evaluation dataset nor the raw experimental results are released."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "The data collection procedure is not described at all. The paper does not specify what attack samples were used, where they came from, how many there were, or what benign inputs were included."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "While there are no human participants, the evaluation data source is completely unspecified. The paper does not describe how attack samples or test cases were selected or constructed, making the sample provenance unknown."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "No data pipeline is documented. There is no description of how evaluation data was processed from collection to analysis."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "The Acknowledgment section states: 'The author acknowledges Islamic University of Madinah University's support for ongoing research in AI security and trust assurance.'"
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are listed: Islamic University of Madinah (two authors) and Arab Open University-Bahrain (one author). The paper does not evaluate a commercial product, so there is no undisclosed product affiliation."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "The funder (Islamic University of Madinah) is an academic institution with no apparent financial stake in the outcome of the research on prompt injection defense."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial interests statement is present in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "The paper evaluates a defense framework against prompt injection attacks, not a pre-trained model's capability on a benchmark. Contamination of model training data is not the relevant concern."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "The paper tests a defense system rather than evaluating model knowledge on a benchmark. Train/test overlap in the LLM sense is not applicable."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "The paper tests a defense framework, not a model's inherent capabilities on a benchmark. Benchmark contamination is not the relevant threat."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants are involved in this study."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants are involved in this study."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants are involved in this study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants are involved in this study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants are involved in this study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants are involved in this study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants are involved in this study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No inference cost or latency is reported despite the system adding multiple sanitization layers (RoBERTa, OCR, CLIP, two LLM calls) to every request. The overhead of this multi-agent pipeline is never quantified."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No computational budget is stated. The paper does not report GPU hours, API costs, training time for the RoBERTa classifier, or total resources consumed."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No mention of multiple random seeds or seed sensitivity. All results appear to be from a single run."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The number of experimental runs is never stated. It is unclear whether results are from one run or averaged across multiple."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No hyperparameter search is described. Trust thresholds, detection parameters, and other tunable values are not discussed, let alone searched over."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No discussion of how the reported configuration was selected. No validation set or configuration selection procedure is described."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors evaluate their own system against baseline categories (not specific implementations). They do not acknowledge the bias of implementing and evaluating their own system, nor that their implementation of baselines may be suboptimal."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "No comparison of compute budgets across methods. The proposed framework uses multiple neural models (RoBERTa, CLIP, GPT-4o-mini) plus Redis, while baselines like keyword filtering are far cheaper. This compute asymmetry is never discussed."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The evaluation benchmark/dataset is not even specified, let alone analyzed for construct validity. The paper does not discuss whether its three metrics adequately capture real-world defense effectiveness."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "The scaffold IS the thing being tested — the paper evaluates its multi-agent defense framework as a bundled system."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of temporal leakage. The RoBERTa classifier may have been trained on data that includes the test attack patterns, but this is never addressed."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of feature leakage. The evaluation setup is so poorly described that it is impossible to assess whether the defense system had access to information not available in realistic deployment."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of whether training and test data for the classifiers are independent. The data pipeline is not described at all."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No leakage detection or prevention method is used or discussed."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "The proposed framework achieves 94% multimodal prompt injection detection accuracy, outperforming keyword filtering (52%), post-hoc output filtering (61%), safety fine-tuning (66%), and single VLM (78%).",
    369       "evidence": "Section V.A and Figure 3 show comparative bar chart. No dataset specification, sample sizes, or statistical tests accompany the numbers.",
    370       "supported": "weak"
    371     },
    372     {
    373       "claim": "The framework reduces cross-modal trust leakage from 0.24 to 0.07, a 70% reduction.",
    374       "evidence": "Section V.B states this figure and attributes it to trust-aware attention masking and provenance metadata propagation. No dataset, sample size, or statistical validation is provided.",
    375       "supported": "weak"
    376     },
    377     {
    378       "claim": "The framework retains 96% task accuracy on benign multimodal tasks.",
    379       "evidence": "Section V.C reports 96% retention vs 94% for the single VLM baseline. No details on what benign tasks were used or how accuracy was measured.",
    380       "supported": "weak"
    381     },
    382     {
    383       "claim": "The framework provides end-to-end security for LangChain and GraphChain environments through dual-stage sanitization.",
    384       "evidence": "Section III describes the architecture with pre-agent and pre-LLM sanitization stages, Algorithms 1-4, and Table II lists required framework modifications. This is a design claim supported by the architecture description but not empirically validated for end-to-end security.",
    385       "supported": "weak"
    386     }
    387   ],
    388   "red_flags": [
    389     {
    390       "flag": "Phantom evaluation dataset",
    391       "detail": "The paper reports precise performance numbers (94%, 0.07, 96%) but never specifies what evaluation dataset was used, how it was constructed, how many samples it contains, or what types of attacks are included. Results appear without any verifiable experimental setup."
    392     },
    393     {
    394       "flag": "No sample sizes anywhere",
    395       "detail": "Not a single sample size is reported in the entire paper. The reader cannot assess whether these results come from 10 or 10,000 test cases."
    396     },
    397     {
    398       "flag": "No error bars or statistical tests",
    399       "detail": "All results are single point estimates with no uncertainty quantification. Claims of superiority over baselines are made by comparing bare numbers without any statistical testing."
    400     },
    401     {
    402       "flag": "Excessive self-citation",
    403       "detail": "Six of sixteen references (37.5%) are by the first author (Syed): refs [8]-[13]. These are primarily other 'Agentic AI Framework' papers by the same group on topics like finance, disabilities, cloudburst prediction, and inventory management — none directly related to prompt injection defense."
    404     },
    405     {
    406       "flag": "No ablation study despite multi-component system",
    407       "detail": "The framework has four distinct agent components plus a provenance ledger, yet no ablation is performed to determine which components contribute to the reported gains."
    408     },
    409     {
    410       "flag": "Claims outrun evidence",
    411       "detail": "The title promises 'Trustworthy Agentic AI' and the abstract claims the framework 'enhances the establishment of secure, understandable and reliable agentic AI systems.' The evidence is a single bar chart with three metrics and no dataset specification."
    412     },
    413     {
    414       "flag": "Generic baselines, not specific systems",
    415       "detail": "Baselines are technique categories (keyword filtering, fine-tuning, output filtering) rather than specific implementations of state-of-the-art defense systems. It is unclear how these baselines were implemented or configured."
    416     },
    417     {
    418       "flag": "No limitations section",
    419       "detail": "The paper has no limitations, threats to validity, or scope boundaries section. Every result is presented as unambiguously positive."
    420     },
    421     {
    422       "flag": "Results appear too clean",
    423       "detail": "The proposed method dominates on every single metric with no trade-offs, failure modes, or edge cases discussed. A 94% detection rate with 96% benign retention and minimal leakage, all without any negative results, is suspiciously clean for a first prototype."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "More than you've asked for: A comprehensive analysis of prompt injection attacks in large language models",
    429       "authors": ["T. Greshake"],
    430       "year": 2023,
    431       "arxiv_id": "2302.12173",
    432       "relevance": "Foundational work on indirect prompt injection attacks in LLMs, directly relevant to the survey's coverage of PI attack surfaces."
    433     },
    434     {
    435       "title": "Red teaming large language models",
    436       "authors": ["T. Zhuo", "Y. Wang"],
    437       "year": 2023,
    438       "arxiv_id": "2309.07890",
    439       "relevance": "Red-teaming methodology for evaluating LLM safety vulnerabilities, relevant to the survey's assessment of AI safety evaluation methods."
    440     },
    441     {
    442       "title": "Universal and transferable adversarial attacks on aligned language models",
    443       "authors": ["A. Zou"],
    444       "year": 2023,
    445       "arxiv_id": "2307.15043",
    446       "relevance": "Demonstrates universal adversarial prompts that transfer across models and modalities, a key attack vector in the LLM security landscape."
    447     },
    448     {
    449       "title": "Prompt injection defense mechanisms in large language models: A survey",
    450       "authors": ["C. Xu"],
    451       "year": 2024,
    452       "relevance": "Survey of PI defense mechanisms including sandboxing, filtering, and RL guardrails — directly relevant to the survey's coverage of defense taxonomy."
    453     },
    454     {
    455       "title": "Multimodal Prompt Injection Attacks against Vision-Language Models",
    456       "authors": ["L. Wolff", "L. Shen", "M. Faezi", "N. Akhtar", "A. Mian"],
    457       "year": 2024,
    458       "arxiv_id": "2403.04888",
    459       "relevance": "Demonstrates adversarial instructions encoded in images via steganography and metadata manipulation, extending PI attacks to the visual modality."
    460     },
    461     {
    462       "title": "MM-SafetyBench: Evaluating Safety Risks of Multimodal Large Language Models with Generated Prompts",
    463       "authors": ["Z. Liu", "R. Zhang", "J. Xu", "D. Lin"],
    464       "year": 2024,
    465       "arxiv_id": "2402.12323",
    466       "relevance": "Safety benchmark for multimodal LLMs showing VLMs fail to identify harmful visual prompts, relevant to benchmarking methodological quality."
    467     },
    468     {
    469       "title": "Visual Prompt Injection Attacks and Defenses for Vision-Language Models",
    470       "authors": ["Y. Liu", "Z. Han", "B. Li", "N. Z. Gong"],
    471       "year": 2024,
    472       "arxiv_id": "2404.00562",
    473       "relevance": "Evaluates visual prompt injection exploiting vision encoder / language decoder vulnerabilities, relevant to multimodal AI safety research."
    474     },
    475     {
    476       "title": "Exploring interpretability and safety of large language models under adversarial prompting",
    477       "authors": ["S. Casper", "D. Hadfield-Menell"],
    478       "year": 2023,
    479       "arxiv_id": "2305.13636",
    480       "relevance": "Taxonomy of prompt-based attack types and evasion/mitigation strategies, relevant to understanding the adversarial prompting landscape."
    481     }
    482   ],
    483   "engagement_factors": {
    484     "practical_relevance": {
    485       "score": 1,
    486       "justification": "Describes LangChain/GraphChain integration conceptually but releases no code, no library, and no working implementation a practitioner could use."
    487     },
    488     "surprise_contrarian": {
    489       "score": 0,
    490       "justification": "Confirms the widely-held view that multimodal defenses and provenance tracking are needed for agentic AI security — no surprising or contrarian findings."
    491     },
    492     "fear_safety": {
    493       "score": 2,
    494       "justification": "Addresses prompt injection in multi-agent AI systems, a real and growing security concern that resonates with AI safety discourse."
    495     },
    496     "drama_conflict": {
    497       "score": 0,
    498       "justification": "No controversy, no critique of specific companies or products, no conflict angle."
    499     },
    500     "demo_ability": {
    501       "score": 0,
    502       "justification": "No code released, no demo, no package — nothing for anyone to try."
    503     },
    504     "brand_recognition": {
    505       "score": 0,
    506       "justification": "From Islamic University of Madinah and Arab Open University-Bahrain — not well-known AI research labs."
    507     }
    508   }
    509 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs