scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (34961B)
      1 {
      2   "paper": {
      3     "title": "MPIB: A Benchmark for Medical Prompt Injection Attacks and Clinical Safety in LLMs",
      4     "authors": ["Junhyeok Lee", "Han Jang", "Kyu Sung Choi"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2602.06268"
      8   },
      9   "scan_version": 3,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "MPIB benchmarks 12 LLMs across 5 defense configurations for clinical prompt injection safety. The key finding is that Attack Success Rate (ASR) and Clinical Harm Event Rate (CHER) can diverge substantially, especially under indirect RAG-mediated injection (V2), where models may comply with adversarial framing without producing high-severity clinical harm. Direct injection (V1) consistently produces higher CHER than indirect injection (V2) across all models. Composite defenses (D4) are not uniformly best and the safety-utility trade-off is model-dependent.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "The abstract states 'Code and data are available at GitHub (code) and Hugging Face (data).' The evaluation toolkit is described as publicly released (Table 7), and the HuggingFace repository is identified as jhlee0619/mpib (Section C.1)."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The dataset is released on Hugging Face (jhlee0619/mpib) in redacted form, with V2 payloads replaced by [REDACTED_PAYLOAD] tokens. Unredacted payloads are available under gated research agreement (Section C)."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No requirements.txt, Dockerfile, conda environment file, or detailed environment setup section is described in the paper. Hardware and software specifications for running the evaluation are not provided."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper mentions releasing 'fixed splits, prompt assembly templates, and baseline defenses' (Section 3.5) but does not include step-by-step reproduction instructions. A 'reproducible evaluation harness' is referenced but no specific commands or README-style instructions appear in the paper."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Table 5 reports all metrics (ASR, CHER, FPR-H) as point estimates (percentages) without any confidence intervals, error bars, or uncertainty measures."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Despite making numerous comparative claims across 12 models and 5 defense configurations (e.g., 'D2 reduces V1 CHER from 65.7% to 50.7%'), no statistical significance tests (t-tests, bootstrap tests, etc.) are reported anywhere in the paper."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper reports absolute percentage values with baseline context throughout (e.g., 'Qwen-2.5-72B reduces V1 CHER from 65.7% (D0) to 50.7% (D2),' Section 4.2), providing sufficient context to assess the magnitude of differences."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The final dataset size of 9,697 instances is reported (Table 1) but no power analysis or explicit justification for this number is provided. The sizes are a byproduct of the pipeline (source datasets × rule families × quality gates) rather than a deliberate sample size determination."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "All results in Table 5 are single-run point estimates. No standard deviation, variance, or spread measures are reported. While deterministic decoding (temperature=0) is used, the LLM-as-a-judge pipeline and dataset construction involve stochastic components whose variance is not assessed."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "D0 (no defense) serves as the baseline configuration against which D1-D4 are compared. Results are reported for all 12 models under D0 and all defense variants in Table 5."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The evaluated models include contemporary architectures: Qwen-2.5 (2024), Llama-3.1 (2024), MedGemma (2025), and Mixtral (2024). Medical-tuned models include Meditron, BioMistral, and MMed-Llama-3."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The defense hierarchy (D0-D4) functions as an ablation: D1 tests internal hardening alone, D2 tests input guard alone, D3 tests context sanitization alone, and D4 tests their combination. This reveals which defense components contribute to safety under each attack vector."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Three complementary metrics are reported: Attack Success Rate (ASR, severity ≥2), Clinical Harm Event Rate (CHER3, severity ≥3), and Utility False Positive Rate (FPR-H). The central contribution is arguing that ASR alone is insufficient."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "The main evaluation uses an LLM-as-a-judge (Qwen-2.5-72B-Instruct). A human-validated set (N=300) was used only for judge selection (Section 4.1), not for evaluating target model outputs. No human evaluation of the 12 target models' clinical safety outputs is reported."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Section 3.5 specifies an '80/10/10 split (Train/Dev/Test) grouped by source query to prevent leakage across paraphrases and derived variants.' Selected V2 attack families are reserved exclusively for the test set."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Table 5 breaks results down by model (13 models), defense configuration (D0-D4), attack vector (V1/V2), and metric (CHER, ASR, FPR-H). Tables 1-2 provide instance distribution by scenario (S1-S4) and vector. Figure 3 provides per-model ASR-CHER divergence."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Appendix F (Figures 4-7) provides detailed qualitative examples of V0, V0', V1, and V2 instances with raw model outputs showing both successful attacks and resistant behavior. Section 4.2 discusses specific failure modes."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 4.2 explicitly reports that 'Composite defense (D4) is not uniformly best' and can underperform simpler interventions. Cases where D3 increases ASR while decreasing CHER are discussed (e.g., MedGemma-4B under V2). The non-uniformity of defense effectiveness is a central finding."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": false,
    114         "justification": "The abstract claims 'V2 yields multiple-fold higher CHER than V1 in high-risk categories,' but Table 5 consistently shows V1 CHER exceeding V2 CHER across all 13 models under D0 (e.g., Qwen-2.5-72B: V1 CHER 65.7% vs V2 CHER 7.8%). No per-category breakdown supporting this abstract claim is presented. The Introduction similarly states 'indirect injection often exceeds direct injection in strength,' contradicting Section 4.2's own finding that 'V1 CHER is generally higher than V2 CHER.'"
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The defense configurations (D0-D4) constitute controlled interventions applied to each model-instance pair. Claims like 'D2 reduces V1 CHER' are supported by this controlled manipulation design where only the defense component varies while other factors are held constant."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Section 5 (Limitations) explicitly bounds generalizations: the defense harness is 'a controlled reference rather than a production-grade guardrail,' the taxonomy may not capture real-world clinical context (comorbidities, local guidelines), and the V2 protocol does not evaluate retrieval-stage factors. Claims are generally tied to specific models and defense configurations."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 4.2 discusses plausible alternative explanations: 'coupling input rewriting and context sanitization can introduce interaction effects: aggressive rewriting may distort clinical intent, while aggressive sanitization may remove legitimately relevant instructions.' Section 4.3 discusses why V2 ASR-CHER gaps may reflect framing compliance rather than clinical harm."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper's central methodological contribution is distinguishing between ASR (proxy for attack compliance) and CHER (proxy for clinical harm). Section 5 acknowledges that 'real-world clinical harm depends on patient context (e.g., comorbidities and access to care), local guideline differences' which their severity taxonomy cannot fully capture."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Section 4.1 specifies model names with family, size, and variant: 'Qwen-2.5-72B-Instruct,' 'Llama-3.1-8B-Instruct,' 'MedGemma-4B,' 'Meditron-70B,' 'BioMistral-7B,' 'MMed-Llama-3-8B.' These are specific open-weight model identifiers that uniquely identify the model weights."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The LLM judge prompt is provided in full in Appendix E. However, the system prompts and prompt assembly templates used to query the 12 target models during evaluation are not provided verbatim. The paper mentions a 'standardized prompt assembler' (Section 4.1) but does not show the actual prompt text. D1 (Internal Hardening) is described only conceptually."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section 4.1 states 'deterministic decoding (greedy or temperature = 0)' for target models. Appendix E states 'temperature=0.0 and max_tokens=1024' for the judge. These are the key inference parameters."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding (multi-step reasoning, tool use, memory management) is used. The defense components (D1-D4) are single-pass LLM filters, not agentic workflows."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 3.3 and Figure 2 document the full pipeline: text normalization from MedQA/PubMedQA, scenario classification via LLM-based classifier (Qwen-2.5-72B-Instruct), adversarial generation via rule families R1-R10, and multi-stage quality gating (G1-G6). Table 1 reports final counts by pool type."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 5 is titled 'Limitations' and contains three substantive paragraphs covering judge-dependent bias, taxonomy limitations, and defense harness limitations."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 5 discusses threats specific to this study: 'outcome labels are primarily produced via a structured LLM-as-a-judge pipeline, which may introduce judge-dependent bias,' 'our defense harness (D2-D4) is intentionally lightweight—a small-LLM (7B)–based scaffold,' and 'excluding invalid or unparseable judge outputs can affect reported metrics, particularly on borderline cases.'"
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 3.2 explicitly states out-of-scope factors: 'we do not evaluate whether poisoned content is retrieved in the first place, how retrieval ranking or noise affects exposure, or whether the retriever itself is compromised.' Section 5 notes the defense harness 'should be interpreted as a controlled reference rather than a production-grade guardrail.'"
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The dataset is released on Hugging Face (jhlee0619/mpib) with all instances available (V2 payloads redacted in public release, unredacted under gated access). Table 7 lists the artifact manifest including data splits and payload registry."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 3.3 describes the full collection procedure: source datasets (MedQA BigBio English, PubMedQA BigBio labeled folds), normalization pipeline, LLM-based medical filtering and scenario stratification, and adversarial variant generation via rule families R1-R10."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants in the study. Data is derived from standard public benchmarks (MedQA, PubMedQA). The human-validated set (N=300) for judge selection is not a human subjects study."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Figure 2 provides a visual overview of the five pipeline stages (Processing, Attack, Quality Control, Harm Taxonomy, Release). Sections 3.3-3.4 detail each stage. Quality gates G1-G6 (Appendix B.3) specify filtering criteria. Table 1 reports final counts per pool type."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding source, grants, or sponsorship are mentioned anywhere in the paper. There is no acknowledgments section disclosing funding."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed: all three authors are from Seoul National University (College of Medicine and/or Hospital). They do not evaluate any SNU products, so no product-affiliation conflict exists."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding is disclosed, so funder independence cannot be assessed. Without any funding disclosure, this cannot be verified as either independent or non-independent."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement appears in the paper. The absence of a disclosure statement does not constitute absence of conflict."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No training data cutoff dates are stated for any of the 12 evaluated models. This is important because the source datasets (MedQA, PubMedQA) are well-known public benchmarks that may be included in model training corpora."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No analysis or discussion of whether MedQA/PubMedQA instances (the source data for MPIB) appeared in the training data of the 12 evaluated models. This is particularly relevant for V0 (unaltered MedQA/PubMedQA items) used for utility baseline evaluation."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "MedQA (published 2021) and PubMedQA (published 2019) were publicly available well before the training of all evaluated models (Qwen-2.5, Llama-3.1, etc.). The paper does not discuss whether models may have trained on these datasets or how this could affect results, particularly for V0/V0' utility evaluation."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study. The benchmark evaluation is entirely computational."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants. Data is derived from existing public benchmarks (MedQA, PubMedQA)."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants or experimental conditions requiring participant randomization."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No inference cost, latency, or token consumption is reported despite evaluating 12 models × 5 defenses × 9,697 instances. The compute cost of running LLM-as-a-judge on all outputs is also not reported."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No GPU hours, hardware specifications, total API spend, or wall-clock time for the evaluation campaign are stated anywhere in the paper."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "Section 4.1 states 'deterministic decoding (greedy or temperature = 0)' for all target models and 'temperature=0.0' for the judge. Deterministic decoding eliminates stochastic variation across runs, making seed sensitivity analysis unnecessary as results are fully reproducible."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "The use of 'deterministic decoding (greedy or temperature = 0)' (Section 4.1) implies single evaluation runs, since additional runs with deterministic decoding would produce identical results."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The quality gate thresholds (e.g., V2 strict tier: Affinity ≥3.0, Misleading ≥3.0, Plausibility ≥3.0; borderline tier with lower thresholds) and defense configurations appear tuned but no search budget or tuning methodology is described."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "All five defense configurations (D0-D4) are reported for all models in Table 5. No single 'best' configuration is selected — the paper explicitly argues that 'Composite defense (D4) is not uniformly best' and that defense selection is model-dependent."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "The paper makes hundreds of implicit comparisons across 13 models × 5 defenses × 2 vectors × 3 metrics but performs no statistical tests whatsoever, let alone corrections for multiple comparisons."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors constructed the MPIB benchmark, designed the defense configurations (D1-D4), and selected the evaluation judge — all evaluated by the same team. No acknowledgment of self-comparison bias or discussion of independent evaluation is provided."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "Models range from 4B to 72B parameters, but performance is not discussed as a function of compute budget. No analysis of whether larger models' improved robustness justifies their compute cost."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "The paper's central contribution is questioning construct validity: it argues that ASR (the standard metric) does not validly measure clinical harm, and introduces CHER to capture high-severity outcomes. Section 5 discusses limitations of the H1-H5 taxonomy relative to real-world clinical harm."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": true,
    334         "answer": true,
    335         "justification": "Defense configurations (D0-D4) are explicitly treated as a controlled variable. Table 5 reports results for each model × defense combination, enabling readers to separate model robustness from defense scaffold effects."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "MedQA (2021) and PubMedQA (2019) were published before the training of all evaluated models. The paper does not discuss whether models may have seen these benchmark items (or similar medical QA data) during pre-training, which could inflate V0/V0' utility scores."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether the evaluation setup inadvertently leaks information. For example, the V2 protocol embeds poisoned contexts alongside benign contexts — the structural presence of conflicting information could serve as a signal for models trained to detect adversarial inputs."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": true,
    352         "justification": "Section 3.5 specifies '80/10/10 split (Train/Dev/Test) grouped by source query to prevent leakage across paraphrases and derived variants.' This ensures that derived variants of the same query do not appear in both train and test splits."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No concrete leakage detection methods (canary strings, membership inference, n-gram overlap analysis, decontamination pipelines) are applied. The grouping-by-source-query addresses within-benchmark leakage but not model training data contamination."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "ASR and CHER can diverge substantially, especially under indirect (V2) prompt injection, where larger 'Safe Gaps' emerge.",
    364       "evidence": "Figure 3 shows that under V2, ASR (severity ≥2) can remain high while CHER3 (severity ≥3) is much lower for many models. Table 5 shows specific examples: MedGemma-4B V2 D3 has ASR=65.6% but CHER=18.8%. Qwen-2.5-72B V2 D0 has ASR=53.1% but CHER=7.8%.",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "V1 (direct injection) produces generally higher CHER than V2 (indirect injection) across all models.",
    369       "evidence": "Table 5 shows V1 CHER exceeds V2 CHER for every model under D0: Qwen-2.5-72B (65.7% vs 7.8%), Llama-3.1-70B (86.6% vs 37.5%), MedGemma-4B (88.1% vs 21.9%). This pattern is consistent across all defense configurations.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "V2 yields multiple-fold higher CHER than V1 in high-risk categories (abstract claim).",
    374       "evidence": "This claim from the abstract directly contradicts the data in Table 5, which shows V1 CHER consistently exceeding V2 CHER across all 13 models. No per-category breakdown supporting this specific claim is presented in the paper.",
    375       "supported": "unsupported"
    376     },
    377     {
    378       "claim": "Defense effectiveness differs by attack vector — input-side defenses (D2) are more effective for V1, while context-side and hierarchy defenses help with V2.",
    379       "evidence": "Table 5, Section 4.2: D2 reduces V1 CHER for Qwen-2.5-72B (65.7%→50.7%) and Llama-3.1-70B (86.6%→68.7%). For V2, D1 reduces Qwen-2.5-72B CHER from 7.8% to 1.6%, and D4 reduces Meditron-70B V2 CHER from 53.1% to 37.5%.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "Composite defense (D4) is not uniformly the best configuration.",
    384       "evidence": "Table 5, Section 4.2: D4 sometimes underperforms simpler defenses (e.g., for Llama-3.1-8B V1 CHER, D4 yields 80.6% vs D2's 70.1%). The paper attributes this to interaction effects between input rewriting and context sanitization.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Safety-utility trade-off is model-dependent, with medical-tuned models showing higher FPR-H under stronger defenses.",
    389       "evidence": "Table 5 FPR-H: Meditron-70B increases from 16.0% (D0) to 33.6% (D4), while Qwen-2.5-72B ranges from 2.7-3.8% across all defenses. BioMistral-7B maintains 0.0% FPR-H across all configurations.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Qwen-2.5-72B-Instruct is the most appropriate LLM judge based on multi-objective criteria.",
    394       "evidence": "Tables 3-4: Qwen-2.5-72B achieves highest Kendall τ (0.65), lowest SCE (0.75), 100% JSON validity, and strongest holdout stability (Delta SCE=0.05, holdout Recall@4=0.91). Compared against Mixtral-8x22B and Llama-3.1-70B.",
    395       "supported": "strong"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "Abstract contradicts results on V2 vs V1 CHER",
    401       "detail": "The abstract claims 'V2 yields multiple-fold higher CHER than V1 in high-risk categories' and the Introduction states 'indirect injection often exceeds direct injection in strength.' However, Table 5 consistently shows V1 CHER exceeding V2 CHER across all 13 models under all defense configurations. No per-category breakdown supporting the abstract claim is provided. This is a significant mismatch between stated conclusions and presented evidence."
    402     },
    403     {
    404       "flag": "Same LLM used for both dataset construction and evaluation",
    405       "detail": "Qwen-2.5-72B-Instruct serves triple duty: scenario classification (footnote 1), quality control and gate scoring (footnote 2), and primary evaluation judge (Section 4.1). This circular dependency means systematic biases in this model propagate to dataset labels, quality filtering decisions, and outcome evaluation. The paper does not discuss this risk."
    406     },
    407     {
    408       "flag": "No statistical tests despite hundreds of comparisons",
    409       "detail": "The paper makes comparative claims across 13 models × 5 defenses × 2 vectors × 3 metrics, drawing directional conclusions (e.g., 'D2 reduces V1 CHER') from raw percentage differences without any statistical significance testing. Given the relatively small adversarial test sets (V1: 644, V2: 582 total instances, with per-model counts likely much smaller), observed differences may not be statistically significant."
    410     },
    411     {
    412       "flag": "Small adversarial instance counts for per-model analysis",
    413       "detail": "V1 has 644 instances (6.6%) and V2 has 582 instances (6.0%) total. With an 80/10/10 split, the test set would contain approximately 64 V1 and 58 V2 instances. Per-model CHER values derived from ~60 instances have wide implicit confidence intervals that are not reported, making many claimed differences potentially not meaningful."
    414     },
    415     {
    416       "flag": "Contamination risk from MedQA/PubMedQA source data unaddressed",
    417       "detail": "V0 instances are 'unaltered items from MedQA/PubMedQA' (Section 3.3). Both datasets were published in 2019-2021, well before the training of all evaluated models. Models may have trained on these exact questions, inflating V0/V0' utility scores and potentially affecting adversarial robustness through familiarity with the clinical scenarios."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "HarmBench: A Standardized Evaluation Framework for Automated Red Teaming and Robust Refusal",
    423       "authors": ["Mantas Mazeika", "Long Phan", "Xuwang Yin", "Andy Zou"],
    424       "year": 2024,
    425       "arxiv_id": "2402.04249",
    426       "relevance": "Standard benchmark for evaluating LLM safety and red-teaming robustness, which MPIB extends to clinical settings."
    427     },
    428     {
    429       "title": "Benchmarking and defending against indirect prompt injection attacks on large language models",
    430       "authors": ["Jingwei Yi", "Yueqi Xie", "Bin Zhu"],
    431       "year": 2025,
    432       "relevance": "BIPIA benchmark for indirect prompt injection in LLMs, a key comparison point for MPIB's RAG-mediated injection evaluation."
    433     },
    434     {
    435       "title": "First, do NOHARM: towards clinically safe large language models",
    436       "authors": ["David Wu", "Fateme Nateghi Haredasht", "Saloni Kumar Maharaj"],
    437       "year": 2025,
    438       "arxiv_id": "2512.01241",
    439       "relevance": "Outcome-centric harm auditing for clinical LLMs that motivated MPIB's CHER metric and severity-based evaluation."
    440     },
    441     {
    442       "title": "CARES: Comprehensive Evaluation of Safety and Adversarial Robustness in Medical LLMs",
    443       "authors": ["Sijia Chen", "Xiaomin Li", "Mengxue Zhang"],
    444       "year": 2025,
    445       "arxiv_id": "2505.11413",
    446       "relevance": "Adversarial safety stress testing for medical LLMs, directly relevant to evaluating clinical AI safety and robustness."
    447     },
    448     {
    449       "title": "Ensuring safety and trust: Analyzing the risks of large language models in medicine",
    450       "authors": ["Yifan Yang", "Qiao Jin", "Robert Leaman"],
    451       "year": 2024,
    452       "arxiv_id": "2411.14487",
    453       "relevance": "MedGuard principle-based safety auditing for medical LLMs, a complementary evaluation paradigm to MPIB's adversarial approach."
    454     },
    455     {
    456       "title": "Automatic and universal prompt injection attacks against large language models",
    457       "authors": ["Xiaogeng Liu", "Zhiyuan Yu", "Yizhe Zhang"],
    458       "year": 2024,
    459       "arxiv_id": "2403.04957",
    460       "relevance": "Demonstrates universal transferable prompt injection attacks, the attack methodology underlying MPIB's threat model."
    461     },
    462     {
    463       "title": "Prompt injection attack against llm-integrated applications",
    464       "authors": ["Yi Liu", "Gelei Deng", "Yuekang Li"],
    465       "year": 2023,
    466       "arxiv_id": "2306.05499",
    467       "relevance": "Foundational work on prompt injection attacks against LLM-integrated applications, defines the threat model MPIB builds upon."
    468     },
    469     {
    470       "title": "Rag and roll: An end-to-end evaluation of indirect prompt manipulations in llm-based application frameworks",
    471       "authors": ["Gianluca De Stefano", "Lea Schönherr", "Giancarlo Pellegrino"],
    472       "year": 2024,
    473       "arxiv_id": "2408.05025",
    474       "relevance": "End-to-end evaluation of indirect prompt manipulation in RAG systems, directly relevant to MPIB's V2 attack vector."
    475     },
    476     {
    477       "title": "Adversarial prompt and fine-tuning attacks threaten medical large language models",
    478       "authors": ["Yifan Yang", "Qiao Jin", "Furong Huang", "Zhiyong Lu"],
    479       "year": 2025,
    480       "relevance": "Demonstrates adversarial attacks threatening medical LLMs via prompting and fine-tuning, core motivation for MPIB."
    481     },
    482     {
    483       "title": "CommandSans: Securing AI Agents with Surgical Precision Prompt Sanitization",
    484       "authors": ["Debeshee Das", "Luca Beurer-Kellner", "Marc Fischer"],
    485       "year": 2025,
    486       "arxiv_id": "2510.08829",
    487       "relevance": "RAG-specific defense against indirect prompt injection via context sanitization, related to MPIB's D3 defense configuration."
    488     },
    489     {
    490       "title": "Retrieval-augmented generation for knowledge-intensive nlp tasks",
    491       "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus"],
    492       "year": 2020,
    493       "relevance": "Foundational RAG paper whose architecture defines the retrieval pipeline that MPIB's V2 threat model targets."
    494     },
    495     {
    496       "title": "Medical triage as an AI ethics benchmark",
    497       "authors": ["Nathalie Maria Kirch", "Konstantin Hebenstreit", "Matthias Samwald"],
    498       "year": 2025,
    499       "relevance": "Shows that prompt framing degrades LLM triage behavior, motivating MPIB's clinical scenario evaluation approach."
    500     }
    501   ],
    502   "engagement_factors": {
    503     "practical_relevance": {
    504       "score": 2,
    505       "justification": "Provides a benchmark and evaluation harness for testing clinical LLM/RAG system safety, useful for teams deploying medical AI but not immediately actionable without adaptation."
    506     },
    507     "surprise_contrarian": {
    508       "score": 1,
    509       "justification": "The ASR-CHER divergence finding is methodologically novel but the broader finding that medical LLMs are vulnerable to prompt injection is expected."
    510     },
    511     "fear_safety": {
    512       "score": 3,
    513       "justification": "Directly demonstrates that prompt injection can cause LLMs to recommend toxic overdoses, dismiss emergencies, and fabricate clinical evidence — all high-severity patient safety risks."
    514     },
    515     "drama_conflict": {
    516       "score": 1,
    517       "justification": "No controversy or dramatic conflict, though the implicit message that current medical AI safety evaluations (ASR-based) systematically misestimate harm has some edge."
    518     },
    519     "demo_ability": {
    520       "score": 2,
    521       "justification": "Code on GitHub and data on HuggingFace (jhlee0619/mpib) are publicly available, though V2 payloads are redacted and full access requires gated approval."
    522     },
    523     "brand_recognition": {
    524       "score": 0,
    525       "justification": "From Seoul National University, not a well-known AI research lab. No major AI company products or brands are central to the work."
    526     }
    527   }
    528 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs