scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (19267B)
      1 {
      2   "paper": {
      3     "title": "Cross-Platform Evaluation of Large Language Model Safety in Pediatric Consultations: Evolution of Adversarial Robustness and the Scale Paradox",
      4     "authors": ["Vahideh Zolfaghari"],
      5     "year": 2025,
      6     "venue": "",
      7     "doi": ""
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub repositories provided: https://github.com/vzm1399/PediatricAnxietyBench and https://github.com/vzm1399/PediatricAnxietyBench-CrossPlatform with evaluation, scoring, statistical analysis, and visualization scripts."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "PediatricAnxietyBench dataset (300 queries) and all 900 model responses are released via the GitHub repositories under MIT License."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Paper specifies Python 3.10, SciPy v1.11.0, scikit-learn v1.3.0, NumPy v1.24.0, and mentions README files with step-by-step setup instructions including requirements.txt."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Paper states 'README files in each repository provide step-by-step instructions for setup, execution, and result reproduction.' Code availability section lists specific scripts for each pipeline stage."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "95% confidence intervals computed via bias-corrected and accelerated (BCa) bootstrapping with 10,000 iterations are reported throughout results and figures."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Paired t-tests for model comparisons and independent t-tests for adversarial vs non-adversarial subsets, with p-values reported (e.g., p=0.0001, p=0.051)."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Cohen's d reported for all comparisons (e.g., d=0.225 for Llama-3.1-8B vs Llama-3.3-70B, d=0.72 for Mistral adversarial effect, d=1.52 for hedging-safety correlation)."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No power analysis or justification for why 300 queries (or 30 adversarial) were chosen. The limitations section acknowledges n=30 adversarial queries provided limited statistical power but does not justify the original choice."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Standard deviations reported (e.g., SD=1.45 vs SD=2.90 for Llama models). Distribution plots and violin plots show variance across models."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Three models compared against each other, and temporal comparison with prior evaluation of earlier model generations on the same benchmark provides a baseline."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "Only three open-source models evaluated (Llama-3.3-70B, Llama-3.1-8B, Mistral-7B). No proprietary frontier models (GPT-4, Claude, Gemini) included. The paper acknowledges this as a limitation but does not compare against the most capable current systems."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "The paper evaluates existing models as black boxes via APIs; there is no system with components to ablate."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Composite safety score (0-15) plus five sub-dimensions (diagnostic restraint, referral adherence, hedging language, emergency recognition, prescriptive resistance), plus individual metric breakdowns (inappropriate diagnosis rate, referral rate, hedging frequency)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Clinical expert review of a random sample (n=50) showed 86% agreement with automated scores, validating the automated safety scoring approach."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No model training or tuning is performed; all queries are used for evaluation of pre-existing models. There is no train/dev/test split needed."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Per-topic breakdowns across 10 clinical categories provided (e.g., seizures, post-vaccination, infant crying), with Figure 5 showing topic-specific safety scores."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Specific failure modes discussed: seizure queries (33% inappropriate diagnosis rate), post-vaccination concerns (elevated safety issues), universal failure in emergency escalation language across all models."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Universal failure in emergency escalation is reported as a critical negative result. Also reports that the adversarial effect for Llama-3.3-70B was borderline non-significant (p=0.051)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims about scale paradox (smaller model outperforming larger), safety score ranges, Cohen's d values, and adversarial robustness reversal are all supported by results in the paper."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper uses causal language ('challenging assumptions that scale monotonically predicts safety') based on comparing only three models. The scale paradox claim is correlational at best with N=3 model sizes, and confounds scale with architecture, training data, and alignment approach."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Limitations section explicitly bounds generalization: English-only, U.S. healthcare context, three open-source models, specific system prompt dependency, proxy metrics not actual health outcomes."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Discussion addresses alternative explanations for the scale paradox (version-specific regressions vs inherent scale limitations, alignment quality vs parameter count, compute-optimal training). Limitations discuss temporal confounds in cross-generation comparisons."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Specific model versions given: Llama-3.3-70B-Versatile, Llama-3.1-8B-Instant (via Groq), Mistral-7B-Instruct-v0.2 (via HuggingFace). These are specific version identifiers."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The standardized system prompt is provided in full in the paper (6-point safety instruction list). Adversarial query generation prompt template referenced in Supplementary Materials."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Generation parameters explicitly stated: temperature=0.7, max_tokens=500, top_p=1.0."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding used; models are queried directly via APIs with a single system prompt and query."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Describes benchmark composition (150 authentic from HealthCareMagic, 150 synthetic adversarial), adversarial generation process, and handling of malformed responses (3 excluded from Llama-3.3-70B, sensitivity analysis performed)."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Dedicated 'Limitations' section spanning approximately 28 lines discussing seven specific limitations."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Specific threats discussed: n=30 adversarial subset limiting statistical power (p=0.051 borderline), rule-based metrics missing 14% of contextually appropriate responses, temporal confounding in cross-generation comparison, prompt engineering dependency."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Explicitly states: English-only queries, U.S. healthcare contexts, three open-source models only, metrics are safety proxies not actual health outcomes, results reflect specific prompt design interaction."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "All 900 model responses, safety scores, and the benchmark queries are released via GitHub repositories."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Describes 150 authentic queries from HealthCareMagic dataset and 150 adversarial queries generated via Claude 3.5 Sonnet with systematic prompt template. API endpoints and platforms specified."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants recruited; the study evaluates LLM outputs on a benchmark dataset."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Pipeline documented: query generation, API submission across platforms, response collection, automated safety scoring with five sub-dimensions, statistical analysis. Sensitivity analysis for excluded malformed responses also documented."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Funding statement explicitly states: 'This research received no specific grant from any funding agency in the public, commercial, or not-for-profit sectors.'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliation listed: Medical Sciences Education Research Center, Mashhad University of Medical Sciences. No conflict with evaluated products (open-source models via third-party APIs)."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "Research is explicitly unfunded per the funding statement."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement found in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No training data cutoff dates stated for any of the three models evaluated. This matters because PediatricAnxietyBench queries could appear in training data."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether benchmark queries (especially the 150 from HealthCareMagic) could have appeared in model training data."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "HealthCareMagic is a public dataset; the authentic queries could plausibly be in training corpora of the evaluated models. This contamination risk is not discussed."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants; the study evaluates LLM outputs on a benchmark."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants. Ethics statement notes IRB exemption since no human subjects were involved."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in the study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in the study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in the study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference costs, API costs, or latency reported despite using production APIs (Groq, HuggingFace). The paper discusses practical deployment implications but does not quantify evaluation costs."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total computational budget, hardware, or wall-clock time reported for the 900 API calls."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Smaller Llama-3.1-8B (8B parameters) significantly outperformed larger Llama-3.3-70B (70B parameters) on safety by +0.66 points (p=0.0001, Cohen's d=0.225)",
    286       "evidence": "Results section reports paired t-test with p=0.0001, Cohen's d=0.225, mean scores 10.36 vs 9.70.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Adversarial queries elicited higher safety scores than neutral queries (reversal of prior findings), with Mistral-7B showing the strongest effect (Cohen's d=0.72)",
    291       "evidence": "Results section with paired comparisons, 95% CIs, and p-values. Mistral significant (p=0.0002), Llama-3.1-8B significant, Llama-3.3-70B borderline (p=0.051).",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "84% improvement in baseline safety and complete reversal of adversarial vulnerability compared to prior model generations",
    296       "evidence": "Temporal comparison with prior evaluation using same benchmark. However, temporal confounds acknowledged (different infrastructure, scorer implementation).",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "No model produced explicit emergency escalation language for potentially life-threatening scenarios",
    301       "evidence": "Universal failure documented across all three models and both evaluations (prior and current).",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Strong positive correlation between hedging language frequency and overall safety scores (r=0.89, p<0.001, Cohen's d=1.52)",
    306       "evidence": "Regression analysis with 95% CI shown in Figure 4, with statistical test results.",
    307       "supported": "strong"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "Smaller, well-aligned LLMs (Llama-3.1-8B, Mistral-7B) outperformed a larger model (Llama-3.3-70B) on pediatric consultation safety, challenging the assumption that scale monotonically predicts safety. Adversarial parental anxiety queries paradoxically elicited safer responses than neutral queries in current model generations, reversing findings from prior evaluations. Despite overall safety improvements, all models universally failed to produce explicit emergency escalation language for potentially life-threatening scenarios, representing a persistent critical gap.",
    312   "red_flags": [
    313     {
    314       "flag": "Benchmark contamination risk unaddressed",
    315       "detail": "HealthCareMagic is a public dataset; 150 authentic queries could appear in model training data. No contamination analysis performed despite this being a benchmark evaluation."
    316     },
    317     {
    318       "flag": "Scale paradox claim from N=3 models",
    319       "detail": "The 'scale paradox' (smaller models safer than larger) is based on comparing only three models that differ in architecture, training data, and alignment approach, not just scale. The claim is correlational but framed as challenging a general assumption."
    320     },
    321     {
    322       "flag": "Single-author automated evaluation",
    323       "detail": "Safety scoring is entirely rule-based by a single author. While clinical expert validation (n=50, 86% agreement) is mentioned, the expert review details are thin and no inter-rater reliability metrics are provided for the expert validation itself."
    324     }
    325   ],
    326   "cited_papers": [
    327     {
    328       "title": "Adversarial Prompt Detection in Large Language Models: A Comprehensive Survey",
    329       "authors": ["Ergun AE", "Onan A"],
    330       "year": 2025,
    331       "relevance": "Survey on adversarial prompt detection methods for LLMs, directly relevant to LLM safety evaluation."
    332     },
    333     {
    334       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    335       "authors": ["Wei J", "Wang X", "Schuurmans D"],
    336       "year": 2022,
    337       "relevance": "Foundational prompting technique paper relevant to LLM capability evaluation."
    338     },
    339     {
    340       "title": "How medical AI devices are evaluated: limitations and recommendations from an analysis of FDA approvals",
    341       "authors": ["Wu E", "Wu K", "Daneshjou R"],
    342       "year": 2021,
    343       "relevance": "Analysis of AI evaluation limitations relevant to methodological quality assessment of AI systems."
    344     },
    345     {
    346       "title": "Training compute-optimal large language models",
    347       "authors": ["Hoffmann J"],
    348       "year": 2022,
    349       "relevance": "Chinchilla scaling laws paper, relevant to understanding compute-optimal training and the scale-safety relationship."
    350     },
    351     {
    352       "title": "Towards Monosemanticity: Decomposing Language Models With Dictionary Learning",
    353       "authors": ["Bricken T"],
    354       "year": 2023,
    355       "relevance": "Mechanistic interpretability work relevant to understanding LLM internal safety representations."
    356     },
    357     {
    358       "title": "Med-PaLM 2",
    359       "authors": ["Singhal K"],
    360       "year": 2023,
    361       "relevance": "Medical LLM evaluation demonstrating that model scale alone does not guarantee clinical safety."
    362     }
    363   ]
    364 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs