scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28039B)
      1 {
      2   "paper": {
      3     "title": "Policy-as-Prompt: Turning AI Governance Rules into Guardrails for AI Agents",
      4     "authors": ["Gauri Kholkar", "Ratinder Ahuja"],
      5     "year": 2025,
      6     "venue": "NeurIPS 2025 Workshop on Regulatable ML",
      7     "arxiv_id": "2509.23994"
      8   },
      9   "scan_version": 3,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval", "case-study"],
     12   "key_findings": "The Policy-as-Prompt framework converts unstructured design documents (PRDs, TDDs) into enforceable LLM guardrails via a two-stage pipeline: policy tree extraction and prompt-based classification. O1 significantly outperforms other models in policy extraction (60% F1 on HR vs ≤25% for others). For runtime policy enforcement, GPT-4o achieves 73%/71% accuracy on input/output classification while smaller models (Qwen3-1.7B) reach 66%/59%. The evaluation is limited to two enterprise domains (HR, SOC) with small proprietary gold sets (100 inputs/outputs each) that cannot be released.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The evaluation data consists of proprietary internal enterprise artifacts. Section 6 states: 'Reproducibility is further hindered by confidentiality of internal artifacts and logs: we cannot release the full corpora.'"
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment specifications, dependency lists, or setup instructions are provided."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No reproduction instructions are provided. The proprietary nature of the data and the lack of released code make reproduction impossible."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Tables 2, 3, and 4 report only point estimates (e.g., '0.73' accuracy, '60.0' F1) with no confidence intervals or error bars."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper claims 'o1 model significantly outperforms all other models' but provides no statistical significance tests — the comparison is based solely on comparing raw numbers."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Only raw accuracy and F1 scores are reported. No formal effect sizes (Cohen's d, odds ratios) are provided for the model comparisons."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The gold set consists of 100 inputs and 100 outputs per application. No justification for this sample size is given, and no power analysis is discussed."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Section 4 states 'All reported metrics are the average of multiple runs to ensure stability' but never reports standard deviation, variance, or the number of runs, making it impossible to assess result stability."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Multiple models are compared: O1, GPT-OSS 120B, Llama 405B, Claude 3.5 for POLICY-TREE-GEN (Tables 2, 4), and GPT-4o, Qwen3-1.7B, Gemma-1B for POLICY-AS-PROMPT-GEN (Table 3)."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The models used are contemporary: o1, GPT-4o, GPT-OSS 120B, Llama 3 405B, Claude Sonnet 3.5, Gemma 3 1B, and Qwen3 1.7B."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "The system has multiple components (POLICY-TREE-GEN with two-step parse/classify and enrich, POLICY-AS-PROMPT-GEN with input/output classifiers). No ablation study examines the contribution of individual components."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Multiple metrics are used: Recall, F1, Macro-F1, per-class F1 (Table 2); Detection Precision, Micro-F1, Span Exact, Token-F1, Substr, Emb Cos (Table 4); Input/Output Accuracy (Table 3)."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Security engineers reviewed the generated policies as part of deployment approval ('Both policies were judged request review by security engineers'), but this is a workflow step, not a systematic human evaluation with reported inter-rater agreement or quantified assessments."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "100 gold inputs and outputs are used for testing, but there is no explicit separation of dev and test splits. It is unclear whether prompts were tuned using any portion of this gold set."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down by application (HR vs SOC), by model, and by per-class F1 (ID-I, ID-O, OOD-I, OOD-O) in Table 2. Input vs output accuracy is shown separately in Table 3."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "No qualitative failure analysis is provided. The paper notes some models had low scores and two were excluded for poor performance, but does not show specific examples of where the approach breaks down or analyze error patterns."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Gemma 3 1B and Qwen 1.7B were excluded from POLICY-TREE-GEN due to poor performance. Claude 3.5 Sonnet achieved very low F1 scores (8.0% on HR, 5.4% on SOC). These negative results are reported transparently."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims the system 'reduces prompt-injection risk, blocks out-of-scope requests, and limits toxic outputs.' Tables 2-4 show the system can classify inputs/outputs as ID/OOD with measured accuracy, supporting these claims at the measured performance level (70-73% for GPT-4o)."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The abstract claims the system 'reduces prompt-injection risk' and 'limits toxic outputs,' which are causal claims. The evaluation only measures classification accuracy on gold examples, not actual risk reduction in deployment. No causal design (RCT, A/B test) supports these claims."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The abstract and title claim 'scalable AI safety and AI security assurance for regulatable ML' and 'Guardrails for AI Agents' broadly, but the evaluation covers only two enterprise domains (HR, SOC) with proprietary data. The limitations section acknowledges this but the framing throughout is much broader than the evidence."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The limitations section mentions prompt tuning bias and limited domains but does not discuss alternative explanations for the observed results — e.g., whether gold test set difficulty, policy structure simplicity, or model familiarity with similar tasks could explain the performance patterns."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper measures classification accuracy on gold ID/OOD labels and frames this as 'reduces prompt-injection risk' and 'enables secure-by-design deployment.' The gap between accuracy on curated gold examples and actual risk reduction in production is not acknowledged."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "Models are identified by marketing names: 'o1', 'GPT-4o', 'Claude Sonnet 3.5', 'Llama 3 405B', 'Gemma 3 1B', 'Qwen 3 1.7B'. No API version identifiers, snapshot dates, or model checksums are provided."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Full prompt text is provided in the appendix: Figure 2 (Pass 1 parsing prompt), Figure 3 (Pass 2 example extraction prompt), Figure 4 (HR input classifier prompt), and Figure 5 (SOC content classifier prompt)."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "No hyperparameters (temperature, top-p, max tokens, sampling settings) are reported for any of the models used."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "The two-stage pipeline (POLICY-TREE-GEN → POLICY-AS-PROMPT-GEN) is described in Sections 2-3 with Figure 1 showing the overall architecture. The two-pass extraction process and the AI double-checking agent are described."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "Only minimal preprocessing is mentioned: 'all documents were converted to Markdown format, and any embedded images were replaced with textual descriptions generated by gpt-4o.' No details on how the 100 gold inputs/outputs were created, selected, or validated."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 6 is a dedicated 'Limitations' section with substantive discussion of generalizability, reproducibility, and evaluation constraints."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "The limitations section identifies threats specific to this study: 'modest gold sets (100 inputs and 100 outputs per application)', 'prompting effort was greater for GPT-family models, potentially biasing results', and 'confidentiality of internal artifacts and logs' preventing reproduction."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 6 explicitly states: 'Our evaluation spans two enterprise-style domains (HR, SOC)...which constrains transferability to other settings and larger, more heterogeneous corpora.' This acknowledges the narrow scope of the evaluation."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "Section 6 states: 'Reproducibility is further hindered by confidentiality of internal artifacts and logs: we cannot release the full corpora.' No raw data is available."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "The paper states artifacts were 'sourced from real-world, internal enterprise projects' and 'gold policies were created by security engineers,' but provides no detail on how projects were selected, how gold labels were determined, or the annotation process."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "No description of how the enterprise projects were selected, how security engineers who created gold policies were chosen, or whether the selection could introduce bias."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "The high-level pipeline is shown in Figure 1, but the full data pipeline from raw enterprise documents to final evaluation metrics is not documented. How gold examples were constructed from the artifacts is unexplained."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding source is disclosed. Both authors are from Pure Storage but there is no acknowledgments section or funding statement."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Both authors list Pure Storage as their affiliation with institutional email addresses."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding is disclosed. The authors work at Pure Storage and evaluate a framework potentially useful for their employer's products. Independence of outcome cannot be assessed."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement or financial interests declaration is provided."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No training data cutoff dates are stated for any of the evaluated models (o1, GPT-4o, Claude 3.5, Llama 405B, etc.)."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of potential train/test overlap. While the evaluation data is proprietary (reducing contamination risk), this is not explicitly discussed."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No discussion of whether models could have been exposed to similar policy documents or classification tasks during training."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in the study. Security engineers who create gold policies and review outputs are part of the development team, not research subjects."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in the study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in the study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in the study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in the study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in the study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in the study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No inference cost or latency is reported despite the paper claiming 'lightweight' and 'real-time' monitoring. The framework involves multiple LLM calls per input but no cost quantification is provided."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No computational budget (API costs, GPU hours, total runs) is stated despite evaluating multiple large language models across multiple runs."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "Results are stated to be 'the average of multiple runs' but no seed sensitivity analysis, standard deviation, or variation across runs is reported."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "Section 4 says 'All reported metrics are the average of multiple runs to ensure stability' but the exact number of runs is never stated."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No hyperparameter search budget is reported. The prompts appear tuned (especially for GPT models per the limitations section) but no search details are provided."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No description of how the final configurations or prompt versions were selected. The limitations section acknowledges GPT-biased tuning but provides no selection methodology."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "Multiple models are compared across two applications with many metrics. No correction for multiple comparisons is applied."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors evaluate their own framework without acknowledging self-evaluation bias or using independent evaluators. Gold policies were created by security engineers, but whether these were independent of the framework developers is not stated."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "Models ranging from 1B to 405B parameters are compared without discussing compute budget differences. Performance is not reported as a function of compute."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "No discussion of whether the gold set of 100 inputs/outputs actually measures real-world policy enforcement capability, or whether accuracy on this specific benchmark correlates with deployment safety."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": true,
    334         "answer": true,
    335         "justification": "The same prompts and pipeline structure are used across all model comparisons within each stage (POLICY-TREE-GEN and POLICY-AS-PROMPT-GEN), holding the scaffold constant. The limitations section acknowledges prompt tuning was GPT-biased."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "No discussion of whether any models could have been trained on similar policy documents or classification tasks."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether the evaluation setup or prompt context leaks classification information to the models."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of independence between training data and the proprietary test artifacts."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No leakage detection or prevention method is used or discussed."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "O1 significantly outperforms all other models in policy tree extraction, with 60% F1 on HR and 22.6% on SOC.",
    364       "evidence": "Table 2 shows O1 achieving 60.0% F1 and 53.3% recall on HR, compared to 25.0% F1 for GPT-OSS 120B and 14.5% for Llama 405B. On SOC, O1 achieves 22.6% F1 vs ≤10.3% for others.",
    365       "supported": "moderate"
    366     },
    367     {
    368       "claim": "GPT-4o achieves 73% input and 71% output classification accuracy for policy enforcement, demonstrating 'significant utility in a real-world context.'",
    369       "evidence": "Table 3 shows GPT-4o accuracies of 0.73 (input) and 0.71 (output) for HR, and 0.70/0.68 for SOC. No confidence intervals, significance tests, or comparison to a no-guardrail baseline.",
    370       "supported": "weak"
    371     },
    372     {
    373       "claim": "Smaller models can effectively enforce policies with curated prompts.",
    374       "evidence": "Table 3 shows Qwen3-1.7B at 0.66/0.59 (HR) and 0.66/0.61 (SOC), roughly 7-12 percentage points below GPT-4o. Gemma-1B at 0.40/0.32 is substantially lower.",
    375       "supported": "weak"
    376     },
    377     {
    378       "claim": "The system reduces prompt-injection risk, blocks out-of-scope requests, and limits toxic outputs.",
    379       "evidence": "Table 1 shows examples of blocking prompt injection and PII-containing requests. Table 3 shows overall classification accuracy but does not isolate prompt-injection-specific performance. No before/after comparison of attack surface reduction.",
    380       "supported": "weak"
    381     },
    382     {
    383       "claim": "Models with high span quality metrics can still have low classification performance, indicating extraction and classification are separable capabilities.",
    384       "evidence": "Table 4 shows Llama 405B and Claude 3.5 with high Token-F1 (1.000 and 0.992) and Substr (100%) on HR but low Micro-F1 (10.9% and 4.0%), demonstrating the extraction-classification gap.",
    385       "supported": "moderate"
    386     }
    387   ],
    388   "red_flags": [
    389     {
    390       "flag": "Unreproducible evaluation",
    391       "detail": "The entire evaluation relies on proprietary internal enterprise artifacts that cannot be released. The authors explicitly state 'we cannot release the full corpora,' making independent verification impossible."
    392     },
    393     {
    394       "flag": "No variance despite averaging",
    395       "detail": "The paper states results are 'the average of multiple runs to ensure stability' but reports no standard deviation, number of runs, or any measure of variability. This makes it impossible to assess whether differences between models are meaningful."
    396     },
    397     {
    398       "flag": "Uneven prompt tuning acknowledged",
    399       "detail": "The limitations section admits 'prompting effort was greater for GPT-family models, potentially contributing to performance gaps,' introducing systematic bias favoring GPT models in the evaluation."
    400     },
    401     {
    402       "flag": "Small evaluation set for broad claims",
    403       "detail": "Only 100 gold inputs and 100 gold outputs per application are used, yet the paper claims 'scalable AI safety and AI security assurance for regulatable ML.' The gap between the evaluation scope and the claim scope is large."
    404     },
    405     {
    406       "flag": "Company employees evaluating own framework",
    407       "detail": "Both authors are from Pure Storage and evaluate their own proposed framework. Gold policies were created by 'security engineers' whose independence from the authors is not stated."
    408     }
    409   ],
    410   "cited_papers": [
    411     {
    412       "title": "Security of AI agents",
    413       "authors": ["Yifeng He", "Ethan Wang", "Yuyang Rong", "Zifei Cheng", "Hao Chen"],
    414       "year": 2024,
    415       "arxiv_id": "2406.08689",
    416       "relevance": "Directly addresses security risks of autonomous AI agents, the core motivation for the Policy-as-Prompt framework."
    417     },
    418     {
    419       "title": "Building guardrails for large language models",
    420       "authors": ["Yi Dong", "Ronghui Mu", "Gaojie Jin", "Yi Qi", "Jinwei Hu", "Xingyu Zhao", "Jie Meng", "Wenjie Ruan", "Xiaowei Huang"],
    421       "year": 2024,
    422       "arxiv_id": "2402.01822",
    423       "relevance": "Foundational work on LLM guardrails — the concept this paper operationalizes via policy-derived prompts."
    424     },
    425     {
    426       "title": "TrustAgent: Towards safe and trustworthy LLM-based agents",
    427       "authors": ["Wenyue Hua", "Xianjun Yang", "Mingyu Jin", "Zelong Li", "Wei Cheng", "Ruixiang Tang", "Yongfeng Zhang"],
    428       "year": 2024,
    429       "arxiv_id": "2402.01586",
    430       "relevance": "Proposes static safety principles for LLM agents, which the Policy-as-Prompt paper argues are insufficient for dynamic settings."
    431     },
    432     {
    433       "title": "Prompt injection attack against LLM-integrated applications",
    434       "authors": ["Yi Liu", "Gelei Deng", "Yuekang Li", "Kailong Wang"],
    435       "year": 2024,
    436       "arxiv_id": "2306.05499",
    437       "relevance": "Describes prompt injection attacks that the Policy-as-Prompt system aims to detect and block."
    438     },
    439     {
    440       "title": "Commercial LLM agents are already vulnerable to simple yet dangerous attacks",
    441       "authors": ["Ang Li", "Yin Zhou", "Vethavikashini Chithrra Raghuram", "Tom Goldstein", "Micah Goldblum"],
    442       "year": 2025,
    443       "arxiv_id": "2502.08586",
    444       "relevance": "Demonstrates real-world vulnerability of commercial LLM agents to attacks, motivating the need for guardrail frameworks."
    445     },
    446     {
    447       "title": "Contextual agent security: A policy for every purpose",
    448       "authors": ["Lillian Tsai", "Eugene Bagdasarian"],
    449       "year": 2025,
    450       "arxiv_id": "2501.17070",
    451       "relevance": "Argues for context-aware security for AI agents, which the Policy-as-Prompt framework claims to implement."
    452     },
    453     {
    454       "title": "LLM agents should employ security principles",
    455       "authors": ["Kaiyuan Zhang", "Zian Su", "Pin-Yu Chen", "Elisa Bertino", "Xiangyu Zhang", "Ninghui Li"],
    456       "year": 2025,
    457       "arxiv_id": "2505.24019",
    458       "relevance": "Proposes security principles (including least privilege) for LLM agents, directly related to the guardrail enforcement approach."
    459     },
    460     {
    461       "title": "Capture: Context-aware prompt injection testing and robustness enhancement",
    462       "authors": ["Gauri Kholkar", "Ratinder Ahuja"],
    463       "year": 2025,
    464       "arxiv_id": "2505.12368",
    465       "relevance": "Prior work by the same authors on prompt injection testing, forming the security testing foundation for this framework."
    466     },
    467     {
    468       "title": "ProPILE: Probing Privacy Leakage in Large Language Models",
    469       "authors": ["Siwon Kim", "Sangdoo Yun", "Hwaran Lee", "Martin Gubri", "Sungroh Yoon", "Seong Joon Oh"],
    470       "year": 2023,
    471       "arxiv_id": "2307.01881",
    472       "relevance": "Addresses privacy leakage in LLMs, relevant to the PII protection guardrails evaluated in the HR application."
    473     },
    474     {
    475       "title": "A survey of LLM-based agents in medicine: How far are we from Baymax?",
    476       "authors": ["Wenxuan Wang", "Zizhan Ma", "Zheng Wang"],
    477       "year": 2025,
    478       "arxiv_id": "2502.11211",
    479       "relevance": "Surveys LLM-based agents in regulated settings (medicine), contextualizing the need for governance guardrails."
    480     }
    481   ],
    482   "engagement_factors": {
    483     "practical_relevance": {
    484       "score": 2,
    485       "justification": "The policy-to-guardrail pipeline addresses a real enterprise need, but 70-73% accuracy and unreleased code limit immediate adoption."
    486     },
    487     "surprise_contrarian": {
    488       "score": 0,
    489       "justification": "Confirms the expected approach of using LLMs to enforce policy compliance — no counterintuitive findings."
    490     },
    491     "fear_safety": {
    492       "score": 1,
    493       "justification": "Addresses AI safety/guardrails but does not demonstrate novel attacks or reveal new risks."
    494     },
    495     "drama_conflict": {
    496       "score": 0,
    497       "justification": "No controversy, no critique of existing approaches or companies."
    498     },
    499     "demo_ability": {
    500       "score": 0,
    501       "justification": "No code, demo, or tool is released. Prompts are shown in the appendix but cannot be independently run."
    502     },
    503     "brand_recognition": {
    504       "score": 1,
    505       "justification": "Pure Storage is a known enterprise storage company but not prominent in AI research."
    506     }
    507   }
    508 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs