calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (15934B)
      1 {
      2   "paper_slug": "adversarial-threat-vectors-2025",
      3   "calibration_date": "2026-02-28",
      4   "total_questions": 50,
      5   "agreement_count": 49,
      6   "disagreement_count": 1,
      7   "agreement_rate": 0.98,
      8   "disagreements": [
      9     {
     10       "category": "evaluation_design",
     11       "question": "multiple_metrics",
     12       "sonnet": { "applies": true, "answer": true },
     13       "opus": { "applies": false, "answer": false },
     14       "direction": "applies_boundary",
     15       "explanation": "Sonnet treated the OWASP-derived risk scoring factors (Likelihood Factor, Impact Factor, etc.) as 'multiple evaluation metrics.' Opus disagrees: the schema asks whether multiple evaluation metrics are used to evaluate a system's performance. This paper does not evaluate any system — it applies a qualitative risk-scoring model to a hypothetical RAG architecture. The OWASP factors are dimensions of a risk assessment framework, not evaluation metrics for measuring system output quality. The criterion is structurally inapplicable to a framework/position paper that runs no experiments."
     16     }
     17   ],
     18   "opus_checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No code repository or archive is provided anywhere in the paper. The paper presents a qualitative threat modeling framework with no accompanying software artifacts."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No datasets were collected or released. The paper conducts qualitative analysis referencing existing frameworks (MITRE ATLAS, OWASP) rather than any original data collection."
     29       },
     30       "environment_specified": {
     31         "applies": false,
     32         "answer": false,
     33         "justification": "The paper is a theoretical/analytical framework paper with no computational experiments, so environment specification is not applicable."
     34       },
     35       "reproduction_instructions": {
     36         "applies": false,
     37         "answer": false,
     38         "justification": "No experiments were conducted that would require reproduction instructions. The paper presents a threat modeling methodology applied to a hypothetical system."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": false,
     44         "answer": false,
     45         "justification": "The paper presents qualitative risk scores (likelihood x impact) derived from OWASP-style factor analysis, not statistical experiments. No confidence intervals are applicable."
     46       },
     47       "significance_tests": {
     48         "applies": false,
     49         "answer": false,
     50         "justification": "No statistical comparisons between conditions are made. The risk reduction figures are deterministic scoring calculations based on expert judgment, not empirical measurements."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": false,
     54         "answer": false,
     55         "justification": "The paper uses a qualitative risk-scoring model (inherent vs. residual risk) rather than empirical experiments that would yield effect sizes."
     56       },
     57       "sample_size_justified": {
     58         "applies": false,
     59         "answer": false,
     60         "justification": "No sample of participants, examples, or cases is drawn. The analysis is applied to a generic hypothetical RAG system architecture."
     61       },
     62       "variance_reported": {
     63         "applies": false,
     64         "answer": false,
     65         "justification": "No experimental runs were conducted. The risk scores are deterministic expert assessments, not repeated measurements."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": false,
     71         "answer": false,
     72         "justification": "The paper proposes a threat modeling framework and control prioritization scheme rather than evaluating a system empirically; there are no baselines to compare against."
     73       },
     74       "baselines_contemporary": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "No baseline comparison is conducted; this criterion does not apply to a framework/position paper."
     78       },
     79       "ablation_study": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "No implemented system is evaluated. The paper applies qualitative risk analysis to a generic RAG architecture, so component ablation is not applicable."
     83       },
     84       "multiple_metrics": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "The schema asks whether multiple evaluation metrics are used to evaluate a system's performance. This paper does not evaluate any system — it applies a qualitative risk scoring model (OWASP factors) to a hypothetical RAG architecture. The OWASP factors (Likelihood, Impact, Ease of Exploit, etc.) are dimensions of a risk assessment framework, not evaluation metrics for measuring system output quality. This criterion is structurally inapplicable to a framework paper with no experiments."
     88       },
     89       "human_evaluation": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "There are no AI system outputs being evaluated; the paper is a framework paper with qualitative risk analysis, making human evaluation of system outputs irrelevant."
     93       },
     94       "held_out_test_set": {
     95         "applies": false,
     96         "answer": false,
     97         "justification": "No machine learning evaluation is conducted. The paper does not train or test any model on held-out data."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper provides breakdowns by threat model (Sensitive Information Disclosure in Section 3.3.1 vs. RAG System Poisoning in Section 3.3.2) and by attack stage, with separate risk scoring figures for each (Figures 7-10)."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 4 discusses residual risks that remain after applying controls, including 'sophisticated insider threats or advanced supply chain compromises' that 'will continue to challenge even well-defended RAG architectures.'"
    108       },
    109       "negative_results_reported": {
    110         "applies": false,
    111         "answer": false,
    112         "justification": "The paper does not run experiments, so there are no negative empirical results to report. The discussion of residual risk is conceptual, not experimental."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims to identify prominent attack vectors and propose a prioritized control list. The paper delivers on both: Section 3.3 identifies prompt injection, data poisoning, and adversarial query manipulation; Table 3 provides the prioritized control list. Claims are descriptive and matched by content."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper makes causal claims about control effectiveness, e.g., 'Adversarial Training... mitigate risks by increasing the Skill Level required' and reports numerical risk reductions (Overall Risk Severity from 19.5 to 10.41). These causal claims rely entirely on expert judgment applied to an OWASP scoring formula with no empirical validation — no red-team tests, measured attack success rates, or before/after comparisons support the specific numerical reductions."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper frames conclusions broadly ('organizations can better protect the integrity and reliability of their systems') without restricting claims to specific RAG architectures, deployment contexts, or adversary capability models. The generic RAG system is hypothetical, and results are presented as generally applicable."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper does not discuss alternative explanations for why the proposed controls would or would not work, alternative threat prioritizations, or reasons the risk reduction estimates might be inaccurate. Section 4 acknowledges residual risks but does not consider alternative threat models or limitations of the scoring methodology."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "No specific LLM is used or evaluated in experiments. The paper analyzes a generic, hypothetical RAG architecture."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "No prompts are used in experiments. The paper does not conduct any LLM prompting as part of its methodology."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "No models are trained or queried. This is a framework paper with no computational experiments."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "The paper proposes a threat modeling methodology for RAG systems but does not itself implement or evaluate an agentic scaffold."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No data collection or preprocessing is performed. The risk scoring is based on expert judgment applied to a hypothetical system, not on processed datasets."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no dedicated limitations or threats-to-validity section. Section 4 (Discussion and Future Work) briefly notes residual risks in a single paragraph, but this does not constitute substantive limitations discussion."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No specific threats to the validity of the threat modeling analysis or risk reduction calculations are discussed. The paper does not address whether the OWASP scoring factors are calibrated for LLM systems, whether the Pyramid of Pain framework transfers to AI, or whether the residual risk numbers are meaningful."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what the analysis does NOT cover. It analyzes a 'generic RAG system' for enterprise knowledge management but does not delineate which deployment contexts, adversary capabilities, or RAG variants are excluded from its conclusions."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": false,
    184         "answer": false,
    185         "justification": "No empirical data was collected. The risk scores are constructed from expert judgment applied to OWASP scoring factors; there is no raw dataset to verify."
    186       },
    187       "data_collection_described": {
    188         "applies": false,
    189         "answer": false,
    190         "justification": "No primary data was collected. The analysis applies existing frameworks (MITRE ATLAS, OWASP Top 10 for LLMs) to a generic hypothetical RAG architecture."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No participants or samples were recruited. The paper is a framework analysis with no human subjects or empirical data collection."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No data pipeline exists. The methodology is a five-stage qualitative threat modeling process applied to a hypothetical system, not a data processing pipeline."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "The acknowledgments section thanks Dr. Mike Tan for conversations but discloses no funding source or grant. The authors are from Fire Mountain Labs (a private company) but no sponsoring entity is stated."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly stated: both are from Fire Mountain Labs, San Diego, CA. Reference [9] reveals the AI Security Pyramid of Pain framework is their own prior work (Ward, Harguess et al., 2024)."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": false,
    216         "answer": false,
    217         "justification": "No funding source is disclosed. The schema says NA if unfunded. The paper appears to be self-funded or unfunded by Fire Mountain Labs."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "There is no competing interests statement or financial disclosure. The authors promote their own prior framework (AI Security Pyramid of Pain) as the central organizing structure without disclosing any commercial interests in its adoption."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "The paper does not evaluate any pre-trained model's benchmark performance. It proposes a threat modeling framework for generic RAG systems."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "No benchmark evaluation of a pre-trained model is conducted. Contamination is not applicable to this framework paper."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "No benchmark is used to evaluate a pre-trained model's knowledge. The paper is a threat modeling framework, not a capability evaluation."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants are involved. The paper presents a qualitative threat modeling framework."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants are involved."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants are involved."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants are involved."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants or experimental conditions are involved."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants or evaluators are involved."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants are involved."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "The paper is a framework/position paper proposing a threat modeling methodology, not an empirical system. No inference costs are incurred by the paper's own method."
    284       },
    285       "compute_budget_stated": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "No computational experiments were run. The methodology is expert-judgment-based qualitative analysis requiring no compute budget."
    289       }
    290     }
    291   }
    292 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs