calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (15102B)
      1 {
      2   "paper_slug": "agentic-ai-security-survey-2025",
      3   "calibration_model": "opus",
      4   "scan_model": "sonnet",
      5   "timestamp": "2026-02-28",
      6   "total_questions": 50,
      7   "agreement_count": 49,
      8   "disagreement_count": 1,
      9   "agreement_rate": 0.98,
     10   "disagreements": [
     11     {
     12       "category": "conflicts_of_interest",
     13       "question": "funder_independent_of_outcome",
     14       "sonnet": {"applies": false, "answer": false},
     15       "opus": {"applies": true, "answer": false},
     16       "direction": "applies_boundary",
     17       "note": "Sonnet treats no funding disclosure as equivalent to 'unfunded' (applies=false/NA). Opus treats it as applies=true, answer=false because the schema says 'NA if unfunded' but the paper does not confirm it is unfunded—university researchers typically have some form of funding. The absence of a funding disclosure does not confirm absence of funding. Strict reading: if we cannot confirm the work is unfunded, we cannot claim NA."
     18     }
     19   ],
     20   "opus_checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No code repository, GitHub link, or Zenodo archive is mentioned anywhere in the paper. The survey releases no analysis scripts or supporting code."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No dataset of reviewed papers is released. The bibliography is cited inline but no structured, machine-readable corpus is provided for download."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No environment or tooling specification is provided. No mention of reference management software, analysis tools, or any computational environment used to conduct the review."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No reproduction instructions are included. The paper does not describe how a reader could replicate the literature search, paper selection, or taxonomy construction process."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": false,
     46         "answer": false,
     47         "justification": "This is a narrative survey with no quantitative meta-analysis conducted by the authors. Statistical uncertainty measures are structurally inapplicable."
     48       },
     49       "significance_tests": {
     50         "applies": false,
     51         "answer": false,
     52         "justification": "The survey authors make no comparative statistical claims of their own. NA for a narrative survey."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": false,
     56         "answer": false,
     57         "justification": "No effect sizes are computed by the survey authors. Statistics cited (e.g., '94.4% of agents vulnerable') are from cited works, not the authors' own analysis."
     58       },
     59       "sample_size_justified": {
     60         "applies": false,
     61         "answer": false,
     62         "justification": "No experiments are conducted by the survey authors, so sample size justification is structurally inapplicable."
     63       },
     64       "variance_reported": {
     65         "applies": false,
     66         "answer": false,
     67         "justification": "No experimental runs are conducted by the survey authors. NA for a narrative survey."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "The survey does not systematically compare its taxonomy or coverage against prior surveys. Section 2 briefly mentions existing surveys [47-52] and claims this work is 'distinct' but provides no structured comparison of coverage, methodology, or scope."
     75       },
     76       "baselines_contemporary": {
     77         "applies": false,
     78         "answer": false,
     79         "justification": "No baseline comparison is conducted, so assessing whether baselines are contemporary is structurally inapplicable. There are no baselines to evaluate."
     80       },
     81       "ablation_study": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "No system with components to ablate is proposed. This is a survey paper."
     85       },
     86       "multiple_metrics": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "No empirical evaluation is performed by the survey itself that would require multiple metrics."
     90       },
     91       "human_evaluation": {
     92         "applies": false,
     93         "answer": false,
     94         "justification": "No system outputs are generated by this survey that would warrant human evaluation."
     95       },
     96       "held_out_test_set": {
     97         "applies": false,
     98         "answer": false,
     99         "justification": "No empirical evaluation with train/test splits is conducted."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The survey provides detailed per-category breakdowns: Figure 2 and Section 3 organize threats into five major categories with subcategories; Table 1 provides defense coverage across seven dimensions; Table 2 organizes benchmarks by capability vs. security-specific with detailed attribute columns."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The survey discusses failure modes of defense approaches throughout. Section 4.1.1 notes fine-tuning 'can degrade the general-purpose capabilities of LLMs without providing significant defensive capabilities against adaptive attacks.' Section 4.1.4 notes training-free defenses 'remain fragile against adaptive attacks.' Section 6 discusses open challenges representing current failure areas."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The survey consistently reports negative results from cited work: defensive fine-tuning degrading general capabilities (Section 4.1.1, citing [179]), prompt augmentation being bypassed by adaptive attacks (Section 4.1.3), sandboxing reducing collaborative efficiency (Section 4.3), and agents' poor CAPTCHA performance (Section 3.4.3)."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract claims the paper 'outlines a taxonomy of threats specific to agentic AI, reviews recent benchmarks and evaluation methodologies, and discusses defense strategies.' All three are delivered in Sections 3, 5, and 4 respectively. No overclaiming is present."
    122       },
    123       "causal_claims_justified": {
    124         "applies": false,
    125         "answer": false,
    126         "justification": "The survey makes no causal claims of its own. It summarizes causal findings from cited works with appropriate attribution (e.g., 'Lupinacci et al. demonstrated...', 'Fang et al. demonstrate...')."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The survey appropriately attributes claims to specific systems, models, or studies (e.g., 'GPT-4 achieving 87% success rate'). Section 6.5 explicitly states 'this survey primarily focuses on software-based agents' and notes physically embodied agents as underexplored."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": false,
    135         "answer": false,
    136         "justification": "This is a pure survey/taxonomy paper that presents no original empirical results. The schema states NA for 'papers that present no empirical results (e.g., pure surveys or taxonomies).'"
    137       }
    138     },
    139     "setup_transparency": {
    140       "model_versions_specified": {
    141         "applies": false,
    142         "answer": false,
    143         "justification": "The survey does not run any LLM experiments itself. When discussing cited work, it uses model names as reported in those papers, which is appropriate for a survey."
    144       },
    145       "prompts_provided": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "The survey does not use prompting in its own methodology."
    149       },
    150       "hyperparameters_reported": {
    151         "applies": false,
    152         "answer": false,
    153         "justification": "No experiments are conducted by the survey authors. Hyperparameters are structurally inapplicable."
    154       },
    155       "scaffolding_described": {
    156         "applies": false,
    157         "answer": false,
    158         "justification": "No agentic scaffolding is used by the survey itself."
    159       },
    160       "data_preprocessing_documented": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "No systematic search methodology is described. The paper does not explain how papers were identified for inclusion, what databases were searched, what search terms were used, or what inclusion/exclusion criteria were applied. There is no PRISMA diagram or paper selection pipeline."
    164       }
    165     },
    166     "limitations_and_scope": {
    167       "limitations_section_present": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "There is no dedicated limitations or threats-to-validity section. Section 6 ('Open Challenges') discusses gaps in the field of agentic AI security, but does not discuss limitations of the survey itself (e.g., potential coverage gaps, selection bias, recency bias)."
    171       },
    172       "threats_to_validity_specific": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "No threats to the validity of the survey's own methodology are discussed. The paper does not address whether it missed relevant papers, whether its taxonomy is complete, or whether its selection of cited papers is representative."
    176       },
    177       "scope_boundaries_stated": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 6.5 explicitly states 'this survey primarily focuses on software-based agents' and notes that 'physically embodied agents introduce additional and largely unexplored security risks.' The introduction also distinguishes the security focus from capability surveys."
    181       }
    182     },
    183     "data_integrity": {
    184       "raw_data_available": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "No structured dataset of reviewed papers is released. The bibliography provides references but not a machine-readable corpus with coded attributes, making independent verification of coverage claims impossible."
    188       },
    189       "data_collection_described": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "No description of how papers were collected. The survey does not specify search databases, date ranges, search queries, or any systematic selection process. Papers appear selected based on the authors' judgment without a documented protocol."
    193       },
    194       "recruitment_methods_described": {
    195         "applies": false,
    196         "answer": false,
    197         "justification": "No human participants are involved in this survey. This is a literature review, not a human subjects study."
    198       },
    199       "data_pipeline_documented": {
    200         "applies": true,
    201         "answer": false,
    202         "justification": "There is no documented pipeline from paper collection to final taxonomy. The survey presents a taxonomy and cites supporting papers, but the process by which papers were screened, included, or excluded is entirely opaque."
    203       }
    204     },
    205     "conflicts_of_interest": {
    206       "funding_disclosed": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "No acknowledgments section is present. No funding sources are disclosed anywhere in the paper."
    210       },
    211       "affiliations_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Author affiliations are clearly listed on the title page: 'Bellini College of AI, Cybersecurity, and Computing, University of South Florida.' Contact email addresses are also provided."
    215       },
    216       "funder_independent_of_outcome": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No funding is disclosed, so funder independence cannot be assessed. The schema says 'NA if unfunded,' but the paper does not confirm it is unfunded—university researchers typically receive some form of funding support. The absence of a funding disclosure is not the same as confirmation of being unfunded."
    220       },
    221       "financial_interests_declared": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "There is no competing interests or financial disclosure statement in the paper. Absence of declaration is not the same as absence of conflict. The corresponding author (Chhabra) cites two of his own works [145, 146]."
    225       }
    226     },
    227     "contamination": {
    228       "training_cutoff_stated": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "This is a survey paper that does not evaluate any pre-trained model's capabilities on benchmarks. Contamination questions are structurally inapplicable."
    232       },
    233       "train_test_overlap_discussed": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "No model evaluation is conducted by the survey itself. NA for a survey paper."
    237       },
    238       "benchmark_contamination_addressed": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No model evaluation is conducted. NA for a survey paper."
    242       }
    243     },
    244     "human_studies": {
    245       "pre_registered": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants are involved. This is a literature survey."
    249       },
    250       "irb_or_ethics_approval": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants are involved."
    254       },
    255       "demographics_reported": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants are involved."
    259       },
    260       "inclusion_exclusion_criteria": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants are involved."
    264       },
    265       "randomization_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants or experimental conditions are involved."
    269       },
    270       "blinding_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants or experimental conditions are involved."
    274       },
    275       "attrition_reported": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants are involved."
    279       }
    280     },
    281     "cost_and_practicality": {
    282       "inference_cost_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "This is a survey paper that does not propose or evaluate a system with inference costs."
    286       },
    287       "compute_budget_stated": {
    288         "applies": false,
    289         "answer": false,
    290         "justification": "This is a survey paper requiring no computational budget for its own methodology."
    291       }
    292     }
    293   }
    294 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs