calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (19940B)
      1 {
      2   "paper_slug": "aegis-automated-coevolutionary-2025",
      3   "calibration_date": "2026-02-28",
      4   "calibration_model": "opus",
      5   "scan_model": "sonnet",
      6   "total_questions": 50,
      7   "agreement_count": 44,
      8   "disagreement_count": 6,
      9   "agreement_rate": 0.88,
     10   "disagreements": [
     11     {
     12       "question": "evaluation_design.per_category_breakdown",
     13       "sonnet": {"applies": false, "answer": false},
     14       "opus": {"applies": true, "answer": false},
     15       "direction": "applies_boundary",
     16       "explanation": "Sonnet marked applies=false, reasoning that 'the task is binary classification with a single dataset' and 'there are no meaningful sub-categories.' Opus disagrees: the 143 malicious articles contain 'a wide variety of successful prompt injections' with different strategies. A per-category breakdown by injection type/strategy would be meaningful and informative. The criterion applies — the paper could have provided such a breakdown but did not."
     17     },
     18     {
     19       "question": "setup_transparency.model_versions_specified",
     20       "sonnet": {"applies": true, "answer": true},
     21       "opus": {"applies": true, "answer": false},
     22       "direction": "sonnet_generous",
     23       "explanation": "Sonnet credited 'gpt-4.1-mini', 'GPT-5-mini', 'GPT-4.1-nano', 'Gemini-2.0-flash', 'Gemini-2.5-flash-lite' as specific model versions. The schema explicitly states: 'Marketing names like \"Gemini-2.5\" or \"GPT-4o\" without a snapshot date or API version do NOT count as specified versions.' None of these model names include a snapshot date or API version identifier. They are marketing names and do not satisfy the criterion."
     24     },
     25     {
     26       "question": "setup_transparency.hyperparameters_reported",
     27       "sonnet": {"applies": true, "answer": true},
     28       "opus": {"applies": true, "answer": false},
     29       "direction": "sonnet_generous",
     30       "explanation": "Sonnet credited Table 5 (Appendix A.3) as a complete hyperparameter table. While the table lists GAN iterations, optimization iterations, LLM model, weights, and power parameters, it does NOT include LLM API hyperparameters: temperature, top-p, max tokens, or other sampling settings. The schema explicitly states: 'If the paper uses an LLM API without stating temperature/sampling settings, NO — these significantly affect output.' The paper uses GPT and Gemini APIs extensively without reporting temperature/sampling settings."
     31     },
     32     {
     33       "question": "contamination.training_cutoff_stated",
     34       "sonnet": {"applies": true, "answer": false},
     35       "opus": {"applies": false, "answer": false},
     36       "direction": "applies_boundary",
     37       "explanation": "Sonnet marked applies=true because the paper uses GPT and Gemini models. Opus marks applies=false because the schema says NA for papers that 'test defenses/tools rather than model knowledge.' AEGIS is a defense framework that uses LLMs as tools to detect prompt injections — it is not evaluating the LLM's inherent capability on a benchmark. The paper tests whether an automated optimization framework generates better defense prompts, not whether the underlying model has memorized benchmark answers."
     38     },
     39     {
     40       "question": "contamination.train_test_overlap_discussed",
     41       "sonnet": {"applies": true, "answer": false},
     42       "opus": {"applies": false, "answer": false},
     43       "direction": "applies_boundary",
     44       "explanation": "Same reasoning as training_cutoff_stated. The paper tests a defense tool/framework, not model knowledge on a benchmark. Contamination concerns (whether the model saw test data during training) are less relevant when the model is used as a tool for classification rather than being evaluated for its own learned capabilities."
     45     },
     46     {
     47       "question": "contamination.benchmark_contamination_addressed",
     48       "sonnet": {"applies": true, "answer": false},
     49       "opus": {"applies": false, "answer": false},
     50       "direction": "applies_boundary",
     51       "explanation": "Same reasoning as the other contamination questions. The framework uses LLMs as classification tools rather than evaluating their benchmark knowledge. The schema guidance specifies NA for 'studies that test defenses/tools rather than model knowledge.'"
     52     }
     53   ],
     54   "opus_checklist": {
     55     "artifacts": {
     56       "code_released": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No GitHub link, Zenodo archive, or other repository URL is provided anywhere in the paper. No code release is mentioned."
     60       },
     61       "data_released": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The evaluation dataset consists of student-submitted articles from an NTU course. The paper states they were 'manually modified to anonymize personal data and for copyright purposes' but no download link or public release is provided."
     65       },
     66       "environment_specified": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "Only the LLM model name (gpt-4.1-mini) is listed in Table 5 (Appendix A.3). No requirements.txt, Dockerfile, library versions, or environment setup section is provided."
     70       },
     71       "reproduction_instructions": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "Algorithms 1 and 2 describe the method at a high level, but no step-by-step reproduction instructions, README, or runnable scripts are provided."
     75       }
     76     },
     77     "statistical_methodology": {
     78       "confidence_intervals_or_error_bars": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Section 4.2 states experiments were conducted three times with standard deviation calculated. Figures 4 and 5 show shaded regions representing standard deviation across runs."
     82       },
     83       "significance_tests": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No statistical significance tests (t-tests, p-values, bootstrap, etc.) are applied when comparing AEGIS to baselines. Claims of outperformance are based solely on comparing point estimates without any significance testing."
     87       },
     88       "effect_sizes_reported": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Abstract states 'improves the true positive rate (TPR) by 0.20 compared to the previous state of the art, with only a slight decrease in the true negative rate (TNR) of 0.02.' Table 1 provides baseline context (from 0.64 to 0.84 TPR)."
     92       },
     93       "sample_size_justified": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "Training uses 50 GPT-generated articles (40 train, 10 validation); evaluation uses 143 malicious + 100 benign student articles. No power analysis or justification for these sample sizes is provided."
     97       },
     98       "variance_reported": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 4.2: 'all experiments were conducted three times. The results presented in this paper are the average values from these three runs. We also calculated the standard deviation.' Figures 4 and 5 show shaded standard deviation regions."
    102       }
    103     },
    104     "evaluation_design": {
    105       "baselines_included": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Table 1 compares AEGIS against three baselines: Perplexity-based Detection (Alon and Kamfonas, 2023), LLaMA 3.1 Guard (Inan et al., 2023), and Human-Crafted Prompt (Chiang et al., 2024)."
    109       },
    110       "baselines_contemporary": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Baselines are from 2023-2024. Human-Crafted Prompt (Chiang et al., 2024) is the direct predecessor on the same dataset. LLaMA 3.1 Guard (2023) is a recent standard safety tool."
    114       },
    115       "ablation_study": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Section 6 presents three ablations: removing gradient buffer (6.1), removing multiple gradients (6.2), and single-sided training (6.3). Figures 4 and 5 show performance differences."
    119       },
    120       "multiple_metrics": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "TPR and TNR for defense evaluation; ASR and delta_Srel for attacker evaluation."
    124       },
    125       "human_evaluation": {
    126         "applies": false,
    127         "answer": false,
    128         "justification": "The paper evaluates binary classification (attack vs. benign) against pre-labeled ground truth data. Human evaluation of the system's classification outputs is not relevant to the core claims about detection accuracy."
    129       },
    130       "held_out_test_set": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Section 4.1 clearly separates training data (50 GPT-generated benign articles) from evaluation data (143 malicious + 100 benign student articles from a prior course). The evaluation set was not used during training."
    134       },
    135       "per_category_breakdown": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The 143 malicious articles contain 'a wide variety of successful prompt injections' with different strategies. A per-category breakdown by injection type would be meaningful but is not provided. Only aggregate TPR and TNR are reported, hiding potential variation across injection strategies."
    139       },
    140       "failure_cases_discussed": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "No error analysis or qualitative examples of failure cases are provided. Appendix A.4 shows defense prompt improvement (successes) but no cases where the defense fails to detect injections."
    144       },
    145       "negative_results_reported": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Ablation studies (Section 6) report components that degrade performance when removed: gradient buffer removal leads to ~5% TPR degradation; removing multiple gradients leads to >10% degradation; single-sided training leads to overfitting."
    149       }
    150     },
    151     "claims_and_evidence": {
    152       "abstract_claims_supported": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Abstract claims TPR improvement of 0.20 over state of the art with 0.02 TNR decrease. Table 1 confirms: AEGIS Iteration 8 TPR=0.84 vs. Human-Crafted Prompt TPR=0.64 (difference 0.20), TNR=0.89 vs. 0.91 (difference -0.02)."
    156       },
    157       "causal_claims_justified": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Ablation studies (Section 6) use controlled single-variable manipulation to support causal claims about component contributions. Co-evolution, gradient buffer, and multi-route gradients are each removed individually."
    161       },
    162       "generalization_bounded": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "Title claims a general framework for 'Guarding Prompt Injection' but evaluation is limited to a single task (automated assignment grading at NTU). The schema penalizes broad titles when results are narrow. While the Limitation section acknowledges this, the abstract and title make unbounded general claims."
    166       },
    167       "alternative_explanations_discussed": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "No discussion of alternative explanations for the results. No consideration of whether improvements might be due to dataset-specific factors, model-specific properties of gpt-4.1-mini, or other confounds."
    171       }
    172     },
    173     "setup_transparency": {
    174       "model_versions_specified": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "The paper uses 'gpt-4.1-mini', 'GPT-5-mini', 'GPT-4.1-nano', 'Gemini-2.0-flash', 'Gemini-2.5-flash-lite'. Per the schema: 'Marketing names like \"Gemini-2.5\" or \"GPT-4o\" without a snapshot date or API version do NOT count as specified versions.' No snapshot dates or API version identifiers are provided for any model."
    178       },
    179       "prompts_provided": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Appendix A.4 (Table 6) shows actual full defense prompt text at GAN iterations 0, 4, and 8. Appendix A.6 (Table 11) provides the full Human-Crafted Prompt baseline text."
    183       },
    184       "hyperparameters_reported": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "Table 5 (Appendix A.3) lists framework-level hyperparameters (GAN iterations, optimization iterations, weights, power parameters) but does NOT include LLM API settings: temperature, top-p, max tokens, or sampling parameters. The schema states: 'If the paper uses an LLM API without stating temperature/sampling settings, NO.' The paper calls GPT and Gemini APIs extensively without reporting these settings."
    188       },
    189       "scaffolding_described": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The co-evolutionary scaffolding is described in detail in Section 3 and Appendix A.1-A.2: Algorithm 1 (co-evolution framework), Algorithm 2 (TGO+ workflow), evaluation functions Eval() and Val(), gradient acquisition/application, and gradient buffer mechanism."
    193       },
    194       "data_preprocessing_documented": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Section 4.1 documents that student articles were 'manually modified to anonymize personal data and for copyright purposes, while preserving their original strategic intent.' Training/validation split (40/10) and evaluation set composition (143 malicious + 100 benign) are clearly stated."
    198       }
    199     },
    200     "limitations_and_scope": {
    201       "limitations_section_present": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Dedicated 'Limitation' section after the Conclusion with substantive discussion of three specific limitations."
    205       },
    206       "threats_to_validity_specific": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Three specific threats: (1) evaluation limited to automated assignment grading task, (2) defense targets only text-based dialogue systems, (3) no large-scale human evaluation conducted. These are specific to this study."
    210       },
    211       "scope_boundaries_stated": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "The Limitation section explicitly states what was NOT shown: generalizability to diverse real-world security tasks, effectiveness in multimodal systems, and large-scale human evaluation results."
    215       }
    216     },
    217     "data_integrity": {
    218       "raw_data_available": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "The student-submitted articles used for evaluation are not publicly released. No download link or data access mechanism is provided."
    222       },
    223       "data_collection_described": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "Section 4.1 describes data sources: 50 GPT-generated benign articles for training/validation; 143 malicious articles and 100 benign articles from student submissions in a prior NTU course (Chiang et al., 2024). Selection criteria stated: malicious articles are ones that 'achieve full scores without being detected by the defense.'"
    227       },
    228       "recruitment_methods_described": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "The data consists of archival course submissions, not recruited participants. This is not a human subjects study requiring participant recruitment description."
    232       },
    233       "data_pipeline_documented": {
    234         "applies": true,
    235         "answer": true,
    236         "justification": "Section 4.1 documents the data pipeline: GPT-generated benign articles for training (40) and validation (10); real-world evaluation uses 143 malicious student articles + 100 benign student articles. Anonymization/modification transformation step is mentioned."
    237       }
    238     },
    239     "conflicts_of_interest": {
    240       "funding_disclosed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No acknowledgments section and no mention of funding sources anywhere in the paper."
    244       },
    245       "affiliations_disclosed": {
    246         "applies": true,
    247         "answer": true,
    248         "justification": "All authors are affiliated with Electrical Engineering at National Taiwan University, as stated on the first page with institutional email addresses."
    249       },
    250       "funder_independent_of_outcome": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No funding is disclosed. Appears to be unfunded student research (student email addresses b10901039 etc. suggest undergraduate students). Cannot assess funder independence when no funding is disclosed."
    254       },
    255       "financial_interests_declared": {
    256         "applies": true,
    257         "answer": false,
    258         "justification": "No competing interests statement in the paper. Absence of disclosure is not the same as absence of conflict."
    259       }
    260     },
    261     "contamination": {
    262       "training_cutoff_stated": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "The paper tests a defense framework/tool against prompt injection attacks, not evaluating a pre-trained model's capability on a benchmark. Per the schema: NA for 'studies that test defenses/tools rather than model knowledge.' The LLMs are used as tools within the defense system."
    266       },
    267       "train_test_overlap_discussed": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "Same reasoning: the paper tests a defense tool, not model knowledge on a benchmark. Contamination concerns about whether the model memorized benchmark data are not the primary concern here."
    271       },
    272       "benchmark_contamination_addressed": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "Same reasoning: the paper evaluates a defense framework, not a model's benchmark performance. The schema specifies NA for papers that test defenses/tools rather than model knowledge."
    276       }
    277     },
    278     "human_studies": {
    279       "pre_registered": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in experiments. The student-submitted data is archival course data."
    283       },
    284       "irb_or_ethics_approval": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No prospective human subject enrollment. Uses archival student course data."
    288       },
    289       "demographics_reported": {
    290         "applies": false,
    291         "answer": false,
    292         "justification": "No human participants involved in the experiments."
    293       },
    294       "inclusion_exclusion_criteria": {
    295         "applies": false,
    296         "answer": false,
    297         "justification": "No human participants recruited; data consists of archival course submissions."
    298       },
    299       "randomization_described": {
    300         "applies": false,
    301         "answer": false,
    302         "justification": "No human participants involved; no randomization to experimental conditions needed."
    303       },
    304       "blinding_described": {
    305         "applies": false,
    306         "answer": false,
    307         "justification": "No human participants or human evaluators of system outputs involved."
    308       },
    309       "attrition_reported": {
    310         "applies": false,
    311         "answer": false,
    312         "justification": "No human participants involved in the experiments."
    313       }
    314     },
    315     "cost_and_practicality": {
    316       "inference_cost_reported": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The framework calls GPT-4.1-mini and other LLM APIs repeatedly across 8 GAN iterations with multiple optimization iterations each, but no API costs, token usage, or wall-clock time is reported."
    320       },
    321       "compute_budget_stated": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "No total computational budget, GPU hours, total API spend, or hardware specifications are provided."
    325       }
    326     }
    327   }
    328 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs