calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (22579B)
      1 {
      2   "paper_slug": "aegisagent-autonomous-defense-2025",
      3   "calibration_model": "opus",
      4   "scan_model": "sonnet",
      5   "timestamp": "2026-02-28",
      6   "total_questions": 50,
      7   "agreement_count": 46,
      8   "disagreement_count": 4,
      9   "agreement_rate": 0.92,
     10   "disagreements": [
     11     {
     12       "category": "statistical_methodology",
     13       "question": "effect_sizes_reported",
     14       "sonnet": {"applies": true, "answer": false},
     15       "opus": {"applies": true, "answer": true},
     16       "direction": "opus_generous",
     17       "explanation": "The paper reports percentage improvements with baseline context throughout — e.g., ASR drops from 62.2% to 33.4% (Table 1), DA drops from full system ~85% to 7-8% when removing the Robust Reasoner (Section 5.3). Per the schema, 'A paper that says 12% improvement over baseline (from 45% to 57%) provides enough context for YES.' The paper consistently provides before/after numbers with sufficient context to understand the magnitude of effects."
     18     },
     19     {
     20       "category": "evaluation_design",
     21       "question": "human_evaluation",
     22       "sonnet": {"applies": false, "answer": false},
     23       "opus": {"applies": true, "answer": false},
     24       "direction": "applies_boundary",
     25       "explanation": "The schema says applies=false only when 'Human evaluation is clearly irrelevant to the claims.' This paper claims AegisAgent produces 'secure outputs' and preserves 'semantic fidelity' — human evaluation of whether the corrected prompts maintain semantic correctness would be relevant. The paper uses automated metrics only (DA, ASR, SC, etc.) but human evaluation could meaningfully assess the quality of the 'repaired' outputs. Since human evaluation is relevant but not performed, applies=true, answer=false is more appropriate."
     26     },
     27     {
     28       "category": "evaluation_design",
     29       "question": "negative_results_reported",
     30       "sonnet": {"applies": true, "answer": true},
     31       "opus": {"applies": true, "answer": false},
     32       "direction": "sonnet_generous",
     33       "explanation": "Sonnet credits the ablation study as 'negative results.' However, ablation results showing that removing a component hurts performance are not negative results — they are expected positive validation of each component's contribution. The schema asks for 'ablations that hurt performance, approaches that were tried and abandoned, configurations that failed.' The paper shows that every experiment confirms AegisAgent's superiority. There are no reports of approaches that didn't work, configurations that failed, or unexpected findings. The case study showing prior defenses failing is about baselines, not AegisAgent's own approaches."
     34     },
     35     {
     36       "category": "setup_transparency",
     37       "question": "prompts_provided",
     38       "sonnet": {"applies": true, "answer": true},
     39       "opus": {"applies": true, "answer": false},
     40       "direction": "sonnet_generous",
     41       "explanation": "Appendix B provides prompt templates for the five target LLM-HAR models (LLaSA, IMUGPT-2.0, HARGPT, ContextGPT, MotionGPT) showing signal-to-prompt pipelines. However, these are the TARGET SYSTEM prompts, not AegisAgent's own prompts. The paper does not provide the actual prompts used by AegisAgent's Planning Agent, Executor Agent, or Robust Reasoner when they invoke Gemma-2-9B. The schema requires 'the prompts or system instructions used in experiments' — a reader cannot reconstruct the actual prompts sent to the defense system's LLM backbone."
     42     }
     43   ],
     44   "opus_checklist": {
     45     "artifacts": {
     46       "code_released": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The Ethical Considerations section states: 'Upon the publication of this work, we will release our source code.' This is a promise of future release, not an actual release. No repository URL or archive is provided. Per schema, a promise of future release counts as NO."
     50       },
     51       "data_released": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "The evaluation uses three standard public benchmarks: USC-HAD, UCI HAR, and PAMAP2, all publicly available. The paper references the original publications for each dataset. No proprietary data was collected."
     55       },
     56       "environment_specified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The paper mentions 'a workstation equipped with three NVIDIA RTX 3090 GPUs (24GB memory each)' and uses Gemma-2-9B, but provides no requirements.txt, Dockerfile, conda environment, or detailed library version listings."
     60       },
     61       "reproduction_instructions": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No README, reproduction scripts, or step-by-step instructions are provided. Code is promised for future release. The paper describes the system design but lacks operational detail for independent reproduction."
     65       }
     66     },
     67     "statistical_methodology": {
     68       "confidence_intervals_or_error_bars": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "All reported metrics (DA, SC, ASR, RR, HS) in Tables 1-3 and Figures 6-7 are point estimates only. No confidence intervals, error bars, or standard deviations are reported despite Section 5.1 stating experiments are repeated five times."
     72       },
     73       "significance_tests": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "The paper claims AegisAgent 'significantly' outperforms baselines (Section 5.5) and delivers 'significant defensive improvements' (Section 5.2), but no statistical significance tests (p-values, t-tests, etc.) are performed. Comparisons are based solely on comparing point estimates."
     77       },
     78       "effect_sizes_reported": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The paper consistently reports before/after comparisons with baseline context: ASR drops from 62.2% to 33.4% (Table 1), DA drops from ~85% to 7-8% without the Robust Reasoner (Section 5.3), AegisAgent achieves 93.0% DA vs. 64.9% for the best baseline (Section 5.5). Per the schema example ('12% improvement from 45% to 57%' = YES), these provide sufficient context for the magnitude of effects."
     82       },
     83       "sample_size_justified": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "The paper does not justify the number of test instances per attack, per model, or per dataset. No power analysis is discussed. The number of evaluation examples from each dataset is not stated."
     87       },
     88       "variance_reported": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "Section 5.1 states 'each experiment is repeated five times, with average results reported' but no standard deviation, IQR, or any other spread measure is reported across the 5 runs. Only averages are given in all tables and figures."
     92       }
     93     },
     94     "evaluation_design": {
     95       "baselines_included": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 5.5 and Figure 7 compare AegisAgent against three baseline categories: text-only defenses (SafeDecoding), classical HAR defenses (FGSM/PGD adversarial training), and multimodal detection-only defenses (cross-modal consistency methods)."
     99       },
    100       "baselines_contemporary": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "SafeDecoding (2024) is recent and relevant. FGSM/PGD are standard adversarial training methods appropriate for this domain. The multimodal detection methods represent the cross-modal consistency checking approach. Given the novelty of the LLM-HAR defense problem, these baselines are reasonable."
    104       },
    105       "ablation_study": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section 5.3 (RQ2) presents a systematic ablation study removing each of the three components (Input Sanitizer, Consistency Verifier, Robust Reasoner) individually. Results across all five target models are shown in Figure 6."
    109       },
    110       "multiple_metrics": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper reports five distinct metrics: Detection Accuracy (DA), Attack Success Rate (ASR), Recovery Rate (RR), Semantic Consistency (SC), and Harm Score (HS). These cover different aspects of defense effectiveness."
    114       },
    115       "human_evaluation": {
    116         "applies": true,
    117         "answer": false,
    118         "justification": "No human evaluation is performed. The paper claims AegisAgent preserves 'semantic fidelity' and produces 'secure outputs,' but evaluation is entirely automated using metrics like DA, ASR, SC. Human evaluation of the repaired output quality would be relevant to validate claims about semantic correctness and practical safety."
    119       },
    120       "held_out_test_set": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The paper does not explicitly describe train/test splits for evaluation. AegisAgent is training-free, but it is unclear whether the threshold parameters (tau_san, tau_MAD, etc.) were tuned on a separate validation set or on the same data used for evaluation. No explicit separation of dev and test is described."
    124       },
    125       "per_category_breakdown": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Table 2 breaks down Harm Score by individual attack type and category (Signal, Text, Prompt, Hybrid). Table 1 shows per-model and per-LLM-classifier results. Figure 6 shows per-model ablation results. Detailed per-category breakdowns are provided."
    129       },
    130       "failure_cases_discussed": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper does not discuss specific failure cases where AegisAgent fails. No error analysis, qualitative examples of failures, or discussion of attack types the system cannot handle is provided. The presentation focuses exclusively on AegisAgent's successes."
    134       },
    135       "negative_results_reported": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "No negative results are reported. The ablation study (Section 5.3) shows component removal hurts performance, which is expected positive validation, not a negative result. No approaches that were tried and abandoned, no configurations that failed, no unexpected findings are reported. Every experiment confirms AegisAgent's effectiveness."
    139       }
    140     },
    141     "claims_and_evidence": {
    142       "abstract_claims_supported": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The abstract claims '85% detection accuracy on average' (supported by Table 1 averaging to ~85% DA), 'reduces attack success rate by 30%' (supported by ASR comparisons in Table 1), and '78.6 ms of latency overhead' (supported by Table 3). All abstract claims are supported by the results."
    146       },
    147       "causal_claims_justified": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "The ablation study in Section 5.3 uses controlled single-variable removal (one component at a time) to justify causal claims about each component's contribution. This controlled ablation design is adequate for the causal claims made about component importance."
    151       },
    152       "generalization_bounded": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "The paper title claims defense for 'LLM-HARs' generally but only tests five specific architectures with Gemma-2-9B as the internal model. Section 5.5 states AegisAgent 'delivers the most consistent and comprehensive defense' and the conclusion claims 'robustness against a wide range of adversarial perturbations.' These generalizations extend beyond the tested setting of 5 systems, 3 datasets, and 1 backbone LLM."
    156       },
    157       "alternative_explanations_discussed": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "No alternative explanations for AegisAgent's performance gains are discussed. The paper does not consider whether improvements could be partly due to the added computational budget (Gemma-2-9B reasoning overhead), dataset-specific properties, or the specific characteristics of the attack implementations. No threats to validity or confounds are addressed."
    161       }
    162     },
    163     "setup_transparency": {
    164       "model_versions_specified": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "The paper uses ChatGPT-4o, Gemini, LLaMA-2, DeepSeek-V3, and Gemma-2-9B. None of these have specific API version identifiers or snapshot dates. 'GPT-4o' and 'Gemini' without version qualifiers are insufficient per schema. 'LLaMA-2' lacks specific size/variant. 'DeepSeek-V3' lacks checkpoint date. Only 'Gemma-2-9B' specifies the size but not the exact checkpoint."
    168       },
    169       "prompts_provided": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "Appendix B provides prompt templates for the five target LLM-HAR models (how they convert signals to prompts), but these are the TARGET SYSTEM prompts, not AegisAgent's own prompts. The actual prompts used by AegisAgent's Planning Agent, Executor Agent, and Robust Reasoner when invoking Gemma-2-9B are not provided. A reader cannot reconstruct the defense system's LLM queries."
    173       },
    174       "hyperparameters_reported": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Key hyperparameters are specified in Section 4.2: similarity threshold tau_san=0.75, MAD threshold tau_MAD=2.5, graylist coefficient alpha=0.4, semantic mismatch threshold tau_c=0.35, semantic conflict threshold tau_sem=0.6, temporal threshold tau_temp=0.6, executor threshold tau_exec=0.85."
    178       },
    179       "scaffolding_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 4 provides detailed description of AegisAgent's agentic scaffolding: Input Sanitizer, Consistency Verifier, Robust Reasoner, Memory Hub, Planning Agent, and Executor Agent. The workflow is formalized mathematically, Figure 5 provides a system diagram, and inter-component communication is described in detail."
    183       },
    184       "data_preprocessing_documented": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "The paper does not document how raw IMU data from USC-HAD, UCI HAR, and PAMAP2 is preprocessed before being fed to the LLM-HAR models. Appendix B shows prompt templates but the numerical preprocessing steps (windowing, normalization, feature extraction from raw sensors) are not described."
    188       }
    189     },
    190     "limitations_and_scope": {
    191       "limitations_section_present": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "There is no dedicated limitations or threats-to-validity section in the paper. The conclusion (Section 7) is brief (5 sentences) and does not discuss any limitations. The Ethical Considerations section addresses responsible disclosure but not methodological limitations."
    195       },
    196       "threats_to_validity_specific": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "No threats to validity are discussed anywhere in the paper. There is no discussion of threats specific to this study such as: limited number of attack types, only white-box attacks tested, restricted model coverage, evaluation only using simulated attacks, or potential overfitting of thresholds to these specific benchmarks."
    200       },
    201       "scope_boundaries_stated": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "The paper does not explicitly state what the results do NOT show. The conclusion claims 'robustness against a wide range of adversarial perturbations' without bounding this to the tested setting. No explicit statements about untested scenarios, model types, or attack categories that fall outside the evaluation scope."
    205       }
    206     },
    207     "data_integrity": {
    208       "raw_data_available": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "Only aggregated performance metrics are presented in tables and figures. Raw prediction results, per-example detection outcomes, specific adversarial prompts used in testing, and detailed experiment logs are not released or made available."
    212       },
    213       "data_collection_described": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "The paper uses standard public datasets (USC-HAD, UCI HAR, PAMAP2) that are well-documented and cites the original publications for each. The 15 attack types are formally defined in Section 3.2 and cataloged in Appendix A (Table 4) with formal operators and descriptions."
    217       },
    218       "recruitment_methods_described": {
    219         "applies": false,
    220         "answer": false,
    221         "justification": "No human participants were recruited. The evaluation is entirely automated using existing LLM-HAR systems and standard public datasets. The NA rule states this applies when 'no human participants and data source is a standard benchmark.'"
    222       },
    223       "data_pipeline_documented": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "While the high-level pipeline is described (IMU -> text -> prompt -> LLM), intermediate steps are insufficiently documented. How many evaluation instances per dataset/attack/model combination, how attack instances were generated in practice, and how the 5-run averaging was implemented are not detailed enough for independent verification."
    227       }
    228     },
    229     "conflicts_of_interest": {
    230       "funding_disclosed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No acknowledgments section or funding disclosure is present in the paper. There is no mention of grants, institutional support, or corporate funding. The authors are from City University of Hong Kong and Deakin University but no funding is disclosed."
    234       },
    235       "affiliations_disclosed": {
    236         "applies": true,
    237         "answer": true,
    238         "justification": "Author affiliations are listed in the header: City University of Hong Kong (authors 1, 2, 4) and Deakin University (author 3). The authors are not affiliated with any of the commercial LLM providers whose models are evaluated (OpenAI, Google, Meta, DeepSeek)."
    239       },
    240       "funder_independent_of_outcome": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No funding source is disclosed, making this question inapplicable. There is no identified funder whose independence can be assessed. The schema states NA if unfunded."
    244       },
    245       "financial_interests_declared": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "There is no competing interests statement or financial interests declaration in the paper. Per schema, 'If there is no competing interests statement at all, NO — absence of disclosure is not the same as absence of conflict.'"
    249       }
    250     },
    251     "contamination": {
    252       "training_cutoff_stated": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "This paper evaluates a defense system against prompt injection attacks, not a pre-trained model's benchmark performance. AegisAgent is training-free. The evaluation tests defense capabilities, not model knowledge recall. Per schema, NA applies when the paper 'tests defenses/tools rather than model knowledge.'"
    256       },
    257       "train_test_overlap_discussed": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "The evaluation tests defense capabilities against prompt injection attacks, not benchmark accuracy that could be contaminated by training data. The HAR datasets are sensor data, not language benchmarks. Contamination is not a relevant concern for this evaluation design."
    261       },
    262       "benchmark_contamination_addressed": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "The evaluation uses IMU sensor datasets (USC-HAD, UCI HAR, PAMAP2) for activity recognition, not language benchmarks that LLMs might have memorized. The paper tests defense mechanisms, not model knowledge. Contamination is structurally inapplicable."
    266       }
    267     },
    268     "human_studies": {
    269       "pre_registered": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants are involved. The evaluation is entirely automated using existing LLM-HAR systems and public datasets."
    273       },
    274       "irb_or_ethics_approval": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants are involved. The Ethical Considerations section addresses responsible disclosure of attack vectors, not human subjects ethics."
    278       },
    279       "demographics_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants. Demographics are not applicable to this automated benchmark evaluation."
    283       },
    284       "inclusion_exclusion_criteria": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants. Inclusion/exclusion criteria for participants are not applicable."
    288       },
    289       "randomization_described": {
    290         "applies": false,
    291         "answer": false,
    292         "justification": "No human participants. No randomization of experimental conditions over participants was needed."
    293       },
    294       "blinding_described": {
    295         "applies": false,
    296         "answer": false,
    297         "justification": "No human participants. Evaluation is entirely automated with ground truth labels from the datasets."
    298       },
    299       "attrition_reported": {
    300         "applies": false,
    301         "answer": false,
    302         "justification": "No human participants. Attrition is not applicable."
    303       }
    304     },
    305     "cost_and_practicality": {
    306       "inference_cost_reported": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "Section 5.6 (RQ5) reports detailed per-query latency: total 78.6 ms including per-module breakdowns (Input Sanitization: 6.3 ms, Consistency Verification: 31.1 ms, Memory Hub Retrieval: 9.6 ms, Planning Agent: 11.2 ms, Executor Agent: 12.0 ms, Robust Reasoner: 8.4 ms). Hardware specified as three NVIDIA RTX 3090 GPUs."
    310       },
    311       "compute_budget_stated": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "While per-query latency is reported, the total computational budget for the full evaluation campaign (total GPU hours, API costs for GPT-4o and Gemini calls, total number of queries across 15 attacks x 5 models x 3 datasets x 5 runs) is not stated."
    315       }
    316     }
    317   }
    318 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs