scan.json (24932B)
1 { 2 "paper": { 3 "title": "Combining Cost-Constrained Runtime Monitors for AI Safety", 4 "authors": ["Tim Tian Hua", "James Baskerville", "Henri Lemoine", "Mia Hopman", "Aryan Bhatt", "Tyler Tracy"], 5 "year": 2025, 6 "venue": "NeurIPS 2025", 7 "arxiv_id": "2507.15886", 8 "doi": "10.48550/arXiv.2507.15886" 9 }, 10 "checklist": { 11 "artifacts": { 12 "code_released": { 13 "applies": true, 14 "answer": true, 15 "justification": "The paper provides a GitHub link: github.com/tim-hua-01/combining_monitors_public (Section 1, end of introduction). An Alignment Forum practitioner's guide is also linked." 16 }, 17 "data_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "The paper uses the publicly available BigCodeBench dataset (v.0.1.3, Apache-2.0 license) and references the public code repository which would contain the generated backdoored code and monitor scores." 21 }, 22 "environment_specified": { 23 "applies": true, 24 "answer": false, 25 "justification": "Appendix H describes the hardware (AMD EPYC 7413, 754 GB RAM) and mentions scikit-learn for KDE, but no requirements.txt, Dockerfile, or detailed dependency/version listing is provided in the paper." 26 }, 27 "reproduction_instructions": { 28 "applies": true, 29 "answer": false, 30 "justification": "The paper mentions a GitHub repository and names the experiment script (neurips_experiments.py in Appendix H), but does not provide step-by-step reproduction instructions in the paper itself. A reader would need to find the repository and figure out the setup." 31 } 32 }, 33 "statistical_methodology": { 34 "confidence_intervals_or_error_bars": { 35 "applies": true, 36 "answer": true, 37 "justification": "The paper reports quantile-based 90% confidence intervals on test set results (Section 4.1: 'We report means and quantile-based 90% CIs on the test set'). Appendix A shows standard errors on relative performance comparisons." 38 }, 39 "significance_tests": { 40 "applies": true, 41 "answer": true, 42 "justification": "Appendix A (Figure 6) shows that the difference between ΠSR and baselines is 'statistically significant at the 10% level' for budgets between $0.50 and $1.75, using quantile-based bootstrap standard errors." 43 }, 44 "effect_sizes_reported": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper reports effect sizes with baseline context: 'more than double our recall rate compared to a naive baseline' (abstract), and specific recall values at given budgets are shown in figures with both absolute and relative comparisons." 48 }, 49 "sample_size_justified": { 50 "applies": true, 51 "answer": false, 52 "justification": "The paper uses 1,140 aligned outputs and 318 misaligned outputs with 500 random train/test splits, but does not justify why these sample sizes are adequate or discuss statistical power. The paper acknowledges sensitivity to tail distributions (footnote 7) but does not perform a power analysis." 53 }, 54 "variance_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "Results are averaged over 500 random train/test splits of misaligned scores (Section 4.1), and 90% quantile-based confidence intervals are reported. Appendix A reports bootstrap standard errors for relative performance." 58 } 59 }, 60 "evaluation_design": { 61 "baselines_included": { 62 "applies": true, 63 "answer": true, 64 "justification": "The paper compares against Πnaive (from Greenblatt et al., 2024), ΠNP, ΠNPS single-monitor baselines, and a two-monitor baseline ΠBM that always calls both monitors (Section 6)." 65 }, 66 "baselines_contemporary": { 67 "applies": true, 68 "answer": true, 69 "justification": "The baselines are contemporary: Πnaive from Greenblatt et al. (2024) is the standard AI control monitoring approach, and ΠBM represents a natural two-monitor extension. These are appropriate for this setting." 70 }, 71 "ablation_study": { 72 "applies": true, 73 "answer": true, 74 "justification": "The paper systematically builds up from Πnaive → ΠNP (adding likelihood ratios) → ΠNPS (adding stochastic monitoring) → ΠSR (adding two monitors), effectively ablating each contribution. Section 5.2 explicitly examines the trade-off between monitoring and auditing." 75 }, 76 "multiple_metrics": { 77 "applies": true, 78 "answer": true, 79 "justification": "The paper evaluates safety (recall on misaligned outputs) across multiple budget levels and audit cost ratios (Figures 1, 2b, 3a, 3b, 5), and also reports cost/FPR trade-offs. Pareto curves are used to show the full cost-safety trade-off." 80 }, 81 "human_evaluation": { 82 "applies": false, 83 "answer": false, 84 "justification": "This is a mathematical framework paper with automated experiments on code backdoor detection. Human evaluation of the system's outputs is not relevant to the claims about optimal monitor combination." 85 }, 86 "held_out_test_set": { 87 "applies": true, 88 "answer": true, 89 "justification": "Section 4.1 explicitly states: 'We assess policy safety by measuring recall on the test split of misaligned responses.' The 318 misaligned samples are split into 218 train and 100 test across 500 random splits." 90 }, 91 "per_category_breakdown": { 92 "applies": true, 93 "answer": true, 94 "justification": "Performance is broken down by budget level, audit cost ratio (ca/c1), and policy class. Figures show Pareto curves across multiple budget configurations. Appendix G shows out-of-distribution performance on a different attack set." 95 }, 96 "failure_cases_discussed": { 97 "applies": true, 98 "answer": true, 99 "justification": "Section 7 (Limitations) discusses when the approach can fail: distribution shift between training and actual attacks, the 'winner's curse' effect. Appendix G shows degraded absolute performance on out-of-distribution backdoors from Wen et al. (2025)." 100 }, 101 "negative_results_reported": { 102 "applies": true, 103 "answer": true, 104 "justification": "The paper reports that stochastic monitoring gains diminish as audit-to-monitor cost ratio increases (Section 5.2, Figure 3b). Appendix G shows much lower absolute safety on OOD attacks, though relative gains are maintained." 105 } 106 }, 107 "claims_and_evidence": { 108 "abstract_claims_supported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The abstract claims (1) more than doubling recall compared to naive baseline, (2) combining monitors yields Pareto dominance, and (3) a principled methodology — all are supported by the experimental results in Sections 5 and 6 and corresponding figures." 112 }, 113 "causal_claims_justified": { 114 "applies": true, 115 "answer": true, 116 "justification": "The paper makes causal claims through ablations (removing/adding components like likelihood ratios, stochastic monitoring, second monitor). These are controlled single-variable manipulations within a mathematical framework, which is adequate for the causal claims made." 117 }, 118 "generalization_bounded": { 119 "applies": true, 120 "answer": true, 121 "justification": "Section 4.1 explicitly states: 'our empirical results should be primarily viewed as demonstrating how one could apply our framework, rather than a definitive measure of absolute monitor performance.' Section 7 discusses limitations regarding distribution shift." 122 }, 123 "alternative_explanations_discussed": { 124 "applies": true, 125 "answer": true, 126 "justification": "Section 7 discusses that the optimality assumes training distribution matches real attacks, and that distributional shifts could undermine the approach. Appendix G tests an alternative attack distribution. The paper also acknowledges the brute-force search is intractable for many monitors." 127 } 128 }, 129 "setup_transparency": { 130 "model_versions_specified": { 131 "applies": true, 132 "answer": true, 133 "justification": "The paper specifies 'instruction-tuned versions of Llama 3.1 70B and 8B' (Section 4) as monitors, and 'o1-mini' for generating code. However, o1-mini lacks a specific version/snapshot date. Llama 3.1 is a specific release (Grattafiori et al., 2024). On balance, the monitor models are specified but the code-generating model (o1-mini) lacks a version date." 134 }, 135 "prompts_provided": { 136 "applies": true, 137 "answer": true, 138 "justification": "Appendix D provides the complete system prompt and monitor prompt template with placeholders for question and code. The fill values (BigCodeBench questions and o1-mini solutions) are from the public dataset and reproducible." 139 }, 140 "hyperparameters_reported": { 141 "applies": true, 142 "answer": true, 143 "justification": "Section 4 reports: 5 samples per monitor with averaged scores, Gaussian noise addition, Gaussian kernel KDE with Silverman's rule, 218/100 train/test split, 500 random splits, 200 quantiles for grid search. Monitor costs (1.0 and 0.21) and audit cost (20x) are specified." 144 }, 145 "scaffolding_described": { 146 "applies": false, 147 "answer": false, 148 "justification": "No agentic scaffolding is used. The monitors are simple prompted LLMs that output a suspicion score. The framework is a mathematical optimization over monitor outputs, not an agentic system." 149 }, 150 "data_preprocessing_documented": { 151 "applies": true, 152 "answer": true, 153 "justification": "Section 4 documents: o1-mini generates solutions to all 1,140 BigCodeBench questions; backdoored solutions are verified to pass test cases but fail on specific inputs (yielding 318 misaligned samples); monitor scores are averaged over 5 samples with minimal Gaussian noise added; KDE with Gaussian kernel and Silverman's rule is used." 154 } 155 }, 156 "limitations_and_scope": { 157 "limitations_section_present": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section 7 begins with a 'Limitations' subsection that discusses distribution shift, overfitting to training attacks, and scalability limitations of the brute-force search approach." 161 }, 162 "threats_to_validity_specific": { 163 "applies": true, 164 "answer": true, 165 "justification": "The limitations are specific to this study: (1) the optimality assumes the real attack distribution matches the training distribution, (2) ΠNP could underperform if the true attack distribution is easier, (3) brute-force search is intractable for more than 4 monitors. Appendix G empirically tests OOD robustness." 166 }, 167 "scope_boundaries_stated": { 168 "applies": true, 169 "answer": true, 170 "justification": "Section 4.1 explicitly bounds scope: 'our empirical results should be primarily viewed as demonstrating how one could apply our framework, rather than a definitive measure of absolute monitor performance.' Section 7 notes the approach only works for few monitors and that distribution shift is a real concern." 171 } 172 }, 173 "data_integrity": { 174 "raw_data_available": { 175 "applies": true, 176 "answer": true, 177 "justification": "BigCodeBench is publicly available (Apache-2.0 license). The code repository (github.com/tim-hua-01/combining_monitors_public) presumably includes the generated monitor scores and backdoored code, enabling independent verification." 178 }, 179 "data_collection_described": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section 4 describes: o1-mini prompted to solve all 1,140 BigCodeBench coding questions for aligned data; o1-mini separately prompted to create backdoored solutions verified against criteria, yielding 318 misaligned responses; monitors called 5 times per sample with scores averaged." 183 }, 184 "recruitment_methods_described": { 185 "applies": false, 186 "answer": false, 187 "justification": "No human participants. The data consists of AI-generated code and monitor scores on the BigCodeBench benchmark." 188 }, 189 "data_pipeline_documented": { 190 "applies": true, 191 "answer": true, 192 "justification": "The pipeline is documented: BigCodeBench questions → o1-mini generates aligned solutions (1,140) and backdoored solutions (verified, 318) → monitors score each solution 5 times → scores averaged with Gaussian noise → KDE for likelihood ratios → policy optimization. Each stage is described with counts." 193 } 194 }, 195 "conflicts_of_interest": { 196 "funding_disclosed": { 197 "applies": true, 198 "answer": true, 199 "justification": "The Acknowledgments section states: 'This work was supported through the MARS (Mentorship for Alignment Research Students) program at the Cambridge AI Safety Hub (caish.org/mars).' Author affiliations include MARS and Redwood Research." 200 }, 201 "affiliations_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "Author affiliations are clearly listed: MARS for Hua, Baskerville, Lemoine, Hopman; Redwood Research for Bhatt; MARS & Redwood Research for Tracy. These are AI safety research organizations." 205 }, 206 "funder_independent_of_outcome": { 207 "applies": true, 208 "answer": true, 209 "justification": "MARS/Cambridge AI Safety Hub is an academic mentorship program. Redwood Research is an AI safety organization. Neither has a commercial product being evaluated. The monitors used are open-source Llama models. The funder has no financial stake in a particular outcome." 210 }, 211 "financial_interests_declared": { 212 "applies": true, 213 "answer": false, 214 "justification": "No competing interests or financial interests statement is present in the paper. While the affiliations are disclosed, there is no explicit declaration of competing interests or absence thereof." 215 } 216 }, 217 "contamination": { 218 "training_cutoff_stated": { 219 "applies": true, 220 "answer": false, 221 "justification": "The paper uses Llama 3.1 70B and 8B as monitors evaluating code for backdoors. The training cutoff of these models is not stated. While contamination is less critical here (the monitors are evaluating code patterns, not benchmark knowledge), the cutoff is still relevant since BigCodeBench problems could appear in training data." 222 }, 223 "train_test_overlap_discussed": { 224 "applies": true, 225 "answer": false, 226 "justification": "No discussion of whether BigCodeBench problems or similar code patterns appeared in Llama 3.1's training data. The monitors' ability to detect backdoors could be influenced by exposure to similar code during training." 227 }, 228 "benchmark_contamination_addressed": { 229 "applies": true, 230 "answer": false, 231 "justification": "BigCodeBench was published (2025) potentially after Llama 3.1's training cutoff, but this is not discussed. The paper does not address whether the benchmark or similar problems could have been in the monitors' training data." 232 } 233 }, 234 "human_studies": { 235 "pre_registered": { 236 "applies": false, 237 "answer": false, 238 "justification": "No human participants. The study is purely computational, using AI-generated code and automated monitors." 239 }, 240 "irb_or_ethics_approval": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants." 244 }, 245 "demographics_reported": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants." 249 }, 250 "inclusion_exclusion_criteria": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants." 254 }, 255 "randomization_described": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants." 259 }, 260 "blinding_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants." 264 }, 265 "attrition_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants." 269 } 270 }, 271 "cost_and_practicality": { 272 "inference_cost_reported": { 273 "applies": true, 274 "answer": true, 275 "justification": "The paper reports relative costs: Llama 3.1 70B costs 1 unit per sample, Llama 3.1 8B costs 0.21 units (Section 4). Audit cost is 20x the 70B monitor cost. The entire framework is built around cost optimization, with explicit cost budgets in all experiments." 276 }, 277 "compute_budget_stated": { 278 "applies": true, 279 "answer": true, 280 "justification": "Appendix H provides detailed compute specifications: AMD EPYC 7413, total runtime of 500.2 seconds (~8.3 minutes) for all experiments, 87-90 CPU workers in parallel. Per-experiment task rates are also provided." 281 } 282 } 283 }, 284 "claims": [ 285 { 286 "claim": "Likelihood-ratio-based policies (ΠNP) outperform naive threshold-based policies (Πnaive) due to the 'winner's curse' effect in monitor scores.", 287 "evidence": "Figure 2b shows ΠNP achieves higher safety (recall) than Πnaive at low audit FPR. Figure 2a shows the log likelihood ratio dips between scores 7.2-8.5 for Llama 3.1 70B (Section 5.1).", 288 "supported": "strong" 289 }, 290 { 291 "claim": "Stochastic monitor calling (ΠNPS) more than doubles recall compared to deterministic monitoring when audit cost is 20x monitor cost and budget is $1.20.", 292 "evidence": "Section 5.2 states the optimal ΠNPS policy calls the monitor 71% of the time, audits top 3.4% by LLR, achieving 2.4% FPR vs. 1% for deterministic case, more than doubling safety.", 293 "supported": "strong" 294 }, 295 { 296 "claim": "Combining two monitors (Llama 3.1 70B and 8B) via ΠSR yields Pareto improvements over any single-monitor policy.", 297 "evidence": "Figure 5 shows ΠSR Pareto dominates ΠNPS (70B only), ΠNPS (8B only), and ΠBM (both always-on) at budgets between $0.50 and $1.75. Figure 6 in Appendix A confirms statistical significance at 10% level.", 298 "supported": "strong" 299 }, 300 { 301 "claim": "The framework is robust to out-of-distribution attacks: ΠSR fitted on training data still outperforms baselines on OOD backdoors.", 302 "evidence": "Appendix G tests on 300 backdoors from Wen et al. (2025). Figure 10 shows ΠSR outperforms baselines across most budgets, though absolute safety is much lower.", 303 "supported": "moderate" 304 } 305 ], 306 "methodology_tags": ["benchmark-eval", "theoretical"], 307 "key_findings": "The paper develops a mathematical framework for optimally combining multiple runtime safety monitors under a unified cost budget, using the Neyman-Pearson lemma. In a backdoor detection experiment on BigCodeBench using Llama 3.1 70B and 8B as monitors, likelihood-ratio-based policies outperform naive threshold policies, stochastic monitor calling can more than double recall, and combining two monitors yields Pareto improvements over single-monitor approaches. The framework degrades gracefully under distribution shift, maintaining relative advantages over baselines on out-of-distribution attacks.", 308 "red_flags": [ 309 { 310 "flag": "Single empirical setting", 311 "detail": "All experiments use the same BigCodeBench backdoor detection task with the same two Llama monitors. While the theoretical framework is general, empirical validation is limited to one narrow setting, making it unclear how well results transfer to other monitoring scenarios (e.g., jailbreak detection, content moderation)." 312 }, 313 { 314 "flag": "Small misaligned sample size", 315 "detail": "Only 318 misaligned (backdoored) samples are available, split into 218 train and 100 test. The paper acknowledges sensitivity to tail distributions (footnote 7) but does not perform a power analysis. With 100 test samples, recall estimates have relatively high variance." 316 }, 317 { 318 "flag": "o1-mini version unspecified", 319 "detail": "The paper uses o1-mini to generate both aligned and backdoored code but does not specify which version/snapshot of o1-mini was used, affecting reproducibility." 320 } 321 ], 322 "cited_papers": [ 323 { 324 "title": "AI control: Improving safety despite intentional subversion", 325 "authors": ["Ryan Greenblatt", "Buck Shlegeris", "Kshitij Sachan", "Fabien Roger"], 326 "year": 2024, 327 "relevance": "Foundational AI control paper that proposes the monitoring framework this paper builds upon, directly relevant to AI safety evaluation methodology." 328 }, 329 { 330 "title": "Sabotage evaluations for frontier models", 331 "authors": ["Joe Benton", "Misha Wagner", "Eric Christiansen"], 332 "year": 2024, 333 "arxiv_id": "2410.21514", 334 "relevance": "Evaluates sabotage capabilities of frontier AI models, directly relevant to the threat model this paper's monitoring framework addresses." 335 }, 336 { 337 "title": "Constitutional classifiers: Defending against universal jailbreaks across thousands of hours of red teaming", 338 "authors": ["Mrinank Sharma", "Meg Tong", "Jesse Mu"], 339 "year": 2025, 340 "arxiv_id": "2501.18837", 341 "relevance": "Presents safety classifiers for defending against jailbreaks, a key application area for runtime monitoring combinations." 342 }, 343 { 344 "title": "Cost-effective constitutional classifiers via representation re-use", 345 "authors": ["Hoagy Cunningham", "Alwin Peng", "Jerry Wei"], 346 "year": 2025, 347 "relevance": "Proposes multi-stage classifiers using cheap model-internals monitors followed by larger monitors, directly related to the multi-monitor combination problem." 348 }, 349 { 350 "title": "Monitoring reasoning models for misbehavior and the risks of promoting obfuscation", 351 "authors": ["Bowen Baker", "Joost Huizinga", "Leo Gao"], 352 "year": 2025, 353 "arxiv_id": "2503.11926", 354 "relevance": "Studies chain-of-thought monitoring for detecting misbehavior in reasoning models, relevant to runtime safety monitoring." 355 }, 356 { 357 "title": "Reasoning models don't always say what they think", 358 "authors": ["Yanda Chen", "Joe Benton", "Ansh Radhakrishnan"], 359 "year": 2025, 360 "arxiv_id": "2505.05410", 361 "relevance": "Demonstrates that reasoning models' chain-of-thought can be unfaithful, motivating the need for multiple monitoring approaches." 362 }, 363 { 364 "title": "Adaptive deployment of untrusted LLMs reduces distributed threats", 365 "authors": ["Jiaxin Wen", "Vivek Hebbar", "Caleb Larson"], 366 "year": 2025, 367 "relevance": "Studies adaptive deployment strategies for untrusted LLMs with safety considerations, relevant to the AI control framework." 368 }, 369 { 370 "title": "Control tax: The price of keeping AI in check", 371 "authors": ["Mikhail Terekhov", "Zhen Ning David Liu", "Caglar Gulcehre"], 372 "year": 2025, 373 "relevance": "Explores the cost of using different language models as monitors in AI control settings, directly relevant to cost-constrained monitoring." 374 }, 375 { 376 "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions", 377 "authors": ["Terry Yue Zhuo"], 378 "year": 2025, 379 "relevance": "The benchmark used in this paper's experiments for code generation evaluation." 380 }, 381 { 382 "title": "A sketch of an AI control safety case", 383 "authors": ["Tomek Korbak", "Joshua Clymer", "Benjamin Hilton", "Buck Shlegeris", "Geoffrey Irving"], 384 "year": 2025, 385 "arxiv_id": "2501.17315", 386 "relevance": "Proposes structure for AI control safety cases where monitoring is a cornerstone component." 387 }, 388 { 389 "title": "How to evaluate control measures for LLM agents? A trajectory from today to superintelligence", 390 "authors": ["Tomek Korbak", "Mikita Balesni", "Buck Shlegeris", "Geoffrey Irving"], 391 "year": 2025, 392 "arxiv_id": "2504.05259", 393 "relevance": "Discusses evaluation methodology for AI control measures, relevant to the broader safety evaluation landscape." 394 }, 395 { 396 "title": "Detecting strategic deception using linear probes", 397 "authors": ["Nicholas Goldowsky-Dill", "Bilal Chughtai", "Stefan Heimersheim", "Marius Hobbhahn"], 398 "year": 2025, 399 "arxiv_id": "2502.03407", 400 "relevance": "Proposes activation probing for detecting deception, one type of monitor that could be combined using this paper's framework." 401 } 402 ] 403 }