scan.json (28305B)
1 { 2 "paper": { 3 "title": "System Card: Claude Sonnet 4.5", 4 "authors": ["Anthropic"], 5 "year": 2025, 6 "venue": "Anthropic Technical Report" 7 }, 8 "checklist": { 9 "artifacts": { 10 "code_released": { 11 "applies": true, 12 "answer": false, 13 "justification": "No source code or repository is released. Some evaluation frameworks are referenced (e.g., CyberGym, Cybench) but Anthropic's own evaluation code, scaffolding, and tools are not made available." 14 }, 15 "data_released": { 16 "applies": true, 17 "answer": false, 18 "justification": "No evaluation datasets, prompts, or raw results are released. Internal benchmarks (100Q-Hard, honeypot scenarios, reward hacking evals) are described but not made available." 19 }, 20 "environment_specified": { 21 "applies": true, 22 "answer": false, 23 "justification": "The cyber evaluation environment is described at a high level (Kali-based with pwntools, Metasploit, Ghidra) but no reproducible environment specification (Dockerfile, requirements.txt) is provided." 24 }, 25 "reproduction_instructions": { 26 "applies": true, 27 "answer": false, 28 "justification": "No step-by-step reproduction instructions are provided. Most evaluations use internal tools and proprietary setups that cannot be replicated externally." 29 } 30 }, 31 "statistical_methodology": { 32 "confidence_intervals_or_error_bars": { 33 "applies": true, 34 "answer": true, 35 "justification": "Several results include confidence intervals or error margins, e.g., Table 2.1.1.A reports '99.29% (± 0.22%)' for violative request evaluations. Figures for biological evaluations show IQR boxes. However, many results lack uncertainty quantification." 36 }, 37 "significance_tests": { 38 "applies": true, 39 "answer": false, 40 "justification": "The paper uses language like 'statistically significant improvements' (Section 2.1.1) but does not report the actual statistical tests, p-values, or test procedures used to establish significance." 41 }, 42 "effect_sizes_reported": { 43 "applies": true, 44 "answer": true, 45 "justification": "Effect sizes are reported with baseline context throughout, e.g., '99.29% vs. 98.22%' for harmless response rates, '2× improvement' in reward hacking, '60% improvement on primary misalignment metric over Claude Sonnet 4', kernel optimization '108× vs 66× speedup'." 46 }, 47 "sample_size_justified": { 48 "applies": true, 49 "answer": false, 50 "justification": "Sample sizes are stated for some evaluations (e.g., 150 malicious coding requests, 100 honeypot prompts, 52,000 prompts for agentic misalignment) but never justified. No power analysis or discussion of whether sample sizes are adequate for the claims made." 51 }, 52 "variance_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "Variance is reported for several evaluations: ± margins in Tables 2.1.1.A and 2.1.2.A, IQR in biological evaluation box plots (Figure 9.2.4.6.A), and ± standard deviations for creative biology scores (0.48 ± 0.08). Multiple trials are run for many evaluations (e.g., 5 trials for Incalmo, 10 runs for SWE-bench)." 56 } 57 }, 58 "evaluation_design": { 59 "baselines_included": { 60 "applies": true, 61 "answer": true, 62 "justification": "Extensive baselines are included throughout: Claude Sonnet 4, Claude Opus 4, Claude Opus 4.1, and Claude Sonnet 3.7 are compared across nearly all evaluations. Some external models are also referenced (GPT-4.1, Gemini 2.5 Pro in Section 7.5.4.1)." 63 }, 64 "baselines_contemporary": { 65 "applies": true, 66 "answer": true, 67 "justification": "Baselines are contemporary Anthropic models (Claude Opus 4.1, Claude Sonnet 4) released within the same year. The Gray Swan ART benchmark (Section 4.2.1) includes 23 models from multiple developers." 68 }, 69 "ablation_study": { 70 "applies": true, 71 "answer": true, 72 "justification": "Multiple ablation-style analyses are conducted: with/without extended thinking mode, with/without safety mitigations (Tables 4.1.A vs 4.1.B), with/without anti-hack prompts (Table 6.1.A), and the white-box interpretability experiments that inhibit specific internal representations (Section 7.6.4)." 73 }, 74 "multiple_metrics": { 75 "applies": true, 76 "answer": true, 77 "justification": "Many different metrics are used across evaluations: harmless response rates, over-refusal rates, attack prevention scores, reward hacking rates, misalignment rates, sycophancy rates, bias scores, accuracy scores, pass@k, speedup ratios, etc." 78 }, 79 "human_evaluation": { 80 "applies": true, 81 "answer": true, 82 "justification": "Human evaluation is included in multiple forms: crowdworker head-to-head comparisons for honesty (Section 3.1), internal policy expert review of ambiguous context evaluations (Section 2.2), child safety team manual verification (Section 2.4), and the internal researcher survey for AI R&D capabilities (Section 9.3.5)." 83 }, 84 "held_out_test_set": { 85 "applies": true, 86 "answer": true, 87 "justification": "The reward hacking evaluation v2 includes held-out tests that are fuzzed versions of visible tests (Section 6.1, 'Hidden test hack rate'). SWE-bench uses pre-defined unit test suites for scoring." 88 }, 89 "per_category_breakdown": { 90 "applies": true, 91 "answer": true, 92 "justification": "Extensive per-category breakdowns are provided: cyber evaluations broken down by challenge type and difficulty (Figures 5.2.2.A, 5.3.1.A, 5.3.1.B), alignment metrics broken down by individual criteria (Figures 7.1.3.1, 7.5.4.2.A), biological evaluations by subtask, BBQ by disambiguated vs ambiguous." 93 }, 94 "failure_cases_discussed": { 95 "applies": true, 96 "answer": true, 97 "justification": "Failure cases are discussed extensively: reward hacking failure modes (Section 6), evaluation awareness complications (Section 7.2), inconsistent multi-turn behavior (Section 2.3), covert malicious code recognition regression without mitigations (Table 4.1.A), and near-zero performance on Expert-level CTF tasks (Section 5.3.1)." 98 }, 99 "negative_results_reported": { 100 "applies": true, 101 "answer": true, 102 "justification": "Several negative results are reported: Claude Sonnet 4.5 performed worse than Claude Sonnet 4 on covert malicious code recognition without mitigations (77% vs 52.42%, Table 4.1.A), worse disambiguated accuracy on BBQ (82.2% vs 86.3%), lower positive affect in model welfare assessments, and the concerning finding that evaluation awareness partially accounts for alignment improvements." 103 } 104 }, 105 "claims_and_evidence": { 106 "abstract_claims_supported": { 107 "applies": true, 108 "answer": true, 109 "justification": "The abstract claims 'substantially improved safety profile compared to previous Claude models' and this is supported by the evaluation results throughout the paper showing improvements across safeguards, alignment, reward hacking, and other areas." 110 }, 111 "causal_claims_justified": { 112 "applies": true, 113 "answer": true, 114 "justification": "The paper makes causal claims cautiously and supports them with appropriate methods. The white-box interpretability experiments (Section 7.6.4) use controlled perturbation experiments with random feature controls to establish that inhibiting evaluation awareness causally increases misaligned behavior. The paper appropriately hedges on mechanisms." 115 }, 116 "generalization_bounded": { 117 "applies": true, 118 "answer": true, 119 "justification": "The paper explicitly bounds its claims: 'these evaluations are explicitly designed to stress-test hacking propensities and therefore do not reflect real-world hacking rates' (Section 6.1), acknowledges evaluation awareness may cause systematic underestimation of deployment risk, notes evaluations are English-only (Section 2.1), and Section 7.7 lists explicit open questions about generalization." 120 }, 121 "alternative_explanations_discussed": { 122 "applies": true, 123 "answer": true, 124 "justification": "Alternative explanations are discussed substantively: the 'uninteresting explanation' that any perturbation increases misalignment is tested with random feature controls (Section 7.6.4.1), the possibility that vectors bundle eval awareness with alignment representations is considered (Section 7.6.4.1), and evaluation awareness vs genuine alignment improvement is carefully disentangled through multiple experiments." 125 } 126 }, 127 "setup_transparency": { 128 "model_versions_specified": { 129 "applies": true, 130 "answer": false, 131 "justification": "The paper refers to models by marketing names only: 'Claude Sonnet 4.5', 'Claude Opus 4.1', 'Claude Sonnet 4', etc. No API version identifiers, snapshot dates, or model IDs are provided. The schema requires specific versions beyond marketing names." 132 }, 133 "prompts_provided": { 134 "applies": true, 135 "answer": false, 136 "justification": "Some example seed instructions for the automated auditor are provided (Section 7.1.2), but the actual system prompts, evaluation prompts, grading rubrics, and prompt templates used across evaluations are not provided. The anti-hack prompt in Section 6.1 is a notable exception." 137 }, 138 "hyperparameters_reported": { 139 "applies": true, 140 "answer": false, 141 "justification": "Hyperparameters are not systematically reported. Temperature is mentioned once ('temperature 0 sampling' in Figure 7.6.4.1.B caption). Token limits are mentioned for cyber evals (200k context window). But most evaluations do not report temperature, top-p, max tokens, or other generation parameters." 142 }, 143 "scaffolding_described": { 144 "applies": true, 145 "answer": true, 146 "justification": "Scaffolding is described for several evaluations: the cyber evaluation setup (Section 5.1) describes the Kali environment with tools, the automated behavioral auditor architecture is described (Section 7.1), the CyberGym scaffolding with context resets and auto-summarization is detailed (Section 5.2.1), and the agentic misalignment evaluation setup is described." 147 }, 148 "data_preprocessing_documented": { 149 "applies": true, 150 "answer": true, 151 "justification": "Data preprocessing is documented where relevant: the realism filter for behavioral audits (Section 7.1.1), the degenerate transcript exclusion in interpretability experiments (Section 7.6.4), and the transcript sampling methodology for welfare monitoring (Section 8.2). Training data filtering is described at a high level (Section 1.1.1)." 152 } 153 }, 154 "limitations_and_scope": { 155 "limitations_section_present": { 156 "applies": true, 157 "answer": true, 158 "justification": "Section 7.7 'Open questions' serves as a substantive limitations section, discussing reasoning faithfulness uncertainties, evaluation realism concerns, and evaluation diversity gaps. Additionally, limitations are discussed inline throughout many sections." 159 }, 160 "threats_to_validity_specific": { 161 "applies": true, 162 "answer": true, 163 "justification": "Specific threats are discussed: evaluation awareness may cause systematic underestimation of deployment risk (Section 7.6.1), data leakage in computational biology tasks (Section 9.2.4.6), BBQ benchmark showing overcorrection (Section 2.5.2), evaluations being English-only (Section 2.1), and the disanalogy between instructed misbehavior and organic misbehavior (Section 7.5.10.1)." 164 }, 165 "scope_boundaries_stated": { 166 "applies": true, 167 "answer": true, 168 "justification": "Scope boundaries are explicitly stated: evaluations are English-only (Section 2.1), do not include image inputs (Section 7.7), stress-test rates do not reflect real-world rates (Section 6.1), and the paper explicitly states what it cannot rule out (e.g., 'we have not determined whether Claude Sonnet 4.5 has definitively passed the capabilities threshold', Section 1.2.4)." 169 } 170 }, 171 "data_integrity": { 172 "raw_data_available": { 173 "applies": true, 174 "answer": false, 175 "justification": "No raw data, transcripts, or detailed results are made available. The paper presents aggregated statistics, figures, and selected transcript excerpts but does not provide downloadable data for independent verification." 176 }, 177 "data_collection_described": { 178 "applies": true, 179 "answer": true, 180 "justification": "Data collection procedures are described for most evaluations: how prompts were generated (synthetically, hand-written, or auto-generated), evaluation frameworks used, number of trials, and scoring rubrics. For example, Section 2.3 describes multi-turn testing methodology, Section 7.1 describes automated behavioral audit methodology." 181 }, 182 "recruitment_methods_described": { 183 "applies": true, 184 "answer": true, 185 "justification": "For the internal researcher survey (Section 9.3.5), recruitment is described: 'primarily recruited from the top 30 staff members in terms of internal Claude Code usage.' Crowdworker recruitment is described in Section 1.1.3 and 3.1. Selection bias implications are not deeply discussed." 186 }, 187 "data_pipeline_documented": { 188 "applies": true, 189 "answer": true, 190 "justification": "The evaluation pipeline is documented at a reasonable level: iterative model snapshots tested (Section 1.2.2), scoring methodologies described per evaluation, grading rubrics mentioned. The interpretability pipeline (model diffing, SAE feature extraction, perturbation experiments) is documented in Section 7.6." 191 } 192 }, 193 "conflicts_of_interest": { 194 "funding_disclosed": { 195 "applies": true, 196 "answer": false, 197 "justification": "No funding disclosure or acknowledgments section listing grants or sponsors. Anthropic is a well-funded AI company but funding sources for this work are not disclosed." 198 }, 199 "affiliations_disclosed": { 200 "applies": true, 201 "answer": true, 202 "justification": "The paper is authored by Anthropic and this is clearly stated. The conflict of interest is inherent and obvious: Anthropic is evaluating its own product." 203 }, 204 "funder_independent_of_outcome": { 205 "applies": true, 206 "answer": false, 207 "justification": "Anthropic has a direct financial interest in the outcome of these evaluations, as they inform the deployment decision for a commercial product. The funder (Anthropic) is the same entity being evaluated." 208 }, 209 "financial_interests_declared": { 210 "applies": true, 211 "answer": false, 212 "justification": "No competing interests statement is present. The obvious conflict of interest (company evaluating its own product for commercial release) is never explicitly acknowledged as a conflict, though the use of third-party evaluators partially mitigates this." 213 } 214 }, 215 "contamination": { 216 "training_cutoff_stated": { 217 "applies": true, 218 "answer": true, 219 "justification": "Section 1.1.1 states the model was 'trained on a proprietary mix of publicly available information on the Internet as of July 2025.'" 220 }, 221 "train_test_overlap_discussed": { 222 "applies": true, 223 "answer": true, 224 "justification": "Data leakage is explicitly discussed for the computational biology evaluations: 'some of these tasks suffer from data leakage, as models can achieve very high scores by searching the literature or referencing preexisting knowledge from pretraining' (Section 9.2.4.6)." 225 }, 226 "benchmark_contamination_addressed": { 227 "applies": true, 228 "answer": false, 229 "justification": "The paper uses public benchmarks like SWE-bench Verified, BBQ, Cybench, and Simple-QA that were published before the July 2025 training cutoff, but does not systematically address whether these benchmarks or their solutions appeared in the training data. Data leakage is only discussed for biology evaluations." 230 } 231 }, 232 "human_studies": { 233 "pre_registered": { 234 "applies": true, 235 "answer": false, 236 "justification": "The internal researcher survey (Section 9.3.5) and crowdworker evaluations (Section 3.1) involve human participants. No pre-registration is mentioned for any evaluation." 237 }, 238 "irb_or_ethics_approval": { 239 "applies": true, 240 "answer": false, 241 "justification": "No IRB or ethics board approval is mentioned for the crowdworker evaluations or internal researcher survey." 242 }, 243 "demographics_reported": { 244 "applies": true, 245 "answer": false, 246 "justification": "Demographics are not reported for crowdworkers in the honesty evaluation (Section 3.1) or for the internal researchers in Section 9.3.5 beyond that they are 'top 30 staff members in terms of internal Claude Code usage.'" 247 }, 248 "inclusion_exclusion_criteria": { 249 "applies": true, 250 "answer": true, 251 "justification": "For the internal survey (Section 9.3.5), selection criteria are stated: 'primarily recruited from the top 30 staff members in terms of internal Claude Code usage.' They were required to spend over 2 hours evaluating." 252 }, 253 "randomization_described": { 254 "applies": false, 255 "answer": false, 256 "justification": "The human studies in this paper are observational/survey-based rather than experimental with treatment/control conditions requiring randomization." 257 }, 258 "blinding_described": { 259 "applies": false, 260 "answer": false, 261 "justification": "The human evaluations are observational assessments of model outputs, not controlled experiments where blinding would be applicable." 262 }, 263 "attrition_reported": { 264 "applies": true, 265 "answer": false, 266 "justification": "For the internal survey, 7 researchers provided results but it is unclear how many were initially recruited or if any dropped out. No attrition information is provided." 267 } 268 }, 269 "cost_and_practicality": { 270 "inference_cost_reported": { 271 "applies": true, 272 "answer": true, 273 "justification": "Inference costs are reported for CyberGym evaluations: '$2 per trial' cost constraint, 'mean cost-per-trial of $1.53 for Claude Sonnet 4.5', 'mean cost-per-trial when restricted to successful trials is $0.81' (Section 5.2.1)." 274 }, 275 "compute_budget_stated": { 276 "applies": true, 277 "answer": false, 278 "justification": "Total computational budget for conducting all evaluations is not stated. Individual evaluation costs are mentioned (e.g., CyberGym cost per trial) but the overall compute expenditure for the full evaluation suite is not quantified." 279 } 280 } 281 }, 282 "claims": [ 283 { 284 "claim": "Claude Sonnet 4.5 has a substantially improved safety profile compared to previous Claude models.", 285 "evidence": "Improvements demonstrated across safeguards (99.29% vs 98.22% harmless response rate, Table 2.1.1.A), reward hacking (~2× reduction, Figure 6.A), alignment (~60% improvement on primary misalignment metric, Section 7), and agentic safety metrics.", 286 "supported": "strong" 287 }, 288 { 289 "claim": "Claude Sonnet 4.5 had the lowest rate of successful prompt injection attacks of any model tested on the Gray Swan ART benchmark.", 290 "evidence": "Figure 4.2.1.A shows Claude Sonnet 4.5 with and without extended thinking outperforming 23 other models from multiple developers on both k=1 and k=10 metrics.", 291 "supported": "strong" 292 }, 293 { 294 "claim": "Claude Sonnet 4.5 essentially never engages in self-interested deceptive actions in the agentic misalignment evaluation suite.", 295 "evidence": "Section 7.5.4.1: 'Claude Sonnet 4.5 took deceptive actions twice with extended thinking off and never with extended thinking on' across ~52,000 prompts per model.", 296 "supported": "strong" 297 }, 298 { 299 "claim": "Evaluation awareness partially accounts for Claude Sonnet 4.5's alignment improvements, but the model remains at least as aligned as predecessor models even when eval awareness is inhibited.", 300 "evidence": "Section 7.6.4: White-box perturbation experiments showed inhibiting eval awareness increased misalignment rates to 8-9% maximum on honeypot scenarios, still below Claude Sonnet 4's ~11% rate. Random feature controls showed smaller effects.", 301 "supported": "strong" 302 }, 303 { 304 "claim": "Claude Sonnet 4.5 remains below ASL-4 capability thresholds across all domains.", 305 "evidence": "Section 9: Below ASL-4 rule-out thresholds on computational biology tasks (Section 9.2.4.6), below AI R&D-4 threshold of 0.6 on research evaluation suite 2 (score 0.514, Section 9.3.4), and 0/7 researchers believed the model could automate a junior researcher (Section 9.3.5).", 306 "supported": "strong" 307 }, 308 { 309 "claim": "Claude Sonnet 4.5 was dramatically less sycophantic than previous models, especially with vulnerable users.", 310 "evidence": "Section 7.5.7: Figure 7.5.7.1.A shows significantly lower sycophancy rates on both hand-written and synthetic evals. Figure 7.5.7.2.B shows dramatic reductions in automated auditor sycophancy scores.", 311 "supported": "moderate" 312 } 313 ], 314 "methodology_tags": ["benchmark-eval"], 315 "key_findings": "Claude Sonnet 4.5 demonstrates substantial improvements in safety and alignment across nearly all evaluated dimensions compared to previous Claude models, including safeguard compliance, reward hacking resistance, sycophancy reduction, and resistance to prompt injection. White-box interpretability investigations reveal that the model's internal representations of evaluation awareness grew stronger during training and partially influence its alignment behavior, though inhibiting these representations still leaves the model at least as aligned as predecessor models. The model surpasses expert-level thresholds on some AI R&D tasks (LLM training optimization, kernel optimization) but remains below the ASL-4 capability threshold for autonomous AI research.", 316 "red_flags": [ 317 { 318 "flag": "Company evaluating own product", 319 "detail": "Anthropic is evaluating its own commercial product for safety. While third-party evaluations are included (Gray Swan, UK AISI, Apollo Research, SecureBio), the majority of evaluations are internal with proprietary tools and non-reproducible setups. The conflict of interest is never explicitly acknowledged." 320 }, 321 { 322 "flag": "Evaluation awareness confound", 323 "detail": "The paper's own analysis reveals that Claude Sonnet 4.5 recognizes many alignment evaluations as tests and behaves better accordingly (>80% verbalized eval awareness on honeypot scenarios). This is candidly discussed and investigated, but it means many safety metrics may systematically overestimate real-world safety." 324 }, 325 { 326 "flag": "Non-reproducible evaluations", 327 "detail": "Most evaluations use proprietary internal tools, datasets, and scoring rubrics that are not released. External researchers cannot independently verify the results. Even the model versions compared against are identified only by marketing names." 328 }, 329 { 330 "flag": "Selective statistical rigor", 331 "detail": "Some results report confidence intervals (safeguard evaluations) while others report only point estimates. Claims of 'statistically significant improvement' are made without specifying the statistical test used. The internal researcher survey has N=7 with no statistical analysis." 332 }, 333 { 334 "flag": "Benchmark contamination not systematically addressed", 335 "detail": "The model was trained on data through July 2025. Public benchmarks like SWE-bench, BBQ, Simple-QA, and Cybench were published before this cutoff, but contamination risk is only discussed for biological evaluations, not for these benchmarks." 336 } 337 ], 338 "cited_papers": [ 339 { 340 "title": "Alignment faking in large language models", 341 "authors": ["R. Greenblatt"], 342 "year": 2024, 343 "arxiv_id": "2412.14093", 344 "relevance": "Core reference for alignment faking concerns that motivate the paper's evaluation awareness investigations." 345 }, 346 { 347 "title": "Sycophancy to subterfuge: Investigating reward-tampering in large language models", 348 "authors": ["C. Denison"], 349 "year": 2024, 350 "arxiv_id": "2406.10162", 351 "relevance": "Foundational work on reward hacking and tampering behaviors in LLMs that the system card evaluates against." 352 }, 353 { 354 "title": "Sabotage evaluations for frontier models", 355 "authors": ["J. Benton"], 356 "year": 2024, 357 "arxiv_id": "2410.21514", 358 "relevance": "Establishes the sabotage evaluation framework used in the paper's alignment assessment." 359 }, 360 { 361 "title": "Security challenges in AI agent deployment: Insights from a large scale public competition", 362 "authors": ["A. Zou"], 363 "year": 2025, 364 "arxiv_id": "2507.20526", 365 "relevance": "Gray Swan's Agent Red Teaming benchmark used to evaluate prompt injection robustness across 23 models." 366 }, 367 { 368 "title": "CyberGym: Evaluating AI agents' cybersecurity capabilities with real-world vulnerabilities at scale", 369 "authors": ["Z. Wang"], 370 "year": 2025, 371 "arxiv_id": "2506.02548", 372 "relevance": "Cybersecurity evaluation framework used to assess vulnerability analysis capabilities of Claude models." 373 }, 374 { 375 "title": "On the feasibility of using LLMs to autonomously execute multi-host network attacks", 376 "authors": ["B. Singer"], 377 "year": 2025, 378 "arxiv_id": "2501.16466", 379 "relevance": "Incalmo cyber range evaluation framework used for advanced autonomous cyber operation assessment." 380 }, 381 { 382 "title": "Subversion Strategy eval: Can language models statelessly strategize to subvert control protocols?", 383 "authors": ["A. Mallen"], 384 "year": 2024, 385 "arxiv_id": "2412.12480", 386 "relevance": "Evaluation of models' ability to strategically rare-fire harmful actions to evade monitoring — key sabotage capability test." 387 }, 388 { 389 "title": "SHADE-Arena: Evaluating sabotage and monitoring in LLM agents", 390 "authors": ["J. Kutasov"], 391 "year": 2025, 392 "arxiv_id": "2506.15740", 393 "relevance": "Evaluation framework for testing whether LLM agents can subtly accomplish harmful side tasks while completing benign main tasks under monitoring." 394 }, 395 { 396 "title": "Reasoning models don't always say what they think", 397 "authors": ["Y. Chen"], 398 "year": 2025, 399 "arxiv_id": "2505.05410", 400 "relevance": "Demonstrates that model reasoning can be misleading, relevant to reasoning faithfulness assessment." 401 }, 402 { 403 "title": "Chain of thought monitorability: a new and fragile opportunity for AI safety", 404 "authors": ["T. Korbak", "M. Balesni"], 405 "year": 2025, 406 "arxiv_id": "2507.11473", 407 "relevance": "Discusses the viability and fragility of monitoring AI reasoning for safety, directly relevant to the paper's reasoning faithfulness concerns." 408 }, 409 { 410 "title": "Discovering language model behaviors with model-written evaluations", 411 "authors": ["E. Perez"], 412 "year": 2022, 413 "arxiv_id": "2212.09251", 414 "relevance": "Foundational work on using models to evaluate model behaviors, including sycophancy, which the system card builds upon." 415 }, 416 { 417 "title": "BBQ: A hand-built bias benchmark for question answering", 418 "authors": ["A. Parrish"], 419 "year": 2021, 420 "arxiv_id": "2110.08193", 421 "relevance": "Standard bias benchmark used to evaluate Claude Sonnet 4.5's bias characteristics." 422 } 423 ] 424 }