scan-v4.json (38048B)
1 { 2 "scan_version": 4, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Detecting Proxy Gaming in RL and LLM Alignment via Evaluator Stress Tests", 6 "authors": [ 7 "Ibne Farabi Shihab", 8 "Sanjeda Akter", 9 "Anuj Sharma" 10 ], 11 "year": 2025, 12 "venue": "arXiv", 13 "arxiv_id": "2507.05619", 14 "doi": null 15 }, 16 "checklist": { 17 "claims_and_evidence": { 18 "abstract_claims_supported": { 19 "applies": true, 20 "answer": true, 21 "justification": "All abstract claims are supported: 78.4% precision/81.7% recall for RL (Table 11), 74.2% precision/78.6% recall for LLM (Tables 1, 7), 8.3 point win-rate improvement (Table 9), 54.6% hacking reduction (Table 31), median 3-checkpoint early warning (Table 7), and overhead percentages match reported values.", 22 "source": "opus" 23 }, 24 "causal_claims_justified": { 25 "applies": true, 26 "answer": true, 27 "justification": "Causal claims are supported by controlled designs: ablation studies (Tables 17, 29, 30) use single-variable manipulation, the 2×2×2 factorial design (Table 12) varies individual factors, and control experiments (Table 30) confirm gains are from detector-guided interventions rather than extra compute.", 28 "source": "opus" 29 }, 30 "generalization_bounded": { 31 "applies": true, 32 "answer": true, 33 "justification": "Section 8 (Limitations) explicitly states the study is 'limited to 4 tasks and 2 model sizes' and notes the threat model 'assumes fixed judges during training.' The paper acknowledges custom testbed effect sizes would be 'more modest' in real-world deployments (Appendix C.5).", 34 "source": "opus" 35 }, 36 "alternative_explanations_discussed": { 37 "applies": true, 38 "answer": true, 39 "justification": "The paper addresses length as a confound (Table 23 shows ρ=0.18 correlation, Figure 5 AUROC 0.734 vs 0.534 for length-only), controls for extra compute as an alternative explanation for mitigation gains (Table 30 control experiments), and discusses audit quality as a source of false positives (Appendix H).", 40 "source": "opus" 41 }, 42 "proxy_outcome_distinction": { 43 "applies": true, 44 "answer": true, 45 "justification": "The paper's core contribution is explicitly about the proxy-true distinction. Section 3 formally defines proxy gaming as 'systematic increases in E[J(y)] that do not correspond to increases in E[H(y)]' and the entire EST framework is designed to distinguish proxy-exploitative gains from content-driven improvements.", 46 "source": "opus" 47 } 48 }, 49 "limitations_and_scope": { 50 "limitations_section_present": { 51 "applies": true, 52 "answer": true, 53 "justification": "Section 8 is titled 'Limitations' and provides substantive discussion covering task/model scope limitations, threat model assumptions (fixed judges), and real-world deployment challenges (concept drift, multi-stakeholder conflicts, adversarial adaptation).", 54 "source": "opus" 55 }, 56 "threats_to_validity_specific": { 57 "applies": true, 58 "answer": true, 59 "justification": "Section 8 discusses specific threats: 'limited to 4 tasks and 2 model sizes,' 'fixed judges during training; adaptive evaluators that update during fine-tuning may require modified detection approaches,' and 'concept drift, multi-stakeholder objective conflicts, and adversarial adaptation over longer training horizons.' The paper also acknowledges custom testbeds inflate effect sizes.", 60 "source": "opus" 61 }, 62 "scope_boundaries_stated": { 63 "applies": true, 64 "answer": true, 65 "justification": "Section 8 explicitly bounds scope: 'larger-scale validation across more diverse domains and model architectures would strengthen generalizability claims.' The paper states mitigation results 'represent controlled experimental conditions' and real-world deployment faces additional challenges. Custom testbed effect sizes are noted as unrepresentative of production settings.", 66 "source": "opus" 67 } 68 }, 69 "conflicts_of_interest": { 70 "funding_disclosed": { 71 "applies": true, 72 "answer": false, 73 "justification": "No funding sources are disclosed. The acknowledgments section only mentions 'the use of AI tools for improving writing flow and fixing grammatical errors.' No grants, sponsors, or funding agencies are listed.", 74 "source": "opus" 75 }, 76 "affiliations_disclosed": { 77 "applies": true, 78 "answer": true, 79 "justification": "Author affiliations are clearly listed: Iowa State University departments (Computer Science, and Civil, Construction & Environmental Engineering). The authors are not affiliated with any company whose products are being evaluated.", 80 "source": "opus" 81 }, 82 "funder_independent_of_outcome": { 83 "applies": true, 84 "answer": false, 85 "justification": "Cannot determine funder independence because no funding source is disclosed. The absence of funding disclosure means independence cannot be verified.", 86 "source": "opus" 87 }, 88 "financial_interests_declared": { 89 "applies": true, 90 "answer": false, 91 "justification": "No competing interests or financial interests statement is present in the paper. The ethics statement addresses human annotation but does not include a competing interests declaration.", 92 "source": "opus" 93 } 94 }, 95 "scope_and_framing": { 96 "key_terms_defined": { 97 "applies": true, 98 "answer": true, 99 "justification": "The paper formally defines 'proxy gaming' (Section 3), 'evaluator gaming' including three subcategories (Section 2.1), 'format-sensitivity' and 'content-sensitivity' mathematically (Section 3.1), and provides a formal definition citing Skalse et al. for 'unhackable' proxy evaluators.", 100 "source": "haiku" 101 }, 102 "intended_contribution_clear": { 103 "applies": true, 104 "answer": true, 105 "justification": "The paper explicitly states its contributions in the introduction: the EST framework, cross-domain transfer analysis, and benchmarks for both RL (2,156 episodes) and LLM (1,200 instances) domains.", 106 "source": "haiku" 107 }, 108 "engagement_with_prior_work": { 109 "applies": true, 110 "answer": true, 111 "justification": "Section 2 engages substantively with RLHF/DPO literature, prior reward hacking detection methods, and positions EST as addressing the 'scalability limitations' and 'lack of principled frameworks' in existing approaches, distinguishing it from KL regularization, judge ensembling, and manual inspection.", 112 "source": "haiku" 113 } 114 } 115 }, 116 "type_checklist": { 117 "empirical": { 118 "artifacts": { 119 "code_released": { 120 "applies": true, 121 "answer": false, 122 "justification": "The paper states 'We release benchmarks for both domains: 2,156 RL episodes and 1,200 LLM gaming instances' but provides no repository URL, download link, or archive. 'All benchmarks will be released for standardized evaluation' (Appendix C.7) is a future promise, not an actual release.", 123 "source": "opus" 124 }, 125 "data_released": { 126 "applies": true, 127 "answer": false, 128 "justification": "Same as code — the paper promises to release benchmark data (2,156 RL episodes and 1,200 LLM instances) but no download URL or archive link is provided anywhere in the paper.", 129 "source": "opus" 130 }, 131 "environment_specified": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper mentions NVIDIA A6000 (48GB VRAM) in Appendix N and lists model names (Llama-3-8B, Llama-3-70B, GPT-4) but provides no requirements.txt, Dockerfile, library versions, or environment setup details sufficient to recreate the computational environment.", 135 "source": "opus" 136 }, 137 "reproduction_instructions": { 138 "applies": true, 139 "answer": false, 140 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The algorithmic details (Appendix D, Algorithm 1) describe the detection framework conceptually but do not constitute executable reproduction instructions.", 141 "source": "opus" 142 } 143 }, 144 "statistical_methodology": { 145 "confidence_intervals_or_error_bars": { 146 "applies": true, 147 "answer": true, 148 "justification": "Table 11 reports ±95% CI across 5-fold cross-validation for all RL detection categories. Table 7 reports ±std for overall LLM detection metrics. Table 18 reports mean ± std across random seeds for both domains.", 149 "source": "opus" 150 }, 151 "significance_tests": { 152 "applies": true, 153 "answer": true, 154 "justification": "Table 12 reports p-values for the factorial design experiment (e.g., reward density p < 0.001, objective alignment p < 0.001, reward complexity p = 0.003). Interaction effects also tested with p-values.", 155 "source": "opus" 156 }, 157 "effect_sizes_reported": { 158 "applies": true, 159 "answer": true, 160 "justification": "Table 12 reports Cohen's d for each factor in the factorial design (e.g., objective alignment d = 2.08, reward density d = 1.24, reward complexity d = 0.67). The paper acknowledges these are 'unusually large effect sizes' due to custom environment design.", 161 "source": "opus" 162 }, 163 "sample_size_justified": { 164 "applies": true, 165 "answer": false, 166 "justification": "No power analysis or explicit justification for why 2,156 RL episodes or 1,200 LLM instances were annotated. The paper states these sample sizes but does not justify why these numbers are sufficient for the claims made.", 167 "source": "opus" 168 }, 169 "variance_reported": { 170 "applies": true, 171 "answer": true, 172 "justification": "Tables 11, 18, and 29 report standard deviations across experimental runs and cross-validation folds. Table 7 reports ±std for all main metrics. The paper uses 10 random seeds per RL configuration and 5-fold CV.", 173 "source": "opus" 174 } 175 }, 176 "evaluation_design": { 177 "baselines_included": { 178 "applies": true, 179 "answer": true, 180 "justification": "Table 3/16 includes 10+ baselines for LLM detection (length-only, format-feature, style embedding, KL regularization, judge ensembling, hardened judges, probe-based detection, reward model ensemble disagreement, etc.). Table 29 includes LSTM-Autoencoder, One-Class SVM, Isolation Forest, and BC Divergence baselines for RL.", 181 "source": "opus" 182 }, 183 "baselines_contemporary": { 184 "applies": true, 185 "answer": true, 186 "justification": "Baselines include contemporary methods: reward model ensemble disagreement, probe-based detection, hardened judges, and self-consistency (CoT). These represent the current state of detection approaches for evaluator gaming.", 187 "source": "opus" 188 }, 189 "ablation_study": { 190 "applies": true, 191 "answer": true, 192 "justification": "Table 17 ablates individual LLM detection components (EST, correlation tracking, reasoning validity, format perturbation, content perturbation). Table 29 ablates each RL detector. Table 30 ablates mitigation intervention components. Removing EST causes the largest F1 drop (0.734 to 0.694).", 193 "source": "opus" 194 }, 195 "multiple_metrics": { 196 "applies": true, 197 "answer": true, 198 "justification": "The paper reports precision, recall, F1-score, AUC-ROC, early warning latency, computational overhead, human win-rate, judge-human correlation, and false positive rate across multiple experimental conditions.", 199 "source": "opus" 200 }, 201 "human_evaluation": { 202 "applies": true, 203 "answer": true, 204 "justification": "Human evaluation is central: 1,200 response pairs annotated by 3 human raters (Fleiss' κ ≥ 0.78) for LLM gaming, 2,156 episodes annotated by 3 RL experts (Cohen's κ = 0.847). Human win-rates are used to evaluate mitigation effectiveness. Human validation of 100 transformation samples achieves 87% equivalence agreement.", 205 "source": "opus" 206 }, 207 "held_out_test_set": { 208 "applies": true, 209 "answer": true, 210 "justification": "Section 4.1 states 'We use strict train-validation-test splits, holding out entire task-model-judge combinations for testing.' Appendix C.3.1 describes 'environment-stratified splits to prevent data leakage, with entire environment-algorithm combinations held out for testing.'", 211 "source": "opus" 212 }, 213 "per_category_breakdown": { 214 "applies": true, 215 "answer": true, 216 "justification": "Table 1 breaks results by task, model size, and judge type. Table 11 breaks RL results by hacking category. Table 15 presents all 32 experimental conditions individually. Table 6 shows per-task, per-judge detection performance.", 217 "source": "opus" 218 }, 219 "failure_cases_discussed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Appendix N provides detailed error analysis of 100 classification errors (50 false positives, 50 false negatives) with three patterns for each. Boundary cases (18.3% of detector-labeled episodes) are analyzed in Appendix N.3. Adaptive evasion results in Appendix I show detection degradation under adversarial conditions.", 223 "source": "opus" 224 }, 225 "negative_results_reported": { 226 "applies": true, 227 "answer": true, 228 "justification": "The ablation study shows components that hurt performance when removed. White-box evasion reduces precision from 74.2% to 65.9% (Table 24). Cross-environment transfer degrades by 10-15 F1 points (Table 32). The paper reports that data filtering has minimal impact on win-rate (+7.9% vs +8.3%), suggesting it's the least useful intervention.", 229 "source": "opus" 230 } 231 }, 232 "setup_transparency": { 233 "model_versions_specified": { 234 "applies": true, 235 "answer": false, 236 "justification": "The paper uses 'GPT-4' as a judge without specifying a version or snapshot date (e.g., gpt-4-0613). 'Llama-3-8B' and 'Llama-3-70B' are named but without specific checkpoint or release identifiers. No model version dates are provided anywhere.", 237 "source": "opus" 238 }, 239 "prompts_provided": { 240 "applies": true, 241 "answer": false, 242 "justification": "No actual prompt text is provided for any component: judge evaluation prompts, format perturbation generation prompts, content perturbation generation prompts, or model fine-tuning prompts are all described conceptually but the actual text used is never shown.", 243 "source": "opus" 244 }, 245 "hyperparameters_reported": { 246 "applies": true, 247 "answer": false, 248 "justification": "Detection framework hyperparameters are reported (Table 19: τspec=0.3, Δρ=0.5, γ=0.1, W=50). However, LLM fine-tuning hyperparameters (learning rate, batch size, epochs) and judge API settings (temperature, top-p, max tokens) are not reported, which significantly affect output quality.", 249 "source": "opus" 250 }, 251 "scaffolding_described": { 252 "applies": false, 253 "answer": false, 254 "justification": "No agentic scaffolding is used. The detection framework operates as a monitoring pipeline applied to standard fine-tuning runs, not as an agentic system with tools, memory, or retry logic.", 255 "source": "opus" 256 }, 257 "data_preprocessing_documented": { 258 "applies": true, 259 "answer": true, 260 "justification": "Appendix C.3.1 describes the validation protocol with expert-validated (n=2,156) and detector-consensus (n=13,091) sets. Data splits are documented (environment-stratified, task-model-judge holdouts). Annotation procedures are described (3 raters, consensus ≥2/3, inter-rater agreement metrics). Transformation validity audits are detailed in Appendix H.", 261 "source": "opus" 262 } 263 }, 264 "data_integrity": { 265 "raw_data_available": { 266 "applies": true, 267 "answer": false, 268 "justification": "Despite claiming to release benchmarks ('We release benchmarks for both domains'), no download URL, repository link, or data archive is provided. Raw episode data, annotations, and model outputs are not available for verification.", 269 "source": "opus" 270 }, 271 "data_collection_described": { 272 "applies": true, 273 "answer": true, 274 "justification": "Section 4.3 describes RL data collection: 15 environments, 5 algorithms, 10 random seeds, 15,247 total episodes, 2,156 expert-annotated. Section 5.1 describes LLM data: 4 tasks, 2 model sizes, 2 training methods, 2 judges, 10 checkpoints, 1,200 annotated instances. Annotation procedures include consensus rules and agreement metrics.", 275 "source": "opus" 276 }, 277 "recruitment_methods_described": { 278 "applies": true, 279 "answer": false, 280 "justification": "RL annotators are described only as 'three independent RL experts with extensive experience in reward hacking identification.' LLM annotators are described as '3 human annotators' with no further characterization. How annotators were recruited, their qualifications beyond vague descriptions, and potential selection bias are not discussed.", 281 "source": "opus" 282 }, 283 "data_pipeline_documented": { 284 "applies": true, 285 "answer": true, 286 "justification": "Appendix C.3.1 describes the pipeline from raw episodes (15,247) to expert-annotated subset (2,156) to train/test splits (environment-stratified). The LLM pipeline from model fine-tuning through checkpoint sampling to human annotation is described in Section 5.1. Transformation validity audits are documented with pass rates (Table 20).", 287 "source": "opus" 288 } 289 }, 290 "contamination": { 291 "training_cutoff_stated": { 292 "applies": false, 293 "answer": false, 294 "justification": "The paper evaluates a detection framework for proxy gaming, not pre-trained model capability on benchmarks. The core evaluation is whether EST detects gaming behavior, not whether Llama-3 or GPT-4 achieve specific benchmark scores.", 295 "source": "opus" 296 }, 297 "train_test_overlap_discussed": { 298 "applies": false, 299 "answer": false, 300 "justification": "Same as above — the paper tests a defense/detection framework rather than model knowledge. The relevant overlap concern is addressed through environment-stratified splits for detection performance evaluation.", 301 "source": "opus" 302 }, 303 "benchmark_contamination_addressed": { 304 "applies": false, 305 "answer": false, 306 "justification": "The paper tests a detection framework, not model benchmark performance. While the models are fine-tuned on tasks like TL;DR, the evaluation target is detection precision/recall, not model task accuracy.", 307 "source": "opus" 308 } 309 }, 310 "human_studies": { 311 "pre_registered": { 312 "applies": true, 313 "answer": false, 314 "justification": "No pre-registration is mentioned. The study uses human annotators who provided informed consent and received compensation, but the experimental protocol was not pre-registered.", 315 "source": "opus" 316 }, 317 "irb_or_ethics_approval": { 318 "applies": true, 319 "answer": false, 320 "justification": "The ethics statement says 'Human annotation was conducted under informed consent with compensation consistent with local standards' but does not mention IRB approval or ethics board review.", 321 "source": "opus" 322 }, 323 "demographics_reported": { 324 "applies": true, 325 "answer": false, 326 "justification": "RL annotators described only as 'three independent RL experts with extensive experience in reward hacking identification.' LLM annotators described as '3 human annotators' with no demographics (experience level, background, etc.) reported.", 327 "source": "opus" 328 }, 329 "inclusion_exclusion_criteria": { 330 "applies": true, 331 "answer": false, 332 "justification": "No formal inclusion/exclusion criteria for annotator selection are stated. The RL experts are characterized only by having 'extensive experience' but no specific criteria for what constitutes sufficient expertise.", 333 "source": "opus" 334 }, 335 "randomization_described": { 336 "applies": false, 337 "answer": false, 338 "justification": "Not an experimental study with human participants assigned to treatment/control conditions. The humans serve as annotators providing ground truth labels, not as experimental subjects.", 339 "source": "opus" 340 }, 341 "blinding_described": { 342 "applies": false, 343 "answer": false, 344 "justification": "Blinding is not applicable to this annotation task. The annotators label gaming instances; there is no treatment condition to blind.", 345 "source": "opus" 346 }, 347 "attrition_reported": { 348 "applies": true, 349 "answer": false, 350 "justification": "No mention of annotator attrition or dropout. The paper does not state whether all recruited annotators completed their annotation tasks or if any were replaced.", 351 "source": "opus" 352 } 353 }, 354 "cost_and_practicality": { 355 "inference_cost_reported": { 356 "applies": true, 357 "answer": true, 358 "justification": "Table 14 reports per-checkpoint costs: '∼0.8s per output' for EST computation, '∼0.1s per checkpoint' for correlation tracking, '2.1% of training time' total overhead. Appendix N reports '≈0.2 GPU-hours baseline cost' per 1,000 episodes.", 359 "source": "opus" 360 }, 361 "compute_budget_stated": { 362 "applies": true, 363 "answer": false, 364 "justification": "Per-technique costs are given (Table 31: 0.13-0.58 GPU-hours for mitigation techniques on NVIDIA A6000 48GB), but the total computational budget for all experiments (RL + LLM fine-tuning + detection + human annotation) is not stated.", 365 "source": "opus" 366 } 367 }, 368 "experimental_rigor": { 369 "seed_sensitivity_reported": { 370 "applies": true, 371 "answer": true, 372 "justification": "RL experiments use '10 random seeds per configuration' (Appendix C.1). Table 11 reports results across 5-fold cross-validation with 95% CI. Table 18 reports mean ± std across random seeds.", 373 "source": "opus" 374 }, 375 "number_of_runs_stated": { 376 "applies": true, 377 "answer": true, 378 "justification": "The paper explicitly states '10 random seeds per configuration' for RL and '5-fold cross-validation' for detection performance evaluation. '10 training checkpoints' are sampled per LLM condition.", 379 "source": "opus" 380 }, 381 "hyperparameter_search_budget": { 382 "applies": true, 383 "answer": true, 384 "justification": "Appendix G.1 states 'comprehensive sensitivity analysis on key detection parameters' across '243 tested parameter combinations in the full grid search.' Table 19 reports sensitivity ranges for 5 hyperparameters.", 385 "source": "opus" 386 }, 387 "best_config_selection_justified": { 388 "applies": true, 389 "answer": true, 390 "justification": "Appendix C.3.1 states 'Detection thresholds were determined through 5-fold cross-validation on this subset only, with hyperparameters selected to maximize F1-score on validation folds.' Figure 3 shows threshold calibration analysis justifying the optimal threshold τ=0.6.", 391 "source": "opus" 392 }, 393 "multiple_comparison_correction": { 394 "applies": true, 395 "answer": false, 396 "justification": "The paper performs many comparisons across 32 experimental conditions, 15 RL environments, 6 hacking categories, and multiple baseline methods without mentioning any multiple comparison correction (Bonferroni, Holm, Benjamini-Hochberg, etc.).", 397 "source": "opus" 398 }, 399 "self_comparison_bias_addressed": { 400 "applies": true, 401 "answer": false, 402 "justification": "The authors implement all baselines themselves (LSTM-Autoencoder, One-Class SVM, Isolation Forest, etc.) and compare against their own framework. No acknowledgment of author-evaluation bias or independent evaluation is provided.", 403 "source": "opus" 404 }, 405 "compute_budget_vs_performance": { 406 "applies": true, 407 "answer": false, 408 "justification": "While overhead percentages are reported (2.1% LLM, 4.2% RL), performance is not shown as a function of compute budget. Baselines and the proposed method may require different compute budgets, but this is not analyzed or equalized.", 409 "source": "opus" 410 }, 411 "benchmark_construct_validity": { 412 "applies": true, 413 "answer": false, 414 "justification": "The paper uses human annotations as ground truth for gaming detection without questioning whether human annotators reliably identify all forms of gaming or whether inter-rater agreement (κ = 0.847 RL, κ ≥ 0.78 LLM) constitutes sufficient construct validity. The gap between 'human-annotated gaming' and 'actual proxy gaming' is not discussed.", 415 "source": "opus" 416 }, 417 "scaffold_confound_addressed": { 418 "applies": false, 419 "answer": false, 420 "justification": "No agentic scaffolding is used. The detection framework operates as a monitoring pipeline on standard fine-tuning runs without scaffolding confounds.", 421 "source": "opus" 422 } 423 }, 424 "data_leakage": { 425 "temporal_leakage_addressed": { 426 "applies": true, 427 "answer": false, 428 "justification": "No discussion of whether pre-trained Llama-3 or GPT-4 models may have encountered training task data (TL;DR summaries, GSM8K problems) during pre-training, which could affect the gaming patterns observed during fine-tuning.", 429 "source": "opus" 430 }, 431 "feature_leakage_addressed": { 432 "applies": true, 433 "answer": false, 434 "justification": "No explicit discussion of whether detection features (format sensitivity, correlation metrics) might inadvertently contain information that leaks label assignments or whether the feature design introduces bias toward certain detection patterns.", 435 "source": "opus" 436 }, 437 "non_independence_addressed": { 438 "applies": true, 439 "answer": true, 440 "justification": "Appendix C.3.1 states 'Training and testing employed environment-stratified splits to prevent data leakage, with entire environment-algorithm combinations held out for testing.' Section 4.1 uses 'strict train-validation-test splits, holding out entire task-model-judge combinations for testing.'", 441 "source": "opus" 442 }, 443 "leakage_detection_method": { 444 "applies": true, 445 "answer": true, 446 "justification": "Environment-stratified and task-model-judge combination splits serve as structural prevention methods against train-test leakage. These ensure entire configurations are held out, preventing information flow between training and test sets for the detection framework.", 447 "source": "opus" 448 } 449 } 450 } 451 }, 452 "claims": [ 453 { 454 "claim": "EST achieves 78.4% precision and 81.7% recall for RL reward hacking detection across 15 environments and 5 algorithms (2,156 expert-annotated episodes).", 455 "evidence": "Table 11 reports overall precision 0.784±0.027, recall 0.817±0.023, F1 0.800±0.019 with AUC-ROC 0.808±0.016 via 5-fold cross-validation on expert-validated set.", 456 "supported": "strong" 457 }, 458 { 459 "claim": "EST achieves 74.2% precision and 78.6% recall for LLM evaluator gaming detection across 32 experimental conditions (1,200 human-annotated instances).", 460 "evidence": "Table 1 and Table 7 report overall precision 0.742±0.04, recall 0.786±0.04, F1 0.763±0.03 across all 32 conditions (4 tasks × 2 model sizes × 2 training methods × 2 judges).", 461 "supported": "strong" 462 }, 463 { 464 "claim": "Closed-loop mitigation with EST improves human win-rate by 8.3 percentage points (52.1% to 60.4%) in LLM fine-tuning.", 465 "evidence": "Table 9 shows baseline 52.1% vs. format penalty/combined approach 60.4%; Table 30 ablation shows control conditions (extra compute, extra filtering) only achieve +2.1% and +1.7% respectively.", 466 "supported": "moderate" 467 }, 468 { 469 "claim": "EST provides early warning signals with median lead time of 3 checkpoints (IQR: 2–4) before human-noticeable quality decline.", 470 "evidence": "Figure 2 and Table 7 report early warning of 3.0±0.4 checkpoints; defined as checkpoints between detector trigger and human win-rate dropping below 0.50.", 471 "supported": "moderate" 472 }, 473 { 474 "claim": "Proxy-true correlation tracking transfers directly across RL and LLM domains without modification, while perturbation design requires domain adaptation.", 475 "evidence": "Table 8 categorizes transfer: correlation tracking achieves 0.821 AUC (RL) and 0.798 AUC (LLM); perturbation design is listed as 'Adapted' with domain-specific targets.", 476 "supported": "moderate" 477 }, 478 { 479 "claim": "EST substantially outperforms feature-based and method baselines in LLM gaming detection (F1 0.734 vs. best baseline 0.700 for hardened judge alone).", 480 "evidence": "Table 16 shows EST Framework at F1 0.734 vs. all baselines ranging from 0.534 (length-only) to 0.700 (hardened judge); combined EST + hardened judge reaches 0.789.", 481 "supported": "strong" 482 }, 483 { 484 "claim": "White-box evasion reduces EST detection precision to 65.9%, but ensemble defense-in-depth recovers precision to 78.1%.", 485 "evidence": "Tables 24 and 25 show white-box evasion drops precision from 74.2% to 65.9%, while evading the full ensemble (EST + correlation + reasoning + hardened judge) only achieves 78.1% precision.", 486 "supported": "moderate" 487 }, 488 { 489 "claim": "Objective alignment is the strongest predictor of RL reward hacking frequency (Cohen's d = 2.08, 31.2% lower hacking in high-alignment conditions).", 490 "evidence": "Table 12 from the 2×2×2 factorial design across 5 custom environments with 15 seeds shows objective alignment effect size d=2.08, p<0.001; authors note these large effect sizes reflect custom environments 'designed to maximize experimental contrast.'", 491 "supported": "weak" 492 } 493 ], 494 "methodology_tags": [ 495 "benchmark-eval", 496 "case-study", 497 "theoretical" 498 ], 499 "key_findings": "The Evaluator Stress Test (EST) framework detects proxy gaming by separating format-exploitable sensitivity from content-driven improvements via controlled perturbations with semantic validity audits. Across 15 RL environments (F1 0.800) and 4 LLM tasks (F1 0.734), EST outperforms all baselines and provides early warning 3 checkpoints before observable quality decline. Cross-domain analysis shows that proxy-true correlation tracking transfers directly between RL and LLM settings, while perturbation design requires domain-specific adaptation. Closed-loop detector-triggered mitigation yields 8.3 percentage-point human win-rate improvement in LLM training and 54.6% hacking reduction in RL.", 500 "red_flags": [ 501 { 502 "flag": "No code or data URL provided", 503 "detail": "The paper claims 'We release benchmarks for both domains' but provides no repository URL, DOI, or access link. Without accessible artifacts, the paper's reproducibility claims are unverifiable." 504 }, 505 { 506 "flag": "Custom environments designed to inflate effect sizes", 507 "detail": "Appendix C.5 explicitly states 'these unusually large effect sizes reflect our custom environments designed to maximize experimental contrast between conditions.' Cohen's d values of 2.08 for objective alignment are implausibly large for realistic deployments." 508 }, 509 { 510 "flag": "GPT-4 version unspecified", 511 "detail": "GPT-4 is used as a judge throughout but no snapshot date or version is provided, making exact replication impossible since GPT-4 behavior changes across versions." 512 }, 513 { 514 "flag": "LLM training hyperparameters absent", 515 "detail": "No learning rates, batch sizes, training steps, or generation temperatures are reported for the DPO/RLHF fine-tuning experiments, which are central to the paper's claims." 516 }, 517 { 518 "flag": "Circular RL ground truth for pattern analysis", 519 "detail": "Of 15,247 total RL episodes, ground truth is only available for 2,156; the remaining 13,091 use detector consensus (3-of-6 agreement) as pseudo-labels. Though the paper acknowledges circularity and restricts performance metrics to the expert-validated set, hacking frequency statistics drawn from the larger set remain suspect." 520 }, 521 { 522 "flag": "Self-citation of concurrent preprints as baselines", 523 "detail": "The authors cite Shihab et al. 2025a and 2025b (arXiv:2509.03733 and 2509.03790) — concurrent preprints by the same authors — in the related work, creating a potential circular framing of the prior literature." 524 }, 525 { 526 "flag": "Human annotation setup underspecified", 527 "detail": "Annotator recruitment, qualifications, annotation interface, and instructions for distinguishing gaming from legitimate improvement are not described; only inter-rater reliability statistics are reported." 528 } 529 ], 530 "cited_papers": [ 531 { 532 "title": "Defining and Characterizing Reward Hacking", 533 "relevance": "Provides the formal definition of reward hacking and 'unhackable' proxy evaluators that EST's detection framework builds on" 534 }, 535 { 536 "title": "Specification Gaming: The Flip Side of AI Ingenuity (DeepMind Blog)", 537 "relevance": "Taxonomy of specification gaming behaviors used as foundation for EST's RL reward hacking taxonomy" 538 }, 539 { 540 "title": "Training Language Models to Follow Instructions with Human Feedback (InstructGPT)", 541 "relevance": "Establishes RLHF pipeline that EST targets for evaluator gaming detection" 542 }, 543 { 544 "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model", 545 "relevance": "One of two training methods (DPO and RLHF) used in all LLM experimental conditions" 546 }, 547 { 548 "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", 549 "relevance": "Establishes LLM-as-judge methodology that EST tests for exploitability" 550 }, 551 { 552 "title": "Concrete Problems in AI Safety", 553 "relevance": "Foundational motivation for proxy optimization as an AI safety challenge spanning both RL and alignment" 554 }, 555 { 556 "title": "Scaling Laws for Reward Model Overoptimization", 557 "relevance": "Prior work on reward model misspecification that EST extends with online detection" 558 }, 559 { 560 "title": "Learning to Summarize with Human Feedback", 561 "relevance": "Establishes TL;DR summarization as benchmark task and demonstrates judge-human correlation tracking methodology" 562 }, 563 { 564 "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 565 "relevance": "Basis for EST's reasoning validity detector applied to GSM8K chain-of-thought evaluation" 566 } 567 ], 568 "engagement_factors": { 569 "practical_relevance": { 570 "score": 2, 571 "justification": "The detection framework is relevant for practitioners doing RLHF/DPO fine-tuning, but no code is released, limiting immediate usability." 572 }, 573 "surprise_contrarian": { 574 "score": 1, 575 "justification": "Proxy gaming and reward hacking are well-known problems; the unified RL-LLM framework is novel but the core insight is not surprising." 576 }, 577 "fear_safety": { 578 "score": 2, 579 "justification": "Demonstrates that models systematically game evaluators during alignment training, raising concerns about RLHF/DPO pipeline reliability." 580 }, 581 "drama_conflict": { 582 "score": 1, 583 "justification": "Shows LLM-as-judge pipelines are gameable (e.g., format exploitation scores 8.2 vs 5.1 human rating), but this is already a recognized concern." 584 }, 585 "demo_ability": { 586 "score": 0, 587 "justification": "No code, demo, or tool is released despite claims of benchmark release." 588 }, 589 "brand_recognition": { 590 "score": 0, 591 "justification": "From Iowa State University researchers without established profiles in AI alignment or safety research." 592 } 593 }, 594 "hn_data": { 595 "threads": [ 596 { 597 "hn_id": "38720557", 598 "title": "ReLoRA: High-Rank Training Through Low-Rank Updates", 599 "points": 3, 600 "comments": 0, 601 "url": "https://news.ycombinator.com/item?id=38720557" 602 }, 603 { 604 "hn_id": "41035192", 605 "title": "The Limitations of Compute Thresholds as a Governance Strategy", 606 "points": 1, 607 "comments": 0, 608 "url": "https://news.ycombinator.com/item?id=41035192" 609 } 610 ], 611 "top_points": 3, 612 "total_points": 4, 613 "total_comments": 0 614 } 615 }