ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (36057B)


      1 {
      2   "paper": {
      3     "title": "Detecting Proxy Gaming in RL and LLM Alignment via Evaluator Stress Tests",
      4     "authors": [
      5       "Ibne Farabi Shihab",
      6       "Sanjeda Akter",
      7       "Anuj Sharma"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv",
     11     "arxiv_id": "2507.05619"
     12   },
     13   "scan_version": 3,
     14   "active_modules": ["experimental_rigor", "data_leakage"],
     15   "methodology_tags": ["benchmark-eval"],
     16   "key_findings": "The Evaluator Stress Test (EST) detects proxy gaming in both RL (78.4% precision, 81.7% recall on 2,156 expert-annotated episodes) and LLM alignment (74.2% precision, 78.6% recall on 1,200 human-annotated instances) by measuring whether score improvements are driven by exploitable features or genuine content improvements. Closed-loop mitigation using detector-triggered interventions improves human win-rate by 8.3 points in LLM alignment and reduces hacking by 54.6% in RL. Cross-domain analysis shows proxy-true correlation tracking transfers directly between RL and LLM domains while perturbation design requires domain adaptation.",
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "The paper states 'We release benchmarks for both domains: 2,156 RL episodes and 1,200 LLM gaming instances' but provides no repository URL, download link, or archive. 'All benchmarks will be released for standardized evaluation' (Appendix C.7) is a future promise, not an actual release."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "Same as code — the paper promises to release benchmark data (2,156 RL episodes and 1,200 LLM instances) but no download URL or archive link is provided anywhere in the paper."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The paper mentions NVIDIA A6000 (48GB VRAM) in Appendix N and lists model names (Llama-3-8B, Llama-3-70B, GPT-4) but provides no requirements.txt, Dockerfile, library versions, or environment setup details sufficient to recreate the computational environment."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The algorithmic details (Appendix D, Algorithm 1) describe the detection framework conceptually but do not constitute executable reproduction instructions."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "Table 11 reports ±95% CI across 5-fold cross-validation for all RL detection categories. Table 7 reports ±std for overall LLM detection metrics. Table 18 reports mean ± std across random seeds for both domains."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Table 12 reports p-values for the factorial design experiment (e.g., reward density p < 0.001, objective alignment p < 0.001, reward complexity p = 0.003). Interaction effects also tested with p-values."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Table 12 reports Cohen's d for each factor in the factorial design (e.g., objective alignment d = 2.08, reward density d = 1.24, reward complexity d = 0.67). The paper acknowledges these are 'unusually large effect sizes' due to custom environment design."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No power analysis or explicit justification for why 2,156 RL episodes or 1,200 LLM instances were annotated. The paper states these sample sizes but does not justify why these numbers are sufficient for the claims made."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Tables 11, 18, and 29 report standard deviations across experimental runs and cross-validation folds. Table 7 reports ±std for all main metrics. The paper uses 10 random seeds per RL configuration and 5-fold CV."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Table 3/16 includes 10+ baselines for LLM detection (length-only, format-feature, style embedding, KL regularization, judge ensembling, hardened judges, probe-based detection, reward model ensemble disagreement, etc.). Table 29 includes LSTM-Autoencoder, One-Class SVM, Isolation Forest, and BC Divergence baselines for RL."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Baselines include contemporary methods: reward model ensemble disagreement, probe-based detection, hardened judges, and self-consistency (CoT). These represent the current state of detection approaches for evaluator gaming."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Table 17 ablates individual LLM detection components (EST, correlation tracking, reasoning validity, format perturbation, content perturbation). Table 29 ablates each RL detector. Table 30 ablates mitigation intervention components. Removing EST causes the largest F1 drop (0.734 to 0.694)."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "The paper reports precision, recall, F1-score, AUC-ROC, early warning latency, computational overhead, human win-rate, judge-human correlation, and false positive rate across multiple experimental conditions."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Human evaluation is central: 1,200 response pairs annotated by 3 human raters (Fleiss' κ ≥ 0.78) for LLM gaming, 2,156 episodes annotated by 3 RL experts (Cohen's κ = 0.847). Human win-rates are used to evaluate mitigation effectiveness. Human validation of 100 transformation samples achieves 87% equivalence agreement."
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Section 4.1 states 'We use strict train-validation-test splits, holding out entire task-model-judge combinations for testing.' Appendix C.3.1 describes 'environment-stratified splits to prevent data leakage, with entire environment-algorithm combinations held out for testing.'"
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Table 1 breaks results by task, model size, and judge type. Table 11 breaks RL results by hacking category. Table 15 presents all 32 experimental conditions individually. Table 6 shows per-task, per-judge detection performance."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Appendix N provides detailed error analysis of 100 classification errors (50 false positives, 50 false negatives) with three patterns for each. Boundary cases (18.3% of detector-labeled episodes) are analyzed in Appendix N.3. Adaptive evasion results in Appendix I show detection degradation under adversarial conditions."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The ablation study shows components that hurt performance when removed. White-box evasion reduces precision from 74.2% to 65.9% (Table 24). Cross-environment transfer degrades by 10-15 F1 points (Table 32). The paper reports that data filtering has minimal impact on win-rate (+7.9% vs +8.3%), suggesting it's the least useful intervention."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "All abstract claims are supported: 78.4% precision/81.7% recall for RL (Table 11), 74.2% precision/78.6% recall for LLM (Tables 1, 7), 8.3 point win-rate improvement (Table 9), 54.6% hacking reduction (Table 31), median 3-checkpoint early warning (Table 7), and overhead percentages match reported values."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Causal claims are supported by controlled designs: ablation studies (Tables 17, 29, 30) use single-variable manipulation, the 2×2×2 factorial design (Table 12) varies individual factors, and control experiments (Table 30) confirm gains are from detector-guided interventions rather than extra compute."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Section 8 (Limitations) explicitly states the study is 'limited to 4 tasks and 2 model sizes' and notes the threat model 'assumes fixed judges during training.' The paper acknowledges custom testbed effect sizes would be 'more modest' in real-world deployments (Appendix C.5)."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper addresses length as a confound (Table 23 shows ρ=0.18 correlation, Figure 5 AUROC 0.734 vs 0.534 for length-only), controls for extra compute as an alternative explanation for mitigation gains (Table 30 control experiments), and discusses audit quality as a source of false positives (Appendix H)."
    134       },
    135       "proxy_outcome_distinction": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The paper's core contribution is explicitly about the proxy-true distinction. Section 3 formally defines proxy gaming as 'systematic increases in E[J(y)] that do not correspond to increases in E[H(y)]' and the entire EST framework is designed to distinguish proxy-exploitative gains from content-driven improvements."
    139       }
    140     },
    141     "setup_transparency": {
    142       "model_versions_specified": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "The paper uses 'GPT-4' as a judge without specifying a version or snapshot date (e.g., gpt-4-0613). 'Llama-3-8B' and 'Llama-3-70B' are named but without specific checkpoint or release identifiers. No model version dates are provided anywhere."
    146       },
    147       "prompts_provided": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "No actual prompt text is provided for any component: judge evaluation prompts, format perturbation generation prompts, content perturbation generation prompts, or model fine-tuning prompts are all described conceptually but the actual text used is never shown."
    151       },
    152       "hyperparameters_reported": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "Detection framework hyperparameters are reported (Table 19: τspec=0.3, Δρ=0.5, γ=0.1, W=50). However, LLM fine-tuning hyperparameters (learning rate, batch size, epochs) and judge API settings (temperature, top-p, max tokens) are not reported, which significantly affect output quality."
    156       },
    157       "scaffolding_described": {
    158         "applies": false,
    159         "answer": false,
    160         "justification": "No agentic scaffolding is used. The detection framework operates as a monitoring pipeline applied to standard fine-tuning runs, not as an agentic system with tools, memory, or retry logic."
    161       },
    162       "data_preprocessing_documented": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Appendix C.3.1 describes the validation protocol with expert-validated (n=2,156) and detector-consensus (n=13,091) sets. Data splits are documented (environment-stratified, task-model-judge holdouts). Annotation procedures are described (3 raters, consensus ≥2/3, inter-rater agreement metrics). Transformation validity audits are detailed in Appendix H."
    166       }
    167     },
    168     "limitations_and_scope": {
    169       "limitations_section_present": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 8 is titled 'Limitations' and provides substantive discussion covering task/model scope limitations, threat model assumptions (fixed judges), and real-world deployment challenges (concept drift, multi-stakeholder conflicts, adversarial adaptation)."
    173       },
    174       "threats_to_validity_specific": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 8 discusses specific threats: 'limited to 4 tasks and 2 model sizes,' 'fixed judges during training; adaptive evaluators that update during fine-tuning may require modified detection approaches,' and 'concept drift, multi-stakeholder objective conflicts, and adversarial adaptation over longer training horizons.' The paper also acknowledges custom testbeds inflate effect sizes."
    178       },
    179       "scope_boundaries_stated": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 8 explicitly bounds scope: 'larger-scale validation across more diverse domains and model architectures would strengthen generalizability claims.' The paper states mitigation results 'represent controlled experimental conditions' and real-world deployment faces additional challenges. Custom testbed effect sizes are noted as unrepresentative of production settings."
    183       }
    184     },
    185     "data_integrity": {
    186       "raw_data_available": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "Despite claiming to release benchmarks ('We release benchmarks for both domains'), no download URL, repository link, or data archive is provided. Raw episode data, annotations, and model outputs are not available for verification."
    190       },
    191       "data_collection_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Section 4.3 describes RL data collection: 15 environments, 5 algorithms, 10 random seeds, 15,247 total episodes, 2,156 expert-annotated. Section 5.1 describes LLM data: 4 tasks, 2 model sizes, 2 training methods, 2 judges, 10 checkpoints, 1,200 annotated instances. Annotation procedures include consensus rules and agreement metrics."
    195       },
    196       "recruitment_methods_described": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "RL annotators are described only as 'three independent RL experts with extensive experience in reward hacking identification.' LLM annotators are described as '3 human annotators' with no further characterization. How annotators were recruited, their qualifications beyond vague descriptions, and potential selection bias are not discussed."
    200       },
    201       "data_pipeline_documented": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Appendix C.3.1 describes the pipeline from raw episodes (15,247) to expert-annotated subset (2,156) to train/test splits (environment-stratified). The LLM pipeline from model fine-tuning through checkpoint sampling to human annotation is described in Section 5.1. Transformation validity audits are documented with pass rates (Table 20)."
    205       }
    206     },
    207     "conflicts_of_interest": {
    208       "funding_disclosed": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "No funding sources are disclosed. The acknowledgments section only mentions 'the use of AI tools for improving writing flow and fixing grammatical errors.' No grants, sponsors, or funding agencies are listed."
    212       },
    213       "affiliations_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Author affiliations are clearly listed: Iowa State University departments (Computer Science, and Civil, Construction & Environmental Engineering). The authors are not affiliated with any company whose products are being evaluated."
    217       },
    218       "funder_independent_of_outcome": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "Cannot determine funder independence because no funding source is disclosed. The absence of funding disclosure means independence cannot be verified."
    222       },
    223       "financial_interests_declared": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No competing interests or financial interests statement is present in the paper. The ethics statement addresses human annotation but does not include a competing interests declaration."
    227       }
    228     },
    229     "contamination": {
    230       "training_cutoff_stated": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "The paper evaluates a detection framework for proxy gaming, not pre-trained model capability on benchmarks. The core evaluation is whether EST detects gaming behavior, not whether Llama-3 or GPT-4 achieve specific benchmark scores."
    234       },
    235       "train_test_overlap_discussed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "Same as above — the paper tests a defense/detection framework rather than model knowledge. The relevant overlap concern is addressed through environment-stratified splits for detection performance evaluation."
    239       },
    240       "benchmark_contamination_addressed": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "The paper tests a detection framework, not model benchmark performance. While the models are fine-tuned on tasks like TL;DR, the evaluation target is detection precision/recall, not model task accuracy."
    244       }
    245     },
    246     "human_studies": {
    247       "pre_registered": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "No pre-registration is mentioned. The study uses human annotators who provided informed consent and received compensation, but the experimental protocol was not pre-registered."
    251       },
    252       "irb_or_ethics_approval": {
    253         "applies": true,
    254         "answer": false,
    255         "justification": "The ethics statement says 'Human annotation was conducted under informed consent with compensation consistent with local standards' but does not mention IRB approval or ethics board review."
    256       },
    257       "demographics_reported": {
    258         "applies": true,
    259         "answer": false,
    260         "justification": "RL annotators described only as 'three independent RL experts with extensive experience in reward hacking identification.' LLM annotators described as '3 human annotators' with no demographics (experience level, background, etc.) reported."
    261       },
    262       "inclusion_exclusion_criteria": {
    263         "applies": true,
    264         "answer": false,
    265         "justification": "No formal inclusion/exclusion criteria for annotator selection are stated. The RL experts are characterized only by having 'extensive experience' but no specific criteria for what constitutes sufficient expertise."
    266       },
    267       "randomization_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "Not an experimental study with human participants assigned to treatment/control conditions. The humans serve as annotators providing ground truth labels, not as experimental subjects."
    271       },
    272       "blinding_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "Blinding is not applicable to this annotation task. The annotators label gaming instances; there is no treatment condition to blind."
    276       },
    277       "attrition_reported": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No mention of annotator attrition or dropout. The paper does not state whether all recruited annotators completed their annotation tasks or if any were replaced."
    281       }
    282     },
    283     "cost_and_practicality": {
    284       "inference_cost_reported": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "Table 14 reports per-checkpoint costs: '∼0.8s per output' for EST computation, '∼0.1s per checkpoint' for correlation tracking, '2.1% of training time' total overhead. Appendix N reports '≈0.2 GPU-hours baseline cost' per 1,000 episodes."
    288       },
    289       "compute_budget_stated": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "Per-technique costs are given (Table 31: 0.13-0.58 GPU-hours for mitigation techniques on NVIDIA A6000 48GB), but the total computational budget for all experiments (RL + LLM fine-tuning + detection + human annotation) is not stated."
    293       }
    294     },
    295     "experimental_rigor": {
    296       "seed_sensitivity_reported": {
    297         "applies": true,
    298         "answer": true,
    299         "justification": "RL experiments use '10 random seeds per configuration' (Appendix C.1). Table 11 reports results across 5-fold cross-validation with 95% CI. Table 18 reports mean ± std across random seeds."
    300       },
    301       "number_of_runs_stated": {
    302         "applies": true,
    303         "answer": true,
    304         "justification": "The paper explicitly states '10 random seeds per configuration' for RL and '5-fold cross-validation' for detection performance evaluation. '10 training checkpoints' are sampled per LLM condition."
    305       },
    306       "hyperparameter_search_budget": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "Appendix G.1 states 'comprehensive sensitivity analysis on key detection parameters' across '243 tested parameter combinations in the full grid search.' Table 19 reports sensitivity ranges for 5 hyperparameters."
    310       },
    311       "best_config_selection_justified": {
    312         "applies": true,
    313         "answer": true,
    314         "justification": "Appendix C.3.1 states 'Detection thresholds were determined through 5-fold cross-validation on this subset only, with hyperparameters selected to maximize F1-score on validation folds.' Figure 3 shows threshold calibration analysis justifying the optimal threshold τ=0.6."
    315       },
    316       "multiple_comparison_correction": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The paper performs many comparisons across 32 experimental conditions, 15 RL environments, 6 hacking categories, and multiple baseline methods without mentioning any multiple comparison correction (Bonferroni, Holm, Benjamini-Hochberg, etc.)."
    320       },
    321       "self_comparison_bias_addressed": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The authors implement all baselines themselves (LSTM-Autoencoder, One-Class SVM, Isolation Forest, etc.) and compare against their own framework. No acknowledgment of author-evaluation bias or independent evaluation is provided."
    325       },
    326       "compute_budget_vs_performance": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "While overhead percentages are reported (2.1% LLM, 4.2% RL), performance is not shown as a function of compute budget. Baselines and the proposed method may require different compute budgets, but this is not analyzed or equalized."
    330       },
    331       "benchmark_construct_validity": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The paper uses human annotations as ground truth for gaming detection without questioning whether human annotators reliably identify all forms of gaming or whether inter-rater agreement (κ = 0.847 RL, κ ≥ 0.78 LLM) constitutes sufficient construct validity. The gap between 'human-annotated gaming' and 'actual proxy gaming' is not discussed."
    335       },
    336       "scaffold_confound_addressed": {
    337         "applies": false,
    338         "answer": false,
    339         "justification": "No agentic scaffolding is used. The detection framework operates as a monitoring pipeline on standard fine-tuning runs without scaffolding confounds."
    340       }
    341     },
    342     "data_leakage": {
    343       "temporal_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No discussion of whether pre-trained Llama-3 or GPT-4 models may have encountered training task data (TL;DR summaries, GSM8K problems) during pre-training, which could affect the gaming patterns observed during fine-tuning."
    347       },
    348       "feature_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No explicit discussion of whether detection features (format sensitivity, correlation metrics) might inadvertently contain information that leaks label assignments or whether the feature design introduces bias toward certain detection patterns."
    352       },
    353       "non_independence_addressed": {
    354         "applies": true,
    355         "answer": true,
    356         "justification": "Appendix C.3.1 states 'Training and testing employed environment-stratified splits to prevent data leakage, with entire environment-algorithm combinations held out for testing.' Section 4.1 uses 'strict train-validation-test splits, holding out entire task-model-judge combinations for testing.'"
    357       },
    358       "leakage_detection_method": {
    359         "applies": true,
    360         "answer": true,
    361         "justification": "Environment-stratified and task-model-judge combination splits serve as structural prevention methods against train-test leakage. These ensure entire configurations are held out, preventing information flow between training and test sets for the detection framework."
    362       }
    363     }
    364   },
    365   "claims": [
    366     {
    367       "claim": "EST achieves 78.4% precision and 81.7% recall for reward hacking detection across 15 RL environments and 5 algorithms",
    368       "evidence": "Table 11 reports overall precision 0.784±0.027, recall 0.817±0.023 on 2,156 expert-annotated episodes with environment-stratified 5-fold cross-validation. Cohen's κ = 0.847 for inter-rater reliability.",
    369       "supported": "moderate"
    370     },
    371     {
    372       "claim": "EST achieves 74.2% precision and 78.6% recall for evaluator gaming detection across 4 LLM tasks, 2 model scales, 2 training methods, and 2 judges",
    373       "evidence": "Tables 1 and 7 report overall precision 0.742±0.04, recall 0.786±0.04 on 1,200 human-annotated instances (300 per task). Performance broken down across all 32 conditions in Table 15.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "Closed-loop mitigation improves human win-rate by 8.3 points for LLM alignment",
    378       "evidence": "Table 9 shows baseline win-rate 52.1% improving to 60.4% with format penalty, with correlation maintained at ρ=0.82 vs 0.61 baseline. Table 30 ablation confirms gains from detector-guided interventions vs control experiments with equivalent extra compute (+2.1% only).",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Closed-loop mitigation reduces hacking by 54.6% in RL",
    383       "evidence": "Table 31 reports combined approach achieves 54.6% hacking reduction with 9.1% performance impact and 6.7% computational overhead. Individual techniques range from 28.1% to 47.8% reduction.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "EST provides early warning signals with median lead time of 3 checkpoints before human-noticeable quality decline",
    388       "evidence": "Table 7 reports early warning of 3.0±0.4 checkpoints. Figure 2 illustrates detector triggers preceding human quality decline. Section 5.2 defines early warning as checkpoints between detector trigger and W(t) < 0.50.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Proxy-true correlation tracking transfers directly between RL and LLM domains",
    393       "evidence": "Table 8 classifies correlation tracking as 'Direct' transfer. Table 18 reports AUC 0.821 (RL) and 0.798 (LLM) for proxy optimization detection without modification. Section 6.1 defines transfer as achieving ≥90% of in-domain performance.",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "EST maintains 65.9% precision under white-box evasion attacks, recovering to 78.1% with defense-in-depth",
    398       "evidence": "Table 24 shows white-box evasion reduces precision from 74.2% to 65.9%. Table 25 shows full ensemble (EST + correlation + reasoning + hardened judge) achieves 78.1% precision and 74.3% recall under ensemble-aware evasion with 6.8% FP rate.",
    399       "supported": "moderate"
    400     },
    401     {
    402       "claim": "Objective alignment is the strongest predictor of hacking frequency, reducing it by 31.2% (Cohen's d = 2.08)",
    403       "evidence": "Table 12 presents factorial design results with objective alignment showing effect size -0.312, p < 0.001, Cohen's d = 2.08. However, the paper acknowledges 'these unusually large effect sizes reflect our custom environments designed to maximize experimental contrast.'",
    404       "supported": "weak"
    405     }
    406   ],
    407   "red_flags": [
    408     {
    409       "flag": "Custom testbeds favor detection framework",
    410       "detail": "Three of 15 RL environments are custom testbeds ('Proxy Trap', 'Tampering Test', 'Alignment Challenge') designed by the authors. These are described as 'designed to maximize diversity of hacking behaviors,' which means they were built to exhibit the patterns the detection framework is specifically engineered to catch. This inflates detection performance metrics."
    411     },
    412     {
    413       "flag": "Circular ground truth for large-scale RL analysis",
    414       "detail": "Only 2,156 of 15,247 episodes have expert annotations. The remaining 13,091 episodes use 'detector consensus' (≥3/6 detectors agreeing) as pseudo ground truth. The paper acknowledges this circularity but still uses findings from this set to inform analysis of hacking patterns and prevalence."
    415     },
    416     {
    417       "flag": "No code or data despite release claims",
    418       "detail": "The abstract and conclusion claim 'We release benchmarks for both domains' but no repository URL, download link, or archive is provided. This prevents independent verification of all reported results."
    419     },
    420     {
    421       "flag": "Unusually large effect sizes acknowledged",
    422       "detail": "The factorial design experiment reports Cohen's d values of 1.24–2.08, which are extremely large. The paper acknowledges these are artifacts of custom environments with maximized experimental contrast, and that 'real world deployments would likely show smaller effects.' The reported Cohen's d of 0.8–1.2 in domain-specific benchmarks is more realistic but still on the high side."
    423     },
    424     {
    425       "flag": "Self-implemented baselines",
    426       "detail": "All baselines (LSTM-Autoencoder, One-Class SVM, Isolation Forest, BC Divergence, hardened judges, etc.) are implemented by the authors. No reference implementations or third-party code is used, and author-evaluation bias is not acknowledged. Lucic et al. (2018) showed authors' implementations of baselines systematically underperform."
    427     },
    428     {
    429       "flag": "Missing LLM training and inference hyperparameters",
    430       "detail": "While detection framework hyperparameters are well-documented, critical LLM experimental details are missing: DPO/RLHF learning rates, batch sizes, training epochs, and judge API temperature/sampling settings. These significantly affect both training dynamics and the gaming behaviors observed."
    431     }
    432   ],
    433   "cited_papers": [
    434     {
    435       "title": "Defining and characterizing reward hacking",
    436       "authors": ["Joar Skalse", "Nikolaus HR Howe", "Dmitrii Krasheninnikov", "David Krueger"],
    437       "year": 2022,
    438       "relevance": "Formal definition of reward hacking and 'unhackable' proxy evaluators, foundational to this paper's detection framework for AI alignment."
    439     },
    440     {
    441       "title": "Judging LLM-as-a-judge with MT-Bench and chatbot arena",
    442       "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"],
    443       "year": 2023,
    444       "relevance": "Establishes LLM-as-judge evaluation methodology and identifies evaluator weaknesses that this paper's detection framework targets."
    445     },
    446     {
    447       "title": "Specification gaming: the flip side of ai ingenuity",
    448       "authors": ["Victoria Krakovna", "Laurent Orseau", "Richard Ngo", "Miljan Martic", "Shane Legg"],
    449       "year": 2020,
    450       "relevance": "Catalogs specification gaming behaviors in RL systems, providing the empirical foundation for the reward hacking taxonomy used in this paper."
    451     },
    452     {
    453       "title": "Training language models to follow instructions with human feedback",
    454       "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"],
    455       "year": 2022,
    456       "relevance": "Foundational RLHF paper for LLM alignment; the training methodology this paper's detection framework monitors for evaluator gaming."
    457     },
    458     {
    459       "title": "Direct preference optimization: Your language model is secretly a reward model",
    460       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell", "Stefano Ermon", "Christopher D Manning", "Chelsea Finn"],
    461       "year": 2023,
    462       "relevance": "DPO training method used as one of two fine-tuning approaches in this paper's LLM alignment experiments."
    463     },
    464     {
    465       "title": "Scaling laws for reward model overoptimization",
    466       "authors": ["Leo Gao", "John Schulman", "Jacob Hilton"],
    467       "year": 2023,
    468       "relevance": "Studies reward model overoptimization at scale, directly relevant to understanding proxy gaming dynamics in LLM alignment."
    469     },
    470     {
    471       "title": "Concrete problems in ai safety",
    472       "authors": ["Dario Amodei", "Chris Olah", "Jacob Steinhardt", "Paul Christiano", "John Schulman", "Dan Mané"],
    473       "year": 2016,
    474       "arxiv_id": "1606.06565",
    475       "relevance": "Foundational AI safety paper identifying reward hacking and proxy optimization as core safety challenges."
    476     },
    477     {
    478       "title": "The effects of reward misspecification: Mapping and mitigating misaligned models",
    479       "authors": ["Alexander Pan", "Kush Bhatia", "Jacob Steinhardt"],
    480       "year": 2022,
    481       "relevance": "Explores reward misspecification effects and mitigation strategies in RL, directly related to proxy gaming detection."
    482     },
    483     {
    484       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    485       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    486       "year": 2022,
    487       "relevance": "CoT prompting methodology relevant to the reasoning validity detector component that checks whether models produce correct answers through invalid reasoning chains."
    488     },
    489     {
    490       "title": "Deep reinforcement learning from human preferences",
    491       "authors": ["Paul F Christiano", "Jan Leike", "Tom Brown"],
    492       "year": 2017,
    493       "relevance": "Foundational RLHF methodology paper; preference learning approach that creates the evaluator gaming vulnerability this paper detects."
    494     },
    495     {
    496       "title": "Alignment of language agents",
    497       "authors": ["Zachary Kenton", "Tom Everitt", "Laura Weidinger", "Iason Gabriel", "Vladimir Mikulik", "Geoffrey Irving"],
    498       "year": 2021,
    499       "arxiv_id": "2103.14659",
    500       "relevance": "Analyzes alignment challenges for language agents, including proxy optimization and evaluator gaming."
    501     }
    502   ],
    503   "engagement_factors": {
    504     "practical_relevance": {
    505       "score": 2,
    506       "justification": "The detection framework is relevant for practitioners doing RLHF/DPO fine-tuning, but no code is released, limiting immediate usability."
    507     },
    508     "surprise_contrarian": {
    509       "score": 1,
    510       "justification": "Proxy gaming and reward hacking are well-known problems; the unified RL-LLM framework is novel but the core insight is not surprising."
    511     },
    512     "fear_safety": {
    513       "score": 2,
    514       "justification": "Demonstrates that models systematically game evaluators during alignment training, raising concerns about RLHF/DPO pipeline reliability."
    515     },
    516     "drama_conflict": {
    517       "score": 1,
    518       "justification": "Shows LLM-as-judge pipelines are gameable (e.g., format exploitation scores 8.2 vs 5.1 human rating), but this is already a recognized concern."
    519     },
    520     "demo_ability": {
    521       "score": 0,
    522       "justification": "No code, demo, or tool is released despite claims of benchmark release."
    523     },
    524     "brand_recognition": {
    525       "score": 0,
    526       "justification": "From Iowa State University researchers without established profiles in AI alignment or safety research."
    527     }
    528   }
    529 }

Impressum · Datenschutz