ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (33009B)


      1 {
      2   "paper": {
      3     "title": "Inverse Reinforcement Learning with Dynamic Reward Scaling for LLM Alignment",
      4     "authors": [
      5       "Ruoxi Cheng",
      6       "Haoxuan Ma",
      7       "Weixin Wang",
      8       "Ranjie Duan",
      9       "Jiexi Liu",
     10       "Xiaoshuang Jia",
     11       "Simeng Qin",
     12       "Xiaochun Cao",
     13       "Yang Liu",
     14       "Xiaojun Jia"
     15     ],
     16     "year": 2025,
     17     "venue": "arXiv",
     18     "arxiv_id": "2503.18991"
     19   },
     20   "scan_version": 3,
     21   "active_modules": ["experimental_rigor", "data_leakage"],
     22   "methodology_tags": ["benchmark-eval"],
     23   "key_findings": "DR-IRL combines inverse reinforcement learning with dynamic reward scaling (based on data hardness and model responsiveness) to align LLMs, achieving highest safety scores across StrongReject, XsTest, WildChat, and stereotype benchmarks on both Llama-3.1-8B and Qwen-2-7B while maintaining helpfulness. Ablation shows both data-level and model-level hardness coefficients contribute meaningfully, with their multiplicative combination outperforming additive alternatives. Per-category shadow reward models add only 20% compute overhead but yield consistent safety gains over a single reward model. The difficulty-weighting mechanism transfers to PPO and DPO, improving safety-utility tradeoffs across alignment families.",
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "Section C.4 references 'our code repository' but no URL, GitHub link, or archive is provided anywhere in the paper. The code is not actually released."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "Evaluation benchmarks (StrongReject, XsTest, WildChat, Do-Not-Answer) are publicly available, but the paper's key contribution—the balanced CoD safety dataset of 7,000 instructions with refusal demonstrations—is not released. No download link or archive is provided."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Section C.3 mentions approximate versions: 'PyTorch (v2.0+)', 'Hugging Face Transformers (v4.x)', 'DeepSpeed (v0.10+)', 'CUDA 12.x'. These are version ranges, not exact specifications. No requirements.txt, Dockerfile, or conda environment file is provided."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No step-by-step reproduction instructions are provided. The paper references a code repository (Section C.4) but provides no URL, README, or runnable commands."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "All results in Tables 1, 2, 3, 6, 7, and 8 are single point estimates with no confidence intervals, error bars, or ± notation."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper claims DR-IRL 'significantly outperforms all state-of-the-art alignment methods' and 'outperforms all baselines' but no statistical significance tests (p-values, t-tests, etc.) are reported for any comparison."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Tables 1, 2, 3, and 8 report absolute performance numbers for all methods, allowing direct comparison of effect magnitudes. Table 2 explicitly reports improvements: '+1.79 pp' on StrongReject, '+2.73 pp' on WildChat, '+2.62 pp' on Stereotype."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The dataset uses 1,000 harmful instructions per category (7,000 total) with no justification for this number. No power analysis is provided. The shadow reward model evaluation (Table 7) also uses 1,000 prompts per category without justification."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No variance, standard deviation, or spread measure is reported across experimental runs. All results appear to be from single runs. Section C.3 mentions setting 'random seeds for model initialization and data shuffling to ensure reproducibility' but does not report results across multiple seeds."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table 1 compares against 9 baselines: Base, CoT, SFT, DPO, SACPO, Self-Rewarding, STAIR, GRPO, and IRL (an ablated variant of their method)."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Baselines include recent methods: SACPO (2024), Self-Rewarding (2024), STAIR (2025), GRPO (2024). The established baselines (DPO, SFT, PPO) are appropriate references."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Figure 3 ablates hardness coefficients (full DR-IRL vs w/o αD vs w/o αM vs No Hardness). Table 2 ablates per-category vs single reward model. Table 3 compares multiplicative vs additive combination rules. Table 5 ablates CoD vs CoT."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Eight evaluation metrics are used: StrongReject (goodness score), XsTest (refusal rate), WildChat (refusal rate), Stereotype (refusal rate), SimpleQA, AdvGLUE, GSM8k, and HHH, spanning both safety and helpfulness."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "All evaluation is automated. No human evaluation of the aligned model's outputs is performed. For a safety alignment paper where refusal quality matters (not just refusal rate), human evaluation would be valuable."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Evaluation uses established external benchmarks (StrongReject, XsTest, WildChat, GSM8k, etc.) that are separate from the training data. The training uses Do-Not-Answer and Safety-Prompts for CoD generation, while evaluation uses different splits/benchmarks."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Figure 2 shows per-category refusal rates across all 7 harmful categories (Insult, Unfairness, Crimes, Physical Harm, Mental Health, Privacy, Ethics) for both Llama and Qwen. Table 7 breaks down reward model accuracy per category."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": false,
    113         "justification": "No failure analysis is provided. The paper does not discuss where DR-IRL fails, what types of harmful prompts still succeed, or qualitative examples of failures. All presented results are positive."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Ablation studies (Figure 3) show degraded performance when components are removed—e.g., 'No Hardness lowers the StrongReject score by roughly 4 percentage points.' Table 3 shows the additive variant underperforms the multiplicative one."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The abstract claims 'DR-IRL outperforms all baseline methods in safety alignment while maintaining usefulness.' Table 1 supports this: DR-IRL achieves highest StrongReject scores (0.9361 and 0.8798) and leading or competitive results on helpfulness benchmarks across both models."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Causal claims are made through ablation studies (Figure 3): 'removing either coefficient degrades harmlessness' and 'suppressing αD mainly hurts refusal precision, whereas dropping αM causes larger fluctuations in general-capability metrics.' The ablation design uses controlled single-variable manipulation, which is adequate for these claims."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The title claims 'LLM Alignment' broadly and the abstract says 'across various benchmarks and LLMs.' In reality, experiments cover only 2 model families (Llama-3.1 and Qwen-2) at 3B/7B/8B scale, with no testing on larger models, closed-source models, or non-English safety scenarios. The contribution claims 'significantly outperforms all state-of-the-art alignment methods' without bounding this to the tested settings."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "No alternative explanations for the results are discussed. The paper does not consider confounds such as whether improvements come from the CoD data format rather than IRL, whether the balanced dataset alone explains gains, or whether results would differ with other base model families."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "The paper measures refusal rates and benchmark scores but frames these as 'safety alignment' and 'usefulness.' Refusal rate is a proxy for safety—a model can refuse harmfully or accept safely. The gap between refusal rate and actual safety is not discussed. Similarly, GSM8k/AdvGLUE performance is framed as 'helpfulness' without acknowledging these are narrow proxies."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Table 1 headers specify 'Llama-3.1-8B-Instruct' and 'Qwen-2-7B-Instruct', which are specific open-source model checkpoints with defined architectures and weights. Section D also tests 'Llama-3.1-3B' and 'Qwen-2-3B'."
    153       },
    154       "prompts_provided": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section B.2 provides the full base prompt: 'Think step by step, but only keep a minimum draft for each thinking step, with 5 words at most. Return the answer at the end of the response after a separator. Why did you refuse to answer {goal}? Provide your reasoning process for refusal.' Multiple concrete fill values (harmful goals) and their complete responses are shown in Sections B.2.1 and B.2.2."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Table 4 comprehensively reports: batch size (32), effective batch size (512), learning rates (2e-5/3e-5), warmup (500), weight decay (0.1), β (0.1/0.2), max sequence length (2048), precision (fp16), optimizer (AdamW), and scheduler. Section C.1 also details reward model training hyperparameters (lr 3×10⁻⁵, β=1.0, mini-batch 16)."
    163       },
    164       "scaffolding_described": {
    165         "applies": false,
    166         "answer": false,
    167         "justification": "No agentic scaffolding is used. DR-IRL is a training/fine-tuning method, not an agentic system."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section B.1 describes the CoD dataset construction: sampling harmful instructions from Do-Not-Answer and Safety-Prompts, generating CoD refusal responses, covering 7 categories with M=1000 per category. Section 4.1 describes training data for baselines. Section C.1 details tokenization and processing steps."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "There is no dedicated limitations section. The conclusion (Section 5) is three sentences summarizing the contribution with no mention of limitations. No 'Threats to Validity' or equivalent section exists."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No threats to validity are discussed anywhere in the paper. There is no analysis of what could undermine the conclusions."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "No scope boundaries are stated. The paper does not specify what settings, models, or scenarios the results do NOT apply to. The claims are presented without qualification."
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "Neither the constructed CoD training dataset nor the raw experimental outputs are made available. Only aggregated results in tables and figures are shown."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section 4.1 and B.1 describe data collection: 1,000 harmful instructions sampled per category from Do-Not-Answer and Safety-Prompts datasets, CoD demonstrations generated by the LLM. Baseline training uses UltraFeedback, PKU-SafeRLHF, and JailbreakV-28k."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": false,
    205         "answer": false,
    206         "justification": "No human participants. Data sources are standard public benchmarks and LLM-generated responses."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "The sampling procedure from Do-Not-Answer and Safety-Prompts to get exactly 1,000 per category is not described in detail—how were instructions selected when a category had more than 1,000? Were any filtered? The text splitting step for sub-sentences (Section 3.2) mentions prompting LLaMA-3 but does not show the splitting prompt or filtering criteria."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding sources are disclosed. There is no acknowledgments section mentioning grants or corporate sponsors. Multiple authors are from Alibaba Group, suggesting corporate funding, but this is not stated."
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Author affiliations are clearly listed on the first page: Alibaba Group, Southeast University, Duke University, Renmin University, Northeast University, Sun Yat-sen University, and Nanyang Technological University."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "Multiple authors are from Alibaba Group, which has commercial interest in LLM alignment technology. Alibaba develops its own LLMs (Qwen series) and one of the two evaluated model families is Qwen. No statement about independence of the funder from the outcome is made."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests or financial interests statement is present. Authors from Alibaba Group may have financial interests related to LLM alignment technology, but this is not addressed."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "The training data cutoff dates for Llama-3.1-8B and Qwen-2-7B base models are not stated. This is needed to assess whether evaluation benchmark data could appear in pre-training."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No discussion of whether evaluation benchmark data (StrongReject, XsTest, WildChat, GSM8k, etc.) may overlap with the base models' pre-training data."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "Several benchmarks used (GSM8k from 2021, AdvGLUE from 2021) predate the models' training data. XsTest (2023) and StrongReject (2024) could also have been seen during pre-training. No contamination analysis is provided."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants in this study."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No inference cost, latency, or per-example cost is reported for the aligned models. Table 5 reports token counts and latency for CoD vs CoT data generation, but not for the final aligned model's inference."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": true,
    299         "justification": "Table 2 reports '≈120 GPU h' for 7 reward models and '≈100 GPU h' for a single reward model on LLaMA. Section 4.1 states '4 NVIDIA A100 GPUs with 80GB memory.' Section C.3 describes 'up to eight NVIDIA A100 GPUs (80 GB each).'"
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No multi-seed results are reported. Section C.3 mentions 'We set random seeds for model initialization and data shuffling to ensure reproducibility' but does not test sensitivity across different seeds."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The number of experimental runs is never stated. All results appear to be from single runs with no indication of repetition."
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "For the additive combination baseline (Section D), the paper describes sweeping wD ∈{0.0, 0.1, ..., 1.0}. However, the overall hyperparameter search budget for the main DR-IRL method (including β, learning rates, etc.) is not reported."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "Table 4 reports final hyperparameter values but does not explain how they were selected. The text says β was 'tuned on a validation set' (Section C.2) but does not describe the selection procedure for other hyperparameters."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The paper compares DR-IRL against 9 baselines across 8 metrics (72+ comparisons) with no statistical tests at all, let alone multiple comparison corrections."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The authors implement and evaluate their own method against their own implementations of baselines. The paper states baselines follow Zhang et al. [59]'s corpus, but no acknowledgment of self-comparison bias is provided."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "Table 2 explicitly compares '7RW (≈120 GPU h)' vs 'RW (≈100 GPU h)' showing the cost-performance tradeoff of per-category reward models. This directly addresses whether the compute overhead justifies the performance gain."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "No discussion of whether refusal rate truly measures safety, or whether StrongReject/XsTest scores correspond to real-world safety. The paper uses these benchmarks without questioning their construct validity."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": false,
    345         "answer": false,
    346         "justification": "No agentic scaffolding is involved. DR-IRL is a training method evaluated directly on the fine-tuned models."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of temporal leakage. Benchmarks like GSM8k (2021) and AdvGLUE (2021) significantly predate the models' training data, but this is not addressed."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether the evaluation setup leaks information. For example, the CoD training data is generated using the same base model that is being fine-tuned, which could introduce circular information, but this is not addressed."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No discussion of independence between training and test data. The harmful instructions in the CoD training set come from Do-Not-Answer and Safety-Prompts, and Do-Not-Answer's stereotype split is also used for evaluation, creating potential overlap."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": false,
    368         "justification": "No leakage detection or prevention method is applied (no canary strings, membership inference, decontamination, or overlap analysis)."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "DR-IRL achieves highest StrongReject scores among all methods (0.9361 on Llama-3.1-8B, 0.8798 on Qwen-2-7B)",
    375       "evidence": "Table 1 shows DR-IRL's StrongReject scores exceed all 9 baselines on both models. Next-best is STAIR at 0.8798 (Llama) and 0.8486 (Qwen).",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "DR-IRL maintains or improves helpfulness while enhancing safety alignment",
    380       "evidence": "Table 1 shows DR-IRL achieves highest AdvGLUE (70.71%, 75.15%), competitive GSM8k (88.10%, 89.70%), and highest HHH (86.16%, 90.71%) on both models. SimpleQA is competitive (6.64%, 4.47%).",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Both data-level (αD) and model-level (αM) hardness coefficients are necessary for best performance",
    385       "evidence": "Figure 3 shows removing either coefficient degrades harmlessness on Llama-3.1-8B. No Hardness lowers StrongReject by ~4pp. Section 4.3 notes αD primarily enforces safety while αM stabilizes usefulness.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Per-category reward models outperform a single reward model with only 20% compute overhead",
    390       "evidence": "Table 2 shows 7 RMs (≈120 GPU h) improve StrongReject by +1.79pp, WildChat by +2.73pp, and Stereotype by +2.62pp over a single RM (≈100 GPU h).",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "DR-IRL is more robust against jailbreak attacks (GCG, AutoDAN, DRA) than baselines",
    395       "evidence": "Table 6 shows DR-IRL achieves 59.00%/96.98%/64.92% refusal rates vs Base (55.75%/56.53%/26.15%) and STAIR (58.75%/91.28%/41.97%) on Llama-3.1-8B.",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "The shadow reward model achieves 91.1% pairwise accuracy, outperforming OpenAI RM and Anthropic RM in every harm category",
    400       "evidence": "Table 7 shows improvements of +9.3 to +10.7pp over OpenAI RM across 7 categories on Llama-3.1-8B.",
    401       "supported": "moderate"
    402     },
    403     {
    404       "claim": "Difficulty-weighted updates improve PPO, DPO, and GRPO uniformly",
    405       "evidence": "Table 8 shows DPO-S, PPO-S, and DR-IRL (GRPO with weighting) consistently outperform their unweighted counterparts on StrongReject, XsTest, and WildChat for both Llama and Qwen.",
    406       "supported": "moderate"
    407     }
    408   ],
    409   "red_flags": [
    410     {
    411       "flag": "No error bars or variance across runs",
    412       "detail": "All results are single-run point estimates for a training method where stochastic elements (random seeds, sampling, SGD) likely produce non-trivial variance. Without multi-seed results, it is impossible to assess whether reported differences are within normal variation."
    413     },
    414     {
    415       "flag": "No statistical significance tests despite strong comparative claims",
    416       "detail": "The paper claims DR-IRL 'significantly outperforms all state-of-the-art alignment methods' (contribution 3) and 'outperforms all baseline methods' (abstract), but no statistical tests support any comparison. Differences could be within noise."
    417     },
    418     {
    419       "flag": "No limitations section",
    420       "detail": "The paper contains no limitations, threats to validity, or scope boundaries discussion whatsoever. The 3-sentence conclusion presents only positive framing."
    421     },
    422     {
    423       "flag": "Potential train-test overlap with Do-Not-Answer",
    424       "detail": "The CoD training dataset samples harmful instructions from Do-Not-Answer (Section 4.1), and the stereotype-related evaluation split also comes from Do-Not-Answer (Section 4.1). The paper does not discuss whether these overlap."
    425     },
    426     {
    427       "flag": "Alibaba conflict of interest undisclosed",
    428       "detail": "Three authors are from Alibaba Group, which develops the Qwen model family. One of the two evaluated models is Qwen-2-7B. No conflict of interest statement or funding disclosure addresses this."
    429     },
    430     {
    431       "flag": "Heavy self-citation",
    432       "detail": "At least 6 of 64 references (refs 2, 5, 6, 7, 10, 45, 61) share authors with the current paper (Ruoxi Cheng, Ranjie Duan, etc.), and several appear tangential to the contribution."
    433     },
    434     {
    435       "flag": "Code promised but not provided",
    436       "detail": "Section C.4 references 'our code repository' implying code exists, but no URL is provided. This makes the 'code available' claim unverifiable."
    437     }
    438   ],
    439   "cited_papers": [
    440     {
    441       "title": "Direct preference optimization: Your language model is secretly a reward model",
    442       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell", "Christopher D Manning", "Stefano Ermon", "Chelsea Finn"],
    443       "year": 2023,
    444       "relevance": "Key baseline method for reward-free LLM alignment via preference learning as classification."
    445     },
    446     {
    447       "title": "DeepSeekMath: Pushing the limits of mathematical reasoning in open language models",
    448       "authors": ["Zhihong Shao", "Peiyi Wang", "Qihao Zhu"],
    449       "year": 2024,
    450       "arxiv_id": "2402.03300",
    451       "relevance": "Introduces GRPO (Group Relative Policy Optimization), the base RL algorithm that DR-IRL extends."
    452     },
    453     {
    454       "title": "Training language models to follow instructions with human feedback",
    455       "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"],
    456       "year": 2022,
    457       "relevance": "Foundational RLHF paper establishing the reward-model + PPO pipeline for LLM alignment."
    458     },
    459     {
    460       "title": "Proximal policy optimization algorithms",
    461       "authors": ["John Schulman", "Filip Wolski", "Prafulla Dhariwal", "Alec Radford", "Oleg Klimov"],
    462       "year": 2017,
    463       "arxiv_id": "1707.06347",
    464       "relevance": "PPO is the standard RL algorithm for LLM alignment that DR-IRL's GRPO-based approach improves upon."
    465     },
    466     {
    467       "title": "A StrongReject for empty jailbreaks",
    468       "authors": ["Alexandra Souly", "Qingyuan Lu", "Dillon Bowen"],
    469       "year": 2024,
    470       "arxiv_id": "2402.10260",
    471       "relevance": "Primary safety evaluation benchmark used in DR-IRL experiments, testing robustness against jailbreak attacks."
    472     },
    473     {
    474       "title": "XsTest: A test suite for identifying exaggerated safety behaviours in large language models",
    475       "authors": ["Paul Röttger", "Hannah Rose Kirk", "Bertie Vidgen"],
    476       "year": 2023,
    477       "arxiv_id": "2308.01263",
    478       "relevance": "Benchmark for detecting over-refusal in aligned LLMs, used to evaluate DR-IRL's calibration between safety and helpfulness."
    479     },
    480     {
    481       "title": "STAIR: Improving safety alignment with introspective reasoning",
    482       "authors": ["Yichi Zhang", "Siyuan Zhang", "Yao Huang"],
    483       "year": 2025,
    484       "arxiv_id": "2502.02384",
    485       "relevance": "Strong baseline method for safety alignment using process rewards, directly compared against DR-IRL."
    486     },
    487     {
    488       "title": "BeaverTails: Towards improved safety alignment of LLM via a human-preference dataset",
    489       "authors": ["Jiaming Ji", "Mickel Liu", "Josef Dai"],
    490       "year": 2023,
    491       "relevance": "PKU-SafeRLHF dataset used in baseline training, providing safety-oriented human preference data."
    492     },
    493     {
    494       "title": "Universal and transferable adversarial attacks on aligned language models",
    495       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J Zico Kolter", "Matt Fredrikson"],
    496       "year": 2023,
    497       "arxiv_id": "2307.15043",
    498       "relevance": "GCG adversarial attack used to evaluate DR-IRL's jailbreak robustness."
    499     },
    500     {
    501       "title": "AutoDAN: Generating stealthy jailbreak prompts on aligned large language models",
    502       "authors": ["Xiaogeng Liu", "Nan Xu", "Muhao Chen", "Chaowei Xiao"],
    503       "year": 2023,
    504       "arxiv_id": "2310.04451",
    505       "relevance": "Jailbreak attack method used in DR-IRL robustness evaluation."
    506     },
    507     {
    508       "title": "Getting more juice out of the SFT data: Reward learning from human demonstration improves SFT for LLM alignment",
    509       "authors": ["Jiaxiang Li", "Siliang Zeng", "Hoi-To Wai", "Chenliang Li", "Alfredo Garcia", "Mingyi Hong"],
    510       "year": 2024,
    511       "relevance": "Core prior work showing IRL-based reward learning from demonstrations outperforms preference-based approaches, directly motivating DR-IRL's design."
    512     },
    513     {
    514       "title": "Safety-tuned llamas: Lessons from improving the safety of large language models that follow instructions",
    515       "authors": ["Federico Bianchi", "Mirac Suzgun", "Giuseppe Attanasio"],
    516       "year": 2023,
    517       "arxiv_id": "2309.07875",
    518       "relevance": "Demonstrates that LLMs can generate effective safety datasets for training, motivating DR-IRL's self-generated CoD dataset."
    519     },
    520     {
    521       "title": "Meta-rewarding language models: Self-improving alignment with LLM-as-a-meta-judge",
    522       "authors": ["Tianhao Wu", "Weizhe Yuan", "Olga Golovneva"],
    523       "year": 2024,
    524       "arxiv_id": "2407.19594",
    525       "relevance": "Self-rewarding baseline method that generates its own preference data for alignment."
    526     },
    527     {
    528       "title": "Is DPO superior to PPO for LLM alignment? A comprehensive study",
    529       "authors": ["Shusheng Xu", "Wei Fu", "Jiaxuan Gao"],
    530       "year": 2024,
    531       "arxiv_id": "2404.10719",
    532       "relevance": "Comprehensive comparison of reward-free vs reward-based alignment showing reward-based pipelines remain robust, motivating DR-IRL's approach."
    533     }
    534   ],
    535   "engagement_factors": {
    536     "practical_relevance": {
    537       "score": 2,
    538       "justification": "The method is applicable to practitioners doing LLM safety alignment but requires multi-GPU training infrastructure and per-category reward model training, limiting immediate adoption."
    539     },
    540     "surprise_contrarian": {
    541       "score": 1,
    542       "justification": "Dynamic reward scaling improving on static rewards is an incremental advance confirming existing intuitions rather than challenging conventional wisdom."
    543     },
    544     "fear_safety": {
    545       "score": 1,
    546       "justification": "Focuses on improving LLM safety defenses, which is relevant to AI safety but does not demonstrate novel attacks or reveal alarming vulnerabilities."
    547     },
    548     "drama_conflict": {
    549       "score": 0,
    550       "justification": "No controversial claims, no challenges to existing benchmarks or other labs' results."
    551     },
    552     "demo_ability": {
    553       "score": 0,
    554       "justification": "No code, demo, or tool is released despite referencing a code repository."
    555     },
    556     "brand_recognition": {
    557       "score": 1,
    558       "justification": "Alibaba Group is well-known in tech but is not a top-tier AI safety research lab; the work is not about a flagship product."
    559     }
    560   }
    561 }

Impressum · Datenschutz