ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25039B)


      1 {
      2   "paper": {
      3     "title": "Alignment and Safety in Large Language Models: Safety Mechanisms, Training Paradigms, and Emerging Challenges",
      4     "authors": [
      5       "Haoran Lu",
      6       "Luyang Fang",
      7       "Ruidong Zhang",
      8       "Xinliang Li",
      9       "Jiazhang Cai",
     10       "Huimin Cheng",
     11       "Lin Tang",
     12       "Ziyu Liu",
     13       "Zeliang Sun",
     14       "Tao Wang",
     15       "Yingchuan Zhang",
     16       "Arif Hassan Zidan",
     17       "Jinwen Xu",
     18       "Jincheng Yu",
     19       "Meizhi Yu",
     20       "Hanqi Jiang",
     21       "Xilin Gong",
     22       "Weidi Luo",
     23       "Bolun Sun",
     24       "Yongkai Chen",
     25       "Terry Ma",
     26       "Shushan Wu",
     27       "Yifan Zhou",
     28       "Junhao Chen",
     29       "Haotian Xiang",
     30       "Jing Zhang",
     31       "Afrar Jahin",
     32       "Wei Ruan",
     33       "Ke Deng",
     34       "Yi Pan",
     35       "Peilong Wang",
     36       "Jiahui Li",
     37       "Zhengliang Liu",
     38       "Lu Zhang",
     39       "Xiaobo Li",
     40       "Lin Zhao",
     41       "Wei Liu",
     42       "Dajiang Zhu",
     43       "Xin Xing",
     44       "Fei Dou",
     45       "Wei Zhang",
     46       "Chao Huang",
     47       "Rongjie Liu",
     48       "Mengrui Zhang",
     49       "Yiwen Liu",
     50       "Xiaoxiao Sun",
     51       "Qin Lu",
     52       "Zhen Xiang",
     53       "Wenxuan Zhong",
     54       "Tianming Liu",
     55       "Ping Ma"
     56     ],
     57     "year": 2025,
     58     "venue": "arXiv",
     59     "arxiv_id": "2507.19672"
     60   },
     61   "checklist": {
     62     "artifacts": {
     63       "code_released": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No source code, repository URL, or analysis scripts are provided or referenced anywhere in the paper. As a survey, the authors could have released code for any systematic analysis or figure generation, but none is provided."
     67       },
     68       "data_released": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No dataset, corpus of reviewed papers, or extracted data tables are released. The paper does not provide a downloadable list of surveyed works or any structured dataset used in the analysis."
     72       },
     73       "environment_specified": {
     74         "applies": false,
     75         "answer": false,
     76         "justification": "This is a survey paper that does not run any experiments or computational analyses requiring an environment specification."
     77       },
     78       "reproduction_instructions": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "No instructions are provided for reproducing the survey methodology, such as search queries, database sources, or inclusion/exclusion workflow steps. The paper does not describe a reproducible systematic review protocol."
     82       }
     83     },
     84     "statistical_methodology": {
     85       "confidence_intervals_or_error_bars": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is a narrative survey paper that does not conduct original experiments or statistical analyses. No original quantitative results are reported."
     89       },
     90       "significance_tests": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "No original experiments or comparative analyses are performed. The paper reviews others' results but does not perform its own statistical testing."
     94       },
     95       "effect_sizes_reported": {
     96         "applies": false,
     97         "answer": false,
     98         "justification": "No original experiments are conducted. The paper describes findings from reviewed works but does not generate its own effect size measurements."
     99       },
    100       "sample_size_justified": {
    101         "applies": false,
    102         "answer": false,
    103         "justification": "No original experimental data is collected. The paper is a narrative literature review without sample-based analysis."
    104       },
    105       "variance_reported": {
    106         "applies": false,
    107         "answer": false,
    108         "justification": "No original experiments are run. The paper reviews prior work but does not report its own variance or dispersion metrics."
    109       }
    110     },
    111     "evaluation_design": {
    112       "baselines_included": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper does not compare its survey methodology or coverage against prior surveys on the same topic. While it mentions prior work on alignment, it does not systematically benchmark itself against other survey papers."
    116       },
    117       "baselines_contemporary": {
    118         "applies": false,
    119         "answer": false,
    120         "justification": "Not applicable since no baselines or comparative evaluations are performed."
    121       },
    122       "ablation_study": {
    123         "applies": false,
    124         "answer": false,
    125         "justification": "This is a survey paper. Ablation studies are structurally inapplicable."
    126       },
    127       "multiple_metrics": {
    128         "applies": false,
    129         "answer": false,
    130         "justification": "This is a survey paper that does not conduct experiments or evaluate systems against metrics."
    131       },
    132       "human_evaluation": {
    133         "applies": false,
    134         "answer": false,
    135         "justification": "This is a survey paper. Human evaluation of system outputs is not applicable."
    136       },
    137       "held_out_test_set": {
    138         "applies": false,
    139         "answer": false,
    140         "justification": "No experiments are conducted. There is no test set of any kind."
    141       },
    142       "per_category_breakdown": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper organizes its review across multiple categories: SFT techniques, RLHF methods, DPO/reward-free methods, efficient fine-tuning approaches, brain-inspired methods, and alignment uncertainty. Sections 4-12 each provide structured breakdowns of methods within their category."
    146       },
    147       "failure_cases_discussed": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "The paper discusses failure modes extensively: reward hacking (Sec 5.4), jailbreak attacks and adversarial vulnerabilities (Sec 3.1), fake alignment/deceptive alignment (Sec 3.1.4), SFT limitations (Sec 4.4), and limitations of each technique reviewed."
    151       },
    152       "negative_results_reported": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "The paper reports negative findings including: SFT alone is insufficient for robust alignment (Sec 4.4), RLHF pipelines are vulnerable to reward hacking (Sec 5.4), current alignment is brittle to adversarial attacks (Sec 3.1), and brain-inspired approaches have not scaled to production (Sec 9.3)."
    156       }
    157     },
    158     "claims_and_evidence": {
    159       "abstract_claims_supported": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The abstract claims the survey provides 'a comprehensive overview of practical alignment techniques, training protocols, and empirical findings' and discusses trade-offs between objectives. The paper's 13 sections deliver on these claims with detailed coverage of SFT, RLHF, DPO, evaluation benchmarks, efficient fine-tuning, brain-inspired methods, uncertainty quantification, and alignment strategies across leading AI labs."
    163       },
    164       "causal_claims_justified": {
    165         "applies": false,
    166         "answer": false,
    167         "justification": "The paper is a survey that synthesizes existing work. It does not make original causal claims about its own experiments. The causal claims it attributes to reviewed papers (e.g., 'InstructGPT showed 1.3B parameter model was preferred over 175B GPT-3') are properly attributed to their original sources."
    168       },
    169       "generalization_bounded": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "The title claims broad coverage of 'Alignment and Safety in Large Language Models' but the scope boundaries are never explicitly stated. The paper does not define inclusion/exclusion criteria for what counts as 'alignment' research, does not specify a time window for reviewed literature, and does not acknowledge what topics are excluded from coverage. The survey reads as comprehensive but does not bound its claims to a specific subset of the field."
    173       },
    174       "alternative_explanations_discussed": {
    175         "applies": false,
    176         "answer": false,
    177         "justification": "As a survey paper with no original empirical results, alternative explanations for observed results are not applicable. The paper does discuss competing interpretations of alignment techniques (e.g., SFT vs RLHF equivalence conditions in Sec 6.2)."
    178       }
    179     },
    180     "setup_transparency": {
    181       "model_versions_specified": {
    182         "applies": false,
    183         "answer": false,
    184         "justification": "This is a survey paper that does not run any models. Model versions discussed are those from the reviewed literature."
    185       },
    186       "prompts_provided": {
    187         "applies": false,
    188         "answer": false,
    189         "justification": "This is a survey paper that does not use prompting for any experiments."
    190       },
    191       "hyperparameters_reported": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No experiments are conducted. The paper reviews hyperparameter choices from the literature but does not report its own."
    195       },
    196       "scaffolding_described": {
    197         "applies": false,
    198         "answer": false,
    199         "justification": "No agentic scaffolding is used. This is a narrative survey."
    200       },
    201       "data_preprocessing_documented": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "The paper provides no description of a systematic literature search or review methodology. There are no search queries, database sources (e.g., Scopus, Web of Science, Semantic Scholar), date ranges, inclusion/exclusion criteria, or PRISMA-style flow diagrams documenting how papers were selected for review. The filtering pipeline for selecting which works to include is completely undocumented."
    205       }
    206     },
    207     "limitations_and_scope": {
    208       "limitations_section_present": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "There is no dedicated limitations section for the survey itself. The paper discusses limitations of individual alignment techniques (e.g., Sec 4.4 'Limitations of SFT Alone', Sec 5.4 'Challenges of RLHF', Sec 9.3 'Challenges and Limitations of Brain-Inspired LLM Alignments') but never addresses the limitations of the survey's own methodology or coverage."
    212       },
    213       "threats_to_validity_specific": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No threats to the validity of the survey itself are discussed. There is no acknowledgment of potential selection bias in the reviewed literature, recency bias, or the limitations of the narrative (non-systematic) review approach."
    217       },
    218       "scope_boundaries_stated": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "The paper does not explicitly state what is in and out of scope. There is no definition of what counts as 'alignment' research for this survey, no time window is specified, and no explicit exclusion criteria are given. The conclusion (Sec 13) mentions open challenges but does not articulate what the survey did NOT cover."
    222       }
    223     },
    224     "data_integrity": {
    225       "raw_data_available": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No raw data is available. The list of papers reviewed, selection criteria, or any extracted metadata are not provided in downloadable form."
    229       },
    230       "data_collection_described": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "The paper does not describe how reviewed papers were collected. There are no search queries, database names, or collection procedures. The survey appears to be a narrative review rather than a systematic one, but this is never stated."
    234       },
    235       "recruitment_methods_described": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants are involved. This is a literature survey."
    239       },
    240       "data_pipeline_documented": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No data pipeline is documented. The paper does not describe how papers were identified, screened, selected, or categorized. There are no filtering stages, no counts of papers at each stage, and no systematic review protocol."
    244       }
    245     },
    246     "conflicts_of_interest": {
    247       "funding_disclosed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "No funding source is disclosed anywhere in the paper. There is no acknowledgments section mentioning grants or sponsors."
    251       },
    252       "affiliations_disclosed": {
    253         "applies": true,
    254         "answer": true,
    255         "justification": "Author affiliations are listed in detail on page 1: University of Georgia (Statistics and Computing), Boston University, Augusta University, University of Arizona, Northwestern University, Harvard University, Carnegie Mellon University, UT Arlington, Mayo Clinic, Indiana University, NJIT, Virginia Tech, Stanford University, and University of Arizona. All are academic institutions."
    256       },
    257       "funder_independent_of_outcome": {
    258         "applies": true,
    259         "answer": false,
    260         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of funding disclosure does not equate to absence of funding."
    261       },
    262       "financial_interests_declared": {
    263         "applies": true,
    264         "answer": false,
    265         "justification": "No competing interests or financial interests statement is provided. There is no declaration regarding patents, equity, consulting relationships, or other potential conflicts."
    266       }
    267     },
    268     "contamination": {
    269       "training_cutoff_stated": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "This is a survey paper. No pre-trained models are evaluated on benchmarks."
    273       },
    274       "train_test_overlap_discussed": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "This is a survey paper. No models are evaluated on benchmarks."
    278       },
    279       "benchmark_contamination_addressed": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "This is a survey paper. No models are evaluated on benchmarks."
    283       }
    284     },
    285     "human_studies": {
    286       "pre_registered": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants are involved. This is a literature survey."
    290       },
    291       "irb_or_ethics_approval": {
    292         "applies": false,
    293         "answer": false,
    294         "justification": "No human participants are involved."
    295       },
    296       "demographics_reported": {
    297         "applies": false,
    298         "answer": false,
    299         "justification": "No human participants are involved."
    300       },
    301       "inclusion_exclusion_criteria": {
    302         "applies": false,
    303         "answer": false,
    304         "justification": "No human participants are involved."
    305       },
    306       "randomization_described": {
    307         "applies": false,
    308         "answer": false,
    309         "justification": "No human participants are involved."
    310       },
    311       "blinding_described": {
    312         "applies": false,
    313         "answer": false,
    314         "justification": "No human participants are involved."
    315       },
    316       "attrition_reported": {
    317         "applies": false,
    318         "answer": false,
    319         "justification": "No human participants are involved."
    320       }
    321     },
    322     "cost_and_practicality": {
    323       "inference_cost_reported": {
    324         "applies": false,
    325         "answer": false,
    326         "justification": "This is a survey paper. No method is proposed or evaluated, so inference cost of the paper's own method is not applicable."
    327       },
    328       "compute_budget_stated": {
    329         "applies": false,
    330         "answer": false,
    331         "justification": "This is a survey paper that does not perform any computation or experiments."
    332       }
    333     }
    334   },
    335   "claims": [
    336     {
    337       "claim": "SFT enables basic instruction-following but preference-based methods (RLHF) offer more flexibility for aligning with nuanced human intent",
    338       "evidence": "Section 4.4 documents SFT limitations (dependence on data coverage, lack of preference awareness, vulnerability to label noise). Section 5 covers RLHF's ability to optimize for comparative human preferences. InstructGPT (1.3B parameters with RLHF) was preferred over 175B GPT-3 (Sec 5, citing Ouyang et al. 2022).",
    339       "supported": "strong"
    340     },
    341     {
    342       "claim": "DPO provides an elegant mathematical insight eliminating the complex RL pipeline while maintaining or improving alignment quality compared to RLHF",
    343       "evidence": "Section 7.1 describes DPO as formulating alignment as a supervised contrastive learning problem that 'significantly reduces computational complexity while maintaining and in some cases improving the alignment quality compared to RLHF' (citing Rafailov et al. 2023). Meta's LLaMA 3 replaced PPO with DPO for computational efficiency (Sec 12.5).",
    344       "supported": "moderate"
    345     },
    346     {
    347       "claim": "Current alignment methods exhibit brittleness to adversarial attacks and may not scale to superhuman capabilities",
    348       "evidence": "Section 3.1 documents extensive jailbreak attack categories (logic-based, low-resource, community-driven, fake alignment) with specific examples. Section 13.2 identifies scalable oversight as an open challenge. However, these are synthesized observations from the literature rather than original empirical findings.",
    349       "supported": "strong"
    350     },
    351     {
    352       "claim": "GRPO eliminates the critic network and uses group-based reward normalization, improving training efficiency while maintaining alignment performance",
    353       "evidence": "Section 7.4 provides the formal GRPO objective (Eq. 14-15, citing Shao et al. 2024b). Table 5 lists multiple GRPO variants with speedup metrics (e.g., CPPO achieves 8.3x speedup on GSM8K). Section 5.3.2 compares GRPO with PPO and other actor-only methods.",
    354       "supported": "moderate"
    355     },
    356     {
    357       "claim": "Constitutional AI reduces dependence on human annotation while improving harmlessness",
    358       "evidence": "Section 7.2.1 describes CAI (Bai et al. 2022a) in detail, including the self-critique SFT and RLAIF phases. Section 12.3 explains Anthropic's implementation. The claim that CAI 'yields assistants that are rated safer and less evasive than those produced by standard RLHF' is cited to the original paper but not independently verified in this survey.",
    359       "supported": "moderate"
    360     }
    361   ],
    362   "methodology_tags": [
    363     "meta-analysis",
    364     "qualitative"
    365   ],
    366   "key_findings": "This 80-page survey provides a comprehensive narrative review of LLM alignment research, covering supervised fine-tuning, RLHF (including reward modeling and policy optimization), DPO and reward-free methods, Constitutional AI, brain-inspired approaches, alignment uncertainty quantification, and alignment strategies at major AI labs (OpenAI, DeepSeek, Anthropic, Google DeepMind, Meta, xAI). The paper documents the progression from SFT-only to preference-based methods, identifies key tradeoffs between helpfulness, harmlessness, and honesty, and catalogs safety evaluation benchmarks across general, domain-specific, and code safety categories. A notable contribution is the formalization of alignment uncertainty quantification (AUQ) as a distinct research direction, including a decision-theoretic framework for the alignment gap.",
    367   "red_flags": [
    368     {
    369       "flag": "No systematic review methodology",
    370       "detail": "The paper presents itself as a 'comprehensive survey' but follows no documented systematic review protocol. There are no search queries, database sources, PRISMA flow diagrams, or inclusion/exclusion criteria. The selection of papers to review appears entirely ad hoc, making it impossible to assess completeness or bias in coverage."
    371     },
    372     {
    373       "flag": "No quality assessment of reviewed works",
    374       "detail": "The survey summarizes alignment techniques without any structured quality assessment of the primary studies. Papers with strong experimental validation are treated equivalently to those with weaker evidence. This lack of quality filtering means the survey could be laundering the signal-to-noise ratio of its sources."
    375     },
    376     {
    377       "flag": "Extremely large author list without clear contribution",
    378       "detail": "The paper lists 50 authors from 17 institutions. No author contribution statement is provided, making it unclear what each person contributed. This raises questions about authorship standards."
    379     },
    380     {
    381       "flag": "Missing funding and competing interests disclosures",
    382       "detail": "No funding sources are acknowledged and no competing interests statement is provided, despite the paper covering commercial AI products from companies that might have relationships with academic institutions."
    383     },
    384     {
    385       "flag": "Scope boundaries undefined",
    386       "detail": "The survey covers an extremely broad range of topics (SFT, RLHF, DPO, brain-inspired methods, uncertainty quantification, regulatory policy, AGI safety) without defining clear scope boundaries or acknowledging what is excluded. The breadth may come at the cost of depth in any single area."
    387     },
    388     {
    389       "flag": "Some citations appear fabricated or inaccurate",
    390       "detail": "Several citations use placeholder-style author names (e.g., 'Smith and Doe, 2024', 'Doe and Bloggs, 2023', 'Johnson and Kumar, 2024', 'Lee and Kim, 2024a/b') that appear to be placeholder or fabricated references rather than real publications. This casts doubt on the accuracy of the reference list."
    391     }
    392   ],
    393   "cited_papers": [
    394     {
    395       "title": "Training language models to follow instructions with human feedback",
    396       "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"],
    397       "year": 2022,
    398       "relevance": "Foundational RLHF work on InstructGPT that established the three-stage alignment pipeline used widely in practice."
    399     },
    400     {
    401       "title": "Constitutional AI: Harmlessness from AI Feedback",
    402       "authors": ["Yuntao Bai"],
    403       "year": 2022,
    404       "relevance": "Introduced Constitutional AI and RLAIF, a core alignment method reducing dependence on human annotation for safety."
    405     },
    406     {
    407       "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
    408       "authors": ["Rafael Rafailov"],
    409       "year": 2023,
    410       "relevance": "Introduced DPO, eliminating the need for separate reward models and RL, a major simplification of alignment pipelines."
    411     },
    412     {
    413       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    414       "authors": ["DeepSeek-AI"],
    415       "year": 2025,
    416       "relevance": "Demonstrates pure RL training for reasoning and the benefits of combining SFT with RL for alignment."
    417     },
    418     {
    419       "title": "DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models",
    420       "authors": ["Zhihong Shao"],
    421       "year": 2024,
    422       "relevance": "Introduced GRPO, an actor-only RL method that eliminates the critic network for more efficient alignment."
    423     },
    424     {
    425       "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training",
    426       "authors": ["Evan Hubinger"],
    427       "year": 2024,
    428       "relevance": "Demonstrated that deceptive alignment can persist through safety training, a critical alignment failure mode."
    429     },
    430     {
    431       "title": "Proximal Policy Optimization Algorithms",
    432       "authors": ["John Schulman"],
    433       "year": 2017,
    434       "relevance": "PPO is the standard policy optimization algorithm used in most RLHF pipelines for LLM alignment."
    435     },
    436     {
    437       "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models",
    438       "authors": ["Hugo Touvron"],
    439       "year": 2023,
    440       "relevance": "Open-source LLM demonstrating multi-stage alignment (SFT + RLHF) with separate helpfulness and safety reward models."
    441     },
    442     {
    443       "title": "Red Teaming Language Models to Reduce Harms: Methods, Scaling Behaviors, and Lessons Learned",
    444       "authors": ["Deep Ganguli"],
    445       "year": 2022,
    446       "relevance": "Established systematic red-teaming methodology for identifying LLM safety vulnerabilities."
    447     },
    448     {
    449       "title": "Concrete problems in AI safety",
    450       "authors": ["Dario Amodei", "Chris Olah", "Jacob Steinhardt"],
    451       "year": 2016,
    452       "relevance": "Foundational paper identifying persistent technical challenges in AI safety including reward hacking and distributional robustness."
    453     },
    454     {
    455       "title": "Alignment faking in large language models",
    456       "authors": ["Ryan Greenblatt"],
    457       "year": 2024,
    458       "relevance": "Demonstrates that LLMs can strategically fake alignment during evaluation, a critical concern for safety assessment."
    459     },
    460     {
    461       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    462       "authors": ["Edward Hu"],
    463       "year": 2022,
    464       "relevance": "Introduced parameter-efficient fine-tuning that enables scalable deployment of alignment techniques."
    465     }
    466   ]
    467 }

Impressum · Datenschutz