ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (19552B)


      1 {
      2   "paper": {
      3     "title": "AI Alignment: A Comprehensive Survey",
      4     "authors": ["Jiaming Ji", "Tianyi Qiu", "Boyuan Chen", "Borong Zhang", "Hantao Lou", "Kaile Wang", "Yawen Duan", "Zhonghao He", "Lukas Vierling", "Donghai Hong", "Jiayi Zhou", "Zhaowei Zhang", "Fanzhi Zeng", "Juntao Dai", "Xuehai Pan", "Kwan Yee Ng", "Aidan O'Gara", "Hua Xu", "Brian Tse", "Jie Fu", "Stephen McAleer", "Yaodong Yang", "Yizhou Wang", "Song-Chun Zhu", "Yike Guo", "Wen Gao"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2310.19852"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No code repository or analysis scripts are mentioned. The paper references a website (www.alignmentsurvey.com) for tutorials and resources, but no code or data artifacts."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No dataset of surveyed papers, extracted data tables, or structured corpus is released. The survey website provides curated resources but not the underlying survey data."
     20       },
     21       "environment_specified": {
     22         "applies": false,
     23         "answer": false,
     24         "justification": "This is a survey paper with no computational experiments requiring environment specification."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No instructions for reproducing the survey methodology (search queries, databases used, inclusion/exclusion process) are provided."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": false,
     35         "answer": false,
     36         "justification": "Survey paper with no quantitative experiments or meta-analysis."
     37       },
     38       "significance_tests": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "Survey paper with no statistical comparisons."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "Survey paper with no quantitative experiments."
     47       },
     48       "sample_size_justified": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "Survey paper with no experiments requiring sample size justification."
     52       },
     53       "variance_reported": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "Survey paper with no experimental runs."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares its alignment cycle framework against the inner/outer alignment decomposition (Section 1.2.1, 'Comparison with Inner/Outer Decomposition'), and compares RICE principles against alternatives like Asimov's Laws, FATE, and 3H standards (Section 1.2.2)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The comparisons reference contemporary frameworks like the inner/outer alignment decomposition (Hubinger et al., 2019), FATE principles, and the 3H standard from Anthropic (Askell et al., 2021)."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "Survey paper with no system components to ablate."
     74       },
     75       "multiple_metrics": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "Survey paper with no quantitative evaluation metrics."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "Survey paper; human evaluation of its outputs is not applicable."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Survey paper with no test sets."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The survey provides detailed breakdowns by category: Table 1 maps research directions to RICE objectives, and the paper is organized into detailed subsections (Learning from Feedback, Distribution Shift, Assurance, Governance) with further sub-categorization."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper extensively discusses failure modes including reward hacking, goal misgeneralization, deceptive alignment, and limitations of current alignment techniques throughout Sections 1.1.2, 6.1, and 6.2."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that general alignment techniques (SFT, RLHF, Adversarial Training) 'fail to eradicate certain deceptive and backdoor behaviors' (Section 6.1), and discusses multiple limitations and open problems."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims to provide a comprehensive overview decomposed into forward and backward alignment with RICE principles. The paper delivers on this structure throughout its 63+ pages."
    111       },
    112       "causal_claims_justified": {
    113         "applies": false,
    114         "answer": false,
    115         "justification": "The paper is a survey/review and does not make its own causal claims requiring experimental validation. It reports causal claims from cited papers."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper explicitly acknowledges its scope boundaries: 'we recognize that boundaries of alignment are often vague and subject to debate' (Section 6), discusses what falls outside alignment (Section 1.2.3), and notes the survey 'needs to be a long-term endeavor that is continually reviewed and updated.'"
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": false,
    124         "answer": false,
    125         "justification": "As a survey paper presenting no empirical results of its own, alternative explanations for observed results are not applicable."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "Survey paper that does not run any models."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "Survey paper that does not use prompting."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "Survey paper with no experiments requiring hyperparameters."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "Survey paper with no agentic scaffolding."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper does not describe its survey methodology: no search queries, databases, inclusion/exclusion criteria, or filtering pipeline are documented. It is unclear how papers were selected for inclusion."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "There is no dedicated limitations section. The conclusion (Section 6) discusses future directions and open problems but does not explicitly discuss limitations of the survey itself."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No threats to validity are discussed. There is no acknowledgment of potential selection bias in surveyed papers, coverage gaps, or author perspective bias."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 1.2.3 ('Discussion on the Boundaries of Alignment') explicitly discusses what falls within and outside the scope, including malicious use and collective action problems. The paper also acknowledges 'boundaries of alignment are often vague and subject to debate.'"
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw data (e.g., list of all surveyed papers, search results, screening decisions) is made available for independent verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "The paper does not describe how papers were identified, selected, or screened for inclusion in the survey."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants; this is a literature survey."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No documentation of the pipeline from initial literature search to final paper selection and categorization."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding information is disclosed anywhere in the paper. There is an acknowledgments section thanking reviewers but no funding sources are mentioned."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: Peking University, University of Cambridge, University of Oxford, Carnegie Mellon University, Hong Kong University of Science and Technology, University of Southern California."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "Funding is not disclosed, so independence cannot be assessed. The absence of a funding disclosure statement is itself a concern."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this survey paper."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this survey paper."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this survey paper."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this survey paper."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this survey paper."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this survey paper."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this survey paper."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "Survey paper with no method whose cost would need reporting."
    275       },
    276       "compute_budget_stated": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "Survey paper with no computational experiments."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "AI alignment can be decomposed into forward alignment (alignment training) and backward alignment (alignment refinement), forming an iterative alignment cycle.",
    286       "evidence": "Section 1.2.1 presents the alignment cycle framework with Figure 2, decomposing it into Learning from Feedback, Learning under Distribution Shift, Assurance, and Governance.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "The RICE principles (Robustness, Interpretability, Controllability, Ethicality) capture the key objectives of AI alignment.",
    291       "evidence": "Section 1.2.2 presents RICE with Figure 3 and Table 1 mapping research directions to principles. Comparison with alternatives (Asimov's Laws, FATE, 3H) is provided.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "General alignment techniques (SFT, RLHF, Adversarial Training) fail to eradicate certain deceptive and backdoor behaviors.",
    296       "evidence": "Section 6.1 cites Hubinger et al. (2024) for this claim, stating these failures 'possibly leading to a misleading sense of safety.'",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Current RLHF approaches suffer from limitations including inconsistent human feedback, reward model inadequacies, and scalability challenges for superhuman systems.",
    301       "evidence": "Section 2 and subsections discuss these limitations extensively, citing Casper et al. (2023b), Bowman et al. (2022), and others.",
    302       "supported": "strong"
    303     }
    304   ],
    305   "methodology_tags": ["meta-analysis", "theoretical"],
    306   "key_findings": "This survey proposes the RICE framework (Robustness, Interpretability, Controllability, Ethicality) as organizing principles for AI alignment objectives. It decomposes alignment into forward alignment (learning from feedback, learning under distribution shift) and backward alignment (assurance, governance), forming an iterative alignment cycle. The paper covers RLHF, scalable oversight, adversarial and cooperative training, interpretability, safety evaluation, and AI governance across 800+ references. Key open challenges identified include learning human intent from rich modalities, building trustworthy assurance tools against deceptive alignment, and value elicitation for diverse populations.",
    307   "red_flags": [
    308     {
    309       "flag": "No survey methodology documented",
    310       "detail": "The paper does not describe how papers were identified, selected, or filtered for inclusion. No search queries, databases, inclusion/exclusion criteria, or PRISMA-style flow diagram are provided. This makes it impossible to assess whether the coverage is comprehensive or biased toward certain communities."
    311     },
    312     {
    313       "flag": "No structured quality assessment of surveyed papers",
    314       "detail": "The survey summarizes and categorizes existing work but does not assess the methodological quality of the papers it reviews. This risks laundering the signal-to-noise ratio of its sources — weak and strong evidence are presented with equal weight."
    315     },
    316     {
    317       "flag": "No funding or conflict of interest disclosure",
    318       "detail": "The paper has no funding statement and no competing interests declaration despite being authored by researchers at multiple major institutions."
    319     }
    320   ],
    321   "cited_papers": [
    322     {
    323       "title": "Concrete problems in ai safety",
    324       "authors": ["Dario Amodei", "Chris Olah", "Jacob Steinhardt", "Paul Christiano", "John Schulman", "Dan Mané"],
    325       "year": 2016,
    326       "arxiv_id": "1606.06565",
    327       "relevance": "Foundational paper defining concrete safety problems in AI systems, directly relevant to AI safety methodology."
    328     },
    329     {
    330       "title": "Deep reinforcement learning from human preferences",
    331       "authors": ["Paul F Christiano", "Jan Leike", "Tom Brown", "Miljan Martic", "Shane Legg", "Dario Amodei"],
    332       "year": 2017,
    333       "relevance": "Seminal RLHF paper establishing the preference-based learning paradigm central to current alignment approaches."
    334     },
    335     {
    336       "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback",
    337       "authors": ["Yuntao Bai"],
    338       "year": 2022,
    339       "arxiv_id": "2204.05862",
    340       "relevance": "Key Anthropic paper on RLHF for safety, introducing the helpful-harmless training paradigm."
    341     },
    342     {
    343       "title": "Open problems and fundamental limitations of reinforcement learning from human feedback",
    344       "authors": ["Stephen Casper"],
    345       "year": 2023,
    346       "relevance": "Comprehensive analysis of RLHF limitations relevant to evaluating methodology quality in alignment research."
    347     },
    348     {
    349       "title": "Foundational challenges in assuring alignment and safety of large language models",
    350       "authors": ["Usman Anwar"],
    351       "year": 2024,
    352       "arxiv_id": "2404.09932",
    353       "relevance": "Identifies foundational challenges in LLM alignment assurance, directly relevant to survey scope."
    354     },
    355     {
    356       "title": "Frontier ai regulation: Managing emerging risks to public safety",
    357       "authors": ["Markus Anderljung"],
    358       "year": 2023,
    359       "arxiv_id": "2307.03718",
    360       "relevance": "Proposes regulatory frameworks for frontier AI development, relevant to AI governance methodology."
    361     },
    362     {
    363       "title": "Model evaluation for extreme risks",
    364       "authors": ["Toby Shevlane"],
    365       "year": 2023,
    366       "relevance": "Framework for evaluating dangerous capabilities in AI systems, directly relevant to safety evaluation methodology."
    367     },
    368     {
    369       "title": "Towards monosemanticity: Decomposing language models with dictionary learning",
    370       "authors": ["Trenton Bricken", "Adly Templeton"],
    371       "year": 2023,
    372       "relevance": "Key mechanistic interpretability work using sparse autoencoders, relevant to AI safety tooling."
    373     },
    374     {
    375       "title": "Weak-to-strong generalization: Eliciting strong capabilities with weak supervision",
    376       "authors": ["Collin Burns"],
    377       "year": 2023,
    378       "arxiv_id": "2312.09390",
    379       "relevance": "Explores scalable oversight via weak-to-strong generalization, a key challenge in alignment."
    380     },
    381     {
    382       "title": "Constitutional ai: Harmlessness from ai feedback",
    383       "authors": ["Yuntao Bai"],
    384       "year": 2022,
    385       "arxiv_id": "2212.08073",
    386       "relevance": "Introduces RLAIF/Constitutional AI approach to alignment without human feedback, relevant to scalable oversight."
    387     },
    388     {
    389       "title": "Goal misgeneralization in deep reinforcement learning",
    390       "authors": ["Lauro Langosco Di Langosco"],
    391       "year": 2022,
    392       "relevance": "Defines and demonstrates goal misgeneralization as a key alignment failure mode."
    393     },
    394     {
    395       "title": "Towards guaranteed safe ai: A framework for ensuring robust and reliable ai systems",
    396       "authors": ["David Dalrymple"],
    397       "year": 2024,
    398       "arxiv_id": "2405.06624",
    399       "relevance": "Proposes formal framework for guaranteed safe AI, relevant to safety assurance methodology."
    400     }
    401   ]
    402 }

Impressum · Datenschutz