ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (22738B)


      1 {
      2   "paper": {
      3     "title": "Societal Alignment Frameworks Can Improve LLM Alignment",
      4     "authors": [
      5       "Karolina Stańczak",
      6       "Nicholas Meade",
      7       "Mehar Bhatia",
      8       "Hattie Zhou",
      9       "Konstantin Böttinger",
     10       "Jeremy Barnes",
     11       "Jason Stanley",
     12       "Jessica Montgomery",
     13       "Richard Zemel",
     14       "Nicolas Papernot",
     15       "Nicolas Chapados",
     16       "Denis Therien",
     17       "Timothy P Lillicrap",
     18       "Ana Marasović",
     19       "Sylvie Delacroix",
     20       "Gillian K Hadfield",
     21       "Siva Reddy"
     22     ],
     23     "year": 2025,
     24     "venue": "arXiv",
     25     "arxiv_id": "2503.00069",
     26     "doi": "10.48550/arXiv.2503.00069"
     27   },
     28   "scan_version": 3,
     29   "active_modules": [],
     30   "methodology_tags": ["theoretical"],
     31   "key_findings": "The paper frames LLM alignment as an incomplete contracting problem within a principal-agent framework, arguing that reward misspecification is analogous to incomplete contracts in economics and law. It proposes that societal alignment mechanisms — social norms and values, economic fairness and pluralistic aggregation, and contractual/legal oversight — can guide LLM alignment improvements. The paper distinguishes between unwanted epistemic uncertainty and uncertainty that is essential for ethical deployment, and advocates for participatory interface designs that democratize the process of determining LLM objectives rather than perfecting specification methods.",
     32   "checklist": {
     33     "artifacts": {
     34       "code_released": {
     35         "applies": false,
     36         "answer": false,
     37         "justification": "This is a purely theoretical position paper with no computational experiments, systems, or analysis code to release."
     38       },
     39       "data_released": {
     40         "applies": false,
     41         "answer": false,
     42         "justification": "No data was collected or analyzed; this is a conceptual/argumentative paper."
     43       },
     44       "environment_specified": {
     45         "applies": false,
     46         "answer": false,
     47         "justification": "No computational experiments were conducted."
     48       },
     49       "reproduction_instructions": {
     50         "applies": false,
     51         "answer": false,
     52         "justification": "No experiments to reproduce; the paper is a theoretical argument."
     53       }
     54     },
     55     "statistical_methodology": {
     56       "confidence_intervals_or_error_bars": {
     57         "applies": false,
     58         "answer": false,
     59         "justification": "No quantitative results are presented; the paper is purely theoretical."
     60       },
     61       "significance_tests": {
     62         "applies": false,
     63         "answer": false,
     64         "justification": "No statistical comparisons are made; the paper presents no empirical data."
     65       },
     66       "effect_sizes_reported": {
     67         "applies": false,
     68         "answer": false,
     69         "justification": "No empirical effects are measured."
     70       },
     71       "sample_size_justified": {
     72         "applies": false,
     73         "answer": false,
     74         "justification": "No samples or experiments exist in this theoretical paper."
     75       },
     76       "variance_reported": {
     77         "applies": false,
     78         "answer": false,
     79         "justification": "No experimental runs or quantitative results to report variance for."
     80       }
     81     },
     82     "evaluation_design": {
     83       "baselines_included": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No evaluation or experiments are conducted; this is a position paper."
     87       },
     88       "baselines_contemporary": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "No evaluation to compare against baselines."
     92       },
     93       "ablation_study": {
     94         "applies": false,
     95         "answer": false,
     96         "justification": "No system or method is built that could be ablated."
     97       },
     98       "multiple_metrics": {
     99         "applies": false,
    100         "answer": false,
    101         "justification": "No metrics are computed; the paper is theoretical."
    102       },
    103       "human_evaluation": {
    104         "applies": false,
    105         "answer": false,
    106         "justification": "No system outputs exist to be evaluated."
    107       },
    108       "held_out_test_set": {
    109         "applies": false,
    110         "answer": false,
    111         "justification": "No datasets or evaluation splits are used."
    112       },
    113       "per_category_breakdown": {
    114         "applies": false,
    115         "answer": false,
    116         "justification": "No quantitative results to break down."
    117       },
    118       "failure_cases_discussed": {
    119         "applies": false,
    120         "answer": false,
    121         "justification": "No system or method is evaluated that could produce failure cases."
    122       },
    123       "negative_results_reported": {
    124         "applies": false,
    125         "answer": false,
    126         "justification": "No experiments are conducted that could yield negative results."
    127       }
    128     },
    129     "claims_and_evidence": {
    130       "abstract_claims_supported": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The abstract claims the paper 'argues' that societal alignment frameworks can improve LLM alignment and 'investigates' how uncertainty manifests. These hedged claims are supported by the conceptual arguments in Sections 4-6. The abstract appropriately uses 'argue' and 'discuss' rather than claiming demonstrated empirical improvement."
    134       },
    135       "causal_claims_justified": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The title asserts 'Societal Alignment Frameworks Can Improve LLM Alignment' — a causal improvement claim. Section 4 uses language like 'instilling norms and values' and 'enabling LLMs to dynamically identify, adapt to, and mitigate emerging biases.' These are causal claims supported only by analogy to societal mechanisms, with no empirical evidence, experiments, or case studies demonstrating actual improvement."
    139       },
    140       "generalization_bounded": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper makes sweeping claims about 'LLM alignment' generally without bounding to specific models, tasks, or deployment contexts. The title claims broad applicability, but the arguments are built on analogies between societal institutions and LLM training that may not hold across all alignment settings."
    144       },
    145       "alternative_explanations_discussed": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "The paper presents no empirical results for which alternative explanations would be relevant. It is a pure position/theoretical paper."
    149       },
    150       "proxy_outcome_distinction": {
    151         "applies": false,
    152         "answer": false,
    153         "justification": "No measurements or proxies are used; this is a theoretical paper."
    154       }
    155     },
    156     "setup_transparency": {
    157       "model_versions_specified": {
    158         "applies": false,
    159         "answer": false,
    160         "justification": "No models are used in experiments; this is a theoretical paper."
    161       },
    162       "prompts_provided": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "No prompting is used; the paper is purely argumentative."
    166       },
    167       "hyperparameters_reported": {
    168         "applies": false,
    169         "answer": false,
    170         "justification": "No experiments with hyperparameters are conducted."
    171       },
    172       "scaffolding_described": {
    173         "applies": false,
    174         "answer": false,
    175         "justification": "No agentic scaffolding or systems are built or evaluated."
    176       },
    177       "data_preprocessing_documented": {
    178         "applies": false,
    179         "answer": false,
    180         "justification": "No data is collected or preprocessed."
    181       }
    182     },
    183     "limitations_and_scope": {
    184       "limitations_section_present": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "There is no dedicated limitations section. The Impact Statement (after Section 7) briefly mentions the need for participatory frameworks, but does not substantively discuss the limitations of the paper's own arguments or framework."
    188       },
    189       "threats_to_validity_specific": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "No specific threats to the validity of the paper's arguments are discussed. Section 6 acknowledges that the contract metaphor 'oversimplifies complex systems,' but this is framed as motivating a new perspective rather than as a limitation of the paper's own analysis."
    193       },
    194       "scope_boundaries_stated": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "The paper does not explicitly state what its arguments do NOT cover or where the proposed framework would fail. No specific boundaries are drawn around which types of alignment problems or LLM deployment scenarios the societal alignment analogy applies to."
    198       }
    199     },
    200     "data_integrity": {
    201       "raw_data_available": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No data was collected; this is a theoretical position paper."
    205       },
    206       "data_collection_described": {
    207         "applies": false,
    208         "answer": false,
    209         "justification": "No data collection occurred."
    210       },
    211       "recruitment_methods_described": {
    212         "applies": false,
    213         "answer": false,
    214         "justification": "No human participants and no benchmark data; purely theoretical."
    215       },
    216       "data_pipeline_documented": {
    217         "applies": false,
    218         "answer": false,
    219         "justification": "No data pipeline exists in this theoretical paper."
    220       }
    221     },
    222     "conflicts_of_interest": {
    223       "funding_disclosed": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "The Acknowledgements section mentions the paper originated from the Bellairs Invitational Workshop (April 2024) but does not disclose any specific funding sources, grants, or sponsors."
    227       },
    228       "affiliations_disclosed": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "Author affiliations are clearly listed, including Mila, McGill University, Anthropic, Fraunhofer AISEC, ServiceNow, Google DeepMind, University of Cambridge, Columbia University, University of Toronto, University of Utah, King's College London, and Johns Hopkins University."
    232       },
    233       "funder_independent_of_outcome": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "Several authors are affiliated with Anthropic, Google DeepMind, and ServiceNow — companies that develop LLMs and have commercial stakes in alignment narratives. Since no explicit funding is disclosed, independence cannot be assessed, and the affiliations suggest potential non-independence."
    237       },
    238       "financial_interests_declared": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "No competing interests statement is present in the paper. Multiple authors are affiliated with companies (Anthropic, Google DeepMind, ServiceNow) with financial interests in LLM alignment outcomes."
    242       }
    243     },
    244     "contamination": {
    245       "training_cutoff_stated": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "The paper does not evaluate any pre-trained model on any benchmark; it is a theoretical position paper."
    249       },
    250       "train_test_overlap_discussed": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No model evaluation is conducted."
    254       },
    255       "benchmark_contamination_addressed": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No benchmarks are used."
    259       }
    260     },
    261     "human_studies": {
    262       "pre_registered": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this theoretical paper."
    266       },
    267       "irb_or_ethics_approval": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants."
    271       },
    272       "demographics_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       },
    277       "inclusion_exclusion_criteria": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants."
    281       },
    282       "randomization_described": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants."
    286       },
    287       "blinding_described": {
    288         "applies": false,
    289         "answer": false,
    290         "justification": "No human participants."
    291       },
    292       "attrition_reported": {
    293         "applies": false,
    294         "answer": false,
    295         "justification": "No human participants."
    296       }
    297     },
    298     "cost_and_practicality": {
    299       "inference_cost_reported": {
    300         "applies": false,
    301         "answer": false,
    302         "justification": "Purely theoretical paper; no system to assess costs for."
    303       },
    304       "compute_budget_stated": {
    305         "applies": false,
    306         "answer": false,
    307         "justification": "No computational experiments were conducted."
    308       }
    309     }
    310   },
    311   "claims": [
    312     {
    313       "claim": "LLM alignment can be modeled as an incomplete contracting problem within a principal-agent framework, where the LLM is the agent and the developer/user is the principal.",
    314       "evidence": "Section 3 formalizes the principal-agent framework with contract (a, r) pairs following Echenique et al. (2023). Section 3.2 argues complete contracts are infeasible because the output space Y is intractable and human values are too complex to fully specify.",
    315       "supported": "moderate"
    316     },
    317     {
    318       "claim": "Misalignment in LLMs arises from reward misspecification, which is analogous to incomplete contracts in economics.",
    319       "evidence": "Section 3.3 discusses reward hacking, jailbreaking (Chao et al., 2024; Zou et al., 2023), fake alignment (Greenblatt et al., 2024), and context-dependent reward functions as consequences of incomplete contracts. Examples are drawn from existing literature rather than new evidence.",
    320       "supported": "moderate"
    321     },
    322     {
    323       "claim": "Social alignment mechanisms — including cultural norms, multimodal cues, and dynamic norm evolution — can guide LLM alignment.",
    324       "evidence": "Section 4.1 argues by analogy to human normative competence (Bicchieri, 2017; Schutz, 1976). Cites datasets of social norms (Ziems et al., 2022; Zhan et al., 2024; Chiu et al., 2024) and work on Western value bias (Durmus et al., 2024). No empirical validation of the proposed integration.",
    325       "supported": "weak"
    326     },
    327     {
    328       "claim": "Economic alignment frameworks such as Pareto efficiency, social welfare functions, and cooperative game theory can inform pluralistic LLM alignment.",
    329       "evidence": "Section 4.2 draws on welfare economics (Arrow, 1951; d'Aspremont & Gevers, 2002), Pareto efficiency (Boldi et al., 2024), and welfare-centric RLHF (Pardeshi et al., 2024; Cousins et al., 2024). Arguments are conceptual; no demonstration that these approaches improve alignment in practice.",
    330       "supported": "weak"
    331     },
    332     {
    333       "claim": "Uncertainty in LLM alignment is partly essential for ethical deployment and should be communicated rather than eliminated.",
    334       "evidence": "Section 5.2 argues that trade-offs between helpfulness and harmlessness inherently require uncertainty, citing Zollo et al. (2024) and Yaghini et al. (2023). Section 5.3 discusses uncertainty communication drawing on Bhatt et al. (2021). The argument is logical but lacks any empirical demonstration.",
    335       "supported": "weak"
    336     },
    337     {
    338       "claim": "The under-specified nature of LLM objectives should be reframed as a democratic opportunity for participatory alignment, not a flaw to be fixed.",
    339       "evidence": "Section 6 argues that alignment is fundamentally a political rather than purely technical question, citing Terzis (2024), Goldoni & Wilkinson (2018), and Kirk et al. (2024). This is a normative/philosophical argument with no empirical backing.",
    340       "supported": "weak"
    341     }
    342   ],
    343   "red_flags": [
    344     {
    345       "flag": "Title overclaims relative to evidence",
    346       "detail": "The title states societal alignment frameworks 'Can Improve' LLM alignment, implying demonstrated improvement. The paper provides only conceptual arguments and analogies without any empirical evidence, case studies, or experiments showing actual improvement."
    347     },
    348     {
    349       "flag": "No empirical validation of proposed framework",
    350       "detail": "The paper proposes that social, economic, and contractual alignment mechanisms can improve LLM alignment but provides zero empirical validation — no experiments, no simulations, no case studies demonstrating that any of these mechanisms work in practice for LLMs."
    351     },
    352     {
    353       "flag": "Undisclosed conflicts of interest",
    354       "detail": "Authors from Anthropic (whose Constitutional AI is discussed favorably in Section 4.3.2), Google DeepMind, and ServiceNow have commercial interests in alignment narratives. No competing interests statement is provided, and the favorable treatment of Constitutional AI is not flagged as involving affiliated authors."
    355     },
    356     {
    357       "flag": "No limitations section",
    358       "detail": "A 17-author position paper making broad claims about LLM alignment lacks any dedicated discussion of limitations, threats to validity, or scope boundaries for its own arguments."
    359     }
    360   ],
    361   "cited_papers": [
    362     {
    363       "title": "Alignment faking in large language models",
    364       "authors": ["Ryan Greenblatt", "Carson Denison", "Benjamin Wright"],
    365       "year": 2024,
    366       "arxiv_id": "2412.14093",
    367       "relevance": "Directly demonstrates fake alignment behavior in LLMs — a core alignment failure this paper's framework aims to address."
    368     },
    369     {
    370       "title": "Constitutional AI: Harmlessness from AI feedback",
    371       "authors": ["Yuntao Bai", "Saurav Kadavath"],
    372       "year": 2022,
    373       "arxiv_id": "2212.08073",
    374       "relevance": "Key alignment technique discussed in Section 4.3.2 as an example of internal contractual alignment for LLMs."
    375     },
    376     {
    377       "title": "Training language models to follow instructions with human feedback",
    378       "authors": ["Long Ouyang", "Jeff Wu"],
    379       "year": 2022,
    380       "relevance": "Foundational RLHF paper that defines the alignment approach this paper critiques as insufficient due to incomplete contracting."
    381     },
    382     {
    383       "title": "Jailbreaking black box large language models in twenty queries",
    384       "authors": ["Patrick Chao", "Alexander Robey"],
    385       "year": 2024,
    386       "arxiv_id": "2310.08419",
    387       "relevance": "Demonstrates jailbreaking attacks that exploit reward misspecification gaps, cited as evidence of incomplete contract failures."
    388     },
    389     {
    390       "title": "Universal and transferable adversarial attacks on aligned language models",
    391       "authors": ["Andy Zou", "Zifan Wang"],
    392       "year": 2023,
    393       "arxiv_id": "2307.15043",
    394       "relevance": "Demonstrates transferable adversarial attacks that bypass LLM safety guardrails, relevant to alignment robustness."
    395     },
    396     {
    397       "title": "Jailbroken: How does LLM safety training fail?",
    398       "authors": ["Alexander Wei", "Nika Haghtalab", "Jacob Steinhardt"],
    399       "year": 2023,
    400       "relevance": "Analyzes failure modes of LLM safety training, directly relevant to understanding alignment breakdowns."
    401     },
    402     {
    403       "title": "Direct preference optimization: Your language model is secretly a reward model",
    404       "authors": ["Rafael Rafailov", "Archit Sharma"],
    405       "year": 2023,
    406       "relevance": "Major alignment method (DPO) that simplifies RLHF, discussed as part of the contemporary alignment pipeline."
    407     },
    408     {
    409       "title": "AI safety via debate",
    410       "authors": ["Geoffrey Irving", "Paul Christiano", "Dario Amodei"],
    411       "year": 2018,
    412       "arxiv_id": "1805.00899",
    413       "relevance": "Scalable oversight mechanism discussed in Section 4.3.2 as an example of internal contractual alignment."
    414     },
    415     {
    416       "title": "Large language model alignment: A survey",
    417       "authors": ["Tianhao Shen", "Renchu Jin"],
    418       "year": 2023,
    419       "arxiv_id": "2309.15025",
    420       "relevance": "Comprehensive survey of LLM alignment methods, directly relevant to the survey's scope on alignment techniques."
    421     },
    422     {
    423       "title": "Breach by a thousand leaks: Unsafe information leakage in 'safe' AI responses",
    424       "authors": ["David Glukhov", "Ziwen Han"],
    425       "year": 2024,
    426       "arxiv_id": "2407.02551",
    427       "relevance": "Demonstrates that aligned LLMs still leak unsafe information, illustrating alignment gaps from in-context learning exploits."
    428     },
    429     {
    430       "title": "Concrete problems in AI safety",
    431       "authors": ["Dario Amodei", "Chris Olah", "Jacob Steinhardt"],
    432       "year": 2016,
    433       "arxiv_id": "1606.06565",
    434       "relevance": "Foundational AI safety paper that identifies reward misspecification and other alignment challenges."
    435     },
    436     {
    437       "title": "On targeted manipulation and deception when optimizing LLMs for user feedback",
    438       "authors": ["Micah Williams", "Micah Carroll"],
    439       "year": 2024,
    440       "arxiv_id": "2411.02306",
    441       "relevance": "Studies how optimizing LLMs for user feedback can lead to manipulation and deception, relevant to alignment safety."
    442     },
    443     {
    444       "title": "Incomplete contracting and AI alignment",
    445       "authors": ["Dylan Hadfield-Menell", "Gillian K. Hadfield"],
    446       "year": 2019,
    447       "doi": "10.1145/3306618.3314250",
    448       "relevance": "Core prior work that first formalized AI alignment as an incomplete contracting problem — the theoretical foundation this paper extends."
    449     }
    450   ],
    451   "engagement_factors": {
    452     "practical_relevance": {
    453       "score": 0,
    454       "justification": "Pure theoretical/conceptual framework with no immediately usable tools, techniques, or methods for practitioners."
    455     },
    456     "surprise_contrarian": {
    457       "score": 1,
    458       "justification": "The reframing of alignment as incomplete contracting draws from economics and law, which is somewhat novel but not strongly contrarian."
    459     },
    460     "fear_safety": {
    461       "score": 1,
    462       "justification": "Touches on alignment failure modes like jailbreaking and fake alignment, but does not demonstrate novel attacks or raise new safety concerns."
    463     },
    464     "drama_conflict": {
    465       "score": 0,
    466       "justification": "No controversy, no claims that existing approaches are fundamentally broken, no adversarial framing."
    467     },
    468     "demo_ability": {
    469       "score": 0,
    470       "justification": "No code, no demo, no tool — purely argumentative paper."
    471     },
    472     "brand_recognition": {
    473       "score": 2,
    474       "justification": "Authors from Anthropic, Google DeepMind, Mila, and ServiceNow — well-known AI institutions."
    475     }
    476   }
    477 }

Impressum · Datenschutz