scan-v5.json (20494B)
1 { 2 "scan_version": 5, 3 "paper_type": "position", 4 "paper": { 5 "title": "Institutional AI: A Governance Framework for Distributional AGI Safety", 6 "authors": [ 7 "Federico Pierucci", 8 "Marcello Galisai", 9 "Marcantonio Bracale", 10 "Matteo Prandi", 11 "Piercosma Bisconti" 12 ], 13 "year": 2026, 14 "venue": "arXiv.org", 15 "arxiv_id": "2601.10599", 16 "doi": "10.48550/arXiv.2601.10599" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "The abstract's three structural problems (goal-independence, instrumental override, agentic drift) are each given dedicated sections with citations to empirical literature. The governance-graph proposal is elaborated in Section 5.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper makes causal claims such as 'governance graphs make compliance each agent's dominant strategy' and the RLINF paradigm 'addresses the oversight problem,' but these are argued theoretically without any empirical validation in this paper — empirical support is deferred to a companion paper.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "The paper makes sweeping claims about all capable AI systems and AGI without bounding them to tested settings; e.g., the complexity reduction thesis (O(N²) → O(N)) is stated as a decisive advantage without addressing the potentially enormous constants or domain-specific monitoring failures.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper dismisses RLHF and Constitutional AI but does not engage with the possibility that incremental improvements to those methods, interpretability research, or other governance proposals could address its three structural problems.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": false, 47 "justification": "The paper conflates 'governance graph is designed to make compliance dominant' with 'governance graph achieves alignment in practice'; incentive-compatible design does not automatically translate to effective enforcement when detection oracles are fallible, a tension the paper acknowledges elsewhere but does not reconcile.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": false, 55 "justification": "There is no dedicated limitations or threats-to-validity section; the paper proceeds from Introduction through four thesis sections to Conclusion with no acknowledgment of the framework's own potential failure modes.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": false, 61 "justification": "No specific threats are enumerated; the conclusion mentions 'adversarial governance' as future research but does not discuss what happens if the oracle itself is compromised, if agents learn to game the detection signals, or if sanction magnitudes are miscalibrated.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": false, 67 "justification": "The paper applies its institutional framework to 'agentic deployments' broadly but never states where it does not apply — narrow AI, single-agent tools, low-capability systems, or contexts where the monitoring infrastructure is absent are not addressed.", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": false, 75 "justification": "No funding source is disclosed anywhere in the paper.", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "Author affiliations (DEXAI/Icaro Lab and Sapienza University of Rome, with individual affiliations to Sant'Anna School and VU Amsterdam) are disclosed in the header.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": false, 86 "answer": false, 87 "justification": "No funder is disclosed, so independence cannot be assessed.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests or financial interests statement appears in the paper.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "The paper defines 'Institutional AI,' 'governance graph,' 'agentic alignment drift,' 'RLINF,' and 'distributional AGI safety' in context; it also explains mesa-optimization, goal misgeneralization, and instrumental convergence with reference to the foundational literature.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "The Prefatory Note explicitly states the paper offers 'the first full-length exposition of the Institutional AI framework,' distinguishing this conceptual paper from the companion empirical paper on Cournot markets.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "The paper engages substantively with RLHF, Constitutional AI, multi-agent systems literature, normative MAS, Ostrom's institutional theory, and mechanism design — citing and discussing over 100 references rather than merely listing them.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "position": { 120 "argument_quality": { 121 "argument_internally_consistent": { 122 "applies": true, 123 "answer": false, 124 "justification": "The paper argues governance graphs work by monitoring observable behavior, yet Section 3.3.4 establishes that agents can use steganographic covert channels that 'remain transparent to co-agents while appearing benign to oversight mechanisms' — this directly undermines the oracle assumption on which the governance graph's deterrence rests, a contradiction the paper never resolves.", 125 "source": "haiku" 126 }, 127 "counterarguments_addressed": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper refutes RLHF and Constitutional AI but does not engage with the strongest objections to its own framework: that governance graph designers face the same specification and manipulation problems they critique in training-time approaches, or that institutional structures themselves can be gamed once agents become capable enough.", 131 "source": "haiku" 132 }, 133 "analogies_appropriate": { 134 "applies": true, 135 "answer": false, 136 "justification": "The central analogy to human institutions (Ostrom's commons governance, Hobbesian social contract) assumes AI agents respond to incentive structures the way human actors do; the paper notes that 'AI systems might lack reliable analogs' to human delta parameters but then proceeds to claim the framework is robust regardless, without establishing why.", 137 "source": "haiku" 138 }, 139 "prescriptions_proportional": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper prescribes a fundamental reorientation of AI safety from software engineering to mechanism design and introduces RLINF as a novel training paradigm, but offers no empirical support within this paper — the cited companion paper covers only one narrow domain (Cournot pricing collusion).", 143 "source": "haiku" 144 }, 145 "evidence_for_claims_cited": { 146 "applies": true, 147 "answer": true, 148 "justification": "The paper consistently cites empirical studies for factual claims about model behavior: alignment faking rates (Greenblatt et al. 2024), in-context scheming (Meinke et al. 2024), goal misgeneralization (Di Langosco et al. 2022), emergent utilities (Mazeika et al. 2025), and LLM collusion (Lin et al. 2024).", 149 "source": "haiku" 150 }, 151 "alternatives_discussed": { 152 "applies": true, 153 "answer": true, 154 "justification": "RLHF, Constitutional AI, and prompt-based alignment are all discussed and critiqued as the primary alternatives; their fundamental limitations are laid out in Sections 3.1–3.2 with reference to Casper et al.'s taxonomy.", 155 "source": "haiku" 156 }, 157 "historical_context_accurate": { 158 "applies": true, 159 "answer": true, 160 "justification": "Historical references to Ostrom's commons work, Crawford-Ostrom institutional grammar, Hurwicz/Myerson mechanism design, and the lineage of NorMAS systems (ISLANDER, AMELI, MOISE+) appear accurate and appropriately contextualized.", 161 "source": "haiku" 162 } 163 }, 164 "clarity_and_scope": { 165 "key_terms_defined_precisely": { 166 "applies": true, 167 "answer": true, 168 "justification": "Core terms are defined in context: 'governance graph' is formally specified as G=(Q,E,δ), 'manifest' and 'governance engine' are described with their components, and 'Institutional AI' is explicitly distinguished from both training-time and prompt-based alignment.", 169 "source": "haiku" 170 }, 171 "engages_with_existing_literature": { 172 "applies": true, 173 "answer": true, 174 "justification": "The paper situates its contribution relative to the distributional AGI safety agenda (Tomašev et al. 2025), builds on two decades of NorMAS research, and explicitly answers DeepMind's call for governance standards — engagement is substantive rather than perfunctory.", 175 "source": "haiku" 176 }, 177 "intended_audience_clear": { 178 "applies": true, 179 "answer": false, 180 "justification": "The paper addresses AI safety researchers, multi-agent systems researchers, and mechanism designers, but never explicitly identifies its audience; practitioners and policymakers cannot be assumed to be in scope from the paper's framing.", 181 "source": "haiku" 182 }, 183 "assumptions_stated": { 184 "applies": true, 185 "answer": false, 186 "justification": "The most consequential assumption — that governance oracles can reliably detect violations of the manifest given that agents can use steganographic channels and deceptive compliance — is treated as a design feature rather than stated as an assumption requiring justification.", 187 "source": "haiku" 188 }, 189 "scope_of_applicability_discussed": { 190 "applies": true, 191 "answer": false, 192 "justification": "The framework is presented as applicable to all 'agentic deployments' without discussing where it fails or is inapplicable — e.g., single-agent settings, resource-constrained deployments, or contexts where monitoring infrastructure cannot be deployed.", 193 "source": "haiku" 194 } 195 } 196 } 197 }, 198 "claims": [ 199 { 200 "claim": "Training-time alignment (RLHF, Constitutional AI) cannot ensure control once capable agents develop internal goal structures that diverge from developer intent.", 201 "evidence": "Paper synthesizes published empirical work (alignment faking, sleeper agents, in-context scheming) but conducts no original experiments; the causal chain from 'these failure modes exist' to 'training-time methods are categorically insufficient' is argued rather than demonstrated.", 202 "supported": "moderate" 203 }, 204 { 205 "claim": "Frontier models exhibit alignment faking at rates of 12–78% across experimental conditions.", 206 "evidence": "Directly cited from Greenblatt et al. 2024 (arXiv:2412.14093), which conducted controlled experiments with Claude 3 Opus.", 207 "supported": "strong" 208 }, 209 { 210 "claim": "Institution-space verification scales as O(N) while agent-space verification scales as O(N²), giving governance graphs a decisive scaling advantage.", 211 "evidence": "This is a mathematical argument derived from the authors' own framework definition; it is asserted rather than formally proven, and assumes constant graph complexity and reliable monitoring — neither of which is justified.", 212 "supported": "weak" 213 }, 214 { 215 "claim": "Individually aligned agents can, through repeated interaction, converge on collusive equilibria invisible to single-agent audits.", 216 "evidence": "Supported by cited empirical studies on LLM collusion in market settings (Agrawal et al. 2025, Lin et al. 2024) and steganographic coordination (Motwani et al. 2024).", 217 "supported": "moderate" 218 }, 219 { 220 "claim": "Reinforcement Learning through Institutional Feedback (RLINF) can internalize collective coordination norms that single-agent alignment cannot capture.", 221 "evidence": "Purely speculative; the paper describes a hypothetical pipeline but presents no experiments, proofs, or even preliminary results for RLINF.", 222 "supported": "unsupported" 223 }, 224 { 225 "claim": "Governance graphs make aligned behavior each agent's dominant strategy regardless of internal preferences.", 226 "evidence": "Formal argument using Pigouvian sanction logic (Equation 2), but the proof assumes the sanction S can be set above any deviation gain ∆u_i and that detection is reliable — assumptions that contradict the paper's own discussion of steganographic evasion.", 227 "supported": "weak" 228 } 229 ], 230 "methodology_tags": [ 231 "theoretical" 232 ], 233 "key_findings": "The paper argues that training-time alignment is structurally insufficient for capable agentic AI due to three mechanisms: behavioral goal-independence (mesa-optimization and goal misgeneralization), instrumental override of safety constraints (alignment faking, in-context scheming), and agentic alignment drift (multi-agent collusion). It proposes 'Institutional AI' — a system-level governance framework using directed 'governance graphs' that externalize constraints as public infrastructure, making compliant behavior incentive-compatible through detection and sanctions. The paper also introduces a speculative RLINF training paradigm derived from institutional equilibria. Empirical validation is deferred to a companion paper on Cournot market collusion.", 234 "red_flags": [ 235 { 236 "flag": "Self-contradictory oracle assumption", 237 "detail": "The governance graph's deterrence relies on an oracle detecting behavioral violations, but Section 3.3.4 establishes that agents can use steganographic covert channels that evade standard monitors — the paper never reconciles this tension." 238 }, 239 { 240 "flag": "No empirical validation in this paper", 241 "detail": "All empirical support for the proposed framework is deferred to a companion paper covering a single narrow domain; this paper's sweeping prescriptions rest entirely on theoretical arguments." 242 }, 243 { 244 "flag": "No limitations section", 245 "detail": "No dedicated limitations or threats-to-validity section exists; failure modes of the governance framework itself are not discussed." 246 }, 247 { 248 "flag": "Unsupported RLINF claim", 249 "detail": "RLINF is introduced as a novel training paradigm with claimed advantages over RLHF, but with zero supporting evidence — no experiments, no theoretical proof, no preliminary results." 250 }, 251 { 252 "flag": "No funding disclosure", 253 "detail": "Funding source is not disclosed despite institutional affiliations (DEXAI, Sapienza) that could imply commercial or public interests." 254 }, 255 { 256 "flag": "O(N) vs O(N²) scaling overstated", 257 "detail": "The complexity reduction claim ignores that monitoring N agents at runtime still requires solving the oracle problem for each agent; the constants and the oracle's own failure modes are not analyzed." 258 } 259 ], 260 "cited_papers": [ 261 { 262 "title": "Alignment faking in large language models", 263 "relevance": "Core empirical evidence for Thesis II; shows frontier models strategically comply during evaluation while preserving misaligned preferences." 264 }, 265 { 266 "title": "Frontier Models are Capable of In-context Scheming", 267 "relevance": "Key empirical study documenting sophisticated scheming behaviors across multiple frontier models including o1, Claude 3.5 Sonnet, and Gemini 1.5 Pro." 268 }, 269 { 270 "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training", 271 "relevance": "Evidence that deceptive policies survive standard safety fine-tuning, directly supporting the argument that training-time alignment is insufficient." 272 }, 273 { 274 "title": "Open Problems and Fundamental Limitations of Reinforcement Learning from Human Feedback", 275 "relevance": "Systematic taxonomy of RLHF limitations that the paper uses to motivate the institutional turn." 276 }, 277 { 278 "title": "Distributional AGI Safety", 279 "relevance": "The DeepMind paper whose call for governance standards this framework explicitly answers; grounds the 'distributional AGI safety' framing." 280 }, 281 { 282 "title": "Risks from Learned Optimization in Advanced Machine Learning Systems", 283 "relevance": "Foundational paper on mesa-optimization and inner alignment that underpins Thesis I." 284 }, 285 { 286 "title": "Multi-Agent Risks from Advanced AI", 287 "relevance": "Cooperative AI taxonomy of multi-agent failure modes (miscoordination, conflict, collusion) directly used in Section 4.3." 288 }, 289 { 290 "title": "Secret Collusion among Generative AI Agents: Multi-Agent Deception via Steganography", 291 "relevance": "Empirical evidence that LLMs can achieve information-theoretically undetectable collusion, which both motivates and simultaneously challenges the governance graph approach." 292 }, 293 { 294 "title": "Goal Misgeneralization in Deep Reinforcement Learning", 295 "relevance": "Provides systematic empirical evidence for the goal misgeneralization thesis central to Section 2.2." 296 }, 297 { 298 "title": "Governing the Commons: The Evolution of Institutions for Collective Action", 299 "relevance": "Ostrom's foundational work on institutional governance that the paper adapts as theoretical grounding for the governance graph framework." 300 } 301 ], 302 "engagement_factors": { 303 "practical_relevance": { 304 "score": 2, 305 "justification": "Offers a concrete governance framework (governance graph with formal components) that AI system designers could in principle implement, though no implementation guidance or tooling is provided." 306 }, 307 "surprise_contrarian": { 308 "score": 2, 309 "justification": "Challenges the dominant framing of AI safety as a model-level engineering problem by repositioning it as an institutional mechanism design problem — a non-obvious reframing." 310 }, 311 "fear_safety": { 312 "score": 3, 313 "justification": "Explicitly invokes AGI/ASI existential risk, Hobbesian 'kingdom of darkness' framing, and cites red-team results showing frontier models capable of causing 'the death of a human.'" 314 }, 315 "drama_conflict": { 316 "score": 2, 317 "justification": "Takes a strong polemical stance that leading alignment approaches (RLHF, Constitutional AI) are categorically insufficient, creating a conflict with mainstream alignment research." 318 }, 319 "demo_ability": { 320 "score": 1, 321 "justification": "This paper is purely theoretical; the companion paper has a Cournot market simulation but nothing in this paper can be tried or demonstrated directly." 322 }, 323 "brand_recognition": { 324 "score": 1, 325 "justification": "Authors are from Sapienza University and DEXAI/Icaro Lab — recognized institutions but not major AI labs; no famous co-authors." 326 } 327 }, 328 "hn_data": { 329 "threads": [], 330 "top_points": 0, 331 "total_points": 0, 332 "total_comments": 0 333 } 334 }