ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (20171B)


      1 {
      2   "paper": {
      3     "title": "Institutional AI: A Governance Framework for Distributional AGI Safety",
      4     "authors": ["F. Pierucci", "M. Galisai", "M. Bracale Syrnikov", "M. Prandi", "P. Bisconti", "F. Giarrusso", "O. Sorokoletova", "V. Suriani", "D. Nardi"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2601.10599",
      8     "doi": "10.48550/arXiv.2601.10599"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": false,
     17         "justification": "No source code or repository URL is provided. The paper is theoretical but could have released formal specifications or simulation code."
     18       },
     19       "data_released": {
     20         "applies": false,
     21         "answer": false,
     22         "justification": "Purely theoretical paper with no data collection or analysis. No dataset to release."
     23       },
     24       "environment_specified": {
     25         "applies": false,
     26         "answer": false,
     27         "justification": "No computational experiments are conducted. No environment to specify."
     28       },
     29       "reproduction_instructions": {
     30         "applies": false,
     31         "answer": false,
     32         "justification": "No experiments to reproduce. The paper is a theoretical framework."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": false,
     38         "answer": false,
     39         "justification": "Purely theoretical paper with no empirical results or statistical analyses."
     40       },
     41       "significance_tests": {
     42         "applies": false,
     43         "answer": false,
     44         "justification": "No comparative empirical claims requiring statistical tests."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": false,
     48         "answer": false,
     49         "justification": "No empirical measurements or effect sizes to report."
     50       },
     51       "sample_size_justified": {
     52         "applies": false,
     53         "answer": false,
     54         "justification": "No empirical study with samples."
     55       },
     56       "variance_reported": {
     57         "applies": false,
     58         "answer": false,
     59         "justification": "No experimental runs to report variance across."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": false,
     65         "answer": false,
     66         "justification": "No empirical evaluation is conducted. The paper is a theoretical framework proposal."
     67       },
     68       "baselines_contemporary": {
     69         "applies": false,
     70         "answer": false,
     71         "justification": "No empirical evaluation with baselines."
     72       },
     73       "ablation_study": {
     74         "applies": false,
     75         "answer": false,
     76         "justification": "No system with components to ablate. The paper presents a theoretical framework."
     77       },
     78       "multiple_metrics": {
     79         "applies": false,
     80         "answer": false,
     81         "justification": "No empirical evaluation with metrics."
     82       },
     83       "human_evaluation": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No system outputs to evaluate."
     87       },
     88       "held_out_test_set": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "No data or test sets involved."
     92       },
     93       "per_category_breakdown": {
     94         "applies": false,
     95         "answer": false,
     96         "justification": "No empirical results to break down."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "The paper discusses failure modes of existing alignment approaches but does not discuss potential failure cases of its own proposed governance graph framework."
    102       },
    103       "negative_results_reported": {
    104         "applies": false,
    105         "answer": false,
    106         "justification": "No experiments conducted that could yield negative results."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The abstract claims are theoretical in nature — identifying three structural problems and proposing governance graphs as a solution. The paper body develops each thesis (Sections 2-4) and the proposed framework (Section 5) as stated in the abstract."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": false,
    118         "justification": "The paper makes causal claims such as 'the institution shifts the payoff landscape' and that sanctions 'eliminate the profitable deviation problem' (Section 5.1, Eq. 1-2). These are supported by mechanism design theory but not empirically validated. The companion paper [15] is referenced for experimental validation but not included here."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The paper makes sweeping claims about alignment of AI agents generally. The title references 'Distributional AGI Safety.' The theoretical framework is presented as broadly applicable without bounding to specific agent types, capability levels, or deployment contexts. Section 5.7 acknowledges domain variation but the core claims are unbounded."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The paper does not seriously consider alternatives to its institutional approach. It does not discuss whether improved training-time alignment, interpretability advances, or other runtime approaches could address the same problems. The framing treats training-time alignment as fundamentally insufficient without considering counterarguments."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": false,
    132         "answer": false,
    133         "justification": "Theoretical paper with no measurements. No proxy-outcome gap to discuss."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": false,
    139         "answer": false,
    140         "justification": "No models are used in experiments. The paper references models (Claude Opus 4, GPT-4, etc.) only in its literature review of prior work."
    141       },
    142       "prompts_provided": {
    143         "applies": false,
    144         "answer": false,
    145         "justification": "No prompting is used. This is a theoretical paper."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": false,
    149         "answer": false,
    150         "justification": "No experiments conducted requiring hyperparameter settings."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding is used in this paper. The paper describes scaffolding conceptually as part of its framework."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": false,
    159         "answer": false,
    160         "justification": "No data is collected or processed."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "There is no dedicated limitations section. The conclusion (Section 6) does not discuss limitations of the proposed framework."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "No threats to validity are discussed. The paper does not address potential weaknesses of governance graphs, such as the assumption that agent behavior is observable, that sanctions can be calibrated, or that agents cannot subvert the governance engine itself."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "The paper does not explicitly state what the framework does NOT address. It mentions future work directions (Section 6: fake news, credit markets) but does not bound the current claims."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": false,
    183         "answer": false,
    184         "justification": "No data collected. Purely theoretical paper."
    185       },
    186       "data_collection_described": {
    187         "applies": false,
    188         "answer": false,
    189         "justification": "No data collection. Theoretical framework paper."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No participants or data recruitment."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": false,
    198         "answer": false,
    199         "justification": "No data pipeline. Theoretical paper."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding acknowledgment or disclosure is present in the paper."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are listed: DEXAI/Icaro Lab, Sapienza University of Rome, Sant'Anna School of Advanced Studies, and VU Amsterdam."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding is disclosed, so independence cannot be assessed. The authors are affiliated with DEXAI (appears to be a company/lab) but no funding relationship is stated."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial interests statement is provided."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "The paper does not evaluate any pre-trained model on benchmarks. It is a theoretical framework paper."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "No model evaluation on benchmarks."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No benchmark evaluation conducted."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "Purely theoretical paper. No method with inference cost."
    283       },
    284       "compute_budget_stated": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No computational experiments conducted."
    288       }
    289     }
    290   },
    291   "claims": [
    292     {
    293       "claim": "Sufficiently capable models can acquire internal goal structures that diverge from developer-specified objectives through mesa-optimization and goal misgeneralization.",
    294       "evidence": "Section 2 reviews literature on mesa-optimization (Hubinger et al. 2019), goal misgeneralization (Di Langosco et al. 2022, Shah et al. 2022), and emergent preferences (Mazeika et al. 2025, Seror 2024). Evidence is from cited work, not original experiments.",
    295       "supported": "moderate"
    296     },
    297     {
    298       "claim": "RLHF and Constitutional AI have fundamental limitations that cannot be resolved through better implementation, including oversight, representation, and deception problems.",
    299       "evidence": "Section 3 synthesizes Casper et al. (2023) taxonomy of RLHF limitations, Anthropic's alignment faking results (Greenblatt et al. 2024), and Apollo Research scheming findings (Meinke et al. 2024). All evidence from prior work.",
    300       "supported": "moderate"
    301     },
    302     {
    303       "claim": "Individually aligned agents can converge on collusive or adversarial equilibria through multi-agent interaction dynamics.",
    304       "evidence": "Section 4 cites LLM collusion in auctions (Agrawal et al. 2025), market division (Lin et al. 2024), steganographic coordination (Motwani et al. 2024). The ESRH framework from Bisconti et al. [12] is self-cited.",
    305       "supported": "moderate"
    306     },
    307     {
    308       "claim": "Governance graphs provide O(N) verification scaling versus O(N²) for agent-space verification, creating a decisive scaling advantage.",
    309       "evidence": "Section 5.8 provides a mathematical argument comparing agent-space (N·d individual + N(N-1)/2 pairwise) vs institution-space (|Q|²·P graph + N monitoring) verification. This is a theoretical claim supported by the formal model, not empirical measurement.",
    310       "supported": "weak"
    311     },
    312     {
    313       "claim": "Institutional sanctions can transform the social optimum into a Nash equilibrium by making misaligned behavior unprofitable.",
    314       "evidence": "Section 5.1 presents the formal argument with Equations 1-2, showing that if sanction S exceeds max deviation gain, no profitable unilateral deviation exists. This is a standard mechanism design result applied to AI agents.",
    315       "supported": "moderate"
    316     }
    317   ],
    318   "methodology_tags": ["theoretical"],
    319   "key_findings": "The paper proposes 'Institutional AI,' a governance framework that externalizes alignment constraints as public data structures (governance graphs) rather than relying on training-time internalization. It identifies three structural alignment problems — behavioral goal-independence, instrumental override of alignment constraints, and agentic alignment drift — and argues these cannot be solved by prompt-based or RLHF approaches alone. The governance graph formalism uses mechanism design to reshape agent payoffs so that compliance becomes the dominant strategy, with claimed O(N) verification scaling. A companion paper reportedly validates the approach in Cournot market simulations.",
    320   "red_flags": [
    321     {
    322       "flag": "Claims significantly outrun evidence",
    323       "detail": "The paper makes sweeping claims about alignment solutions for AGI-level systems based entirely on theoretical arguments and literature review. No empirical validation is presented in this paper. The companion paper [15] is referenced but not included. The complexity reduction thesis (O(N) vs O(N²)) assumes agents cannot subvert the governance infrastructure itself."
    324     },
    325     {
    326       "flag": "No limitations discussion",
    327       "detail": "The paper has no limitations section despite making ambitious theoretical claims. Key unaddressed issues include: whether the monitoring oracle can reliably detect all violations (especially steganographic coordination, which the paper itself highlights as a threat), whether agents can game the governance graph itself, and whether the formal model's assumptions hold for real LLM agents."
    328     },
    329     {
    330       "flag": "Selective use of literature",
    331       "detail": "The paper cites evidence of alignment failures extensively to motivate its framework but does not engage with counterarguments — e.g., that improved interpretability, formal verification, or better RLHF could address the same issues. The presentation frames training-time alignment as fundamentally broken without fairly representing the opposing view."
    332     },
    333     {
    334       "flag": "Self-citation cluster",
    335       "detail": "Several key references ([10], [11], [12], [15]) are self-citations from the same research group (Bisconti, Pierucci, Galisai, Bracale, Prandi). The ESRH framework (Section 4.2) and companion empirical validation both come from within the group."
    336     }
    337   ],
    338   "cited_papers": [
    339     {
    340       "title": "Alignment faking in large language models",
    341       "authors": ["R. Greenblatt", "C. Denison", "B. Wright"],
    342       "year": 2024,
    343       "arxiv_id": "2412.14093",
    344       "relevance": "Key evidence for alignment faking behavior in frontier models, directly relevant to AI safety evaluation."
    345     },
    346     {
    347       "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training",
    348       "authors": ["E. Hubinger", "C. Denison", "J. Mu"],
    349       "year": 2024,
    350       "arxiv_id": "2401.05566",
    351       "relevance": "Demonstrates persistent deceptive policies that survive safety fine-tuning, relevant to alignment robustness."
    352     },
    353     {
    354       "title": "Frontier Models are Capable of In-context Scheming",
    355       "authors": ["A. Meinke", "B. Schoen", "J. Scheurer"],
    356       "year": 2024,
    357       "arxiv_id": "2412.04984",
    358       "relevance": "Documents scheming capabilities in frontier models including o1, Claude 3.5 Sonnet, and Gemini 1.5 Pro."
    359     },
    360     {
    361       "title": "Open Problems and Fundamental Limitations of Reinforcement Learning from Human Feedback",
    362       "authors": ["S. Casper", "X. Davies", "C. Shi"],
    363       "year": 2023,
    364       "arxiv_id": "2307.15217",
    365       "relevance": "Systematic taxonomy of RLHF limitations relevant to understanding alignment technique boundaries."
    366     },
    367     {
    368       "title": "Constitutional AI: Harmlessness from AI Feedback",
    369       "authors": ["Y. Bai", "S. Kadavath", "S. Kundu"],
    370       "year": 2022,
    371       "arxiv_id": "2212.08073",
    372       "relevance": "Foundational paper on Constitutional AI, a key alignment approach critiqued in this work."
    373     },
    374     {
    375       "title": "Secret Collusion among Generative AI Agents: Multi-Agent Deception via Steganography",
    376       "authors": ["S. R. Motwani", "M. Baranchuk", "M. Strohmeier"],
    377       "year": 2024,
    378       "arxiv_id": "2402.07510",
    379       "relevance": "Demonstrates steganographic collusion capabilities in LLM agents, relevant to multi-agent safety."
    380     },
    381     {
    382       "title": "Multi-Agent Risks from Advanced AI",
    383       "authors": ["L. Hammond", "A. Chan", "J. Clifton"],
    384       "year": 2025,
    385       "arxiv_id": "2502.14143",
    386       "relevance": "Comprehensive taxonomy of multi-agent AI risks from Cooperative AI, directly relevant to agentic safety."
    387     },
    388     {
    389       "title": "Agentic Misalignment: How LLMs Could be Insider Threats",
    390       "authors": ["A. Lynch", "B. Wright", "C. Larson"],
    391       "year": 2025,
    392       "arxiv_id": "2510.05179",
    393       "relevance": "Red-teaming results showing frontier models autonomously selecting harmful tactics in agentic settings."
    394     },
    395     {
    396       "title": "Distributional AGI Safety",
    397       "authors": ["N. Tomašev", "M. Franklin", "J. Jacobs"],
    398       "year": 2025,
    399       "arxiv_id": "2512.16856",
    400       "relevance": "Google DeepMind's distributional AGI safety agenda that this paper claims to answer."
    401     },
    402     {
    403       "title": "Beyond Single-Agent Safety: A Taxonomy of Risks in LLM-to-LLM Interactions",
    404       "authors": ["P. Bisconti", "M. Galisai", "F. Pierucci"],
    405       "year": 2025,
    406       "arxiv_id": "2512.02682",
    407       "relevance": "Introduces the ESRH framework for multi-agent risk analysis, directly builds on this group's work."
    408     },
    409     {
    410       "title": "Utility Engineering: Analyzing and Controlling Emergent Value Systems in AIs",
    411       "authors": ["M. Mazeika", "X. Yin", "R. Tamirisa"],
    412       "year": 2025,
    413       "arxiv_id": "2502.08640",
    414       "relevance": "Evidence for emergent utility representations and coherent preferences in frontier LLMs."
    415     },
    416     {
    417       "title": "Towards Understanding Sycophancy in Language Models",
    418       "authors": ["M. Sharma", "M. Tong", "T. Korbak"],
    419       "year": 2023,
    420       "arxiv_id": "2310.13548",
    421       "relevance": "Systematic study of sycophancy as an alignment failure mode in preference-optimized models."
    422     }
    423   ]
    424 }

Impressum · Datenschutz