scan.json (26841B)
1 { 2 "paper": { 3 "title": "Institutional AI: Governing LLM Collusion in Multi-Agent Cournot Markets via Public Governance Graphs", 4 "authors": ["Marcantonio Bracale", "Federico Pierucci", "Marcello Galisai", "Matteo Prandi", "Piercosma Bisconti", "F. Giarrusso", "O. Sorokoletova", "V. Suriani", "D. Nardi"], 5 "year": 2026, 6 "venue": "arXiv", 7 "arxiv_id": "2601.11369", 8 "doi": "10.48550/arXiv.2601.11369" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "Institutional governance (governance-graph-based enforcement) substantially reduces LLM collusion in Cournot markets: mean collusion tier drops from 3.1 to 1.8 (Cohen's d=1.28), and severe-collusion incidence drops from 50% to 5.6% across 6 model configurations (N=90 runs/condition). Prompt-only Constitutional baselines yield no reliable improvement over ungoverned conditions, suggesting declarative prohibitions do not bind under optimization pressure. The effect holds across homogeneous and heterogeneous model pairs including cross-provider configurations.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "The paper mentions 'results artifacts' containing emitted manifests and logs but provides no download link or public repository." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No requirements.txt, Dockerfile, or library version listing is provided. Models are named but no environment setup details given." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided. The methodology section describes the pipeline conceptually but lacks executable reproduction details." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": true, 41 "justification": "Table 6 reports mean ± SD for all metrics across 90 runs per condition." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": true, 46 "justification": "Two-sided Welch t-tests with p-values reported in Table 6; two-proportion z-tests for tier shares; paired sign-flip permutation tests (p=0.0312) for cross-configuration inference (Section 5.5, Section 7)." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Cohen's d reported for all continuous endpoints in Table 6 (d=1.28, 1.05, 1.51, etc.), with explicit magnitude interpretation ('large by conventional standards')." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "N=90 runs/condition (5 runs/label/batch/condition × 6 labels × 3 batches) is stated but no power analysis or justification for why 5 runs per cell is sufficient." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "Standard deviations reported across runs in Table 6 (e.g., '3.10 ± 1.06'). IQR also reported for median tier (Section 7)." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Two baselines: Ungoverned (replication of Lin et al. 2024) and Constitutional (prompt-only prohibition). Both clearly defined in Table 2." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Constitutional baseline draws on Palla et al. (2025) and Hua et al. (2024) policy-as-prompt approaches. Ungoverned replicates Lin et al. (2024). These are contemporary references." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": false, 78 "justification": "Section 6.2.3 mentions a 'small factorial ablation matrix around the selected institution to isolate the marginal contribution of core governance levers (fine salience, credit timing, tier persistence, and credit budget)' but results of this ablation are not reported in the paper." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Multiple metrics: collusion tier, HHI excess, CV excess (max and mean), tier share percentages (≥3, ≥4), profit excess, total profit. All reported in Table 6." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "This is a computational experiment with LLM agents in a simulated market. Human evaluation of outputs is not relevant to the claims." 89 }, 90 "held_out_test_set": { 91 "applies": false, 92 "answer": false, 93 "justification": "This is not a prediction task with train/test splits. It is a simulation experiment comparing governance regimes." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Table 7 and Figure 5 provide per-model-configuration breakdowns of mean collusion tier across all 6 configurations and 3 conditions." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 7 notes Constitutional baseline sometimes increases collusion (GPT-5 Mini: 2.93→3.60). Section 9 discusses governance brittleness and Goodhart risk. 5.6% severe collusion remains under Institutional regime." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Constitutional baseline yields no reliable improvement — this is a key negative finding. GPT-5 Mini Constitutional tier exceeding Ungoverned is reported (Table 7). 244 suspension requests denied is also reported." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims: mean tier 3.1→1.8 (d=1.28), severe collusion 50%→5.6%, Constitutional yields no reliable improvement. All supported by Table 6 and Section 7." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper makes causal claims about Institutional governance suppressing collusion. The experimental design (3 conditions × 6 configurations × 3 batches, randomized) with controlled single-variable manipulation (governance regime) supports causal inference within the simulated setting." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "Section 9 explicitly states limitations: 'two-firm Cournot abstracts away contracts, asymmetric information, richer strategic instruments, and endogenous entry/exit.' The abstract uses 'may benefit from' and 'these results suggest' language. Title specifies 'Cournot Markets.'" 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "Section 9 discusses governance brittleness (Goodhart risk), that fixed proxy thresholds may be gamed. Section 8 considers whether compliance is incentive-based vs norm-following. The paper also addresses whether model heterogeneity drives results (it doesn't; Section 7)." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper explicitly defines collusion through market-structure proxies (CV excess, HHI excess) in Section 5.3, derives them from Cournot-Nash benchmarks, and maps them to a discrete tier system (Table 1). The proxy relationship between these metrics and actual collusion is discussed, with normalization relative to Nash equilibrium." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "Models listed as 'GPT-5 Mini', 'Grok-4 Fast', 'Gemini 2.5 Flash' — marketing names without snapshot dates or API versions. Lin et al. (2024) specified 'GPT-4o-2024-08-06' but the present paper does not follow this practice for its own models." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Appendix A provides the full agent decision prompt template with actual structure. Appendix B provides the full Constitutional text. Appendix C provides verbatim Institutional notice examples. Appendix D references the manifest. The prompts are substantially complete." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "Lin et al. (2024) used temperature=1, but this paper does not state temperature or sampling parameters for its own model configurations. Market parameters (α, β, κ) are stated but LLM API hyperparameters are not." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "Section 6.1 describes the agent architecture in detail: rolling 30-round market trace, PLANS/INSIGHTS memory, structured JSON output with retries, feasibility enforcement. Section 6.2 details the Oracle/Controller governance engine. Figure 1 shows the pipeline." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 6.3 describes the full execution flow: environment reset, manifest emission, per-round cycle (notice → proposal → Oracle/Controller → market clearing → memory update). Post-horizon processing (Nash computation via SLSQP, excess ratio derivation, tier assignment) is specified." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 9 'Limitations and Further Research' provides substantive discussion of limitations." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 9 identifies specific threats: two-firm Cournot is narrow (no contracts, asymmetric information, entry/exit); fixed proxy thresholds may be Goodharted; no inter-firm communication tested; limited model zoo." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 9 explicitly states what was not tested: n≥3 firms, explicit inter-firm communication, multilingual models, broader coordination domains. Section 1 states the paper focuses on 'implementation, operational definitions, and empirical results' not conceptual foundations." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "The paper mentions 'results artifacts' and 'full emitted manifests and logs are included in the results artifacts' (Appendix D) but provides no download link or public access." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 5.1 and 6.3 describe the experimental setup: 6 model configs × 3 conditions × 3 batches × 5 runs, with fixed market parameters and the execution pipeline fully specified." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data comes from LLM agent simulations using standard public API models." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "Section 6.3 documents the full pipeline: environment initialization → per-round execution (governance notice → agent proposal → Oracle/Controller → market clearing → memory update) → post-horizon computation of reference solutions and metrics." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding statement, acknowledgments section, or grant disclosures found in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations listed: DEXAI–Icaro Lab, Sapienza University of Rome, Sant'Anna School of Advanced Studies, VU Amsterdam." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding information disclosed, so independence cannot be assessed. Authors are affiliated with DEXAI–Icaro Lab which may have interests in the governance framework being proposed." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial disclosure statement found in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": false, 229 "answer": false, 230 "justification": "This paper does not evaluate model capability on a benchmark. It tests governance regimes' effects on LLM agent behavior in a simulated market. Training cutoff is irrelevant — the agents are making strategic decisions, not recalling memorized answers." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": false, 234 "answer": false, 235 "justification": "Not a benchmark evaluation of model knowledge. The Cournot game is a novel interactive setting, not a static test set." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": false, 239 "answer": false, 240 "justification": "Not applicable — no static benchmark is used. The experimental setting is a dynamic multi-round game." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants. Experiment is a computational simulation with LLM agents." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No API costs, tokens consumed, or per-run cost reported despite running 270 runs (90/condition × 3 conditions) each with 50 rounds of LLM calls across 6 different models." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No total API spend, compute hours, or hardware specifications reported." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": true, 296 "justification": "Results are aggregated over 3 independent batches of 5 runs each per label/condition (N=90/condition), with SD reported. This effectively tests sensitivity across runs, though 'seed' is not explicitly used since LLM APIs handle randomness internally." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "Explicitly stated: '5 runs/label/batch/condition; N = 90 runs/condition' across 'three independent batches' (Section 5.5)." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "Section 6.2.3 mentions 'a low-replication screening sweep over a small candidate set' and 'a small factorial ablation matrix' for parameter selection, but the number of configurations tried and compute spent on search are not reported." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "Section 6.2.3 describes a staged procedure: screening sweep → robustness check on second model family → factorial ablation → manifest locked by semantic digest before final runs. Selection criterion (eliminate configurations failing to reduce severe outcomes) is stated." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "Multiple statistical tests are reported (Welch t-tests on 4 continuous metrics, z-tests on 2 tier shares, permutation tests) without mention of Bonferroni, Holm, or other family-wise error rate corrections." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors designed the Institutional governance system and evaluate it against baselines. No discussion of author-evaluation bias or independent evaluation." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": false, 325 "answer": false, 326 "justification": "All three regimes use the same underlying LLM calls per round. The Institutional regime adds deterministic Oracle/Controller computation which is negligible. Compute differences are negligible across conditions." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": true, 331 "justification": "Section 5.3 explicitly defines collusion through market-structure metrics (HHI, CV) with theoretical grounding in Cournot-Nash equilibrium benchmarks. The proxy nature of the tier system is discussed. Section 5.2 justifies Cournot competition as a testbed." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": true, 335 "answer": true, 336 "justification": "All models use the same agent architecture (Section 6.1): same prompt structure, same memory system, same output schema. The governance regime modifies only the MARKET GOVERNANCE prompt block. Cross-model comparisons use identical scaffolding." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": false, 342 "answer": false, 343 "justification": "Not a benchmark evaluation of model knowledge. The Cournot game is a novel interactive simulation without pre-existing solutions to memorize." 344 }, 345 "feature_leakage_addressed": { 346 "applies": false, 347 "answer": false, 348 "justification": "Not a prediction task. Agents make strategic decisions in a simulated market; there is no answer to leak." 349 }, 350 "non_independence_addressed": { 351 "applies": false, 352 "answer": false, 353 "justification": "Not applicable — no train/test split. Each run is an independent simulation." 354 }, 355 "leakage_detection_method": { 356 "applies": false, 357 "answer": false, 358 "justification": "Not a benchmark evaluation with leakage risk. The experimental setting is a dynamic game, not a static test set." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "Institutional governance reduces mean collusion tier from 3.1 to 1.8 (Cohen's d = 1.28) relative to Ungoverned baseline.", 365 "evidence": "Table 6: mean tier U=3.10±1.06 vs Inst=1.82±0.93, Welch p=4.67e-15, d=1.28. N=90 runs/condition across 6 model configs and 3 batches.", 366 "supported": "strong" 367 }, 368 { 369 "claim": "Severe-collusion incidence drops from 50% to 5.6% under Institutional governance.", 370 "evidence": "Table 6: Tier≥4 rate U=50.0% vs Inst=5.6%, two-proportion z-test p=2.81e-11.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Prompt-only Constitutional baseline yields no reliable improvement over Ungoverned.", 375 "evidence": "Table 6: Constitutional tier=3.02±1.05 vs Ungoverned=3.10±1.06. Table 7 shows Constitutional sometimes increases collusion (GPT-5 Mini: 2.93→3.60).", 376 "supported": "strong" 377 }, 378 { 379 "claim": "The institutional effect holds across all 6 model configurations including heterogeneous cross-provider pairs.", 380 "evidence": "Table 7: Institutional reduces tier in every configuration. Permutation test on 6 labels: p=0.0312.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Model heterogeneity is not a first-order driver of collusion in this setting.", 385 "evidence": "Table 7: heterogeneous pairs show broadly similar tiers to homogeneous duopolies. Section 7 states no evidence that cross-provider heterogeneity disrupts collusion.", 386 "supported": "moderate" 387 } 388 ], 389 "red_flags": [ 390 { 391 "flag": "Authors evaluate their own system", 392 "detail": "The authors designed the Institutional AI framework and governance graph, then evaluate it against baselines. No independent replication or third-party evaluation. The parameter selection procedure (Section 6.2.3) involves tuning the institution to work well before final evaluation, creating researcher degrees of freedom." 393 }, 394 { 395 "flag": "No code or data release", 396 "detail": "Despite 270 runs producing detailed governance logs and market traces, no code repository or data archive is provided. The paper mentions 'results artifacts' but gives no public access." 397 }, 398 { 399 "flag": "Missing API hyperparameters", 400 "detail": "Temperature, top-p, and other sampling parameters for the 6 model configurations are not reported. LLM behavior in strategic settings is highly sensitive to temperature, making this a significant omission." 401 }, 402 { 403 "flag": "No cost reporting", 404 "detail": "270 runs × 50 rounds × 2 agents = 27,000 LLM calls minimum across 6 different commercial models. Total API cost is unreported." 405 }, 406 { 407 "flag": "Ablation results not shown", 408 "detail": "Section 6.2.3 mentions 'a small factorial ablation matrix' was run to isolate governance lever contributions, but the results are not reported. The reader cannot assess which components of the institution drive the effect." 409 } 410 ], 411 "cited_papers": [ 412 { 413 "title": "Strategic collusion of LLM agents: Market division in multi-commodity competitions", 414 "authors": ["R. Y. Lin", "S. M. Ojha", "K. Cai", "M. F. Chen"], 415 "year": 2024, 416 "arxiv_id": "2410.00031", 417 "relevance": "Primary replication target; demonstrates LLM agents collude in Cournot markets without explicit instructions." 418 }, 419 { 420 "title": "Alignment faking in large language models", 421 "authors": ["R. Greenblatt", "C. Denison", "B. Wright"], 422 "year": 2024, 423 "arxiv_id": "2412.14093", 424 "relevance": "Shows LLMs can fake alignment, motivating external governance over internal alignment." 425 }, 426 { 427 "title": "AI control: Improving safety despite intentional subversion", 428 "authors": ["R. Greenblatt", "B. Shlegeris", "K. Sachan", "F. Roger"], 429 "year": 2024, 430 "arxiv_id": "2312.06942", 431 "relevance": "Control-oriented approach to AI safety; separates detection from enforcement, related to Oracle/Controller design." 432 }, 433 { 434 "title": "Multi-agent risks from advanced AI", 435 "authors": ["L. Hammond", "A. Chan", "J. Clifton"], 436 "year": 2025, 437 "arxiv_id": "2502.14143", 438 "relevance": "Risk taxonomy for multi-agent AI including collusion, miscoordination, and conflict failure modes." 439 }, 440 { 441 "title": "Beyond single-agent safety: A taxonomy of risks in LLM-to-LLM interactions", 442 "authors": ["P. Bisconti", "M. Galisai", "F. Pierucci", "M. Bracale", "M. Prandi"], 443 "year": 2025, 444 "arxiv_id": "2512.02682", 445 "relevance": "Taxonomy of systemic risks in multi-agent LLM systems from the same research group." 446 }, 447 { 448 "title": "Sycophancy to subterfuge: Investigating reward-tampering in large language models", 449 "authors": ["C. Denison", "M. MacDiarmid", "F. Barez"], 450 "year": 2024, 451 "arxiv_id": "2406.10162", 452 "relevance": "Demonstrates reward tampering in LLMs, motivating external governance over reward-based alignment." 453 }, 454 { 455 "title": "Sleeper agents: Training deceptive LLMs that persist through safety training", 456 "authors": ["E. Hubinger", "C. Denison", "J. Mu"], 457 "year": 2024, 458 "arxiv_id": "2401.05566", 459 "relevance": "Shows deceptive policies can survive safety training, motivating runtime governance." 460 }, 461 { 462 "title": "Hidden in plain text: Emergence and mitigation of steganographic collusion in LLMs", 463 "authors": ["Y. Mathew", "O. Matthews", "R. McCarthy"], 464 "year": 2024, 465 "arxiv_id": "2410.03768", 466 "relevance": "Demonstrates steganographic collusion channels between LLM agents." 467 }, 468 { 469 "title": "Policy-as-prompt: Rethinking content moderation in the age of large language models", 470 "authors": ["K. Palla", "J. L. R. García", "C. Hauff"], 471 "year": 2025, 472 "relevance": "Evaluates prompt-based policy enforcement for LLMs; Constitutional baseline in this paper draws on this approach." 473 }, 474 { 475 "title": "Why do multi-agent LLM systems fail?", 476 "authors": ["M. Cemri", "M. Z. Pan", "S. Yang"], 477 "year": 2025, 478 "arxiv_id": "2503.13657", 479 "relevance": "MAST taxonomy of 14 failure modes in multi-agent LLM systems with 1600+ annotated traces." 480 }, 481 { 482 "title": "Artificial intelligence, algorithmic pricing, and collusion", 483 "authors": ["E. Calvano", "G. Calzolari", "V. Denicolò", "S. Pastorello"], 484 "year": 2020, 485 "relevance": "Foundational work showing Q-learning agents converge to supra-competitive prices without explicit communication." 486 }, 487 { 488 "title": "Constitutional AI: Harmlessness from AI feedback", 489 "authors": ["Y. Bai", "S. Kadavath", "S. Kundu"], 490 "year": 2022, 491 "arxiv_id": "2212.08073", 492 "relevance": "Constitutional AI training technique; this paper's RLINF proposal is positioned as an institutional analogue." 493 } 494 ] 495 }