ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (20365B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "theoretical",
      4   "paper": {
      5     "title": "Insured Agents: A Decentralized Trust Insurance Mechanism for Agentic Economy",
      6     "authors": [
      7       "Botao 'Amber' Hu",
      8       "Bangdao Chen"
      9     ],
     10     "year": 2025,
     11     "venue": "AAMAS 2026",
     12     "arxiv_id": "2512.08737",
     13     "doi": "10.48550/arXiv.2512.08737"
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "Abstract claims that LLM agents are unreliable (refs [5,52,55,24,22]), that insurance enables verification (Section 3.3), and that the mechanism achieves incentive compatibility (Theorem 3.1). All core claims are supported by content.",
     21         "source": "haiku"
     22       },
     23       "causal_claims_justified": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "Causal claims (e.g., 'insurance calibrates stakes') are justified via game-theoretic analysis. Theorem 3.1 proves the mechanism achieves honest equilibrium under stated conditions. Claims are theoretical, not empirically tested, but formally justified.",
     27         "source": "haiku"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Paper frames contribution as a 'design pattern rather than fully specified protocol' with 'minimal incentive analysis' (Abstract, Section 3). Scope is bounded to open agent economies under the stated assumptions (rational agents, deterministic verifier).",
     33         "source": "haiku"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Paper briefly reviews agents-at-stake and reputation systems (Sections 1, 2.1) but does not systematically compare why insurance is superior to pure slashing or reputation-based accountability. Only one interpretation is presented.",
     39         "source": "haiku"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "Paper claims to achieve 'incentive compatibility' and measures this via equilibrium analysis (Theorem 3.1). The claimed outcome (honest behavior in equilibrium) matches what is measured.",
     45         "source": "haiku"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "Section 4 titled 'Open Challenges and Research Agenda' outlines future work but is NOT a limitations section discussing what this paper does NOT show. No dedicated threats-to-validity section exists.",
     53         "source": "haiku"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "Specific threats not discussed. The model assumes perfect verifier (V reveals misbehavior without error), persistent identities, rational agents, and privacy-preserving TEE access. Real-world violations of these assumptions are not addressed.",
     59         "source": "haiku"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "Paper states it's a 'design pattern' with 'minimal incentive analysis,' bounding scope somewhat, but does not explicitly list what it does NOT address (e.g., 'We do not model distribution shift in agent behavior' or 'We do not address underwriting risk calibration algorithms').",
     65         "source": "haiku"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No funding acknowledgment or statement of funding sources. Absence of disclosure = NO.",
     73         "source": "haiku"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Both authors list affiliations: Botao Hu (University of Oxford) and Bangdao Chen (University College Oxford Blockchain Research Center). Affiliations are disclosed.",
     79         "source": "haiku"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "No funder mentioned, so independence cannot be assessed. Assuming unfunded independent work.",
     85         "source": "haiku"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No competing interests statement. No mention of patents, equity stakes, consulting, or financial relationships to insurance protocols, agentic systems, or blockchain platforms.",
     91         "source": "haiku"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "Key terms partially defined: Service Agent, Insurer, Verifier are defined mechanically (Section 3.1). However, critical terms like 'misbehavior,' 'trustlessness,' and 'protocol-native' are used but not formally defined. Section 4 provides examples (violate constraints, leak secrets) but not formal definitions.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Contribution is stated clearly: 'insured agents as a design pattern...competitive insurers post slashable stake, price risk through premiums, and provide privacy-preserving verification via TEE audit access.' Positioned as mechanism design + research agenda.",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Paper reviews traditional trust mechanisms (Section 2.1: escrow, reputation, slashing) and MAS literature (Section 2.2: mechanism design, reputation engineering). Positions insurance as novel relative to pure slashing. However, deep comparative analysis (why insurance > reputation?) is limited.",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "theoretical": {
    117       "formal_quality": {
    118         "assumptions_stated_explicitly": {
    119           "applies": true,
    120           "answer": true,
    121           "justification": "Section 3.5 explicitly lists: players rational/risk-neutral with common knowledge, verifier V is deterministic/error-free, solvency S_I ≥ L. Main assumptions are explicit, though some (identity persistence, stable agent behavior) are implicit in the model.",
    122           "source": "haiku"
    123         },
    124         "proofs_complete_or_sketched": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "Theorem 3.1 includes complete proof via backward induction. Proof works through all four game stages and derives equilibrium conditions clearly. Proof is rigorous but relatively straightforward.",
    128           "source": "haiku"
    129         },
    130         "bounds_tight_or_discussed": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "Theorem 3.1 provides conditions (Eqs. 1-3) for equilibrium but does not discuss tightness. Are these bounds necessary and sufficient? Can conditions be weakened? No discussion.",
    134           "source": "haiku"
    135         },
    136         "counterexamples_explored": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "Paper does not explore edge cases or counterexamples: e.g., simultaneous claims against one insurer, insurer insolvency, heterogeneous agent verifiability, or model drift. Boundaries of the mechanism are not tested.",
    140           "source": "haiku"
    141         },
    142         "notation_consistent": {
    143           "applies": true,
    144           "answer": true,
    145           "justification": "Notation is consistent throughout: A, I, U, V for roles; S_I, S_A, L, G, B, F, R, V_future for parameters. No overloading or inconsistencies.",
    146           "source": "haiku"
    147         },
    148         "constructive_vs_existence_noted": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "Theorem proves existence of equilibrium but does not provide constructive algorithm to compute it. Section 4 identifies 'underwriting and risk calibration' as open challenges, implicitly acknowledging that computation is not addressed.",
    152           "source": "haiku"
    153         }
    154       },
    155       "connections": {
    156         "connection_to_practice_discussed": {
    157           "applies": true,
    158           "answer": true,
    159           "justification": "Section 3.4 discusses hierarchical insurers for domain specialization. Section 4 outlines practical implementation challenges: evidence design, verifiable misbehavior, TEE privacy, governance, standardization. Clearly connects theory to what must be built.",
    160           "source": "haiku"
    161         },
    162         "relationship_to_prior_work_clear": {
    163           "applies": true,
    164           "answer": true,
    165           "justification": "Paper positions itself vs. escrow/reputation/PoS-slashing (Section 2.1) and MAS mechanism design (Section 2.2). Claims insurance enables competitive underwriting and privacy-preserving verification. Relationship is clear but comparative depth is limited.",
    166           "source": "haiku"
    167         },
    168         "computational_complexity_discussed": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "No discussion of computational complexity. How to set premiums? How to conduct efficient audits at scale? How do TEEs scale? Section 4 identifies these as research questions but provides no complexity analysis.",
    172           "source": "haiku"
    173         },
    174         "limitations_of_formal_model_stated": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "Abstract notes 'minimal incentive analysis,' bounding scope, but specific model limitations are not detailed. Key assumptions (deterministic verifier, identity stability, rational agents) should be discussed as limitations of the model.",
    178           "source": "haiku"
    179         }
    180       }
    181     }
    182   },
    183   "claims": [
    184     {
    185       "claim": "LLM agents are unreliable, hallucinated, manipulable, and vulnerable to prompt injection and tool abuse",
    186       "evidence": "References [5] Why Do Multi-Agent LLM Systems Fail, [52] Hallucination Is Inevitable, [55] Universal Adversarial Attacks, [24] Prompt Injection Attack, [22] Prompt Infection, [6] LLM Agents Are Hypersensitive to Nudges, [16] Sleeper Agents.",
    187       "supported": "strong"
    188     },
    189     {
    190       "claim": "Current agent protocols operate under 'nothing-at-stake' dynamics, enabling misbehavior without cost",
    191       "evidence": "Section 1 motivation; comparison to PoS slashing mechanisms (Section 2.1).",
    192       "supported": "moderate"
    193     },
    194     {
    195       "claim": "Insurance mechanism enables incentive-compatible equilibrium where agents act honestly and insurers pay valid claims",
    196       "evidence": "Theorem 3.1 with complete proof via backward induction (Section 3.6). Equilibrium requires S_A + V_future > G (deterrence), 2L + B > F (access to justice), S_I ≥ L (solvency).",
    197       "supported": "strong"
    198     },
    199     {
    200       "claim": "Hierarchical insurer markets can specialize verification (e.g., Safety Insurer, Financial Insurer) and aggregate trust via Master Insurer",
    201       "evidence": "Section 3.4 describes the hierarchical structure and gives examples. Mechanism design, no empirical validation.",
    202       "supported": "moderate"
    203     },
    204     {
    205       "claim": "Privacy-preserving verification via TEE audit access enables agents to disclose selectively to chosen insurers without global public disclosure",
    206       "evidence": "Section 3.3 argues agents voluntarily grant audit access in exchange for coverage, creating 'voluntary verification market rather than centralized surveillance.' Proposed mechanism, not implemented or tested.",
    207       "supported": "moderate"
    208     },
    209     {
    210       "claim": "Insurance is superior to pure 'agents-at-stake' slashing for heterogeneous agentic tasks because verification is decentralized via competitive underwriting",
    211       "evidence": "Section 1 motivation contrasts with PoS slashing; abstract claims insurance 'decentralizes verification via competitive underwriting.' Asserted but not rigorously compared with slashing alternatives.",
    212       "supported": "weak"
    213     }
    214   ],
    215   "methodology_tags": [
    216     "theoretical"
    217   ],
    218   "key_findings": "The paper proposes insured agents as a mechanism design pattern for open agent economies: specialized insurer agents post slashable collateral on behalf of service agents, earning premiums and receiving privileged TEE audit access for verification. Game-theoretic analysis (Theorem 3.1) proves incentive-compatible equilibrium exists under three conditions (access to justice, solvency, deterrence), yielding honest agent behavior and accurate insurer claims adjudication in equilibrium. The mechanism addresses heterogeneous verification challenges by enabling hierarchical, domain-specialized insurers rather than universal on-chain proofs. The contribution is a design pattern and research agenda, not an implemented system.",
    219   "red_flags": [
    220     {
    221       "flag": "Perfect verifier assumption",
    222       "detail": "Theorem assumes V is deterministic and always reveals truth without error. Real verification is imperfect, partial, and subject to gaming. No discussion of what happens if V is wrong."
    223     },
    224     {
    225       "flag": "Rational agents assumption",
    226       "detail": "Equilibrium proof assumes rational, risk-neutral agents with common knowledge. LLM agents are demonstrably irrational (hallucinate, get adversarially manipulated). Mechanism breaks if agents don't follow assumptions."
    227     },
    228     {
    229       "flag": "No empirical validation",
    230       "detail": "Paper is purely theoretical with no experiments, simulations, toy examples, or proof-of-concept. Cannot assess whether the mechanism works in practice."
    231     },
    232     {
    233       "flag": "Parameter setting unclear",
    234       "detail": "How are S_I, S_A, and premiums P actually determined? Section 4 identifies 'underwriting and risk calibration' as open challenges, suggesting the practical mechanism is unspecified."
    235     },
    236     {
    237       "flag": "Identity stability not addressed",
    238       "detail": "Mechanism requires persistent agent identities that can be slashed. But LLM agents can reset, update models, change prompts, change tools. This undermines collateral binding (mentioned Section 1, not solved)."
    239     },
    240     {
    241       "flag": "Insurer collusion not analyzed",
    242       "detail": "Game analyzes individual insurer incentives but does not address what happens if insurers collude to deny valid claims or charge monopoly premiums. Cartalization risk ignored."
    243     },
    244     {
    245       "flag": "Scalability of verification not discussed",
    246       "detail": "How do TEE audits scale to thousands/millions of agents? What is the computational and privacy cost? No complexity analysis."
    247     },
    248     {
    249       "flag": "No discussion of moral hazard and adverse selection",
    250       "detail": "Section 4 mentions these as 'classic problems' but paper does not analyze them. High-risk agents may seek coverage; insured agents may take greater risks. Design space is unaddressed."
    251     },
    252     {
    253       "flag": "Limited comparison with alternatives",
    254       "detail": "Paper does not rigorously compare insurance with pure slashing, reputation systems, or hybrid approaches. Why insurance specifically? Only assertion."
    255     },
    256     {
    257       "flag": "No limitations section",
    258       "detail": "Section 4 lists open challenges and research agenda, not limitations of what THIS paper does NOT show. No explicit discussion of what the mechanism fails to address."
    259     }
    260   ],
    261   "cited_papers": [
    262     {
    263       "title": "Why Do Multi-Agent LLM Systems Fail?",
    264       "authors": "Cemri et al.",
    265       "year": 2025,
    266       "relevance": "Documents empirical failures of multi-agent LLM systems; motivates need for accountability mechanisms."
    267     },
    268     {
    269       "title": "Internet of Agents: Fundamentals, Applications, and Challenges",
    270       "authors": "Wang et al.",
    271       "year": 2025,
    272       "relevance": "Defines vision of agentic web with autonomous agents; context for decentralized agent protocols."
    273     },
    274     {
    275       "title": "Hallucination Is Inevitable: An Innate Limitation of Large Language Models",
    276       "authors": "Xu et al.",
    277       "year": 2025,
    278       "relevance": "Demonstrates fundamental unreliability of LLM agents; motivates trustlessness assumption."
    279     },
    280     {
    281       "title": "ERC-8004: Trustless Agents—Discover agents and establish trust through reputation and validation",
    282       "authors": "De Rossi et al.",
    283       "year": 2025,
    284       "relevance": "Emerging Ethereum standard for agent validation and reputation; directly relevant to protocol standardization agenda."
    285     },
    286     {
    287       "title": "Agentic Web: Weaving the Next Web with AI Agents",
    288       "authors": "Yang et al.",
    289       "year": 2025,
    290       "relevance": "Vision paper on distributed agent coordination; context for open agent economies."
    291     },
    292     {
    293       "title": "The Agentic Economy",
    294       "authors": "Rothschild et al.",
    295       "year": 2025,
    296       "relevance": "Economic framework for multi-agent systems; foundational for insurance-as-market-mechanism framing."
    297     },
    298     {
    299       "title": "Agent2Agent Protocol (A2A): A New Era of Agent Interoperability",
    300       "authors": "Surapaneni et al.",
    301       "year": 2025,
    302       "relevance": "Agent communication protocol; interoperability requirement for insurance mechanism deployment."
    303     },
    304     {
    305       "title": "Model Context Protocol (MCP): Landscape, Security Threats, and Future Research Directions",
    306       "authors": "Hou et al.",
    307       "year": 2025,
    308       "relevance": "Tool-use protocol for agents; relevant to verifiable misbehavior and constraint violations in Section 4."
    309     }
    310   ],
    311   "engagement_factors": {
    312     "practical_relevance": {
    313       "score": 1,
    314       "justification": "Proposes a concrete mechanism but no prototype, implementation, or feasibility demonstration. Purely theoretical design pattern."
    315     },
    316     "surprise_contrarian": {
    317       "score": 2,
    318       "justification": "Insurance as trust mechanism for agents is relatively novel and contrarian to pure slashing. Reframes trust as market rather than reputation, which is somewhat surprising."
    319     },
    320     "fear_safety": {
    321       "score": 1,
    322       "justification": "Does not advance novel safety claims. Frames insurance as solution to known agent unreliability problems, not new risks."
    323     },
    324     "drama_conflict": {
    325       "score": 0,
    326       "justification": "No controversial or conflict angles. Technical mechanism design paper with no provocative claims or disagreements with prior work."
    327     },
    328     "demo_ability": {
    329       "score": 0,
    330       "justification": "No implementation, prototype, or simulation. Cannot run or demo the mechanism."
    331     },
    332     "brand_recognition": {
    333       "score": 1,
    334       "justification": "Oxford affiliation is prestigious, but individual authors not prominent in agent/AI communities. Published at AAMAS 2026 (good venue but not top-tier like NeurIPS)."
    335     }
    336   },
    337   "hn_data": {
    338     "threads": [
    339       {
    340         "hn_id": "45890174",
    341         "title": "Orbital Characterization of a Newly Discovered Small Satellite Around Quaoar",
    342         "points": 3,
    343         "comments": 0,
    344         "url": "https://news.ycombinator.com/item?id=45890174",
    345         "created_at": "2025-11-11T17:26:07Z"
    346       },
    347       {
    348         "hn_id": "38340183",
    349         "title": "Containerisation for High Performance Computing Systems: Survey and Prospects",
    350         "points": 3,
    351         "comments": 0,
    352         "url": "https://news.ycombinator.com/item?id=38340183",
    353         "created_at": "2023-11-20T00:09:54Z"
    354       },
    355       {
    356         "hn_id": "42959152",
    357         "title": "SmolLM2: When Smol Goes Big – Data-Centric Training of a Small Language Model",
    358         "points": 1,
    359         "comments": 0,
    360         "url": "https://news.ycombinator.com/item?id=42959152",
    361         "created_at": "2025-02-06T04:41:22Z"
    362       },
    363       {
    364         "hn_id": "42739555",
    365         "title": "Enhancing CGRA Efficiency Through Aligned Compute and Communication Provisioning",
    366         "points": 1,
    367         "comments": 0,
    368         "url": "https://news.ycombinator.com/item?id=42739555",
    369         "created_at": "2025-01-17T16:20:18Z"
    370       }
    371     ],
    372     "top_points": 3,
    373     "total_points": 8,
    374     "total_comments": 0
    375   }
    376 }

Impressum · Datenschutz