scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28770B)
      1 {
      2   "paper": {
      3     "title": "Agent Contracts: A Formal Framework for Resource-Bounded Autonomous AI Systems",
      4     "authors": [
      5       "Qing Ye",
      6       "Jing Tan"
      7     ],
      8     "year": 2026,
      9     "venue": "arXiv",
     10     "arxiv_id": "2601.08815"
     11   },
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "The paper provides a GitHub URL: https://github.com/flyersworder/agent-contracts (footnote 3, Section 8.1). The authors state 'Implementation available at https://github.com/flyersworder/agent-contracts. The framework is under active development; we provide experiment code and data for reproducibility.'"
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The paper uses publicly available benchmarks: LiveCodeBench (arXiv:2403.07974) and OpenR1 logic puzzles. The authors also state they provide experiment data via the GitHub repository for reproducibility."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "The paper mentions using Google ADK and LiteLLM as frameworks, but does not provide requirements.txt, Dockerfile, or specific library versions. Mentioning 'Google ADK' and 'LiteLLM' without version numbers is insufficient to recreate the environment."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "While code is linked on GitHub with a note about reproducibility, the paper itself contains no step-by-step instructions for reproducing results. The paper describes experiment designs but not the commands or scripts needed to replicate them."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The paper uses 'bootstrap confidence intervals (10,000 resamples) with BCa correction' (Section 8). Results tables report statistical significance with p-values, and the Research Pipeline experiment reports variance reduction with a 'Bayesian probability' (88.5%)."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "Statistical significance tests are used: paired t-test for token usage (p=0.0007), iteration counts (p<0.0001), LLM calls (p<0.0001), and crisis communication token reduction (p=0.005). The non-significant 7.1pp success rate difference is also tested (p=0.13)."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Effect sizes are reported with baseline context throughout: '90% token reduction' (from 34,606 to 3,461 tokens), '525× lower variance,' '23% token reduction,' and '70%→86% success rate' with explicit absolute and relative differences."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "Sample sizes (n=70, n=50, n=50, n=24) are stated but not justified. No power analysis is provided, and there is no acknowledgment of whether these sample sizes are sufficient for the claims made—particularly the 24-problem Crisis Communication experiment."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Variance reduction is a key result: '525× lower variance (5.29B vs 10.1M)' for Code Review and '26.7× lower variance (σ: 1.75 vs 9.07)' for Research Pipeline. Variance is explicitly reported alongside mean results."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Each experiment uses a within-subjects or between-conditions comparison: CONTRACTED vs UNCONTRACTED baselines in Code Review, Research Pipeline, and Crisis Communication; URGENT vs ECONOMICAL vs BALANCED modes in Strategy Modes."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The UNCONTRACTED condition is a natural baseline (no contract enforcement) rather than a prior system. This is appropriate for evaluating the framework's contribution rather than against competing systems. The comparison is contemporary and relevant."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No ablation study isolates individual components of the Agent Contract framework (e.g., what happens with resource constraints but no success criteria Φ, or temporal constraints T but no resource constraints R). The four experiments test different scenarios but not systematic component removal."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Multiple metrics are used across experiments: token usage, variance, iteration counts, LLM calls, success rate, timeout rate, reasoning tokens, and average time. Code Review alone reports 5 metrics."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "The Research Pipeline experiment uses 'multi-judge LLM evaluation following best practices for rating indeterminacy' (Section 8.3)—LLMs evaluate quality, not humans. No human evaluation of system outputs is performed in any experiment."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "The Code Review experiment uses LiveCodeBench problems 'released post-February 2025, after model cutoff' and the Strategy Modes experiment uses OpenR1 problems 'released February 2025, after model cutoff.' These are post-training data and function as true held-out evaluation sets."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "The Code Review experiment breaks down results by difficulty: 'medium-difficulty problems show 92% token savings versus 76% for easy' (Section 8.2). The Research Pipeline uses 'five categories (technology, science, business, health, society)' though category-level results are not shown in detail."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The paper discusses failure modes: one UNCONTRACTED agent 'failed entirely—stuck in an evaluation loop without submitting output' (Section 8.2), and the Research Pipeline detected 'a runaway agent that exceeded its 40K token budget (56K consumed)' (Section 8.3)."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper explicitly reports that contracted execution has a 7.1 percentage point lower success rate (52.9% vs 60.0%, p=0.13 NS) in Code Review—a cost of governance. Section 7 ('Fundamental Limitations') honestly describes what contracts cannot guarantee."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The abstract claims of '90% token reduction,' '525× lower variance,' 'zero conservation violations,' and 'measurable quality-resource tradeoffs' are all directly supported by results in Section 8. The claims accurately reflect the experimental findings."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The paper uses a within-subjects design (same problems run in CONTRACTED and UNCONTRACTED conditions) for Code Review, enabling controlled causal inference. The design allows attributing token reduction to the contract mechanism directly, not confounding factors."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The title 'A Formal Framework for Resource-Bounded Autonomous AI Systems' claims general applicability. Empirical results are limited to Gemini 2.5 Flash/Flash-Lite on LiveCodeBench and OpenR1 with Google ADK/LiteLLM. The conclusion states 'formal governance becomes essential... Agent Contracts provide one such foundation' without bounding this to the tested models. Per schema guidance, broad titles when results are on a single model family and two benchmarks is NO."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "Section 7 discusses enforcement limitations (single-call budget overruns cannot be prevented) and Section 8.3 provides a sensitivity analysis for the outlier. However, the core alternative explanation — that simple budget-aware prompting or iteration limits alone could produce equivalent results without the formal contract framework — is not discussed. The paper does not consider whether the observed token reduction and variance reduction are attributable to the formal framework versus the simpler mechanisms it bundles together."
    129       }
    130     },
    131     "setup_transparency": {
    132       "model_versions_specified": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper states 'Gemini 2.5 Flash and Flash-Lite (knowledge cutoff: January 2025)' (Section 8) but does not provide specific API version identifiers or snapshot dates. Marketing names like 'Gemini 2.5 Flash' without an API version or snapshot date do not meet the criterion."
    136       },
    137       "prompts_provided": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper describes prompting strategies (e.g., 'budget-aware prompts,' 'dynamic status updates showing both token consumption and iteration progress,' and mentions 'Budget: {used}/{total}' as an example) but does not provide full prompt text. Descriptions of what prompts do rather than the actual text counts as NO."
    141       },
    142       "hyperparameters_reported": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "No hyperparameters (temperature, top-p, max tokens, or other sampling settings) are reported for the LLM calls. The paper reports resource constraints like token budgets and iteration limits, but not LLM inference hyperparameters."
    146       },
    147       "scaffolding_described": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "The agentic scaffolding is described in detail: Google ADK with DelegatingAdkAgent, Coder↔Reviewer pipeline mechanics, tool descriptions (test_code, web search), budget-aware prompting mechanism, runtime monitoring, and conservation law enforcement (Sections 8.2–8.4)."
    151       },
    152       "data_preprocessing_documented": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "The paper states which benchmarks were used (LiveCodeBench, OpenR1) and their difficulty categories, but does not describe how problems were selected, filtered, or preprocessed. For example, the selection of '31 easy and 39 medium difficulty' LiveCodeBench problems lacks filtering criteria."
    156       }
    157     },
    158     "limitations_and_scope": {
    159       "limitations_section_present": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 7 ('Fundamental Limitations and Practical Enforcement') is a dedicated section discussing limitations. It covers single-call enforcement constraints, enforcement capabilities, and future infrastructure requirements needed for hard guarantees."
    163       },
    164       "threats_to_validity_specific": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "The paper identifies specific threats: token consumption is only known after LLM call completion (Section 7.1), contracts provide 'best-effort enforcement' not hard guarantees, the Research Pipeline outlier analysis discusses how one catastrophic UNCONTRACTED failure skews variance results, and Experiment 3 uses LLM-as-judge with acknowledged 'rating indeterminacy.'"
    168       },
    169       "scope_boundaries_stated": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 7.1 explicitly states 'contracts cannot prevent a single expensive call from exceeding budget,' and Section 7.3 states 'Until then, contracts remain most valuable for multi-call and multi-agent scenarios.' These are specific statements about what the framework does NOT provide."
    173       }
    174     },
    175     "data_integrity": {
    176       "raw_data_available": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper states 'we provide experiment code and data for reproducibility' via the GitHub repository (footnote 3). The benchmarks used (LiveCodeBench, OpenR1) are publicly available, enabling independent verification."
    180       },
    181       "data_collection_described": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The benchmarks and their sources are described: LiveCodeBench (arXiv:2403.07974, post-February 2025 problems), OpenR1 (released February 2025), and research topics across five categories for the Research Pipeline experiment."
    185       },
    186       "recruitment_methods_described": {
    187         "applies": false,
    188         "answer": false,
    189         "justification": "No human participants involved. Data comes from public benchmarks (LiveCodeBench, OpenR1) and synthetically generated research topics. NA for recruitment."
    190       },
    191       "data_pipeline_documented": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "The selection criteria for specific problems within LiveCodeBench (31 easy, 39 medium from 70 total) and OpenR1 (50 medium-difficulty) are not explained. How 'medium-difficulty' was defined or identified is not documented."
    195       }
    196     },
    197     "conflicts_of_interest": {
    198       "funding_disclosed": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "Both authors are identified as 'Independent Researcher' with personal email addresses. There is no acknowledgments section and no funding disclosure anywhere in the paper."
    202       },
    203       "affiliations_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Authors are listed as 'Independent Researcher' with personal emails (yeqi519@gmail.com, jtan@live.de), indicating no institutional affiliation with the evaluated systems (Google ADK, Gemini). Their independence from evaluated vendors is implicit."
    207       },
    208       "funder_independent_of_outcome": {
    209         "applies": false,
    210         "answer": false,
    211         "justification": "No funding is disclosed or apparent. Both authors are independent researchers with no stated funding source."
    212       },
    213       "financial_interests_declared": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "There is no competing interests statement in the paper. The authors do not declare whether they hold equity or financial interests related to the Agent Contracts framework or related products. Absence of disclosure counts as NO."
    217       }
    218     },
    219     "contamination": {
    220       "training_cutoff_stated": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "The paper explicitly states 'Gemini 2.5 Flash and Flash-Lite (knowledge cutoff: January 2025)' in Section 8. This is used to select benchmarks released after the training cutoff."
    224       },
    225       "train_test_overlap_discussed": {
    226         "applies": true,
    227         "answer": true,
    228         "justification": "The paper directly addresses contamination: Code Review uses 'LiveCodeBench problems released post-February 2025, after model cutoff' and Strategy Modes uses 'OpenR1 (released February 2025, after model cutoff).' The selection of post-cutoff benchmarks is an explicit contamination mitigation strategy."
    229       },
    230       "benchmark_contamination_addressed": {
    231         "applies": true,
    232         "answer": true,
    233         "justification": "The paper explicitly chose benchmarks released after the January 2025 model training cutoff (LiveCodeBench post-February 2025, OpenR1 February 2025) specifically to avoid contamination. This is a direct and appropriate mitigation strategy."
    234       }
    235     },
    236     "human_studies": {
    237       "pre_registered": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No human participants. All experiments use LLM agents on benchmark problems."
    241       },
    242       "irb_or_ethics_approval": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants. All experiments use LLM agents on benchmark problems."
    246       },
    247       "demographics_reported": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants. All experiments use LLM agents on benchmark problems."
    251       },
    252       "inclusion_exclusion_criteria": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants. All experiments use LLM agents on benchmark problems."
    256       },
    257       "randomization_described": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants. All experiments use LLM agents on benchmark problems."
    261       },
    262       "blinding_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants. All experiments use LLM agents on benchmark problems."
    266       },
    267       "attrition_reported": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants. All experiments use LLM agents on benchmark problems."
    271       }
    272     },
    273     "cost_and_practicality": {
    274       "inference_cost_reported": {
    275         "applies": true,
    276         "answer": true,
    277         "justification": "The paper reports tokens consumed per task (34,606 vs 3,461 in Code Review, Section 8.2), wall-clock time per task (6.9s, 12.5s, 16.9s in Strategy Modes, Section 8.4), reasoning tokens (0, 718, 1519), and LLM call counts. The schema lists 'tokens consumed' and 'wall-clock time' as sufficient indicators of inference cost/latency."
    278       },
    279       "compute_budget_stated": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No total computational budget or GPU hours are stated for the experiments. The paper reports per-task token usage but not total experiment compute or API spend across all 244 experimental runs."
    283       }
    284     }
    285   },
    286   "claims": [
    287     {
    288       "claim": "Agent Contracts achieve 90% token reduction with 525× lower variance in iterative coding workflows compared to uncontracted execution.",
    289       "evidence": "Code Review experiment (n=70 LiveCodeBench problems): CONTRACTED 3,461 tokens vs UNCONTRACTED 34,606 tokens (p=0.0007, paired t-test); variance 10.1M vs 5.29B (Section 8.2, Table).",
    290       "supported": "strong"
    291     },
    292     {
    293       "claim": "Conservation laws for multi-agent delegation achieve zero violations across all tested trials.",
    294       "evidence": "Research Pipeline experiment (n=50): zero conservation violations across all 50 trials; one runaway agent detected and halted at 56K tokens (exceeding 40K budget) as evidence of enforcement (Section 8.3).",
    295       "supported": "strong"
    296     },
    297     {
    298       "claim": "Contract modes produce measurable quality-resource tradeoffs: BALANCED mode achieves 86% success versus 70% for URGENT.",
    299       "evidence": "Strategy Modes experiment (n=50 OpenR1 logic puzzles): BALANCED 86%, ECONOMICAL 76%, URGENT 70%, with BALANCED vs URGENT difference p≈0.05 (Section 8.4, Table).",
    300       "supported": "moderate"
    301     },
    302     {
    303       "claim": "Contracted execution achieves equivalent or better agent reliability while constraining resources, with iteration governance preventing agent failures.",
    304       "evidence": "Crisis Communication experiment (n=24): 23% token reduction (p=0.005) with statistically equivalent quality (p=0.32); one UNCONTRACTED agent failed entirely while CONTRACTED succeeded (Section 8.2).",
    305       "supported": "moderate"
    306     },
    307     {
    308       "claim": "Existing agent frameworks (LangGraph, AutoGen, CrewAI, etc.) lack formal governance mechanisms for resource budgets, success criteria, and conservation laws.",
    309       "evidence": "Table 1 survey of 8 frameworks showing 'Agent Contract: –' (none) for all, with operational controls but no cost budgets, temporal deadlines, success criteria, or conservation laws (Section 3.4).",
    310       "supported": "moderate"
    311     },
    312     {
    313       "claim": "The 7.1 percentage point success rate reduction from contracting (60.0% to 52.9%) is not statistically significant.",
    314       "evidence": "Code Review experiment: success rate difference p=0.13 (NS) from paired t-test (Section 8.2, Table). The paper honestly reports this trade-off.",
    315       "supported": "strong"
    316     }
    317   ],
    318   "methodology_tags": [
    319     "theoretical",
    320     "benchmark-eval"
    321   ],
    322   "key_findings": "The paper introduces Agent Contracts, a formal framework (C = I, O, S, R, T, Φ, Ψ) for resource-bounded autonomous AI, and validates it across four experiments showing 90% token reduction and 525× lower variance in iterative coding workflows, zero conservation violations in multi-agent delegation, and clear quality-resource tradeoffs across contract modes. A key honest finding is that contraction reduces task success rate by 7.1 percentage points (statistically non-significant), representing the governance cost. The framework addresses a genuine gap: no existing multi-agent framework (LangGraph, AutoGen, CrewAI, OpenAI Agents SDK, Google ADK, Bedrock, LlamaIndex, smolagents) provides formal resource governance with conservation laws. A notable limitation acknowledged in the paper is that contracts provide best-effort rather than hard enforcement because LLM token consumption is unknowable until after a call completes.",
    323   "red_flags": [
    324     {
    325       "flag": "LLM-as-judge for quality assessment",
    326       "detail": "The Research Pipeline experiment uses 'multi-judge LLM evaluation' for quality scoring, which introduces circularity: the same type of model being evaluated (LLMs) is also the evaluator. While the paper cites Guerdan et al. (2025) on 'rating indeterminacy,' this does not eliminate the concern that LLM quality scores may not reflect true output quality."
    327     },
    328     {
    329       "flag": "Model version underspecification",
    330       "detail": "The paper uses 'Gemini 2.5 Flash and Flash-Lite' without API version or snapshot dates. Model behavior changes across versions, making exact reproduction impossible. The training cutoff (January 2025) is stated for contamination purposes but is insufficient for reproducibility."
    331     },
    332     {
    333       "flag": "Small sample for Crisis Communication experiment",
    334       "detail": "The Crisis Communication experiment uses only 24 scenarios, which is underpowered for detecting small effects. With n=24, the 23% token reduction (p=0.005) is credible, but the single agent failure event (1/24 UNCONTRACTED vs 0/24 CONTRACTED) is anecdotal at this sample size."
    335     },
    336     {
    337       "flag": "Prompts not provided",
    338       "detail": "Budget-aware prompts are central to the enforcement mechanism (Section 5.2), yet the actual prompt text is not provided. The paper shows 'Budget: {used}/{total}' as an example template but not the complete prompts. This limits reproducibility of the core mechanism."
    339     },
    340     {
    341       "flag": "No hyperparameters reported",
    342       "detail": "Temperature and other LLM sampling settings are not disclosed. These significantly affect output variance and quality, yet they are absent from all four experiments. The reported variance reduction may partly reflect the effect of the contracted iteration limits rather than the formal framework per se."
    343     }
    344   ],
    345   "cited_papers": [
    346     {
    347       "title": "Why do multi-agent LLM systems fail?",
    348       "authors": [
    349         "Cemri, M.",
    350         "Pan, M.Z.",
    351         "Yang, S.",
    352         "Agrawal, L.A."
    353       ],
    354       "year": 2025,
    355       "arxiv_id": "2503.13657",
    356       "relevance": "Analyzes failure modes of multi-agent LLM systems, directly relevant to understanding governance challenges in agentic AI."
    357     },
    358     {
    359       "title": "Token-budget-aware LLM reasoning",
    360       "authors": [
    361         "Han, T.",
    362         "Wang, Z.",
    363         "Fang, C.",
    364         "Zhao, S.",
    365         "Ma, S.",
    366         "Chen, Z."
    367       ],
    368       "year": 2024,
    369       "arxiv_id": "2412.18547",
    370       "relevance": "Introduces token-budget-aware reasoning achieving 68% token reduction, a key related work for resource-bounded agent evaluation."
    371     },
    372     {
    373       "title": "Reasoning in token economies: Budget-aware evaluation of LLM reasoning strategies",
    374       "authors": [
    375         "Wang, J.",
    376         "Jain, S.",
    377         "Zhang, D.",
    378         "Ray, B.",
    379         "Kumar, V.",
    380         "Athiwaratkun, B."
    381       ],
    382       "year": 2024,
    383       "doi": "10.18653/v1/2024.emnlp-main.1106",
    384       "relevance": "Demonstrates that compute-equalized comparison changes conclusions about LLM reasoning strategies, foundational for understanding resource-aware evaluation."
    385     },
    386     {
    387       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    388       "authors": [
    389         "Jain, N.",
    390         "Han, K.",
    391         "Gu, A.",
    392         "Li, W.D."
    393       ],
    394       "year": 2024,
    395       "arxiv_id": "2403.07974",
    396       "relevance": "The code benchmark used in the Code Review experiment; relevant as a contamination-resistant benchmark for LLM code evaluation."
    397     },
    398     {
    399       "title": "Validating LLM-as-a-judge systems under rating indeterminacy",
    400       "authors": [
    401         "Guerdan, L.",
    402         "Barocas, S.",
    403         "Holstein, K.",
    404         "Wallach, H.",
    405         "Wu, Z.S.",
    406         "Choudechova, A."
    407       ],
    408       "year": 2025,
    409       "relevance": "Provides methodology for LLM-as-judge evaluation used in the Research Pipeline experiment; relevant for evaluating LLM evaluation reliability."
    410     },
    411     {
    412       "title": "MetaGPT: Meta programming for multi-agent collaborative framework",
    413       "authors": [
    414         "Hong, S.",
    415         "Zhuge, M.",
    416         "Chen, J."
    417       ],
    418       "year": 2023,
    419       "arxiv_id": "2308.00352",
    420       "relevance": "Key multi-agent LLM framework integrated with human workflow patterns; relevant for comparing coordination approaches."
    421     },
    422     {
    423       "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversation",
    424       "authors": [
    425         "Wu, Q.",
    426         "Bansal, G.",
    427         "Zhang, J.",
    428         "Wu, Y."
    429       ],
    430       "year": 2023,
    431       "arxiv_id": "2308.08155",
    432       "relevance": "Foundational multi-agent LLM framework; relevant for understanding multi-agent coordination patterns and governance gaps."
    433     },
    434     {
    435       "title": "ReAct: Synergizing reasoning and acting in language models",
    436       "authors": [
    437         "Yao, S.",
    438         "Zhao, J.",
    439         "Yu, D.",
    440         "Du, N."
    441       ],
    442       "year": 2023,
    443       "arxiv_id": "2210.03629",
    444       "relevance": "Introduced the reasoning-acting paradigm for LLM agents; foundational work for understanding agentic AI behavior and resource consumption patterns."
    445     },
    446     {
    447       "title": "Multi-agent collaboration mechanisms: A survey of LLMs",
    448       "authors": [
    449         "Tran, K.T.",
    450         "Dao, D.",
    451         "Nguyen, M.D.",
    452         "Pham, Q.V.",
    453         "O'Sullivan, B.",
    454         "Nguyen, H.D."
    455       ],
    456       "year": 2025,
    457       "arxiv_id": "2501.06322",
    458       "relevance": "Surveys collaboration mechanisms across multi-agent LLM systems; directly relevant for understanding the landscape Agent Contracts addresses."
    459     },
    460     {
    461       "title": "Formalizing the safety, security, and functional properties of agentic AI systems",
    462       "authors": [
    463         "Allegrini, E.",
    464         "Shreekumar, A.",
    465         "Celik, Z.B."
    466       ],
    467       "year": 2025,
    468       "arxiv_id": "2510.14133",
    469       "relevance": "Proposes formal verification framework for agentic AI with temporal logic properties; complementary work to Agent Contracts for formal AI governance."
    470     },
    471     {
    472       "title": "Budget-aware tool-use enables effective agent scaling",
    473       "authors": [
    474         "Liu, T.",
    475         "Wang, Z.",
    476         "Miao, J."
    477       ],
    478       "year": 2025,
    479       "arxiv_id": "2511.17006",
    480       "relevance": "Demonstrates that explicit budget awareness enables effective tool-use scaling in agents; directly related to resource governance in agentic systems."
    481     },
    482     {
    483       "title": "Self-resource allocation in multi-agent LLM systems",
    484       "authors": [
    485         "Amayuelas, A.",
    486         "Yang, J.",
    487         "Agashe, S."
    488       ],
    489       "year": 2025,
    490       "arxiv_id": "2504.02051",
    491       "relevance": "Studies LLMs as resource allocators in multi-agent systems; relevant for understanding self-governance capabilities vs. formal contract enforcement."
    492     }
    493   ]
    494 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs