scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21124B)
      1 {
      2   "paper": {
      3     "title": "Reliable agent engineering should integrate machine-compatible organizational principles",
      4     "authors": [
      5       "R. Patrick Xian",
      6       "Garry A. Gabison",
      7       "Ahmed Alaa",
      8       "Christoph Riedl",
      9       "Grigorios G. Chrysos"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv preprint",
     13     "arxiv_id": "2512.07665",
     14     "doi": "10.48550/arXiv.2512.07665"
     15   },
     16   "scan_version": 2,
     17   "active_modules": [],
     18   "methodology_tags": ["theoretical"],
     19   "key_findings": "The paper proposes three organizational principles for AI agent engineering drawn from organization science: (1) balancing agency and capabilities in agent design via structural differentiation, (2) weighing performance benefits against engineering overheads in agent scaling using economies/diseconomies frameworks, and (3) leveraging internal (configuration, self-reflection) and external (finetuning, verification, human feedback) mechanisms for agent management. These remain unvalidated theoretical proposals supported by analogy to human organizations, not empirical evidence.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": false,
     24         "answer": false,
     25         "justification": "Purely theoretical/position paper with no computational experiments or code artifacts to release."
     26       },
     27       "data_released": {
     28         "applies": false,
     29         "answer": false,
     30         "justification": "No data collection or analysis was performed. The paper's tables (1-4) are conceptual comparison frameworks, not empirical data."
     31       },
     32       "environment_specified": {
     33         "applies": false,
     34         "answer": false,
     35         "justification": "No computational experiments to specify an environment for."
     36       },
     37       "reproduction_instructions": {
     38         "applies": false,
     39         "answer": false,
     40         "justification": "No computational experiments to reproduce. The work is a theoretical argument."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": false,
     46         "answer": false,
     47         "justification": "No quantitative experiments or empirical results are presented."
     48       },
     49       "significance_tests": {
     50         "applies": false,
     51         "answer": false,
     52         "justification": "No comparative empirical claims requiring statistical tests."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": false,
     56         "answer": false,
     57         "justification": "No empirical measurements reported."
     58       },
     59       "sample_size_justified": {
     60         "applies": false,
     61         "answer": false,
     62         "justification": "Theoretical paper with no experimental samples."
     63       },
     64       "variance_reported": {
     65         "applies": false,
     66         "answer": false,
     67         "justification": "No experimental runs to report variance across."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": false,
     73         "answer": false,
     74         "justification": "No experiments or empirical evaluation conducted."
     75       },
     76       "baselines_contemporary": {
     77         "applies": false,
     78         "answer": false,
     79         "justification": "No experimental baselines to evaluate."
     80       },
     81       "ablation_study": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "No system or experiment to ablate."
     85       },
     86       "multiple_metrics": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "No evaluation metrics used."
     90       },
     91       "human_evaluation": {
     92         "applies": false,
     93         "answer": false,
     94         "justification": "No system outputs to evaluate."
     95       },
     96       "held_out_test_set": {
     97         "applies": false,
     98         "answer": false,
     99         "justification": "No datasets or evaluation splits."
    100       },
    101       "per_category_breakdown": {
    102         "applies": false,
    103         "answer": false,
    104         "justification": "No experimental results to break down."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": false,
    108         "answer": false,
    109         "justification": "No experimental evaluation producing failure cases. The paper discusses failure modes conceptually but this question targets experimental evaluation."
    110       },
    111       "negative_results_reported": {
    112         "applies": false,
    113         "answer": false,
    114         "justification": "No experiments that could produce negative results."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract claims three preliminary accounts of organizational principles for agent engineering (design, scaling, management). Sections 2, 3, and 4 deliver on each of these with detailed discussion, examples, and remarks. The abstract's claims are conceptual and appropriately hedged ('preliminary accounts')."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper uses causal language throughout: Eq. 1 formally claims organizational principles 'improve' system performance (E_T[R(S;H)] > E_T[R(S;H0)]). Remarks 1-3 claim organizational principles improve reliability. These improvement claims are supported only by analogy to human organizations, not by any empirical validation or causal identification strategy."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper's title and framing suggest broad applicability to all AI agent engineering. While it scopes to 'LLM-based' agents (Table 1) and 'for-profit' organizations as the analogy source, it does not explicitly state where its organizational principles would NOT apply or what types of agentic systems the framework is unsuited for."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": false,
    135         "answer": false,
    136         "justification": "Pure theoretical paper presenting no empirical results. Alternative explanations for observed results are not applicable."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "Theoretical paper with no measurements. No proxy-outcome gap to discuss."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "No models used in experiments. The paper discusses LLM agents conceptually."
    149       },
    150       "prompts_provided": {
    151         "applies": false,
    152         "answer": false,
    153         "justification": "No prompting used. This is a theoretical paper."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": false,
    157         "answer": false,
    158         "justification": "No experiments or model usage requiring hyperparameters."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding implemented. The paper discusses scaffolding conceptually (e.g., tool-use architectures in Fig. 2) but does not build or test any."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": false,
    167         "answer": false,
    168         "justification": "No data collected or preprocessed."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "Section 5 ('Discussion and conclusion') discusses limitations of existing approaches to agent reliability ('robustness in deep learning is brittle', 'agentic systems are more extensive than LLMs') but does not include a dedicated limitations section for the paper's own contributions. The limitations of the proposed organizational framework itself (e.g., whether org science analogies actually transfer to AI systems, lack of empirical validation) are not substantively discussed."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "No specific threats to the validity of the paper's own framework are discussed. The paper acknowledges that 'an agentic system cannot simply adopt the structure of the human organization it aims to emulate' (Section 5) but does not discuss threats to the validity of its analogical reasoning approach or framework."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "The paper implicitly scopes to LLM-based agents and for-profit organizations (Table 1) but does not explicitly state what is out of scope or what the proposed principles do NOT address. No explicit 'what we do not claim' statements."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": false,
    191         "answer": false,
    192         "justification": "No data collected. Purely theoretical paper."
    193       },
    194       "data_collection_described": {
    195         "applies": false,
    196         "answer": false,
    197         "justification": "No data collection performed."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants and no data collection from subjects."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": false,
    206         "answer": false,
    207         "justification": "No data pipeline. Purely theoretical paper."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "The acknowledgments thank the Cooperative AI Foundation for 'inspiring discussions' and a Columbia professor for 'helpful exchanges on the initial idea' but no funding source is explicitly disclosed. Multi-author academic paper from five universities with no mention of grants or sponsorship."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are clearly listed: Northeastern University (two departments), Queen Mary University of London, UC Berkeley, UCSF, and University of Wisconsin-Madison."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No funding source is disclosed, so independence cannot be assessed. The Cooperative AI Foundation is acknowledged for discussions but it is unclear if they provided funding, and they have a stake in AI coordination research."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests or financial interests statement is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "No model evaluation on any benchmark. Theoretical paper."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No benchmark evaluation performed."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No benchmark evaluation performed."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants or experimental study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants or experimental study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": false,
    289         "answer": false,
    290         "justification": "Purely theoretical paper with no method to cost."
    291       },
    292       "compute_budget_stated": {
    293         "applies": false,
    294         "answer": false,
    295         "justification": "No computational experiments performed."
    296       }
    297     }
    298   },
    299   "claims": [
    300     {
    301       "claim": "Organizational principles can be constructive for agentic systems, i.e., they improve system performance over baseline principles (formalized in Eq. 1).",
    302       "evidence": "Section 1 defines constructive organizational principles via Eq. 1 (E_T[R(S;H)] > E_T[R(S;H0)]) and argues from organization science literature that these principles have worked for human organizations. No empirical evidence for AI agents provided.",
    303       "supported": "unsupported"
    304     },
    305     {
    306       "claim": "Single agents with highly centralized intelligence exacerbate safety and misalignment concerns compared to distributed multi-agent systems.",
    307       "evidence": "Section 2.2 cites Mitchell et al. (2025) and Greenblatt (2025) for the theoretical argument. Example 2.1 argues that distributing tool-use across provider-bundled agents 'simplifies the requirements in base model quality.' No original empirical evidence.",
    308       "supported": "weak"
    309     },
    310     {
    311       "claim": "Multiagent debate exhibits a conflict-performance relationship analogous to organizational conflict, with functional and dysfunctional zones.",
    312       "evidence": "Example 3.2 and Fig. 3b draw the parallel to intragroup conflict theory (Jehn & Bendersky, 2003). Cites Wynn et al. (2025) and Zhang et al. (2025a) as showing this relationship in AI agents. The parallel is argued by analogy with references to external empirical work, but no original data.",
    313       "supported": "moderate"
    314     },
    315     {
    316       "claim": "High reliability (HR) principles from organization science can be implemented in coding agents to improve reliability.",
    317       "evidence": "Table 4 and Appendix C propose specific implementations of five HR principles (Weick & Sutcliffe, 2015) in coding agents: preoccupation with failure via reward change, reluctance to simplify via execution-gated operations, sensitivity to operations via multi-agent debate, commitment to resilience via bug-detection agents, deference to expertise via negotiation-enabled subagents. These are proposals without empirical validation.",
    318       "supported": "weak"
    319     },
    320     {
    321       "claim": "Agent scaling can be decomposed into four regimes (structure, interaction, resource, capability scaling) analogous to organizational scaling.",
    322       "evidence": "Table 2 (Section 3.2) maps agent scaling categories to organizational analogues. References Chen et al. (2024b) for compound AI scaling properties. The decomposition is conceptual and the analogy is not empirically validated.",
    323       "supported": "weak"
    324     },
    325     {
    326       "claim": "Agentic systems need organizational principles because organization science distills principles from human organizations that explain efficiency, productivity, and resilience.",
    327       "evidence": "Section 1 argues from the organization science literature (March & Simon, 1993; Haveman, 2022; McEvily et al., 2003) that these principles generalize beyond human organizations. The transfer to AI agents is argued by analogy, supported by Table 1's comparison.",
    328       "supported": "weak"
    329     }
    330   ],
    331   "red_flags": [
    332     {
    333       "flag": "No empirical validation",
    334       "detail": "The paper proposes three organizational principle frameworks for AI agents but provides zero empirical evidence that these principles actually improve agent reliability or performance. The formal definition in Eq. 1 (constructive principle improving E_T[R]) is never tested. All support comes from analogy to human organizations."
    335     },
    336     {
    337       "flag": "Claims outrun evidence",
    338       "detail": "The paper makes strong prescriptive claims ('should integrate machine-compatible organizational principles') supported only by analogical reasoning. The gap between human organization behavior and LLM agent behavior is acknowledged (Section 5, Table 1) but the proposed principles are presented as if the analogy is sufficient to justify adoption."
    339     },
    340     {
    341       "flag": "Missing limitations of own framework",
    342       "detail": "Section 5 discusses limitations of existing approaches (robustness, guardrails) but does not discuss limitations of the paper's own framework: whether organizational analogies actually transfer, under what conditions the framework fails, or what would falsify the proposed principles."
    343     },
    344     {
    345       "flag": "Anthropomorphic reasoning risk",
    346       "detail": "The paper draws parallels between human organizations and AI systems (e.g., multiagent debate as intragroup conflict, agent reward as worker motivation) without rigorous analysis of when these analogies break down. Section 5 briefly acknowledges LLM agents are 'not reliable simulacra of human behavior' but still builds the entire framework on organizational parallels."
    347     }
    348   ],
    349   "cited_papers": [
    350     {
    351       "title": "Why Do Multiagent Systems Fail?",
    352       "authors": ["M. Z. Pan", "M. Cemri", "L. A. Agrawal"],
    353       "year": 2025,
    354       "relevance": "Empirical analysis of failure modes in multiagent AI systems, directly relevant to agent reliability assessment."
    355     },
    356     {
    357       "title": "Commercial LLM Agents Are Already Vulnerable to Simple Yet Dangerous Attacks",
    358       "authors": ["A. Li", "Y. Zhou", "V. C. Raghuram"],
    359       "year": 2025,
    360       "arxiv_id": "2502.08586",
    361       "relevance": "Demonstrates security vulnerabilities in commercial LLM agents, relevant to agent safety and reliability evaluation."
    362     },
    363     {
    364       "title": "Security Challenges in AI Agent Deployment: Insights from a Large Scale Public Competition",
    365       "authors": ["A. Zou", "M. Lin", "E. Jones"],
    366       "year": 2025,
    367       "arxiv_id": "2507.20526",
    368       "relevance": "Large-scale empirical study of AI agent security challenges from deployment competition."
    369     },
    370     {
    371       "title": "τ-bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains",
    372       "authors": ["S. Yao", "N. Shinn", "P. Razavi", "K. R. Narasimhan"],
    373       "year": 2025,
    374       "relevance": "Benchmark for evaluating tool-using AI agents in real-world domain interactions."
    375     },
    376     {
    377       "title": "Are More LLM Calls All You Need? Towards the Scaling Properties of Compound AI Systems",
    378       "authors": ["L. Chen", "J. Q. Davis", "B. Hanin"],
    379       "year": 2024,
    380       "relevance": "Empirical study of scaling properties in compound AI systems, relevant to agent scaling analysis."
    381     },
    382     {
    383       "title": "Gorilla: Large Language Model Connected with Massive APIs",
    384       "authors": ["S. G. Patil", "T. Zhang", "X. Wang", "J. E. Gonzalez"],
    385       "year": 2024,
    386       "relevance": "Tool-use benchmark for LLMs connecting to APIs, relevant to agent tool-use evaluation."
    387     },
    388     {
    389       "title": "AI safety via debate",
    390       "authors": ["G. Irving", "P. Christiano", "D. Amodei"],
    391       "year": 2018,
    392       "arxiv_id": "1805.00899",
    393       "relevance": "Foundational work on multiagent debate for AI safety, a key mechanism discussed in the paper."
    394     },
    395     {
    396       "title": "Let's Verify Step by Step",
    397       "authors": ["H. Lightman", "V. Kosaraju", "Y. Burda"],
    398       "year": 2023,
    399       "relevance": "Process reward models for step-by-step verification, relevant to agent reward design."
    400     },
    401     {
    402       "title": "Fully Autonomous AI Agents Should Not be Developed",
    403       "authors": ["M. Mitchell", "A. Ghosh", "A. S. Luccioni", "G. Pistilli"],
    404       "year": 2025,
    405       "arxiv_id": "2502.02649",
    406       "relevance": "Position paper on risks of fully autonomous AI agents, relevant to agent governance and safety."
    407     },
    408     {
    409       "title": "Executable code actions elicit better LLM agents",
    410       "authors": ["X. Wang", "Y. Chen", "L. Yuan"],
    411       "year": 2024,
    412       "relevance": "Demonstrates that code execution improves LLM agent performance, relevant to agent architecture design."
    413     },
    414     {
    415       "title": "AgentBreeder: Mitigating the AI Safety Risks of Multi-Agent Scaffolds via Self-Improvement",
    416       "authors": ["J. Rosser", "J. N. Foerster"],
    417       "year": 2025,
    418       "relevance": "Addresses safety risks in multi-agent scaffolding through self-improvement, relevant to agent scaling and safety."
    419     },
    420     {
    421       "title": "GuardAgent: Safeguard LLM Agents via Knowledge-Enabled Reasoning",
    422       "authors": ["Z. Xiang", "L. Zheng", "Y. Li"],
    423       "year": 2025,
    424       "relevance": "Implements safety guardrails for LLM agents using specialized guard agents, relevant to agent management mechanisms."
    425     }
    426   ]
    427 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs