scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26267B)
      1 {
      2   "paper": {
      3     "title": "Artificial Organisations",
      4     "authors": ["William Waites"],
      5     "year": 2026,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2602.13275"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "Section 8.1 provides source code at https://codeberg.org/wwaites/persevere."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Section 8.1 states: 'All notes, drafts, composition project artefacts and so forth underlying this document are available at https://codeberg.org/wwaites/persevere-data.'"
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or environment specification is mentioned in the paper. The paper mentions using Claude 4.5 models but provides no dependency or environment setup details."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided in the paper. The code and data repositories are linked but no README or reproduction guide is described."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Section 6 reports 95% confidence intervals for all key metrics: absolute improvement [20.32, 29.64], relative improvement [57.04%, 102.73%], per-iteration improvement [11.75, 17.72], and iterations to convergence [3.75, 4.86]."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper makes comparative claims (e.g., quality improvement over iterations, fabrication detection rates) but does not report any statistical significance tests (no p-values, t-tests, or similar)."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Section 6 reports effect sizes with baseline context: mean absolute improvement of 24.91 points, mean relative improvement of 78.85%, per-iteration improvement of 14.70 points. The absolute point values with CIs provide magnitude context."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The sample of 474 projects (98 with complete review cycles) is described but never justified. No power analysis or reasoning for why this sample size is adequate for the claims made."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 6 reports confidence intervals for all key metrics, and Figure 2 shows the full distribution of score improvements and iteration counts. The wide CI for relative improvement [57.04%, 102.73%] explicitly reflects high variability."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The paper reports only its own system's performance. No comparison against a single-agent baseline, policy-based compartmentalisation, or any alternative architecture. Section 8 explicitly identifies controlled architectural comparison as future work."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No baselines are included at all, so the question of whether they are contemporary does not arise. The paper acknowledges this gap in Section 8."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No ablation study is performed. Section 8 explicitly proposes ablation studies as future work: 'controlled architectural comparison through ablation studies would determine whether compartmentalisation actually matters.'"
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Section 6 reports multiple metrics: fabrication detection rate (52%), project completion rate (69%), absolute quality improvement (24.91 points), relative improvement (78.85%), iterations to convergence (4.30), and cost per project ($0.29)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "All evaluation is automated through the system's own Corroborator and Critic agents. No independent human evaluation of output quality is reported. The paper makes claims about quality improvement but quality is assessed only by the system's own LLM-based Critic."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is not a benchmark evaluation paper. The system processes composition tasks; there is no train/test split or held-out evaluation set."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 2 provides per-category breakdown of verdict distribution (passed/failed/mixed). Section 6 breaks down costs by outcome (completed $0.29, failed $0.20, aborted $0.59). Figure 2 shows distributions of improvement and iteration counts."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 4 documents six consecutive FABRICATED verdicts during self-documentation. Section 5 documents a case where the system was given an impossible task. Section 6 reports 18.1% failure rate and 12.7% abort rate with discussion of causes."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports a 31% non-completion rate (18.1% failed + 12.7% aborted), three projects with negative improvement (Figure 2 caption), and the honest refusal case where the system could not complete the assigned task. Section 7 acknowledges the system cannot operate without human oversight."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims (52% fabrication detection, 79% quality improvement, 4.3 mean iterations, progression toward honest refusal) are all supported by Section 6 results and Section 5 case study. The abstract uses hedged language ('patterns consistent with the institutional hypothesis', 'findings motivate controlled investigation')."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper implies causal claims throughout (e.g., 'architectural enforcement produces reliable collective behaviour', 'adversarial feedback loop drove this progression') but the study design is observational with no controlled comparisons. Section 7 acknowledges: 'we cannot definitively isolate architectural effects from instruction-following.' However, the language regularly implies causation without adequate causal identification."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section 7 explicitly bounds generalisability: 'All results were obtained using Claude 4.5 models during January-February 2026; generalisability to other large language model architectures remains unknown.' Section 5 states: 'It does not support claims about other models, architectures, domains, or systems lacking similar capacities.' The paper consistently qualifies scope throughout."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 7 discusses the reflexivity confound: 'did honest refusal emerge from architectural constraints enabling detection and iteration, or from theoretical exposure to refusal frameworks?' Section 5 discusses confounded variables preventing causal isolation. The paper explicitly considers whether results reflect model-specific behaviour rather than architectural principles."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Section 7 states 'Claude 4.5 models (Haiku, Sonnet, Opus)' but does not specify exact API versions, snapshot dates, or model IDs. 'Claude 4.5' is a marketing name without sufficient version specificity."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "No actual prompt text is provided in the paper. Agent roles are described in natural language (Section 3.2) but the actual system prompts, instructions, and tool definitions given to each agent are not included."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No hyperparameters are reported. Temperature, top-p, max tokens, and other LLM API settings are not mentioned. The convergence threshold tau=85 is specified (Section 3.4) but LLM inference parameters are absent."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Sections 3.1-3.5 provide detailed description of the agentic scaffolding: seven agent roles with specific information access profiles, three-layer architecture, workflow orchestrator (PerseveranceGraph), state management (GraphState), iteration logic, convergence threshold, and document visibility system with six levels."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper does not describe how the 474 projects were selected for analysis, what filtering was applied, or how the 98 projects with 'complete review cycles' were identified from the full corpus. The jump from 474 total to 98 with complete cycles to 167 with verdicts is not fully explained."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 7 is a dedicated 'Limitations' section spanning approximately one full page with substantive discussion of multiple specific limitations."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 7 discusses specific threats: reflexivity confound ('the system documented the honest refusal case whilst simultaneously experiencing the phenomenon'), single model family limitation, unfunded research constraints, human-in-the-loop requirement preventing extrapolation to unsupervised deployment. These are specific to this study, not generic."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 7 explicitly states scope boundaries: 'We cannot extrapolate these findings to unsupervised deployment, adversarial settings, or different domains.' Section 5 states: 'It does not support claims about other models, architectures, domains, or systems lacking similar capacities for articulated reasoning about constraints.'"
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 8.1 provides a link to all composition project artefacts at https://codeberg.org/wwaites/persevere-data. Case studies reference specific project identifiers (e.g., 'pouncing-siamese-deluxe-reggae') enabling verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 6 describes the data source: 474 projects processed during development of the paper, with outcome categories (327 completed, 86 failed, 60 aborted, 1 active). The Corroborator evaluated 733 documents. The operational context is described."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants were recruited. The data comes from the system's own operational logs during composition tasks."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The paper does not document how operational logs were transformed into the reported statistics. The filtering from 474 projects to 167 with verdicts to 98 with complete review cycles is not fully explained with criteria at each stage."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 7 explicitly states: 'This research was unfunded, conducted at personal expense with total operational cost of $149.22 across 474 projects.' The absence of funding is disclosed."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The author's affiliation (University of Southampton) is listed on the first page. The paper evaluates the author's own system (PCE), and the Author Contributions section transparently describes this relationship."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "The research is explicitly unfunded, conducted at personal expense. NA for unfunded work."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement or financial interests declaration is present in the paper. The author evaluates their own system, which could have commercial implications, but no formal declaration is made."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper does not evaluate a pre-trained model's capability on any benchmark. It evaluates a multi-agent system architecture for document composition, not model knowledge on test sets."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "No benchmark evaluation is performed. The system processes novel composition tasks, not pre-existing test sets."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No benchmark evaluation is performed. The operational corpus consists of unique composition tasks, not standardised benchmarks."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study. The paper analyses automated system logs from multi-agent composition tasks."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants. The study analyses the operational behaviour of a software system."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants. The single user (the author) directed the system but was not a research subject."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants were involved in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants and no experimental conditions requiring randomisation."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants. The architectural compartmentalisation between agents is described but this is system design, not blinding of human subjects."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants. Project completion/failure/abort rates are reported but these are system outcomes, not participant attrition."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Section 6 reports: '$0.29 in operational costs' per completed project, total costs of $149.22 across 474 projects, and 222,079,538 total tokens consumed. Cost breakdowns by outcome category are provided."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Section 6 reports total operational costs of $149.22 across 474 projects, with 222,079,538 tokens consumed (83% from cache). Footnote 3 notes a more realistic estimate excluding labour is $500. Section 8 estimates validation would require approximately two person-years plus substantial compute."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "The verification agent classified 52% of submitted drafts as fabricated, requiring iterative revision toward full substantiation.",
    286       "evidence": "Section 6: 'Of these, 352 documents (48.0%) received substantiated verdicts... while 381 documents (52.0%) received fabricated verdicts' across 733 documents evaluated.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Quality scores improved by 78.85% on average from initial submission to final acceptance.",
    291       "evidence": "Section 6, Table 3: 'Mean relative improvement reached 78.85% [95% CI: 57.04%, 102.73%]' across 98 projects with complete review cycles.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Projects required a mean of 4.3 iterations to reach convergence.",
    296       "evidence": "Section 6, Table 3: 'Mean iterations to convergence 4.30 [95% CI: 3.75, 4.86].'",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "When assigned an impossible task, the system progressed from fabrication toward honest refusal with alternative proposals.",
    301       "evidence": "Section 5, Table 1: Five iterations showing progression from fabricated thematic categories (iter 1) to operationalised refusal framework scoring 92/100 (iter 5). Referenced by project identifier pouncing-siamese-deluxe-reggae.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Architectural enforcement of information compartmentalisation produces reliable collective behaviour from unreliable components.",
    306       "evidence": "This is the central thesis. Evidence includes the 474-project corpus, but no controlled comparison against alternative architectures is provided. Section 8 explicitly acknowledges this requires future ablation studies.",
    307       "supported": "weak"
    308     },
    309     {
    310       "claim": "Mean cost per completed project was $0.29, demonstrating economic feasibility of structured verification.",
    311       "evidence": "Section 6: 'Projects reaching successful completion averaged $0.29 in operational costs. Total operational costs across all 474 projects amounted to $149.22.'",
    312       "supported": "strong"
    313     }
    314   ],
    315   "methodology_tags": ["case-study", "observational"],
    316   "key_findings": "The Perseverance Composition Engine, a multi-agent system with architecturally enforced information compartmentalisation, processed 474 document composition tasks with a 69% completion rate, achieving 52% fabrication detection and 79% quality improvement over an average of 4.3 iterations at $0.29 per completed project. Two case studies illustrate specific mechanisms: verification rigour during self-documentation (six consecutive fabrication rejections) and progression from fabrication toward honest refusal under impossible task constraints. The paper frames these observations as consistent with organisational theory applied to multi-agent AI safety, while explicitly acknowledging the absence of controlled experiments comparing alternative architectures.",
    317   "red_flags": [
    318     {
    319       "flag": "Author evaluating own system",
    320       "detail": "The single author designed, implemented, and evaluated the Perseverance Composition Engine. All quality assessments are produced by the system's own LLM-based agents (Corroborator and Critic), not independent evaluators. There is no external validation of output quality."
    321     },
    322     {
    323       "flag": "Self-referential evaluation",
    324       "detail": "The paper was composed by the system it describes (acknowledged in Author Contributions). The system analysed its own operational logs to produce the reported statistics. This reflexivity creates attribution confounds the paper acknowledges but cannot resolve."
    325     },
    326     {
    327       "flag": "No baselines or controlled comparisons",
    328       "detail": "All results report only the system's own performance. No comparison against single-agent baselines, policy-based compartmentalisation, or alternative architectures. The paper explicitly identifies ablation studies as future work, meaning the central claim about architectural enforcement is unsupported by comparative evidence."
    329     },
    330     {
    331       "flag": "Quality metrics are self-assessed",
    332       "detail": "The 78.85% quality improvement metric is based entirely on the system's own Critic agent scores (0-100). No independent human evaluation validates whether these scores reflect actual quality improvement. The Critic is an LLM with no demonstrated correlation to human quality judgments."
    333     },
    334     {
    335       "flag": "Causal language without causal design",
    336       "detail": "Despite careful hedging in some passages, the paper regularly uses causal language ('produces', 'drove this progression', 'enables') for observational findings. The study design cannot distinguish architectural effects from model capabilities, prompt engineering, or other confounds."
    337     }
    338   ],
    339   "cited_papers": [
    340     {
    341       "title": "Constitutional AI: Harmlessness from AI Feedback",
    342       "authors": ["Y. Bai", "S. Kadavath", "S. Kundu"],
    343       "year": 2022,
    344       "arxiv_id": "2212.08073",
    345       "relevance": "Foundational work on AI alignment through training, the individual-level approach this paper contrasts with institutional design."
    346     },
    347     {
    348       "title": "AI Safety via Debate",
    349       "authors": ["G. Irving", "P. Christiano", "D. Amodei"],
    350       "year": 2018,
    351       "arxiv_id": "1805.00899",
    352       "relevance": "Proposes debate between AI agents as a scalable oversight mechanism, a key theoretical foundation for adversarial verification in multi-agent systems."
    353     },
    354     {
    355       "title": "Improving Factuality and Reasoning in Language Models through Multiagent Debate",
    356       "authors": ["Y. Du", "S. Li", "A. Torralba", "J. B. Tenenbaum", "I. Mordatch"],
    357       "year": 2023,
    358       "arxiv_id": "2305.14325",
    359       "relevance": "Demonstrates that multi-agent debate improves factual accuracy in LLMs, directly relevant to adversarial verification approaches."
    360     },
    361     {
    362       "title": "Sleeper Agents: Training Deceptive LLMs that Persist through Safety Training",
    363       "authors": ["E. Hubinger", "C. Denison", "J. Mu"],
    364       "year": 2024,
    365       "arxiv_id": "2401.05566",
    366       "relevance": "Demonstrates model organisms methodology for studying AI safety failures, which this paper extends to collective/institutional safety."
    367     },
    368     {
    369       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
    370       "authors": ["Q. Wu", "G. Bansal", "J. Zhang"],
    371       "year": 2023,
    372       "arxiv_id": "2308.08155",
    373       "relevance": "Major multi-agent LLM framework used as a reference point for contemporary approaches to agent coordination."
    374     },
    375     {
    376       "title": "MetaGPT: Meta Programming for Multi-Agent Collaborative Framework",
    377       "authors": ["S. Hong", "X. Zheng", "J. Chen"],
    378       "year": 2023,
    379       "arxiv_id": "2308.00352",
    380       "relevance": "Multi-agent framework assigning organisational roles to LLM agents, directly comparable to the organisational approach of this paper."
    381     },
    382     {
    383       "title": "CAMEL: Communicative Agents for 'Mind' Exploration of Large Language Model Society",
    384       "authors": ["G. Li", "H. A. A. K. Hammoud", "H. Itani"],
    385       "year": 2023,
    386       "arxiv_id": "2303.17760",
    387       "relevance": "Role-playing multi-agent framework relevant to how agent coordination and role assignment affect LLM outputs."
    388     },
    389     {
    390       "title": "Large Language Models Miss the Multi-Agent Mark",
    391       "authors": ["E. L. Malfa", "G. L. Malfa", "S. Marro"],
    392       "year": 2025,
    393       "relevance": "Identifies gaps in LLM multi-agent systems research including social agency and coordination protocols, which this paper claims to address."
    394     },
    395     {
    396       "title": "MultiAgentBench: Evaluating the Collaboration and Competition of LLM Agents",
    397       "authors": ["K. Zhu", "H. Du", "Z. Hong"],
    398       "year": 2025,
    399       "doi": "10.18653/v1/2025.acl-long.421",
    400       "relevance": "Standardised benchmark for evaluating multi-agent LLM systems, relevant to evaluation methodology in the multi-agent space."
    401     },
    402     {
    403       "title": "Debate, Deliberate, Decide (D3): A Cost-Aware Adversarial Framework for Reliable and Interpretable LLM Evaluation",
    404       "authors": ["A. Harrasse", "C. Bandi", "H. Bandi"],
    405       "year": 2026,
    406       "relevance": "Multi-agent evaluation framework using structured debate and role specialisation, directly comparable to the adversarial verification approach."
    407     },
    408     {
    409       "title": "Risks from Learned Optimization in Advanced Machine Learning Systems",
    410       "authors": ["E. Hubinger", "C. van Merwijk", "V. Mikulik"],
    411       "year": 2019,
    412       "arxiv_id": "1906.01820",
    413       "relevance": "Foundational work on deceptive alignment risks in individual AI systems, the individual-level safety problem that institutional design aims to complement."
    414     },
    415     {
    416       "title": "Scalable Agent Alignment via Reward Modeling: A Research Direction",
    417       "authors": ["J. Leike", "D. Krueger", "T. Everitt"],
    418       "year": 2018,
    419       "arxiv_id": "1811.07871",
    420       "relevance": "Research direction on scalable alignment through reward modeling, relevant to the broader alignment landscape this paper positions against."
    421     }
    422   ]
    423 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs