scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (19840B)
      1 {
      2   "paper": {
      3     "title": "From Fluent to Verifiable: Claim-Level Auditability for Deep Research Agents",
      4     "authors": ["Razeen A Rasheed", "Somnath Banerjee", "Animesh Mukherjee", "Rima Hazra"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2602.13855"
      8   },
      9   "scan_version": 2,
     10   "active_modules": [],
     11   "methodology_tags": ["theoretical"],
     12   "key_findings": "The paper argues that deep research agents suffer from three systematic failure modes — objective drift, transient constraints, and unverifiable inference chains — that make their outputs scientifically unauditable. It proposes the Auditable Autonomous Research (AAR) standard with four metrics: provenance coverage, provenance soundness, contradiction transparency, and audit effort. The paper advocates semantic provenance graphs with protocolized validation as the architectural substrate for auditable research agents. Failure statistics are drawn entirely from prior evaluations (e.g., 44.2% planning failures, 42% experiment execution failures in The AI Scientist).",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No code repository or implementation is referenced. The paper proposes a framework but provides no implementation artifacts."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No dataset or supplementary data is released. The paper draws on secondary sources but provides no novel data."
     24       },
     25       "environment_specified": {
     26         "applies": false,
     27         "answer": false,
     28         "justification": "Purely theoretical/position paper with no computational experiments requiring an environment."
     29       },
     30       "reproduction_instructions": {
     31         "applies": false,
     32         "answer": false,
     33         "justification": "No experiments to reproduce. The contribution is a conceptual framework and measurement definitions."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": false,
     39         "answer": false,
     40         "justification": "No original experiments. All statistics cited (e.g., 44.2% failure rate, 42% execution failure) are from prior work."
     41       },
     42       "significance_tests": {
     43         "applies": false,
     44         "answer": false,
     45         "justification": "No comparative claims based on original data. This is a position paper citing others' results."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": false,
     49         "answer": false,
     50         "justification": "No original measurements are made. All quantitative claims reference prior evaluations."
     51       },
     52       "sample_size_justified": {
     53         "applies": false,
     54         "answer": false,
     55         "justification": "No original data collection or sampling. Theoretical paper."
     56       },
     57       "variance_reported": {
     58         "applies": false,
     59         "answer": false,
     60         "justification": "No original experiments with runs to report variance over."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": false,
     66         "answer": false,
     67         "justification": "No empirical evaluation is conducted. The paper proposes a framework without experimental validation."
     68       },
     69       "baselines_contemporary": {
     70         "applies": false,
     71         "answer": false,
     72         "justification": "No baselines — no experiments."
     73       },
     74       "ablation_study": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "No system to ablate. This is a conceptual framework paper."
     78       },
     79       "multiple_metrics": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "No empirical evaluation. The AAR metrics (PCov, PSnd, CTran, AEff) are proposed but not measured on any system."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No evaluation of any kind is conducted."
     88       },
     89       "held_out_test_set": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "No data splits — no experiments."
     93       },
     94       "per_category_breakdown": {
     95         "applies": false,
     96         "answer": false,
     97         "justification": "No empirical results to break down."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 3 provides a detailed taxonomy of failure cases from prior work: planning failures (objective drift, metric misalignment, novelty verification), execution failures (cross-validation bug, transient memory), and synthesis failures (citation decorrelation, unverifiable inference). Figure 2 summarizes these."
    103       },
    104       "negative_results_reported": {
    105         "applies": false,
    106         "answer": false,
    107         "justification": "No experiments that could produce positive or negative results."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims are appropriately scoped as a 'perspective' paper. It proposes the AAR standard and identifies failure modes, both of which are developed in the body. No unsupported empirical claims in the abstract."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper makes implicit causal claims (e.g., 'cosine similarity... making it mathematically incapable of representing entailment or contradiction' as a cause of citation decorrelation, Section 3.3) without controlled experiments. These are argued from first principles rather than demonstrated experimentally."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper makes broad claims about 'deep research agents' but its failure analysis draws primarily from evaluations of The AI Scientist and ChemCrow. The generalization from these specific systems to all deep research agents is not explicitly bounded."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 6 ('Alternative views and objections') addresses four counterarguments: bigger models will solve this, graphs are too expensive, logs are sufficient, and validation adds prohibitive latency. Each is engaged substantively."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": false,
    133         "answer": false,
    134         "justification": "Theoretical paper with no measurements. The proposed metrics (PCov, PSnd, CTran, AEff) are defined but not measured."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "No models are used. This is a theoretical/position paper."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "No prompting is used in this paper."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "No experiments with hyperparameters."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used or built in this paper."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No data preprocessing — no data collected or processed."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "No dedicated limitations section. Section 6 addresses objections but does not discuss limitations of the authors' own proposal (e.g., feasibility of building the proposed provenance graphs, scalability of the AAR metrics)."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No threats-to-validity discussion. The paper does not acknowledge that its failure taxonomy is derived from a small number of evaluated systems, nor that the proposed metrics are untested."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what its framework does NOT address. It presents the AAR standard as broadly applicable without stating scope limitations."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": false,
    184         "answer": false,
    185         "justification": "No original data collected. All evidence comes from published prior work."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "The paper draws statistics from prior work (e.g., '44.2% of failures' from [19], '42% of experiments failed' from [32]) but does not systematically describe how these sources were identified or selected. The literature coverage appears opportunistic rather than systematic."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants and no benchmark data. Standard sources from published literature."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No data pipeline — this is a position paper synthesizing existing literature."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding or acknowledgments section is present in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly stated: Indian Institute of Science, IIT Kharagpur, Cisco Systems, TCG CREST."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding disclosed, so independence cannot be assessed. One author is affiliated with Cisco Systems but there is no disclosure of whether this creates a conflict."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is present."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "No pre-trained model is evaluated on any benchmark."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "No model evaluation on benchmarks."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "No benchmark evaluation."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "Theoretical paper with no method that incurs inference cost."
    284       },
    285       "compute_budget_stated": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "No computation performed."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "44.2% of multi-agent system failures arise from specification errors, task misinterpretation, and improper decomposition during planning.",
    295       "evidence": "Cited from [19] (Cemri et al. 2025), analysis of ~1,642 multi-agent system traces. Section 2.1 and Figure 1.",
    296       "supported": "moderate"
    297     },
    298     {
    299       "claim": "Execution failure rates in multi-agent systems range from 41% to 86.7% depending on system architecture and task complexity.",
    300       "evidence": "Cited from [19] across multi-agent traces. Section 2.1.",
    301       "supported": "moderate"
    302     },
    303     {
    304       "claim": "42% of The AI Scientist's proposed experiments failed to execute due to coding errors, yet the system produced manuscripts mischaracterizing results.",
    305       "evidence": "Cited from [8] (Beel & Kan 2025) and [32] (evaluation of AI Scientist). Section 1 and 3.2.",
    306       "supported": "moderate"
    307     },
    308     {
    309       "claim": "Cosine similarity is mathematically incapable of representing entailment or contradiction because it is symmetric and blending.",
    310       "evidence": "Argued from first principles in Section 3.3, citing [16] (Bowman et al. 2015) and [69] (Wang et al. 2024). This is a theoretical argument, not empirically demonstrated in this paper.",
    311       "supported": "moderate"
    312     },
    313     {
    314       "claim": "Current deep research agents provide no reconstructible trace linking generated claims to supporting evidence through explicit reasoning steps.",
    315       "evidence": "Argued based on architectural analysis in Section 3.3, with references to [62] (PROV-AGENT) and [12]. No original empirical verification.",
    316       "supported": "weak"
    317     },
    318     {
    319       "claim": "Verification effort must be substantially lower than generation effort for autonomous systems to provide practical value.",
    320       "evidence": "Stated as a design principle in Section 3.3, referencing [26] and [37]. Formalized as the auditability invariant in Definition 1 (Section 4.1).",
    321       "supported": "moderate"
    322     }
    323   ],
    324   "red_flags": [
    325     {
    326       "flag": "No empirical validation of proposed framework",
    327       "detail": "The AAR standard (PCov, PSnd, CTran, AEff) is proposed with formal definitions and a worked example but never applied to any real system. The paper does not demonstrate that these metrics are measurable, discriminative, or actionable in practice."
    328     },
    329     {
    330       "flag": "Narrow evidence base generalized broadly",
    331       "detail": "The failure taxonomy draws heavily from evaluations of The AI Scientist (refs [8], [32]) and to a lesser extent ChemCrow [17]. These are generalized to all 'deep research agents' without acknowledging the narrow empirical base."
    332     },
    333     {
    334       "flag": "No limitations section",
    335       "detail": "The paper engages objections (Section 6) but does not discuss limitations of its own proposal, such as the feasibility of implementing semantic provenance graphs at scale, the cost of NLI-based entailment checking, or whether the AAR metrics have construct validity."
    336     },
    337     {
    338       "flag": "Selective citation of failure statistics",
    339       "detail": "Failure rates are presented from prior work without discussing the conditions under which they were measured or whether they generalize. For example, '28.6-91.4% made-up information' in Figure 1 is attributed without context on which systems or tasks produced these rates."
    340     }
    341   ],
    342   "cited_papers": [
    343     {
    344       "title": "Why Do Multi-Agent LLM Systems Fail?",
    345       "authors": ["Mert Cemri", "Melissa Z. Pan", "Shuyi Yang", "Lakshya A. Agrawal"],
    346       "year": 2025,
    347       "arxiv_id": "2503.13657",
    348       "relevance": "Empirical analysis of 1,642 multi-agent system traces identifying systematic failure modes — directly relevant to agentic AI reliability."
    349     },
    350     {
    351       "title": "The AI Scientist: Towards fully automated open-ended scientific discovery",
    352       "authors": ["Chris Lu", "Cong Lu", "Robert Tjarko Lange", "Jakob Foerster", "Jeff Clune", "David Ha"],
    353       "year": 2024,
    354       "arxiv_id": "2408.06292",
    355       "relevance": "Primary autonomous research agent system evaluated throughout this paper; key subject of failure analysis."
    356     },
    357     {
    358       "title": "Evaluating Sakana's AI Scientist: Bold Claims, Mixed Results, and a Promising Future?",
    359       "authors": ["Joeran Beel", "Min-Yen Kan"],
    360       "year": 2025,
    361       "arxiv_id": "2502.14297",
    362       "relevance": "Independent evaluation of The AI Scientist finding 42% experiment execution failure and methodological weaknesses."
    363     },
    364     {
    365       "title": "AI Scientists Fail Without Strong Implementation Capability",
    366       "authors": ["Minjun Zhu", "Qiujie Xie", "Yixuan Weng", "Jian Wu"],
    367       "year": 2025,
    368       "arxiv_id": "2506.01372",
    369       "relevance": "PaperBench evaluation showing 100% of agent-generated papers had methodological weaknesses; Claude 3.5 Sonnet achieved only 1.8% task completion."
    370     },
    371     {
    372       "title": "PROV-AGENT: Unified Provenance for Tracking AI Agent Interactions in Agentic Workflows",
    373       "authors": ["Renan Souza", "Amal Gueroudji"],
    374       "year": 2025,
    375       "relevance": "Adapts W3C provenance standards to track decision lineage in agentic workflows — directly related to auditability infrastructure."
    376     },
    377     {
    378       "title": "DeepResearch Bench: A Comprehensive Benchmark for Deep Research Agents",
    379       "authors": ["Mingxuan Du", "Benfeng Xu"],
    380       "year": 2025,
    381       "arxiv_id": "2506.11763",
    382       "relevance": "Benchmark for deep research agents measuring long-horizon research task performance."
    383     },
    384     {
    385       "title": "ResearchRubrics: Auditing the Citation Integrity of Deep Research Agents",
    386       "authors": ["Anjali Sharma", "Christopher Lin"],
    387       "year": 2025,
    388       "arxiv_id": "2511.07685",
    389       "relevance": "Directly audits citation integrity in deep research settings, closely related to provenance soundness."
    390     },
    391     {
    392       "title": "DeepTRACE: Auditing Deep Research AI Systems for Tracking Reliability Across Citations and Evidence",
    393       "authors": ["Pranav Narayanan Venkit", "Philippe Laban"],
    394       "year": 2026,
    395       "relevance": "Shows deep research agents generate one-sided answers with 40-80% citation accuracy — key empirical evidence for the auditability problem."
    396     },
    397     {
    398       "title": "Agentic Misalignment: How LLMs Could Be Insider Threats",
    399       "authors": ["Anthropic"],
    400       "year": 2025,
    401       "relevance": "Documents how AI agents bypass stated constraints to pursue alternative goals — relevant to objective drift analysis."
    402     },
    403     {
    404       "title": "AI models collapse when trained on recursively generated data",
    405       "authors": ["Ilia Shumailov", "Zakhar Shumaylov", "Yiren Zhao", "Yarin Gal", "Nicolas Papernot", "Ross Anderson"],
    406       "year": 2024,
    407       "doi": "10.1038/s41586-024-07566-y",
    408       "relevance": "Model collapse from recursive AI-generated training data — motivates why unauditable agent outputs contaminate the scientific pipeline."
    409     },
    410     {
    411       "title": "Protecting scientific integrity in an age of generative AI",
    412       "authors": ["Wulfram Blau", "Vinton G. Cerf"],
    413       "year": 2024,
    414       "doi": "10.1073/pnas.2407886121",
    415       "relevance": "PNAS perspective on scientific integrity threats from generative AI — closely related to this paper's motivation."
    416     }
    417   ]
    418 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs