scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22581B)
      1 {
      2   "paper": {
      3     "title": "Does It Tie Out? Towards Autonomous Legal Agents in Venture Capital",
      4     "authors": ["Pierre Colombo", "Malik Boudiaf", "Allyn Sweet", "Michael Desa", "Hongxi Wang", "Kevin Candra", "Syméon del Marmol"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2512.18658",
      8     "doi": "10.48550/arXiv.2512.18658"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval", "case-study"],
     13   "key_findings": "The paper formalizes capitalization table tie-out as a verification problem and evaluates three approaches on 4 anonymized datarooms (Seed to Series B). The proposed Equall system (eager world model construction via an Event Graph) achieves 85.1% F1 vs 42.1% for agentic+structured and 29.0% for pure agentic baselines. Equall shows a 22x per-check speed advantage over agentic approaches due to amortized reasoning costs. The paper also characterizes empirical complexity scaling: verification steps nearly triple from Seed (~2,700) to Series B (~8,000).",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No code repository, GitHub link, or archive is provided anywhere in the paper."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The four datarooms are anonymized and proprietary. No dataset download link or supplementary data is provided."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications, dependency lists, or hardware details are mentioned."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No reproduction instructions or scripts are provided. The system is proprietary."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Figure 11 shows 95% error bars for time comparisons, but the main F1 results in Figure 8 have no confidence intervals or error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No statistical significance tests are used. Claims like 'significantly outperforming' (Section 5.1) are based on raw number comparisons without any statistical test."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Absolute F1 scores and differences are reported with baseline context: Equall 85.1% vs agentic+structured 42.1% vs pure agentic 29.0%. Speed comparison provides concrete ratios (22x per check)."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Only 4 datarooms are used with no justification for this sample size and no acknowledgment that N=4 may be too small for generalizable conclusions."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or spread measures are reported for the F1 results. Each dataroom appears evaluated once."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Two baselines are included: a pure agentic RAG baseline (GPT-5.1 with iterative RAG) and an ablation (agentic + Equall's structured representation)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The agentic baseline uses GPT-5.1, a contemporary model. However, no other commercial legal AI systems are compared."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The 'Agentic + Structured Repr.' baseline serves as an ablation, isolating the effect of Equall's Event Graph (Stage 2) from the full pipeline."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Precision, recall, and F1 are reported per flag category (Figure 8). Speed/latency comparisons are also provided (Figure 9)."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Ground-truth flags were annotated by experienced legal professionals (Section 5). The evaluation compares system output against expert human annotations."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "With only 4 datarooms, there is no held-out test set. All datarooms appear to be used for both development and evaluation."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Figure 8 breaks down precision, recall, and F1 across four flag categories: Data Discrepancy, Issuance Missing, Board Approval Missing, and Cap Table Missing."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 5.1 discusses where agentic baselines fail: 'Missing Documentation' and 'Missing from Cap Table' categories requiring global reasoning. Equall's limitations on these categories are also visible in results."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "No negative results for Equall are reported. Every experiment shows Equall outperforming baselines. No failed approaches or configurations are mentioned."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims that existing agentic systems fail at tie-out and proposes a world model architecture. The results (29% F1 for agentic vs 85% for Equall) support these claims."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper makes causal claims about why Equall outperforms ('pre-computed knowledge graph turns these complex reasoning chains into reliable graph queries') but the study design (4 datarooms, no controlled manipulation of individual components beyond one ablation) is insufficient for strong causal inference."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title says 'Venture Capital' broadly but results are from only 4 US-focused datarooms (Seed to Series B). No discussion of whether results generalize to other jurisdictions, deal types, or company sizes."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No alternative explanations are discussed. The Equall system may benefit from domain-specific engineering advantages not shared with the generic agentic baseline, but this is not addressed."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper measures F1 on anomaly detection against expert annotations and frames this as 'tie-out automation,' but does not discuss whether matching expert flags is the same as producing legally reliable tie-out (experts may disagree, annotations may be incomplete)."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The agentic baseline uses 'GPT-5.1' but no snapshot date or API version is given. The LLM used inside Equall's extraction pipeline is not specified at all."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "No prompts or system instructions are provided for any of the LLM components (agentic baseline, Equall's extractors, or classifiers)."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No hyperparameters (temperature, top-p, max tokens, chunk size for RAG, etc.) are reported for any system."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section 4 describes both paradigms in detail: the agentic RAG pipeline (query generation → retrieval → reasoning → verification) and Equall's three-stage pipeline (foundational extraction → inductive event modeling → neuro-symbolic verification)."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "No description of how raw dataroom documents were preprocessed (OCR, text extraction, chunking for RAG, etc.). Section 3 mentions OCR quality issues but does not describe the actual preprocessing pipeline."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no limitations or threats-to-validity section. The paper moves directly from results to conclusion."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity are discussed anywhere in the paper."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No explicit scope boundaries. The paper does not state what the results do NOT show or what settings were excluded."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw data is available. The datarooms are proprietary and anonymized. No supplementary data files are provided."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 3 describes the four datarooms with statistics (pages, documents, securities, shareholders) across financing stages. The dataroom composition is characterized by document type distribution (Figures 2, 5)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "No description of how the four companies/datarooms were selected. Were they Equall customers? Were they chosen to be representative? Selection criteria are absent."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The pipeline from raw documents to ground-truth annotations is not documented. How legal experts annotated ground-truth flags, inter-annotator agreement, and annotation guidelines are all absent."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding disclosure or acknowledgments section is present in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All authors list Equall email addresses ({firstname}@equall.com), making the company affiliation clear."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "All authors are from Equall, the company whose product is being evaluated. Equall has a direct financial interest in the system appearing to outperform alternatives."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement. The authors presumably hold equity or employment stakes in Equall but this is not declared."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff is stated for GPT-5.1 or any other model used."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether the models could have seen similar legal documents during training."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No contamination analysis. Although the datarooms are proprietary (reducing contamination risk), this is not discussed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants. The study evaluates automated systems against expert-annotated ground truth."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in the study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in the study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in the study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in the study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Figure 9 reports per-check inference time (45 sec agentic vs 2 sec Equall) and total times for 100 and 500 checks. Figure 11 reports end-to-end time for Equall-assisted workflows (64m to 300m)."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total computational budget (GPU hours, API costs, total spend) is stated. Only relative timing comparisons are provided."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No multiple-seed experiments are reported. Each system appears to be run once per dataroom."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs is never stated. Results appear to be single-run."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget is reported for any system."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No description of how configurations were selected for any of the three systems."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors evaluate their own product (Equall) against baselines they implemented. No acknowledgment of author-evaluation bias."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Figure 9 explicitly compares speed (a compute proxy) vs. paradigm, showing the trade-off between eager (15 min indexing, 2 sec/check) and lazy (2 min indexing, 45 sec/check) approaches."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether matching expert-annotated flags constitutes valid measurement of tie-out quality. Inter-annotator agreement is not reported."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The agentic baseline uses a generic RAG scaffold while Equall uses a highly engineered domain-specific pipeline. The performance difference could be due to engineering quality rather than architectural paradigm, but this confound is not discussed."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether GPT-5.1 may have seen similar legal documents or patterns in training data."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup provides information not available in real usage scenarios."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "With only 4 datarooms, independence between evaluation instances is not discussed."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention method is described."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Equall achieves 85.1% average F1 on flag detection, significantly outperforming agentic+structured (42.1%) and pure agentic (29.0%) baselines.",
    365       "evidence": "Figure 8, Section 5.1. Precision, recall, and F1 broken down by flag category across three systems.",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "Equall provides a 22x speed advantage per verification check over the agentic baseline (2 sec vs 45 sec).",
    370       "evidence": "Figure 9, Section 5.1. Speed comparison table on a 300-document dataroom.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "The evidentiary burden in tie-out scales super-linearly with document volume, with securities increasing 7x while documents only double from Seed to Series B.",
    375       "evidence": "Figure 4, Section 3. Statistics across four companies at different financing stages.",
    376       "supported": "weak"
    377     },
    378     {
    379       "claim": "Verification workload nearly triples from ~2,700 steps at Seed to ~8,000 at Series B.",
    380       "evidence": "Figure 7, Section 3. Verification step counts across four companies.",
    381       "supported": "weak"
    382     },
    383     {
    384       "claim": "Equall-assisted workflow reduces tie-out time from 5-27 hours (manual) to 64-300 minutes.",
    385       "evidence": "Figure 11. Time comparison with 95% intervals from customer/partner data vs internal benchmarks.",
    386       "supported": "weak"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "Company evaluating its own product",
    392       "detail": "All authors are Equall employees evaluating Equall's product against baselines they implemented. No independent evaluation or third-party replication. This is the classic self-evaluation bias documented by Lucic et al."
    393     },
    394     {
    395       "flag": "Tiny sample size (N=4)",
    396       "detail": "Only 4 datarooms are used for evaluation. This is far too small to draw generalizable conclusions about architectural paradigm superiority. No sample size justification is provided."
    397     },
    398     {
    399       "flag": "No limitations section",
    400       "detail": "The paper has no limitations, threats to validity, or discussion of what the results do NOT show. This is a significant omission for a paper making strong architectural claims."
    401     },
    402     {
    403       "flag": "Unfair baseline comparison",
    404       "detail": "The agentic baseline is a generic RAG system, while Equall is a heavily engineered domain-specific pipeline with specialized parsers and a custom knowledge graph. The comparison conflates architectural paradigm with engineering investment."
    405     },
    406     {
    407       "flag": "No statistical rigor",
    408       "detail": "Claims of 'significant' outperformance are made without any statistical tests, confidence intervals (for F1), or multi-run experiments on N=4 datarooms."
    409     },
    410     {
    411       "flag": "Proprietary and unreproducible",
    412       "detail": "No code, data, prompts, hyperparameters, or ground-truth annotations are released. The entire evaluation is unverifiable."
    413     }
    414   ],
    415   "cited_papers": [
    416     {
    417       "title": "Why do multi-agent LLM systems fail?",
    418       "authors": ["Mert Cemri", "Melissa Z Pan", "Shuyi Yang"],
    419       "year": 2025,
    420       "arxiv_id": "2503.13657",
    421       "relevance": "Directly relevant to understanding failure modes of agentic LLM systems, which this paper claims to address."
    422     },
    423     {
    424       "title": "Evaluating AI for Law: Bridging the Gap with Open-Source Solutions",
    425       "authors": ["Rohan Bhambhoria", "Samuel Dahan", "Jonathan Li", "Xiaodan Zhu"],
    426       "year": 2024,
    427       "arxiv_id": "2404.12349",
    428       "relevance": "Evaluates AI capabilities for legal tasks, relevant to understanding LLM performance on legal reasoning."
    429     },
    430     {
    431       "title": "LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models",
    432       "authors": ["Neel Guha", "Julian Nyarko", "Daniel E Ho"],
    433       "year": 2023,
    434       "arxiv_id": "2308.11462",
    435       "relevance": "Major legal AI benchmark that this paper positions against as insufficient for real-world legal workflows."
    436     },
    437     {
    438       "title": "SaulLM-54B & SaulLM-141B: Scaling Up Domain Adaptation for the Legal Domain",
    439       "authors": ["Pierre Colombo", "Telmo Pires", "Malik Boudiaf"],
    440       "year": 2024,
    441       "relevance": "Domain-adapted legal LLM by the same group, relevant to understanding domain-specific LLM development."
    442     },
    443     {
    444       "title": "GPT-4 Passes the Bar Exam",
    445       "authors": ["Daniel Martin Katz", "Michael James Bommarito", "Shang Gao", "Pablo Arredondo"],
    446       "year": 2023,
    447       "relevance": "Landmark evaluation of LLM legal reasoning capability on professional bar examination."
    448     },
    449     {
    450       "title": "DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models",
    451       "authors": ["Zhihong Shao", "Peiyi Wang", "Qihao Zhu"],
    452       "year": 2024,
    453       "arxiv_id": "2402.03300",
    454       "relevance": "Referenced for RL-driven training methodology that the paper proposes applying to legal verification."
    455     }
    456   ]
    457 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs