scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (30749B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Does It Tie Out? Towards Autonomous Legal Agents in Venture Capital",
      6     "authors": [
      7       "Pierre Colombo",
      8       "Malik Boudiaf",
      9       "Allyn Sweet",
     10       "Michael Desa",
     11       "Hongxi Wang",
     12       "Kevin Candra",
     13       "Symeon del Marmol"
     14     ],
     15     "year": 2025,
     16     "venue": "arXiv.org",
     17     "arxiv_id": "2512.18658",
     18     "doi": "10.48550/arXiv.2512.18658"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "All abstract claims are substantiated: multi-document reasoning requirement shown in Section 3, evidence traceability discussed throughout, deterministic output requirement proven by baseline failures in Section 5, world model architecture detailed in Section 4.2.",
     26         "source": "haiku"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "Paper claims architectural superiority (eager vs lazy construction causes performance gain) but lacks rigorous ablation studies. Only one ablation (Agentic + Structured Repr.) tested; no systematic component isolation to prove individual causal factors. Different baselines may differ in implementation quality/effort rather than architectural merit.",
     32         "source": "haiku"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "Claims bounded to tie-out (Seed to Series B VC context) are appropriate, but conclusion overgeneralizes: 'robust foundational substrate suitable for a wider array of downstream legal applications' and 'Applied Legal Intelligence' framing extend far beyond tested scope without evidence. Only one task evaluated.",
     38         "source": "haiku"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "Paper attributes performance differences to architectural design (world model vs RAG) but doesn't discuss alternative explanations: implementation quality differences, prompt engineering quality, model version effects, or whether Equall received more development effort. No discussion of these potential confounds.",
     44         "source": "haiku"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Measurement (F1 on anomaly detection with correct type and evidence traceability) directly matches claimed outcome (reliable tie-out automation). Ground truth provided by legal professionals; no measurement granularity mismatch.",
     50         "source": "haiku"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No dedicated limitations or threats-to-validity section. Paper has Introduction, Background, Complexity Analysis, Methods, Experiments, and Conclusion, but no explicit limitations discussion.",
     58         "source": "haiku"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "Specific threats not discussed: no inter-rater reliability for ground truth annotations, no representativeness justification for 4 companies, no discussion of OCR quality handling despite identifying it as a challenge, no failure mode analysis, no evaluation of annotation quality.",
     64         "source": "haiku"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "Scope to tie-out in Seed-Series B VC is stated implicitly, but conclusion overextends: claims about generalization to 'a wider array of downstream legal applications' and 'legal intelligence' broadly without evidence. Boundaries are stated for the core task but then violated in claims.",
     70         "source": "haiku"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No funding statement provided. All authors list @equall.com email; paper evaluates Equall's own product, but no explicit funding source disclosure or acknowledgment of their employer's financial interest in positive results.",
     78         "source": "haiku"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Author affiliations clearly disclosed via @equall.com email addresses; all work for the company whose system (Equall) is being evaluated.",
     84         "source": "haiku"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "Authors' employer (Equall) is entirely dependent on the outcome. This is a company evaluating its own product against baselines. The funder has direct financial/reputational interest in demonstrating superiority of Equall.",
     90         "source": "haiku"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests or financial disclosure statement provided. No mention of potential patents, equity, or consulting arrangements related to Equall or the described approach.",
     96         "source": "haiku"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Key terms precisely defined: Dataroom (Section 2.1), Cap Table (Section 2.2), Tie-out (Section 2.3 with formal mathematical notation), World Model (Section 4.2), Event Graph (Section 4.2), Anomaly types (Section 2.3 with taxonomy).",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Three contributions explicitly stated in abstract and introduction: (1) characterize tie-out as real-world benchmark, (2) analyze existing agentic systems, (3) propose world model architecture (Equall). Each is developed and evaluated in paper.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "Limited prior work engagement. Introduction cites legal AI benchmarks ([7,5,6,11,10]) but lacks dedicated Related Work section. No substantive discussion of how this relates to existing legal NLP, knowledge graphs, or agentic systems literature. Citations present but not synthesized.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "empirical": {
    122       "artifacts": {
    123         "code_released": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "No code released. No mention of open-source implementation, GitHub repository, or promise of future release.",
    127           "source": "haiku"
    128         },
    129         "data_released": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "Data consists of 'four anonymized datarooms' from real VC companies. Anonymized data is not publicly available; appears under NDA. No release mentioned.",
    133           "source": "haiku"
    134         },
    135         "environment_specified": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "Model mentioned (GPT-5.1) but no environment specifications: no requirements.txt, Dockerfile, Python version, CUDA requirements, or dependency list provided. Implementation details absent.",
    139           "source": "haiku"
    140         },
    141         "reproduction_instructions": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "No step-by-step reproduction instructions. Paper describes architecture and approach conceptually but does not provide actionable instructions to reproduce results.",
    145           "source": "haiku"
    146         }
    147       },
    148       "statistical_methodology": {
    149         "confidence_intervals_or_error_bars": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "Main performance results (Figure 8: F1 scores per flag type; Figure 10: F1 vs dataroom size) lack error bars or confidence intervals. Only Figure 11 (time measurements) includes 95% error bars.",
    153           "source": "haiku"
    154         },
    155         "significance_tests": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "No statistical significance tests reported. Claims like 'significantly outperforming' (p.9) use English phrasing but lack p-values, t-tests, or other statistical tests comparing the three approaches.",
    159           "source": "haiku"
    160         },
    161         "effect_sizes_reported": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "Effect sizes reported: F1 scores (85% vs 42% vs 29% overall; per-category F1 in Figure 8), speedup (22× faster per check), and time reduction (27h → 5h at Series B). Magnitude of differences quantified.",
    165           "source": "haiku"
    166         },
    167         "sample_size_justified": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "Evaluation on only 4 companies without justification. No power analysis, no discussion of sample size adequacy, no statistical reasoning for why 4 companies (1 Seed, 1 Series A, 2 Series B) is sufficient.",
    171           "source": "haiku"
    172         },
    173         "variance_reported": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "F1 performance metrics (Figures 8, 10) reported without variance/std dev/confidence intervals. Baselines tested once per company with single F1 reported. Only time measurements (Figure 11) include variance.",
    177           "source": "haiku"
    178         }
    179       },
    180       "evaluation_design": {
    181         "baselines_included": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Three systems compared: Agentic Baseline (lazy RAG), Agentic + Structured Repr. (ablation), and Equall (proposed). Baselines represent alternative architectural paradigms.",
    185           "source": "haiku"
    186         },
    187         "baselines_contemporary": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Baselines use GPT-5.1 (contemporary agentic architecture with RAG), representing state-of-the-art LLM approaches for the task. Comparison uses same underlying model versions across approaches.",
    191           "source": "haiku"
    192         },
    193         "ablation_study": {
    194           "applies": true,
    195           "answer": false,
    196           "justification": "Only one ablation tested: Agentic + Structured Repr. (removes Event Graph layer). No ablation of individual components within Equall: no test of symbolic-only verification, no isolation of Stage 1 vs Stage 2 extraction, no sensitivity analysis.",
    197           "source": "haiku"
    198         },
    199         "multiple_metrics": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Multiple metrics reported: (1) Precision/Recall/F1 per flag category, (2) F1 across companies/stages, (3) inference speed per check, (4) total workflow time, (5) scaling robustness. Comprehensive evaluation.",
    203           "source": "haiku"
    204         },
    205         "human_evaluation": {
    206           "applies": true,
    207           "answer": false,
    208           "justification": "Ground truth annotated by 'experienced legal professionals' but no user study or qualitative evaluation of system. No human evaluation of system outputs, usability testing, or feedback from actual legal practitioners using Equall.",
    209           "source": "haiku"
    210         },
    211         "held_out_test_set": {
    212           "applies": true,
    213           "answer": false,
    214           "justification": "Not explicitly stated whether test data was held out from system development. Paper says evaluation on 'four anonymized datarooms' with ground truth from professionals, but hold-out procedure and train/dev/test split not described.",
    215           "source": "haiku"
    216         },
    217         "per_category_breakdown": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Figure 8 breaks results by flag type (Data Discrepancy, Missing Documentation, Missing from Cap Table) with precision/recall/F1 for each. Figure 10 shows F1 across companies (Seed, Series A, Series B). Multi-level breakdowns provided.",
    221           "source": "haiku"
    222         },
    223         "failure_cases_discussed": {
    224           "applies": true,
    225           "answer": false,
    226           "justification": "No failure cases shown or discussed. Results only report aggregate F1 scores. No examples of false positives, false negatives, or specific anomalies the system misses.",
    227           "source": "haiku"
    228         },
    229         "negative_results_reported": {
    230           "applies": true,
    231           "answer": false,
    232           "justification": "Only positive results reported. All three systems evaluated show Equall > Baselines across metrics. No cases where Equall underperforms, no scenarios where agentic approaches succeed, no discussion of when the approach fails.",
    233           "source": "haiku"
    234         }
    235       },
    236       "setup_transparency": {
    237         "model_versions_specified": {
    238           "applies": true,
    239           "answer": false,
    240           "justification": "Models mentioned (GPT-5.1) but without version snapshots, release dates, or exact configurations. Unclear if Equall uses GPT-5.1 or different model. No fine-tuning details provided.",
    241           "source": "haiku"
    242         },
    243         "prompts_provided": {
    244           "applies": true,
    245           "answer": false,
    246           "justification": "No example prompts or system instructions shown. Paper describes the pipeline conceptually but does not provide actual prompts used to elicit LLM behavior for document classification, extraction, or verification.",
    247           "source": "haiku"
    248         },
    249         "hyperparameters_reported": {
    250           "applies": true,
    251           "answer": false,
    252           "justification": "No hyperparameters reported for LLM calls: no temperature, top-p, max_tokens, frequency penalties, or other generation parameters specified for any stage of the pipeline.",
    253           "source": "haiku"
    254         },
    255         "scaffolding_described": {
    256           "applies": true,
    257           "answer": false,
    258           "justification": "High-level architecture described (document classification, low-level node extraction, event graph synthesis, neuro-symbolic verification) but not implementation-level scaffolding. No example prompts, reasoning traces, or detailed agent instructions.",
    259           "source": "haiku"
    260         },
    261         "data_preprocessing_documented": {
    262           "applies": true,
    263           "answer": false,
    264           "justification": "Paper identifies preprocessing challenges (OCR quality, document format variance, near-duplicates) but does not document the actual preprocessing steps taken. No description of cleaning, tokenization, or data handling procedures.",
    265           "source": "haiku"
    266         }
    267       },
    268       "data_integrity": {
    269         "raw_data_available": {
    270           "applies": true,
    271           "answer": false,
    272           "justification": "Raw data consists of real VC company datarooms marked 'anonymized' and not publicly released. Data appears restricted by confidentiality/NDA agreements with companies.",
    273           "source": "haiku"
    274         },
    275         "data_collection_described": {
    276           "applies": true,
    277           "answer": false,
    278           "justification": "Paper states 'four anonymized datarooms presented in Section 3' but does not describe how these were collected, whether they're representative, or sampling methodology. No collection procedure documented.",
    279           "source": "haiku"
    280         },
    281         "recruitment_methods_described": {
    282           "applies": false,
    283           "answer": false,
    284           "justification": "N/A - no human participants recruited. Evaluation uses pre-existing corporate datarooms, not participant-generated data.",
    285           "source": "haiku"
    286         },
    287         "data_pipeline_documented": {
    288           "applies": true,
    289           "answer": false,
    290           "justification": "Overall pipeline architecture described (Stages 1-3 in Section 4.2) but specific data handling not documented: no documentation of PDF extraction, OCR handling, duplicate removal, or data validation steps.",
    291           "source": "haiku"
    292         }
    293       },
    294       "contamination": {
    295         "training_cutoff_stated": {
    296           "applies": false,
    297           "answer": false,
    298           "justification": "N/A - not evaluating pre-trained models on public benchmarks; evaluating a bespoke tie-out verification system on real datarooms.",
    299           "source": "haiku"
    300         },
    301         "train_test_overlap_discussed": {
    302           "applies": false,
    303           "answer": false,
    304           "justification": "N/A - not evaluating benchmark contamination; this is a specialized legal task with proprietary ground truth.",
    305           "source": "haiku"
    306         },
    307         "benchmark_contamination_addressed": {
    308           "applies": false,
    309           "answer": false,
    310           "justification": "N/A - not evaluating on public benchmarks; tie-out is a novel task specific to this work.",
    311           "source": "haiku"
    312         }
    313       },
    314       "human_studies": {
    315         "pre_registered": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "N/A - no human participants studied; evaluation uses ground truth from professional annotators but no human subjects experiment.",
    319           "source": "haiku"
    320         },
    321         "irb_or_ethics_approval": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "N/A - no human participants; no IRB approval needed.",
    325           "source": "haiku"
    326         },
    327         "demographics_reported": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "N/A - no human participant demographics; annotators mentioned only as 'experienced legal professionals'.",
    331           "source": "haiku"
    332         },
    333         "inclusion_exclusion_criteria": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "N/A - no human participants; evaluation on corporate datarooms only.",
    337           "source": "haiku"
    338         },
    339         "randomization_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "N/A - not a randomized experiment; fixed set of 4 companies evaluated.",
    343           "source": "haiku"
    344         },
    345         "blinding_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "N/A - not applicable to system evaluation.",
    349           "source": "haiku"
    350         },
    351         "attrition_reported": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "N/A - no human participants to have attrition.",
    355           "source": "haiku"
    356         }
    357       },
    358       "cost_and_practicality": {
    359         "inference_cost_reported": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "Latency reported (2 sec vs 45 sec per check, Figure 9) and total workflow time (Figure 11), but no API costs, compute costs, or monetary expense disclosed.",
    363           "source": "haiku"
    364         },
    365         "compute_budget_stated": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "No total computational budget, GPU/TPU requirements, or model costs stated. Timing provided but not resource constraints or infrastructure requirements.",
    369           "source": "haiku"
    370         }
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "Capitalization tie-out requires multi-document reasoning, strict evidence traceability, and deterministic outputs that current agentic LLM approaches fail to deliver reliably",
    377       "evidence": "Section 4.1 analyzes limitations of agentic RAG (retrieval failure indistinguishable from absence, exponential error compounding); Figure 10 shows agentic F1 drops from 55% to 28% with complexity; Figure 8 shows agentic baselines achieve 29% F1 overall",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "Equall's eager world model architecture achieves 85% F1 on anomaly detection, substantially outperforming lazy agentic approaches (42%, 29%)",
    382       "evidence": "Figure 8 shows F1 across flag categories with Equall at 85%, Agentic+Structured at 42%, Agentic Baseline at 29%",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "Performance gap widens dramatically with dataroom complexity: Equall maintains 95% F1 on large Series B datarooms while agentic baseline drops to 28%",
    387       "evidence": "Figure 10 tracks F1 vs dataroom size (2,081 to 6,721 pages): Equall 55%→95%, Agentic Baseline 55%→28% across scale",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "Equall achieves 22× speedup per verification check (2 seconds vs 45 seconds for agentic) due to upfront world model construction",
    392       "evidence": "Figure 9 shows inference time: agentic 45 sec/check vs Equall 2 sec/check; 15 min indexing cost amortizes over 500+ checks",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "Equall-assisted workflow reduces tie-out time from 27 hours to 5 hours at Series B scale while maintaining 81.4% efficiency",
    397       "evidence": "Figure 11 shows manual tie-out: 21.9 hours at Series B vs Equall-assisted: 5 hours; assisted efficiency 81.4% (error bars shown)",
    398       "supported": "strong"
    399     },
    400     {
    401       "claim": "Verification workload explodes super-linearly with company maturity: rises 3× from Seed to Series B (2,710 to 7,923 verification steps)",
    402       "evidence": "Figure 7 documents verification step count growth; Section 3 shows securities increase 7× while documents only 2.5× from Seed to Series B",
    403       "supported": "strong"
    404     },
    405     {
    406       "claim": "The Event Graph world model is a generalizable foundation suitable for broader downstream legal applications beyond tie-out",
    407       "evidence": "Conclusion states model captures 'legally operative events as structured, temporally ordered state transitions' applicable across legal domains; no empirical validation on other tasks",
    408       "supported": "weak"
    409     },
    410     {
    411       "claim": "Anomalies shift from informal omissions (Seed) to complex document inconsistencies (Series B) requiring different verification strategies",
    412       "evidence": "Figure 5 shows Seed: 41.5% Board Consents vs Series B: 63.8%; Figure 6 shows 'Missing Information' remains top issue category across stages, contradicting the claim",
    413       "supported": "moderate"
    414     }
    415   ],
    416   "methodology_tags": [
    417     "benchmark-eval",
    418     "case-study"
    419   ],
    420   "key_findings": "The paper demonstrates that explicit world model construction substantially outperforms standard agentic LLM approaches on complex multi-document legal reasoning tasks. By building a structured Event Graph before verification (eager construction), the Equall system achieves 85% F1 on anomaly detection compared to 42% and 29% for baseline agentic approaches, with performance advantages widening dramatically as dataroom complexity increases from Seed to Series B stage. The approach enables 22× speedup per verification and reduces manual tie-out time from 27 hours to 5 hours with human-in-the-loop assistance, suggesting that neuro-symbolic architectures with explicit world models could be foundational for reliable autonomous legal reasoning at scale.",
    421   "red_flags": [
    422     {
    423       "flag": "Undisclosed financial conflict of interest",
    424       "detail": "All authors work for Equall (@equall.com) and are evaluating their own product against baselines. No competing interests statement or funding disclosure provided. Direct financial incentive for positive results."
    425     },
    426     {
    427       "flag": "No statistical significance testing",
    428       "detail": "Performance differences (85% vs 42% F1) reported without p-values, confidence intervals, or significance tests on main metrics. Only timing measurements (Figure 11) include error bars."
    429     },
    430     {
    431       "flag": "Insufficient sample size",
    432       "detail": "Evaluation on only 4 companies (Seed, Series A, 2× Series B) without justification or power analysis. No discussion of representativeness for the broader VC financing population."
    433     },
    434     {
    435       "flag": "Minimal ablation studies",
    436       "detail": "Only one ablation tested (Agentic + Structured Repr.). No isolation of individual Equall components: no symbolic-only verification, no Stage 1-only extraction, no sensitivity analysis of design choices."
    437     },
    438     {
    439       "flag": "No failure case analysis",
    440       "detail": "Results show only positive outcomes. No discussion of false negatives, false positives, or specific anomaly types where the system struggles. No error analysis."
    441     },
    442     {
    443       "flag": "Annotation quality unreported",
    444       "detail": "Ground truth provided by 'experienced legal professionals' but no inter-rater reliability, Cohen's kappa, or annotation agreement reported. Single-annotator labels assumed perfect."
    445     },
    446     {
    447       "flag": "Reproducibility impossible",
    448       "detail": "Code not released, data anonymized and proprietary, no prompts or hyperparameters disclosed. Implementation details absent: no requirements.txt, Dockerfile, or model snapshots. Cannot reproduce."
    449     },
    450     {
    451       "flag": "Limited related work engagement",
    452       "detail": "No dedicated related work section. Legal AI literature engagement limited to citations without synthesis. No comparison to knowledge graph, neuro-symbolic, or legal reasoning systems literature."
    453     },
    454     {
    455       "flag": "Overgeneralized conclusions",
    456       "detail": "Tested on single task (tie-out); conclusion claims 'foundation for applied legal intelligence' and 'wider array of downstream legal applications' without any evidence from other domains."
    457     },
    458     {
    459       "flag": "Model version ambiguity",
    460       "detail": "References 'GPT-5.1' without clarity on release date, availability, or whether this is a real/hypothetical model. Unclear what models Equall components use."
    461     }
    462   ],
    463   "cited_papers": [
    464     {
    465       "title": "Saullm-54b & saullm-141b: Scaling up domain adaptation for the legal domain",
    466       "relevance": "Legal LLM domain adaptation; directly relevant to building specialized legal reasoning systems"
    467     },
    468     {
    469       "title": "Saullm-7b: A pioneering large language model for law",
    470       "relevance": "Legal LLM development; foundation model for legal domain reasoning"
    471     },
    472     {
    473       "title": "LegalBench: A collaboratively built benchmark for measuring legal reasoning in large language models",
    474       "relevance": "Legal reasoning benchmarking; directly comparable to tie-out as legal AI evaluation task"
    475     },
    476     {
    477       "title": "Gemini 2.5: Pushing the frontier with advanced reasoning, multimodality, long context, and next generation agentic capabilities",
    478       "relevance": "Agentic LLM capabilities; represents state-of-the-art reasoning and multi-step planning systems"
    479     },
    480     {
    481       "title": "Neural legal judgment prediction in english",
    482       "relevance": "Legal NLP and judgment prediction; foundational work in legal AI classification"
    483     },
    484     {
    485       "title": "GPT-4 passes the bar exam",
    486       "relevance": "LLM legal reasoning capabilities; demonstrates capability ceiling on legal domain benchmarks"
    487     },
    488     {
    489       "title": "Why do multi-agent llm systems fail?",
    490       "relevance": "Directly cited for multi-agent agentic system failures; relevant to understanding agentic limitations"
    491     },
    492     {
    493       "title": "Developing artificially intelligent justice",
    494       "relevance": "Legal AI applications and justice system automation; broader context for legal reasoning systems"
    495     }
    496   ],
    497   "engagement_factors": {
    498     "practical_relevance": {
    499       "score": 2,
    500       "justification": "System targets a real VC workflow (tie-out) with demonstrated efficiency gains (27h → 5h); however, deployment limited to Equall's customers, unknown real-world validation outside authors' company."
    501     },
    502     "surprise_contrarian": {
    503       "score": 1,
    504       "justification": "Finding that structured world models outperform agentic RAG aligns with intuition from complexity analysis; somewhat expected given the problem formulation rather than surprising or contrarian."
    505     },
    506     "fear_safety": {
    507       "score": 0,
    508       "justification": "No AI safety, alignment, or risk concerns discussed. Legal automation raises general legal reasoning reliability questions but paper doesn't engage with these."
    509     },
    510     "demo_ability": {
    511       "score": 1,
    512       "justification": "System demonstrated on real datarooms but not publicly available or accessible. No sandbox, demo, or open-source artifact allowing readers to try the approach."
    513     },
    514     "brand_recognition": {
    515       "score": 1,
    516       "justification": "Equall is a specialized legal AI startup, not a household name. Authors have prior work on legal LLMs (SaulLM) showing domain expertise but limited brand visibility."
    517     },
    518     "drama_conflict": {
    519       "score": 1,
    520       "justification": "Evaluating own product introduces conflict of interest but not positioned as dramatic or controversial. More of a methodological concern than attention-grabbing narrative."
    521     }
    522   },
    523   "hn_data": {
    524     "threads": [
    525       {
    526         "hn_id": "42550783",
    527         "title": "Gamma-ray bursts: what do we know today that we did not know 10 years ago?",
    528         "points": 16,
    529         "comments": 0,
    530         "url": "https://news.ycombinator.com/item?id=42550783",
    531         "created_at": "2024-12-30T16:30:18Z"
    532       },
    533       {
    534         "hn_id": "43777601",
    535         "title": "Assistance or Disruption? Evaluating the Design of Proactive AI Programming",
    536         "points": 2,
    537         "comments": 0,
    538         "url": "https://news.ycombinator.com/item?id=43777601",
    539         "created_at": "2025-04-23T23:02:02Z"
    540       },
    541       {
    542         "hn_id": "42566642",
    543         "title": "1.58-Bit Flux",
    544         "points": 2,
    545         "comments": 1,
    546         "url": "https://news.ycombinator.com/item?id=42566642",
    547         "created_at": "2025-01-01T15:38:39Z"
    548       },
    549       {
    550         "hn_id": "43265832",
    551         "title": "Evaluating Intelligence via Trial and Error",
    552         "points": 2,
    553         "comments": 0,
    554         "url": "https://news.ycombinator.com/item?id=43265832",
    555         "created_at": "2025-03-05T12:51:05Z"
    556       },
    557       {
    558         "hn_id": "43280105",
    559         "title": "Evaluating Intelligence via Trial and Error",
    560         "points": 1,
    561         "comments": 0,
    562         "url": "https://news.ycombinator.com/item?id=43280105",
    563         "created_at": "2025-03-06T13:45:25Z"
    564       }
    565     ],
    566     "top_points": 16,
    567     "total_points": 23,
    568     "total_comments": 1
    569   }
    570 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs