scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28639B)
      1 {
      2   "paper": {
      3     "title": "eSapiens: A Platform for Secure and Auditable Retrieval-Augmented Generation",
      4     "authors": [
      5       "Isaac Shi",
      6       "Zeyuan Li",
      7       "Fan Liu",
      8       "Wenli Wang",
      9       "Lewei He",
     10       "Yang Yang",
     11       "Tianyu Shi"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2507.09588",
     16     "doi": "10.48550/arXiv.2507.09588"
     17   },
     18   "scan_version": 3,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval", "case-study"],
     21   "key_findings": "eSapiens is a commercial RAG platform evaluated on legal QA retrieval (four LegalBench subsets) and generation quality (TRACe metrics across five LLMs). The retrieval experiment finds chunk size 1000 generally outperforms 500 for recall, while the generation experiment shows eSapiens improves Context Relevance over a FAISS baseline but at the cost of higher hallucination rates and lower completeness. The paper's own data shows the FAISS baseline outperforms eSapiens on 3 of 5 TRACe metrics, undermining the paper's positive framing.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No source code repository is provided. The paper links to https://www.esapiens.ai/ which is the commercial product website, not open-source code."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The evaluation uses publicly available datasets: four LegalBench subsets (PrivacyQA, CUAD, MAUD, ContractNLI) listed in Table 2, and RAGtruth for the generation quality evaluation (Table 5)."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper mentions technology components (Elasticsearch 8.x, LangChain, FastAPI, OpenAIEmbeddings) in Section 6 but provides no requirements.txt, Dockerfile, or versioned dependency list sufficient to recreate the environment."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No reproduction instructions are provided. The evaluation setup is described at a high level but there are no steps, scripts, or commands to replicate the experiments."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "Tables 3, 4, and 5 report only point estimates. No confidence intervals, error bars, or uncertainty measures appear anywhere in the paper."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper claims eSapiens 'outperforms' the FAISS baseline and shows 'up to 23% improvement' but no statistical significance tests are applied to any comparison."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The abstract mentions 'up to 23% improvement in factual alignment' but this is cherry-picked from the best-case model (GPT-4o-mini Context Relevance). No systematic effect size reporting across models or metrics. For DeepSeek R1, eSapiens Context Relevance is actually 24.8% worse than FAISS."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The generation quality experiment uses '100 random questions from RAGtruth' (Table 5 caption). No justification is given for why 100 is sufficient. The retrieval benchmark uses existing dataset sizes (712–3950 Q-A pairs) without power analysis."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No variance, standard deviation, or spread measures are reported for any experiment. Results appear to be single-run numbers."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Appendix B compares eSapiens against a 'baseline FAISS-based DEREK pipeline' across five LLMs on TRACe metrics (Table 5). Appendix A compares two chunk sizes."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "The sole baseline is the authors' own simpler FAISS-based pipeline ('faiss+top-2+short'). No comparison against contemporary external RAG systems, frameworks, or academic baselines."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "The system has multiple components (DEREK, THOR, hybrid retrieval, reranking, CO-STAR prompting, LangGraph validation) but no ablation study isolates their individual contributions. The chunk size comparison (500 vs 1000) tests only one parameter."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "The retrieval experiment reports Recall@k and Precision@k at multiple k values (Tables 3–4). The generation experiment reports five TRACe metrics: Completeness, Utilization, Context Relevance, pc_hallucinated, and Accuracy (Table 5)."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "The Accuracy metric in Table 5 is described as 'Human-graded alignment with ground truth' (Section B.1), indicating human evaluation of system outputs."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "No mention of train/validation/test splits. The chunk size comparison (500 vs 1000) appears to be evaluated directly on the test data with no held-out validation set for parameter selection."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Retrieval results are broken down per dataset (PrivacyQA, CUAD, MAUD, ContractNLI) in Tables 3–4. Generation results are broken down per model in Table 5."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Appendix B discusses where eSapiens underperforms: higher hallucination rates than FAISS ('FAISS baseline consistently achieves lower hallucination rates'), lower completeness ('FAISS baseline yields higher completeness scores overall'), and explains why (e.g., 'eSapiens sometimes applies structural formatting or abstraction, leading to partial omission')."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Table 5 and the analysis in Appendix B honestly report that FAISS outperforms eSapiens on hallucination and completeness metrics. The paper notes the baseline 'yields stricter factual alignment with fewer hallucinations.'"
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The abstract claims 'chunk size of 512 tokens yields the highest retrieval precision (Top-3 accuracy: 91.3%)' but the experimental tables show chunk sizes of 500 and 1000 (not 512), and the number 91.3% does not appear in any results table. This is a significant discrepancy between abstract and presented data."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The paper claims eSapiens 'delivers more context-consistent outputs with up to 23% improvement in factual alignment.' This implies a causal effect of the platform, but eSapiens and the FAISS baseline differ in multiple ways (retrieval strategy, reranking, prompting) with no controlled isolation of which component drives the difference."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper claims effectiveness 'for high-stakes domains like legal and finance' but tests only on legal QA datasets and RAGtruth. Finance, healthcare, and insurance are invoked repeatedly (Sections 1, 2, 3, 4) with no supporting evaluation. The title claims 'Secure and Auditable' but security is not experimentally tested."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "No alternative explanations are discussed for the observed results. For example, the Context Relevance improvement could be due to the hybrid retrieval rather than the full platform, but this is not explored."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The abstract calls Context Relevance improvements 'factual alignment' and frames retrieval metrics as evidence of 'trustworthy, auditable AI workflows.' The gap between the proxy (TRACe metrics on 100 questions) and the claimed outcome (trustworthy enterprise workflows) is not acknowledged."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "Section 6 lists 'GPT-4o mini, GPT-4o, GPT-4.1, Claude 3.7 Sonnet, Gemini 1.5 Pro, DeepSeek-R1 and V3.' These are marketing names without specific API versions or snapshot dates (e.g., no 'gpt-4o-2024-05-13')."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "Section 5.6 mentions using the 'CO-STAR prompt format' and that GPT-4o rewrites queries, but the actual prompt text is never provided. Appendix C shows generated SQL queries but not the prompts used to generate them."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "Chunk sizes (500/1000 tokens) and overlap (150 tokens) are stated. However, no LLM hyperparameters are reported: temperature, top-p, max tokens, or any generation parameters for the five models tested."
    161       },
    162       "scaffolding_described": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Sections 5.6–5.7 and 7.2–7.4 describe the multi-agent architecture in detail: supervisor agent routing, DEREK retrieval pipeline (query refinement → hybrid retrieval → answer generation → LangGraph validation), THOR SQL generation pipeline (orchestration → SQL generation → self-correction → result interpretation), with workflow diagrams in Figures 1 and 2."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "Document chunking parameters are stated (Section 5.6: 1000 tokens, 150 overlap), but no detail on how the evaluation datasets were preprocessed: how the '100 random questions' were selected from RAGtruth, how the LegalBench corpora were ingested and indexed, or any filtering steps."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "There is no limitations, threats-to-validity, or discussion section in the paper. The paper ends with Appendix C (SQL examples) with no reflection on limitations."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No threats to validity are discussed anywhere in the paper."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "No explicit scope boundaries are stated. The paper broadly claims applicability to 'finance, healthcare, legal services' and 'high-stakes domains' without bounding what the evaluation does and does not show."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "No raw experimental data is released. The evaluation datasets are public benchmarks, but the eSapiens outputs, human evaluation scores, and retrieval results are not made available."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "Table 2 lists the four LegalBench datasets with Q-A pair counts, and Table 5 mentions '100 random questions from RAGtruth.' However, no detail is given on how random selection was performed, what subset of RAGtruth was used, or how human grading for the Accuracy metric was conducted."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "The Accuracy metric is described as 'Human-graded alignment with ground truth' but the paper does not describe who the human evaluators were, how many there were, or how they were recruited."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "No documentation of the data pipeline from benchmark datasets to final results. The paper does not explain how documents were indexed, how retrieval was performed at evaluation time, or how the TRACe scores were computed."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding disclosure, acknowledgments section, or grant information appears anywhere in the paper."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "All authors are listed under 'eSapiens Team' with the company URL https://www.esapiens.ai/. The affiliation with the evaluated product is clearly visible."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "The authors are employees of the company whose product is being evaluated. The eSapiens company has a direct financial interest in positive evaluation results."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial interests statement is present. The authors clearly have financial interests (as employees/founders of eSapiens) but this is not explicitly declared."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No training data cutoff dates are stated for any of the five LLMs used. While the evaluation is RAG-based, the models could have memorized benchmark answers."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No discussion of whether the LegalBench or RAGtruth benchmark content appeared in any model's training data."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "LegalBench and its subsets (CUAD, MAUD, ContractNLI, PrivacyQA) are publicly available benchmarks that predate the models' training. No contamination risk is discussed."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants as research subjects. The human grading of Accuracy is an evaluation methodology, not a human subjects study."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants as research subjects."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants as research subjects."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants as research subjects."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants as research subjects."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants as research subjects."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants as research subjects."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No inference cost, latency, or token consumption is reported for any experiment despite using five commercial LLM APIs."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "No total computational budget, API spend, or hardware specifications are stated for the evaluation experiments."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No mention of random seeds or sensitivity analysis. Results appear to be from single runs."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The number of experimental runs is not stated for any experiment."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "Only two chunk sizes (500, 1000) are compared. No search budget, search method, or other configurations tried are reported."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The paper recommends chunk=1000 for production use based on test set performance without a separate validation set or justification for the selection process."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "Multiple models and metrics are compared across two pipelines but no statistical tests are performed at all, let alone corrections for multiple comparisons."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The eSapiens team evaluates their own platform against their own FAISS baseline. No acknowledgment of self-evaluation bias or independent evaluation."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "eSapiens uses hybrid retrieval with reranking while the FAISS baseline uses simpler top-2 retrieval. The compute cost difference is not discussed or controlled for."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "No discussion of whether LegalBench subsets or RAGtruth adequately measure what eSapiens claims to deliver (trustworthy, auditable enterprise workflows). The benchmarks test retrieval/generation quality but the claims extend far beyond."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": true,
    343         "answer": false,
    344         "justification": "eSapiens uses hybrid retrieval + reranking + CO-STAR prompting while the FAISS baseline uses simple top-2 retrieval with a restrictive prompt. Multiple scaffold differences are confounded but the performance difference is attributed to 'eSapiens' as a whole."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether the LLMs' training data includes the benchmark content. LegalBench subsets (CUAD from 2021, ContractNLI from 2021) predate most models tested."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether the evaluation setup leaks information. The RAG pipeline provides retrieved context that may contain answer-adjacent information."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No discussion of potential overlap or similarity between questions used for tuning the retrieval pipeline and the 100 test questions from RAGtruth."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No leakage detection or prevention method is applied."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Chunk size of 512 tokens yields the highest retrieval precision with Top-3 accuracy of 91.3%",
    373       "evidence": "Abstract states this claim, but experimental tables (Tables 3 and 4) show chunk sizes of 500 and 1000 — not 512. The number 91.3% does not appear in any results table.",
    374       "supported": "unsupported"
    375     },
    376     {
    377       "claim": "eSapiens delivers up to 23% improvement in factual alignment compared to FAISS baseline",
    378       "evidence": "Table 5 Context Relevance: eSapiens-gpt4o-mini (0.3785) vs faiss-gpt4o-mini (0.3090) = ~22.5% improvement. However, this is the best-case model. For DeepSeek R1, eSapiens is 24.8% worse (0.2581 vs 0.3430). Average improvement across models is much smaller.",
    379       "supported": "weak"
    380     },
    381     {
    382       "claim": "eSapiens outperforms traditional FAISS-based methods in context relevance, utilization, accuracy, and naturalness",
    383       "evidence": "Table 5 shows eSapiens wins on Context Relevance for 4/5 models and Accuracy for 4/5 models. However, FAISS wins on Completeness (5/5 models), Hallucination rate (5/5 models), and Utilization (3/5 models). The paper's own data contradicts its framing of general superiority.",
    384       "supported": "weak"
    385     },
    386     {
    387       "claim": "Chunk size 1000 is better aligned with production needs, offering broader coverage and lower information loss",
    388       "evidence": "Tables 3–4 and Appendix A analysis show chunk=1000 outperforms 500 on recall for CUAD, MAUD, and ContractNLI at high k values, while chunk=500 is better for PrivacyQA. Reasonable conclusion for 3/4 datasets.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Monthly financial reporting time fell from two hours to twelve minutes",
    393       "evidence": "Section 1 (Executive Summary) states this as an early adopter report. No methodology, sample size, measurement procedure, or data is provided.",
    394       "supported": "unsupported"
    395     },
    396     {
    397       "claim": "Development costs slashed by up to 60 percent",
    398       "evidence": "Section 1 (Executive Summary) states this as a benefit. No data, methodology, or supporting evidence is provided.",
    399       "supported": "unsupported"
    400     }
    401   ],
    402   "red_flags": [
    403     {
    404       "flag": "Company evaluating its own product",
    405       "detail": "All seven authors are from the 'eSapiens Team' and evaluate the eSapiens commercial platform. No independent evaluation or acknowledgment of self-evaluation bias. The paper functions partly as marketing material with extensive product description."
    406     },
    407     {
    408       "flag": "Abstract claims do not match presented data",
    409       "detail": "The abstract claims 'chunk size of 512 tokens yields the highest retrieval precision (Top-3 accuracy: 91.3%)' but the experimental tables show chunk sizes of 500 and 1000, and 91.3% appears nowhere in the results. This is a factual discrepancy between the abstract and the paper's own data."
    410     },
    411     {
    412       "flag": "Marketing claims without evidence",
    413       "detail": "The Executive Summary claims '60% cost reduction,' 'reporting time from 2 hours to 12 minutes,' '40% accuracy improvement in ticket categorization,' and 'double-digit' lead-to-deal velocity gains. None of these are supported by any presented data, methodology, or even citations."
    414     },
    415     {
    416       "flag": "Cherry-picked 'up to' improvements",
    417       "detail": "The '23% improvement in factual alignment' comes from the single best model (GPT-4o-mini on Context Relevance). For DeepSeek R1, eSapiens is actually 24.8% worse on the same metric. The 'up to' framing hides that the FAISS baseline outperforms eSapiens on 3 of 5 TRACe metrics across most models."
    418     },
    419     {
    420       "flag": "Selective framing of results",
    421       "detail": "The FAISS baseline achieves lower hallucination rates (consistently, across all 5 models) and higher completeness (all 5 models), yet the paper concludes eSapiens shows 'clear gains' and 'effectiveness in enabling trustworthy, auditable AI workflows.' The word 'trustworthy' is undercut by higher hallucination rates."
    422     },
    423     {
    424       "flag": "No error bars or statistical tests on any comparison",
    425       "detail": "All results are single-point estimates with no uncertainty quantification. Claims of superiority rest on comparing raw numbers without any significance testing, despite small margins on some metrics."
    426     },
    427     {
    428       "flag": "No limitations section",
    429       "detail": "The paper contains no limitations, discussion, or threats-to-validity section. It acknowledges no weaknesses in the evaluation design or scope of claims."
    430     }
    431   ],
    432   "cited_papers": [
    433     {
    434       "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    435       "authors": ["Patrick Lewis"],
    436       "year": 2020,
    437       "relevance": "Foundational RAG paper establishing the retrieval-augmented generation paradigm that eSapiens builds upon."
    438     },
    439     {
    440       "title": "Distilling Knowledge from Reader to Retriever for Question Answering",
    441       "authors": ["Gautier Izacard", "Edouard Grave"],
    442       "year": 2021,
    443       "relevance": "FiD architecture for tightly coupling retrievers with generators, cited as early RAG research prototype."
    444     },
    445     {
    446       "title": "LangChain: Language Models in Chains",
    447       "authors": ["Harrison Chase"],
    448       "year": 2022,
    449       "relevance": "LLM orchestration framework used as the backbone of eSapiens' prompt orchestration and agent workflows."
    450     },
    451     {
    452       "title": "LlamaIndex (GPT Index)",
    453       "authors": ["Jerry Liu"],
    454       "year": 2022,
    455       "relevance": "Modular toolchain for building LLM-powered applications, cited as comparable developer-centric framework."
    456     },
    457     {
    458       "title": "Gorilla: Large Language Model Connected with Massive APIs",
    459       "authors": ["Chinmay H. Patil"],
    460       "year": 2023,
    461       "arxiv_id": "2305.15334",
    462       "relevance": "Demonstrates autonomous agent behaviors and tool use capabilities relevant to LLM-based agentic workflows."
    463     },
    464     {
    465       "title": "Toolformer: Language Models Can Teach Themselves to Use Tools",
    466       "authors": ["Timo Schick"],
    467       "year": 2023,
    468       "arxiv_id": "2302.04761",
    469       "relevance": "Demonstrates LLM tool-use capabilities relevant to agentic AI systems and API-calling workflows."
    470     },
    471     {
    472       "title": "ChatLaw: Open-Source Legal Large Language Model Trained on Chinese Legal Documents",
    473       "authors": ["Xiaofei Gao"],
    474       "year": 2023,
    475       "arxiv_id": "2305.14251",
    476       "relevance": "Domain-specific legal LLM relevant to evaluating AI in high-stakes professional domains."
    477     },
    478     {
    479       "title": "Lawyer LLM: An Expert-Level Chinese Legal Large Language Model",
    480       "authors": ["Weicheng Fan"],
    481       "year": 2023,
    482       "arxiv_id": "2310.10472",
    483       "relevance": "Another domain-specific legal AI system, relevant for understanding vertical AI evaluation in professional settings."
    484     }
    485   ],
    486   "engagement_factors": {
    487     "practical_relevance": {
    488       "score": 2,
    489       "justification": "Enterprise RAG platform with clear practical use cases (legal QA, SQL generation), but commercial product — not open-source or freely usable."
    490     },
    491     "surprise_contrarian": {
    492       "score": 0,
    493       "justification": "Confirms standard expectations about RAG improving LLM outputs with no surprising or contrarian findings."
    494     },
    495     "fear_safety": {
    496       "score": 0,
    497       "justification": "No AI risk or security concerns raised; the paper emphasizes security as a selling point rather than exposing vulnerabilities."
    498     },
    499     "drama_conflict": {
    500       "score": 0,
    501       "justification": "No controversy or conflict; straightforward product evaluation paper."
    502     },
    503     "demo_ability": {
    504       "score": 1,
    505       "justification": "Commercial website exists (esapiens.ai) with demos mentioned, but no open-source code or free trial evident from the paper."
    506     },
    507     "brand_recognition": {
    508       "score": 0,
    509       "justification": "eSapiens is an unknown startup; none of the authors are widely recognized in the AI research community."
    510     }
    511   }
    512 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs