scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (33301B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "eSapiens: A Platform for Secure and Auditable Retrieval-Augmented Generation",
      6     "authors": [
      7       "Isaac Shi",
      8       "Zeyuan Li",
      9       "Fan Liu",
     10       "Wenli Wang",
     11       "Lewei He"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2507.09588",
     16     "doi": "10.48550/arXiv.2507.09588"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The abstract claims 'chunk size of 512 tokens yields the highest retrieval precision (Top-3 accuracy: 91.3%)' but the experimental tables show chunk sizes of 500 and 1000 (not 512), and the number 91.3% does not appear in any results table. This is a significant discrepancy between abstract and presented data.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper claims eSapiens 'delivers more context-consistent outputs with up to 23% improvement in factual alignment.' This implies a causal effect of the platform, but eSapiens and the FAISS baseline differ in multiple ways (retrieval strategy, reranking, prompting) with no controlled isolation of which component drives the difference.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper claims effectiveness 'for high-stakes domains like legal and finance' but tests only on legal QA datasets and RAGtruth. Finance, healthcare, and insurance are invoked repeatedly (Sections 1, 2, 3, 4) with no supporting evaluation. The title claims 'Secure and Auditable' but security is not experimentally tested.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No alternative explanations are discussed for the observed results. For example, the Context Relevance improvement could be due to the hybrid retrieval rather than the full platform, but this is not explored.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The abstract calls Context Relevance improvements 'factual alignment' and frames retrieval metrics as evidence of 'trustworthy, auditable AI workflows.' The gap between the proxy (TRACe metrics on 100 questions) and the claimed outcome (trustworthy enterprise workflows) is not acknowledged.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "There is no limitations, threats-to-validity, or discussion section in the paper. The paper ends with Appendix C (SQL examples) with no reflection on limitations.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No threats to validity are discussed anywhere in the paper.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No explicit scope boundaries are stated. The paper broadly claims applicability to 'finance, healthcare, legal services' and 'high-stakes domains' without bounding what the evaluation does and does not show.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding disclosure, acknowledgments section, or grant information appears anywhere in the paper.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All authors are listed under 'eSapiens Team' with the company URL https://www.esapiens.ai/. The affiliation with the evaluated product is clearly visible.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "The authors are employees of the company whose product is being evaluated. The eSapiens company has a direct financial interest in positive evaluation results.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement is present. The authors clearly have financial interests (as employees/founders of eSapiens) but this is not explicitly declared.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "TRACe (the primary evaluation framework) is never defined or cited; 'eSapiens-claude-3.7-extended' is described as 'fine-tuned for long-context reasoning' without explanation of how or where Claude 3.7 was fine-tuned.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper is explicitly positioned as a platform system description: an enterprise RAG platform combining secure data access, hybrid retrieval, and no-code workflow orchestration.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 3 situates eSapiens against RAG literature (Lewis 2020, Izacard 2021), orchestration frameworks (LangChain, LlamaIndex), and domain-specific tools (ChatLaw, Lawyer-LLM).",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No source code repository is provided. The paper links to https://www.esapiens.ai/ which is the commercial product website, not open-source code.",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "The evaluation uses publicly available datasets: four LegalBench subsets (PrivacyQA, CUAD, MAUD, ContractNLI) listed in Table 2, and RAGtruth for the generation quality evaluation (Table 5).",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The paper mentions technology components (Elasticsearch 8.x, LangChain, FastAPI, OpenAIEmbeddings) in Section 6 but provides no requirements.txt, Dockerfile, or versioned dependency list sufficient to recreate the environment.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No reproduction instructions are provided. The evaluation setup is described at a high level but there are no steps, scripts, or commands to replicate the experiments.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Tables 3, 4, and 5 report only point estimates. No confidence intervals, error bars, or uncertainty measures appear anywhere in the paper.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "The paper claims eSapiens 'outperforms' the FAISS baseline and shows 'up to 23% improvement' but no statistical significance tests are applied to any comparison.",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "The abstract mentions 'up to 23% improvement in factual alignment' but this is cherry-picked from the best-case model (GPT-4o-mini Context Relevance). No systematic effect size reporting across models or metrics. For DeepSeek R1, eSapiens Context Relevance is actually 24.8% worse than FAISS.",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The generation quality experiment uses '100 random questions from RAGtruth' (Table 5 caption). No justification is given for why 100 is sufficient. The retrieval benchmark uses existing dataset sizes (712–3950 Q-A pairs) without power analysis.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or spread measures are reported for any experiment. Results appear to be single-run numbers.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Appendix B compares eSapiens against a 'baseline FAISS-based DEREK pipeline' across five LLMs on TRACe metrics (Table 5). Appendix A compares two chunk sizes.",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "The sole baseline is the authors' own simpler FAISS-based pipeline ('faiss+top-2+short'). No comparison against contemporary external RAG systems, frameworks, or academic baselines.",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "The system has multiple components (DEREK, THOR, hybrid retrieval, reranking, CO-STAR prompting, LangGraph validation) but no ablation study isolates their individual contributions. The chunk size comparison (500 vs 1000) tests only one parameter.",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "The retrieval experiment reports Recall@k and Precision@k at multiple k values (Tables 3–4). The generation experiment reports five TRACe metrics: Completeness, Utilization, Context Relevance, pc_hallucinated, and Accuracy (Table 5).",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "The Accuracy metric in Table 5 is described as 'Human-graded alignment with ground truth' (Section B.1), indicating human evaluation of system outputs.",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": false,
    212           "justification": "No mention of train/validation/test splits. The chunk size comparison (500 vs 1000) appears to be evaluated directly on the test data with no held-out validation set for parameter selection.",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Retrieval results are broken down per dataset (PrivacyQA, CUAD, MAUD, ContractNLI) in Tables 3–4. Generation results are broken down per model in Table 5.",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Appendix B discusses where eSapiens underperforms: higher hallucination rates than FAISS ('FAISS baseline consistently achieves lower hallucination rates'), lower completeness ('FAISS baseline yields higher completeness scores overall'), and explains why (e.g., 'eSapiens sometimes applies structural formatting or abstraction, leading to partial omission').",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Table 5 and the analysis in Appendix B honestly report that FAISS outperforms eSapiens on hallucination and completeness metrics. The paper notes the baseline 'yields stricter factual alignment with fewer hallucinations.'",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Section 6 lists 'GPT-4o mini, GPT-4o, GPT-4.1, Claude 3.7 Sonnet, Gemini 1.5 Pro, DeepSeek-R1 and V3.' These are marketing names without specific API versions or snapshot dates (e.g., no 'gpt-4o-2024-05-13').",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "Section 5.6 mentions using the 'CO-STAR prompt format' and that GPT-4o rewrites queries, but the actual prompt text is never provided. Appendix C shows generated SQL queries but not the prompts used to generate them.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "Chunk sizes (500/1000 tokens) and overlap (150 tokens) are stated. However, no LLM hyperparameters are reported: temperature, top-p, max tokens, or any generation parameters for the five models tested.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "Sections 5.6–5.7 and 7.2–7.4 describe the multi-agent architecture in detail: supervisor agent routing, DEREK retrieval pipeline (query refinement → hybrid retrieval → answer generation → LangGraph validation), THOR SQL generation pipeline (orchestration → SQL generation → self-correction → result interpretation), with workflow diagrams in Figures 1 and 2.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": false,
    262           "justification": "Document chunking parameters are stated (Section 5.6: 1000 tokens, 150 overlap), but no detail on how the evaluation datasets were preprocessed: how the '100 random questions' were selected from RAGtruth, how the LegalBench corpora were ingested and indexed, or any filtering steps.",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "No raw experimental data is released. The evaluation datasets are public benchmarks, but the eSapiens outputs, human evaluation scores, and retrieval results are not made available.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": false,
    276           "justification": "Table 2 lists the four LegalBench datasets with Q-A pair counts, and Table 5 mentions '100 random questions from RAGtruth.' However, no detail is given on how random selection was performed, what subset of RAGtruth was used, or how human grading for the Accuracy metric was conducted.",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": true,
    281           "answer": false,
    282           "justification": "The Accuracy metric is described as 'Human-graded alignment with ground truth' but the paper does not describe who the human evaluators were, how many there were, or how they were recruited.",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": false,
    288           "justification": "No documentation of the data pipeline from benchmark datasets to final results. The paper does not explain how documents were indexed, how retrieval was performed at evaluation time, or how the TRACe scores were computed.",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No training data cutoff dates are stated for any of the five LLMs used. While the evaluation is RAG-based, the models could have memorized benchmark answers.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "No discussion of whether the LegalBench or RAGtruth benchmark content appeared in any model's training data.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "LegalBench and its subsets (CUAD, MAUD, ContractNLI, PrivacyQA) are publicly available benchmarks that predate the models' training. No contamination risk is discussed.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants as research subjects. The human grading of Accuracy is an evaluation methodology, not a human subjects study.",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants as research subjects.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants as research subjects.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants as research subjects.",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants as research subjects.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants as research subjects.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants as research subjects.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference cost, latency, or token consumption is reported for any experiment despite using five commercial LLM APIs.",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No total computational budget, API spend, or hardware specifications are stated for the evaluation experiments.",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "No mention of random seeds or sensitivity analysis. Results appear to be from single runs.",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": false,
    380           "justification": "The number of experimental runs is not stated for any experiment.",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": true,
    385           "answer": false,
    386           "justification": "Only two chunk sizes (500, 1000) are compared. No search budget, search method, or other configurations tried are reported.",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": true,
    391           "answer": false,
    392           "justification": "The paper recommends chunk=1000 for production use based on test set performance without a separate validation set or justification for the selection process.",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": true,
    397           "answer": false,
    398           "justification": "Multiple models and metrics are compared across two pipelines but no statistical tests are performed at all, let alone corrections for multiple comparisons.",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "The eSapiens team evaluates their own platform against their own FAISS baseline. No acknowledgment of self-evaluation bias or independent evaluation.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": true,
    409           "answer": false,
    410           "justification": "eSapiens uses hybrid retrieval with reranking while the FAISS baseline uses simpler top-2 retrieval. The compute cost difference is not discussed or controlled for.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": false,
    416           "justification": "No discussion of whether LegalBench subsets or RAGtruth adequately measure what eSapiens claims to deliver (trustworthy, auditable enterprise workflows). The benchmarks test retrieval/generation quality but the claims extend far beyond.",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": true,
    421           "answer": false,
    422           "justification": "eSapiens uses hybrid retrieval + reranking + CO-STAR prompting while the FAISS baseline uses simple top-2 retrieval with a restrictive prompt. Multiple scaffold differences are confounded but the performance difference is attributed to 'eSapiens' as a whole.",
    423           "source": "opus"
    424         }
    425       },
    426       "data_leakage": {
    427         "temporal_leakage_addressed": {
    428           "applies": true,
    429           "answer": false,
    430           "justification": "No discussion of whether the LLMs' training data includes the benchmark content. LegalBench subsets (CUAD from 2021, ContractNLI from 2021) predate most models tested.",
    431           "source": "opus"
    432         },
    433         "feature_leakage_addressed": {
    434           "applies": true,
    435           "answer": false,
    436           "justification": "No discussion of whether the evaluation setup leaks information. The RAG pipeline provides retrieved context that may contain answer-adjacent information.",
    437           "source": "opus"
    438         },
    439         "non_independence_addressed": {
    440           "applies": true,
    441           "answer": false,
    442           "justification": "No discussion of potential overlap or similarity between questions used for tuning the retrieval pipeline and the 100 test questions from RAGtruth.",
    443           "source": "opus"
    444         },
    445         "leakage_detection_method": {
    446           "applies": true,
    447           "answer": false,
    448           "justification": "No leakage detection or prevention method is applied.",
    449           "source": "opus"
    450         }
    451       }
    452     }
    453   },
    454   "claims": [
    455     {
    456       "claim": "Chunk size of 512 tokens yields the highest retrieval precision with Top-3 accuracy of 91.3%",
    457       "evidence": "Tables 3 and 4 test chunk sizes 500 and 1000 only, not 512; 91.3% appears nowhere in the data; ALL-dataset Recall@4 for chunk=500 is 20.40%",
    458       "supported": "unsupported"
    459     },
    460     {
    461       "claim": "eSapiens delivers up to 23% improvement in factual alignment over FAISS baseline",
    462       "evidence": "Table 5 Context Relevance: eSapiens-gpt4o 0.3648 vs faiss-gpt4o 0.3294 (~10.7%); DeepSeek R1 is worse on eSapiens (0.2581 vs 0.3430); 23% not derivable from any table",
    463       "supported": "unsupported"
    464     },
    465     {
    466       "claim": "Monthly financial reporting time fell from two hours to twelve minutes for early adopters",
    467       "evidence": "Stated in Executive Summary as an anecdotal customer report with no methodology, sample size, controls, or attribution",
    468       "supported": "unsupported"
    469     },
    470     {
    471       "claim": "eSapiens achieves higher human-rated accuracy than FAISS baseline for most models",
    472       "evidence": "Table 5 Accuracy: eSapiens-claude-3.7=4.05 vs faiss-claude-3.7=3.75; eSapiens-gemini=4.0 vs faiss-gemini=3.75; eSapiens-gpt4o=3.85 vs faiss-gpt4o=3.55; though rater methodology not described",
    473       "supported": "moderate"
    474     },
    475     {
    476       "claim": "FAISS baseline achieves lower hallucination rates than eSapiens",
    477       "evidence": "Table 5: faiss+gpt4o pc_hallucinated=0.0875 vs eSapiens-gpt4o=0.1823; faiss+claude=0.0860 vs eSapiens-claude=0.1403; consistently observed and honestly reported",
    478       "supported": "strong"
    479     },
    480     {
    481       "claim": "Chunk size 1000 outperforms 500 in recall for most datasets at high k",
    482       "evidence": "Table 4 vs Table 3: at k=50, chunk=1000 outperforms chunk=500 for CUAD (62.30% vs 55.66%) and ContractNLI (39.78% vs 46.90% — actually reversed here); mixed results",
    483       "supported": "weak"
    484     }
    485   ],
    486   "methodology_tags": [
    487     "benchmark-eval",
    488     "case-study"
    489   ],
    490   "key_findings": "eSapiens is a commercial enterprise RAG platform combining hybrid retrieval, no-code orchestration, and multi-agent validation. Retrieval benchmarks on legal corpora show chunk size 1000 generally outperforms 500 in recall at high k for most datasets. Generation quality experiments on 100 RAGtruth questions show eSapiens achieves higher human-rated naturalness and context relevance than a FAISS baseline but with substantially higher hallucination rates. The paper's headline abstract claims (512-token chunks, 91.3% Top-3 accuracy, 23% factual alignment improvement) cannot be verified from the presented data, and the paper's anecdotal business performance claims lack any methodology.",
    491   "red_flags": [
    492     {
    493       "flag": "Abstract-data mismatch",
    494       "detail": "Abstract claims '512 tokens' and '91.3% Top-3 accuracy' but experiments only test 500 and 1000 token chunks; the 91.3% figure appears nowhere in Tables 3 or 4."
    495     },
    496     {
    497       "flag": "Unverifiable headline improvement",
    498       "detail": "'Up to 23% improvement in factual alignment' cannot be derived from Table 5; actual Context Relevance differences are ~10% for favorable models and negative for DeepSeek R1."
    499     },
    500     {
    501       "flag": "Authors evaluate own commercial product",
    502       "detail": "The eSapiens Team evaluates their own commercial SaaS platform with no independent evaluation, third-party verification, or conflict-of-interest disclosure."
    503     },
    504     {
    505       "flag": "Implausible fine-tuning claim",
    506       "detail": "'eSapiens-claude-3.7-extended, a customized variant fine-tuned for long-context reasoning' — Anthropic does not offer fine-tuning access for Claude 3.7 Sonnet; this claim is unexplained and potentially misleading."
    507     },
    508     {
    509       "flag": "Anecdotal business claims",
    510       "detail": "Claims like '60% development cost reduction,' '40% ticket accuracy improvement,' and 'two hours to twelve minutes' are stated as facts without any methodology, controls, sample size, or attribution."
    511     },
    512     {
    513       "flag": "No limitations section",
    514       "detail": "Paper has no limitations, threats-to-validity, or scope boundaries section despite making broad enterprise AI claims across finance, healthcare, and legal domains."
    515     },
    516     {
    517       "flag": "No statistical rigor",
    518       "detail": "No confidence intervals, significance tests, or variance reporting; all results are single-run point estimates with no indication of reproducibility."
    519     },
    520     {
    521       "flag": "TRACe evaluation framework undefined",
    522       "detail": "TRACe is the primary evaluation framework used throughout Appendix B but is never defined, cited, or described sufficiently to understand what the metrics measure."
    523     }
    524   ],
    525   "cited_papers": [
    526     {
    527       "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    528       "relevance": "Foundational RAG paper (Lewis et al. 2020) that eSapiens builds upon for knowledge retrieval architecture"
    529     },
    530     {
    531       "title": "Distilling Knowledge from Reader to Retriever for Question Answering (FiD)",
    532       "relevance": "Izacard & Grave 2021 — early retrieval-augmented generation approach compared as prior art"
    533     },
    534     {
    535       "title": "LangChain: Language Models in Chains",
    536       "relevance": "Core orchestration framework used in eSapiens implementation for prompt templating and tool calling"
    537     },
    538     {
    539       "title": "LlamaIndex (GPT Index)",
    540       "relevance": "Comparable modular LLM application framework discussed as prior work in the product comparison"
    541     },
    542     {
    543       "title": "Gorilla: Large Language Model Connected with Massive APIs",
    544       "relevance": "Autonomous agent behavior and tool use paper compared to eSapiens approach in related work"
    545     },
    546     {
    547       "title": "Toolformer: Language Models Can Teach Themselves to Use Tools",
    548       "relevance": "Tool-use in LLMs, related to eSapiens multi-agent orchestration design"
    549     },
    550     {
    551       "title": "ChatLaw: Open-Source Legal Large Language Model Trained on Chinese Legal Documents",
    552       "relevance": "Domain-specific legal AI system compared as vertical SaaS prior work"
    553     },
    554     {
    555       "title": "Lawyer-LLM: An Expert-Level Chinese Legal Large Language Model",
    556       "relevance": "Domain-specific legal LLM compared in related work as a narrow vertical solution"
    557     }
    558   ],
    559   "engagement_factors": {
    560     "practical_relevance": {
    561       "score": 2,
    562       "justification": "Enterprise RAG platform with clear practical use cases (legal QA, SQL generation), but commercial product — not open-source or freely usable."
    563     },
    564     "surprise_contrarian": {
    565       "score": 0,
    566       "justification": "Confirms standard expectations about RAG improving LLM outputs with no surprising or contrarian findings."
    567     },
    568     "fear_safety": {
    569       "score": 0,
    570       "justification": "No AI risk or security concerns raised; the paper emphasizes security as a selling point rather than exposing vulnerabilities."
    571     },
    572     "drama_conflict": {
    573       "score": 0,
    574       "justification": "No controversy or conflict; straightforward product evaluation paper."
    575     },
    576     "demo_ability": {
    577       "score": 1,
    578       "justification": "Commercial website exists (esapiens.ai) with demos mentioned, but no open-source code or free trial evident from the paper."
    579     },
    580     "brand_recognition": {
    581       "score": 0,
    582       "justification": "eSapiens is an unknown startup; none of the authors are widely recognized in the AI research community."
    583     }
    584   },
    585   "hn_data": {
    586     "threads": [
    587       {
    588         "hn_id": "41039213",
    589         "title": "Planck stars, White Holes, Remnants and Planck-mass quasi-particles",
    590         "points": 62,
    591         "comments": 32,
    592         "url": "https://news.ycombinator.com/item?id=41039213"
    593       },
    594       {
    595         "hn_id": "43708789",
    596         "title": "Eccfrog512ck2: An Enhanced 512-Bit Weierstrass Elliptic Curve [pdf]",
    597         "points": 45,
    598         "comments": 16,
    599         "url": "https://news.ycombinator.com/item?id=43708789"
    600       },
    601       {
    602         "hn_id": "43701195",
    603         "title": "Reasoning Models Can Be Effective Without Thinking",
    604         "points": 21,
    605         "comments": 2,
    606         "url": "https://news.ycombinator.com/item?id=43701195"
    607       },
    608       {
    609         "hn_id": "32097013",
    610         "title": "A Study of HTTP/2’s Server Push Performance Potential",
    611         "points": 21,
    612         "comments": 2,
    613         "url": "https://news.ycombinator.com/item?id=32097013"
    614       },
    615       {
    616         "hn_id": "44607842",
    617         "title": "BeePL: Correct-by-Compilation Kernel Extensions",
    618         "points": 4,
    619         "comments": 0,
    620         "url": "https://news.ycombinator.com/item?id=44607842"
    621       },
    622       {
    623         "hn_id": "44755879",
    624         "title": "TinyTroupe: An LLM-Powered Multiagent Persona Simulation Toolkit (OSS Paper)",
    625         "points": 3,
    626         "comments": 1,
    627         "url": "https://news.ycombinator.com/item?id=44755879"
    628       },
    629       {
    630         "hn_id": "44639814",
    631         "title": "Automated Hypothesis Validation with Agentic Sequential Falsifications",
    632         "points": 3,
    633         "comments": 0,
    634         "url": "https://news.ycombinator.com/item?id=44639814"
    635       },
    636       {
    637         "hn_id": "43935110",
    638         "title": "ZeroSearch: Incentivize the Search Capability of LLMs Without Searching",
    639         "points": 2,
    640         "comments": 0,
    641         "url": "https://news.ycombinator.com/item?id=43935110"
    642       },
    643       {
    644         "hn_id": "43175116",
    645         "title": "Maximizing Energy Efficiency in Subthreshold RISC-V Cores",
    646         "points": 2,
    647         "comments": 0,
    648         "url": "https://news.ycombinator.com/item?id=43175116"
    649       },
    650       {
    651         "hn_id": "44583158",
    652         "title": "TinyTroupe: An LLM-Powered Multiagent Persona Simulation Toolkit",
    653         "points": 1,
    654         "comments": 0,
    655         "url": "https://news.ycombinator.com/item?id=44583158"
    656       }
    657     ],
    658     "top_points": 62,
    659     "total_points": 164,
    660     "total_comments": 53
    661   }
    662 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs