scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (19590B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "theoretical",
      4   "paper": {
      5     "title": "Featurized-Decomposition Join: Low-Cost Semantic Joins with Guarantees",
      6     "authors": [
      7       "Sepanta Zeighami",
      8       "Shreya Shankar",
      9       "Aditya G. Parameswaran"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2512.05399",
     14     "doi": "10.48550/arXiv.2512.05399"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "All major abstract claims are substantiated: the 10x cost reduction is backed by Table 3 (Movies: BARGAIN 69.9% vs FDJ 6.70%), statistical guarantees are proven in Theorem 7.1, and embedding limitations are demonstrated in Section 8.4 with controlled experiments.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Causal claims that featurized decomposition reduces cost are supported by controlled experiments comparing the same quality targets across 6 datasets; the mechanism (replacing quadratic LLM pair comparisons with linear feature extraction) is formally specified and measured.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The paper categorizes datasets into three performance tiers (Section 8.2) and explicitly states conditions on theorems (r ≤ 1/(1-T), k+ > 1/(1-T)); claims are bounded to the tested setting and dataset types.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Section 8.4 uses synthetic controlled experiments to systematically investigate why embedding-based approaches fail (multiple attribute values, irrelevant text length), presenting the causal mechanism rather than just the performance gap.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper clearly distinguishes cost ratio (monetary LLM token cost, the measurement) from quality (precision/recall, the claim); it explicitly acknowledges using false positive rate as a cost proxy and discusses extensions to more fine-grained cost models in Appendix F.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "No dedicated limitations or threats-to-validity section exists; limitations are scattered across the paper (conditions on theorems, dataset-dependent performance) but not consolidated into a named section.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No threats-to-validity section exists; the paper discusses performance variation by dataset type and conditions on theorems, but does not systematically address threats such as LLM non-determinism, cost simulation vs. actual API calls, or sample size sensitivity.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Explicit scope boundaries are not stated as such; conditions on theorems are noted and dataset performance categories are discussed, but the paper does not explicitly delineate what the results do NOT apply to.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding acknowledgment or disclosure appears anywhere in the paper, despite the work involving a real-world police records project and substantial commercial API usage.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All three authors list UC Berkeley as their affiliation on the title page with corresponding emails.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No funding is disclosed, so funder independence cannot be assessed.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement or declaration of financial interests appears in the paper.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms receive formal mathematical definitions: 'semantic join' (Section 2), 'featurized decomposition,' 'featurized predicate,' 'logical scaffold,' 'featurized clause,' and 'featurization' are all precisely defined; Figure 3 provides a terminology summary.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The Contributions section explicitly lists four contributions: featurized decomposition as a new mechanism, the FDJ algorithm, novel high-dimensional statistical results generalizing prior 1D bounds, and experimental results across real-world datasets.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 9 engages substantively with LOTUS, BARGAIN, SUPG, entity resolution literature, and LLM-powered data management, explaining how FDJ generalizes, outperforms, or is orthogonal to each prior approach.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "theoretical": {
    118       "formal_quality": {
    119         "assumptions_stated_explicitly": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "Assumptions are explicitly stated: LLM behavior for the NP-hardness proof (specific responses to two prompts, Appendix H.1), conditions on Theorem 6.1 (r ≤ 1/(1-T) and k+ > 1/(1-T)), and the rank-normalized dataset assumptions for Lemma H.4.",
    123           "source": "haiku"
    124         },
    125         "proofs_complete_or_sketched": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "Appendix H contains complete proofs: NP-hardness via polynomial-time Set Cover reduction (H.1), Lemma 6.2 via incremental swap argument (H.2-H.2.3), Theorem 6.1 (H.3), and Lemma D.1 (H.4); the body provides proof sketches with explicit appendix references.",
    129           "source": "haiku"
    130         },
    131         "bounds_tight_or_discussed": {
    132           "applies": true,
    133           "answer": true,
    134           "justification": "Tightness is explicitly proven: Lemma 6.2 identifies the worst-case dataset D* that maximizes failure probability, and the paper states 'tight theoretical analysis'; the minimum adjusted target problem (Eq. 6) seeks the smallest valid T', proving the bound is not loose.",
    135           "source": "haiku"
    136         },
    137         "counterexamples_explored": {
    138           "applies": true,
    139           "answer": true,
    140           "justification": "Section 8.4 uses controlled synthetic experiments to stress-test limits: increasing number of attribute values (Fig. 10a) and increasing irrelevant text length (Fig. 10b) systematically probe where the approach and its competitors break down.",
    141           "source": "haiku"
    142         },
    143         "notation_consistent": {
    144           "applies": true,
    145           "answer": true,
    146           "justification": "Notation is introduced systematically in Section 2 and Figure 3, and used consistently throughout; the paper explicitly flags notation abuse (e.g., 'We abuse notation and refer to the featurization and its inference function interchangeably').",
    147           "source": "haiku"
    148         },
    149         "constructive_vs_existence_noted": {
    150           "applies": true,
    151           "answer": true,
    152           "justification": "FDJ (Algorithm 6) is explicitly constructive; the NP-hardness result is an existence result about the minimum cost, and the paper explicitly motivates the greedy Algorithm 4 as an approximation because the optimal solution is intractable to compute.",
    153           "source": "haiku"
    154         }
    155       },
    156       "connections": {
    157         "connection_to_practice_discussed": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "The police records matching application (California Police Records Access Project) is used as a running example throughout; Section 8 evaluates on 6 real-world domains; cost model uses actual OpenAI pricing; practical parameter settings are discussed in Appendix E.",
    161           "source": "haiku"
    162         },
    163         "relationship_to_prior_work_clear": {
    164           "applies": true,
    165           "answer": true,
    166           "justification": "The paper explicitly states it generalizes SUPG/BARGAIN's 1D threshold bounds to high dimensions; Section 9 positions FDJ relative to LOTUS, BARGAIN, entity resolution methods, and cost-efficient LLM processing, specifying what is extended vs. what is orthogonal.",
    167           "source": "haiku"
    168         },
    169         "computational_complexity_discussed": {
    170           "applies": true,
    171           "answer": true,
    172           "justification": "Theorem 4.2 proves MCFD is NP-hard; Proposition E.1 gives the token cost complexity O(k+k'+|Y_hat|+|Phi|(|L|+|R|)); exhaustive vs. greedy threshold search is analyzed with the greedy justified by NP-hardness of the optimal.",
    173           "source": "haiku"
    174         },
    175         "limitations_of_formal_model_stated": {
    176           "applies": true,
    177           "answer": false,
    178           "justification": "The formal model's limitations — using false positive rate as a cost proxy, conditions that restrict the number of clauses, Monte Carlo approximation for probability estimation — are mentioned in passing but not systematically discussed as limitations of the formal model.",
    179           "source": "haiku"
    180         }
    181       }
    182     }
    183   },
    184   "claims": [
    185     {
    186       "claim": "FDJ reduces semantic join cost by up to 10x compared to state-of-the-art BARGAIN",
    187       "evidence": "Table 3: Movies dataset BARGAIN 69.9% cost ratio vs FDJ 6.70% (0.09x); Citations BARGAIN 28.0% vs FDJ 6.80% (0.24x)",
    188       "supported": "strong"
    189     },
    190     {
    191       "claim": "FDJ provides statistical guarantees on recall and precision (Theorem 7.1)",
    192       "evidence": "Theorem 7.1 is stated and proven in Appendix H.3; Table 2 shows FDJ meets 90% recall target with 7% failure rate vs. LOTUS 100% failure rate",
    193       "supported": "strong"
    194     },
    195     {
    196       "claim": "Minimum Cost Featurized Decomposition (MCFD) is NP-hard",
    197       "evidence": "Theorem 4.2 proven via polynomial-time reduction from Set Cover in Appendix H.1",
    198       "supported": "strong"
    199     },
    200     {
    201       "claim": "Embedding-based approaches fail when records contain multiple attribute values or irrelevant text",
    202       "evidence": "Figure 10a: optimal cascade cost ratio rises from ~0 to 0.6 as attribute count increases from 1 to 5; Figure 10b: optimal cascade degrades with even 2 additional sentences while FDJ stays stable",
    203       "supported": "strong"
    204     },
    205     {
    206       "claim": "FDJ is up to 8x cheaper than the optimal cascade (lower bound on all cascade approaches)",
    207       "evidence": "Table 3: Movies optimal cascade 52.5% vs FDJ 6.70%; Citations optimal cascade 19.1% vs FDJ 6.80%",
    208       "supported": "strong"
    209     },
    210     {
    211       "claim": "Iterative LLM featurization generation converges within 50 positive samples across all datasets",
    212       "evidence": "Appendix E states 'we observed that LLMs will stop creating new featurizations after observing at most 50 positive samples across all datasets' — empirical observation without formal bound",
    213       "supported": "moderate"
    214     },
    215     {
    216       "claim": "The threshold adjustment function provides tight statistical bounds generalizing 1D cascade bounds to high dimensions",
    217       "evidence": "Lemma 6.2 proves D* is the worst-case dataset maximizing failure probability; Theorem 6.1 formalizes the guarantee with explicit conditions; minimum adjusted target problem (Eq. 6) minimizes T'",
    218       "supported": "strong"
    219     }
    220   ],
    221   "methodology_tags": [
    222     "theoretical",
    223     "benchmark-eval"
    224   ],
    225   "key_findings": "FDJ introduces featurized decomposition — automatically constructed logical expressions in CNF over extracted text features — as an alternative to embedding-based model cascades for semantic joins, reducing LLM cost by up to 10x over BARGAIN while maintaining statistical guarantees on recall. The minimum cost featurized decomposition problem is proven NP-hard via reduction from Set Cover, motivating a greedy approximation with iterative LLM-guided feature generation. The paper provides novel tight statistical bounds for multi-dimensional threshold selection (generalizing prior 1D SUPG/BARGAIN bounds to r dimensions), proving the worst-case dataset has minimally correlated features. Empirically, gains are largest when join conditions depend on specific extractable features (date, location, names) and minimal for complex classification tasks where features are not clearly separable.",
    226   "red_flags": [
    227     {
    228       "flag": "No limitations section",
    229       "detail": "No dedicated limitations or threats-to-validity section; performance near-zero improvement on BioDEX (0.99x vs BARGAIN) is described but not framed as a limitation of the approach."
    230     },
    231     {
    232       "flag": "No funding disclosure",
    233       "detail": "No acknowledgment of funding sources despite the work involving a real-world police records project (BIDS Berkeley), use of commercial OpenAI APIs (GPT-4.1, O3), and three UC Berkeley researchers."
    234     },
    235     {
    236       "flag": "Cost simulation rather than real LLM calls",
    237       "detail": "Experiments simulate LLM calls by using ground-truth labels and computing token counts from prompt construction; real-world latency, API variability, and actual LLM accuracy are not evaluated."
    238     },
    239     {
    240       "flag": "Theorem conditions restrict practical scope",
    241       "detail": "Theorem 6.1 requires r ≤ 1/(1-T) (e.g., at most 10 clauses for T=0.9) and k+ > 1/(1-T); the paper claims 'no practical need' to enforce these but acknowledges the constraint limits theoretical coverage."
    242     }
    243   ],
    244   "cited_papers": [
    245     {
    246       "title": "LOTUS: Enabling Semantic Queries with LLMs over Tables of Unstructured and Structured Data",
    247       "relevance": "Primary baseline for semantic joins with LLMs using model cascades; FDJ is positioned as outperforming LOTUS's cascade approach; LOTUS is excluded from experiments due to statistical guarantee failures"
    248     },
    249     {
    250       "title": "Cut Costs, Not Accuracy: LLM-Powered Data Processing with Guarantees (BARGAIN)",
    251       "relevance": "State-of-the-art baseline for model cascades with statistical guarantees; primary comparison throughout; FDJ extends BARGAIN's 1D threshold bounds to high dimensions"
    252     },
    253     {
    254       "title": "Approximate Selection with Guarantees using Proxies (SUPG)",
    255       "relevance": "Foundational work on statistical guarantees for proxy-based selection; FDJ generalizes SUPG's 1D bounds to multi-dimensional threshold setting"
    256     },
    257     {
    258       "title": "DocETL: Agentic Query Rewriting and Evaluation for Complex Document Processing",
    259       "relevance": "Related work on LLM-powered document processing that supports semantic joins; target system for FDJ integration"
    260     },
    261     {
    262       "title": "Deep Learning for Entity Matching: A Design Space Exploration",
    263       "relevance": "Entity resolution baseline; Products dataset is from this work; ER is framed as a special case of semantic joins"
    264     },
    265     {
    266       "title": "On the Theoretical Limitations of Embedding-Based Retrieval",
    267       "relevance": "Provides theoretical support for why embedding similarity fails for complex join conditions with multiple relevant features"
    268     },
    269     {
    270       "title": "BioDEX: Large-scale Biomedical Adverse Drug Event Extraction",
    271       "relevance": "One of the 6 evaluation datasets; multi-label classification task representing semantic joins at scale"
    272     },
    273     {
    274       "title": "LePaRD: A Large-scale Dataset of Judicial Citations to Precedent",
    275       "relevance": "Source of the Citations dataset used in experiments (legal argument self-join)"
    276     }
    277   ],
    278   "engagement_factors": {
    279     "practical_relevance": {
    280       "score": 3,
    281       "justification": "Directly addresses LLM cost reduction in production data systems (Snowflake, Databricks, AlloyDB are cited as deployers) with statistical guarantees; 10x cost reduction is commercially significant."
    282     },
    283     "surprise_contrarian": {
    284       "score": 2,
    285       "justification": "Challenges the embedding-similarity paradigm dominant in semantic join systems by proving its failure mode and showing feature extraction outperforms it by up to 8x even vs. an oracle cascade."
    286     },
    287     "fear_safety": {
    288       "score": 0,
    289       "justification": "No AI safety or risk concerns; this is a cost optimization paper for database query processing."
    290     },
    291     "drama_conflict": {
    292       "score": 1,
    293       "justification": "Shows LOTUS fails to meet recall targets 100% of the time and BARGAIN provides minimal gains on 80% of pairs for police records, which challenges published benchmarks of these systems."
    294     },
    295     "demo_ability": {
    296       "score": 2,
    297       "justification": "Source code is referenced (prompts available in source code, Appendix I); approach requires OpenAI API access; the police records running example provides a concrete applicable scenario."
    298     },
    299     "brand_recognition": {
    300       "score": 1,
    301       "justification": "UC Berkeley is well-known; Shankar and Parameswaran are recognized in the data management/LLM systems community (LOTUS, DocETL) but the paper is not from a major industry lab."
    302     }
    303   },
    304   "hn_data": {
    305     "threads": [
    306       {
    307         "hn_id": "45529216",
    308         "title": "DeepMind's paper reveals Google's new direction on RAG: In-Context Retreival",
    309         "points": 6,
    310         "comments": 1,
    311         "url": "https://news.ycombinator.com/item?id=45529216",
    312         "created_at": "2025-10-09T15:38:33Z"
    313       },
    314       {
    315         "hn_id": "42418821",
    316         "title": "Specifications: The missing link to make development of LLM an eng discipline",
    317         "points": 2,
    318         "comments": 0,
    319         "url": "https://news.ycombinator.com/item?id=42418821",
    320         "created_at": "2024-12-14T19:07:39Z"
    321       },
    322       {
    323         "hn_id": "33980774",
    324         "title": "Graph algorithms for predicting subcellular localization at the pathway level",
    325         "points": 1,
    326         "comments": 0,
    327         "url": "https://news.ycombinator.com/item?id=33980774",
    328         "created_at": "2022-12-14T06:51:58Z"
    329       },
    330       {
    331         "hn_id": "33453848",
    332         "title": "The friendship paradox in real and model networks (2020)",
    333         "points": 1,
    334         "comments": 0,
    335         "url": "https://news.ycombinator.com/item?id=33453848",
    336         "created_at": "2022-11-03T16:52:43Z"
    337       },
    338       {
    339         "hn_id": "29539537",
    340         "title": "Internet, on the Ground by Nick Merrill",
    341         "points": 1,
    342         "comments": 0,
    343         "url": "https://news.ycombinator.com/item?id=29539537",
    344         "created_at": "2021-12-13T13:49:47Z"
    345       }
    346     ],
    347     "top_points": 6,
    348     "total_points": 11,
    349     "total_comments": 1
    350   }
    351 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs