scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (18276B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "theoretical",
      4   "paper": {
      5     "title": "Featurized-Decomposition Join: Low-Cost Semantic Joins with Guarantees",
      6     "authors": [
      7       "Sepanta Zeighami",
      8       "Shreya Shankar",
      9       "Aditya G. Parameswaran"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2512.05399",
     14     "doi": "10.48550/arXiv.2512.05399"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "Abstract claims 'up to 10 times reduction in cost' — supported by Table 3 (Movies: 6.70 vs 69.9 = ~10x). 'providing the same quality guarantees' — supported by Theorem 7.1 and Table 2 empirical validation.",
     22         "source": "opus"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper's causal claims (FDJ reduces cost) are supported by controlled experimental comparison with baselines on 6 datasets, theoretical analysis with proofs (Theorems 4.2, 6.1, 7.1), and systematic analysis of data characteristics (Sec 8.4).",
     28         "source": "opus"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The paper bounds claims: 'up to 10 times' (not always), discusses when gains are limited (classification tasks, Sec 8.2), and systematically analyzes factors affecting performance (number of attributes, text length, Sec 8.4).",
     34         "source": "opus"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper explains why FDJ works and why embeddings fail but does not consider alternative explanations for observed results, such as whether dataset-specific tuning or the choice of LLM/embedding model could explain the performance differences.",
     40         "source": "opus"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper measures cost ratio and recall/precision directly — these exactly match the claimed contributions (cost reduction with quality guarantees). No proxy gap between measurement and claims.",
     46         "source": "opus"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "No dedicated limitations section. The paper proceeds from experiments (Sec 8) to related work (Sec 9) to conclusion (Sec 10) without discussing limitations.",
     54         "source": "opus"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No threats-to-validity discussion. No mention of threats specific to this study's design or evaluation methodology.",
     60         "source": "opus"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The paper does not explicitly state what the results do NOT show. While Sec 8.2 discusses dataset categories where FDJ is less effective, there are no explicit scope boundary statements.",
     66         "source": "opus"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding information or acknowledgments section appears in the paper.",
     74         "source": "opus"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All three authors are listed as UC Berkeley affiliations with email addresses. They are not employees of OpenAI (whose models are used in evaluation).",
     80         "source": "opus"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "No funding is disclosed, so independence of funder cannot be verified.",
     86         "source": "opus"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial disclosure statement appears in the paper. Two authors (Shankar, Parameswaran) are co-authors of LOTUS [46], a baseline system.",
     92         "source": "opus"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Semantic join, featurized decomposition, featurized predicate, featurized clause, logical scaffold, featurization, and cost ratio are all formally defined with mathematical notation in Sections 2–3.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 1 contains an explicit four-bullet contributions list: featurized decomposition as a concept, FDJ as an algorithm, novel multi-dimensional threshold statistical results, and experimental validation.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 9 situates FDJ against LOTUS, BARGAIN, SUPG, traditional entity resolution, and deep learning ER methods, explaining mechanistically why FDJ differs from and outperforms each approach.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "theoretical": {
    118       "formal_quality": {
    119         "assumptions_stated_explicitly": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "Theorem 6.1 explicitly states conditions (r ≤ 1/(1−T) and k+ > 1/(1−T)); the NP-hardness proof in Appendix H.1 states specific LLM behavioral assumptions and notes they were empirically verified.",
    123           "source": "haiku"
    124         },
    125         "proofs_complete_or_sketched": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "Full proofs for all theorems and lemmas are in Appendix H; the NP-hardness reduction from Set Cover (H.1), the worst-case dataset lemma (H.2), and the guarantee theorem (H.3) are all given with complete arguments.",
    129           "source": "haiku"
    130         },
    131         "bounds_tight_or_discussed": {
    132           "applies": true,
    133           "answer": true,
    134           "justification": "Tightness is explicitly claimed and demonstrated: Lemma 6.2 identifies the worst-case dataset D* that achieves the maximum in the minimum adjusted target problem, and the paper states 'tight theoretical analysis' as a contribution.",
    135           "source": "haiku"
    136         },
    137         "counterexamples_explored": {
    138           "applies": true,
    139           "answer": true,
    140           "justification": "Section 8.4 systematically constructs synthetic datasets to isolate failure conditions for embedding-based approaches; the BioDEX result (0.99× gain) represents an empirical near-counterexample that is explained mechanistically.",
    141           "source": "haiku"
    142         },
    143         "notation_consistent": {
    144           "applies": true,
    145           "answer": true,
    146           "justification": "Notation is defined in Figure 3 (terminology summary) and used consistently: L, R, p, T, δ, Π, κ, π, φ, θ are all introduced once and applied uniformly across algorithms and proofs.",
    147           "source": "haiku"
    148         },
    149         "constructive_vs_existence_noted": {
    150           "applies": true,
    151           "answer": true,
    152           "justification": "The NP-hardness proof establishes that an optimal solution exists but cannot be found in polynomial time; Algorithms 1–6 provide constructive (greedy) approximations, and the distinction is made explicit in Section 4.2.",
    153           "source": "haiku"
    154         }
    155       },
    156       "connections": {
    157         "connection_to_practice_discussed": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "The paper motivates the work with a real-world police records matching project (California Police Records Access Project), and Section 8 validates on six real-world datasets including biomedical and legal domains.",
    161           "source": "haiku"
    162         },
    163         "relationship_to_prior_work_clear": {
    164           "applies": true,
    165           "answer": true,
    166           "justification": "Section 3 explicitly frames FDJ as generalizing BARGAIN's 1D threshold selection to high dimensions; the paper traces theoretical lineage through SUPG [28], LOTUS [46], and BARGAIN [65] with precise comparisons.",
    167           "source": "haiku"
    168         },
    169         "computational_complexity_discussed": {
    170           "applies": true,
    171           "answer": true,
    172           "justification": "Theorem 4.2 proves NP-hardness of optimal featurized decomposition; Proposition E.1 gives token cost complexity O(κt(k+k'+ |Ŷ| + |Φ|(|L|+|R|))); tractability of the greedy approximation is discussed in Appendix G.",
    173           "source": "haiku"
    174         },
    175         "limitations_of_formal_model_stated": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Appendix F acknowledges that the false-positive-rate cost proxy is a simplification and provides a more accurate token-based cost model; Theorem 6.1 conditions acknowledge the sample size requirement is an artifact of the proof technique.",
    179           "source": "haiku"
    180         }
    181       }
    182     }
    183   },
    184   "claims": [
    185     {
    186       "claim": "FDJ achieves up to 10× cost reduction over BARGAIN (state-of-the-art) while maintaining the same statistical quality guarantees",
    187       "evidence": "Table 3: Movies dataset shows BARGAIN cost ratio 69.9 vs FDJ 6.70 (0.09×); across 6 datasets FDJ improves on BARGAIN in all cases",
    188       "supported": "strong"
    189     },
    190     {
    191       "claim": "FDJ achieves up to 8× cost reduction over the optimal embedding-based cascade (an oracle lower bound on any cascade approach)",
    192       "evidence": "Table 3: Movies optimal cascade 52.5 vs FDJ 6.70 (0.09× vs 0.75×); Citations optimal cascade 19.1 vs FDJ 6.80",
    193       "supported": "strong"
    194     },
    195     {
    196       "claim": "Finding the minimum-cost featurized decomposition is NP-hard",
    197       "evidence": "Theorem 4.2 proved via polynomial-time reduction from Set Cover in Appendix H.1, with explicit construction of the reduction",
    198       "supported": "strong"
    199     },
    200     {
    201       "claim": "FDJ provides provable statistical guarantees on recall (and precision) of the join result",
    202       "evidence": "Theorem 7.1 formally proves P(R(Ȳ) < T) ≤ δ; the adjusted target function is proved tight via Lemma 6.2 identifying the worst-case dataset",
    203       "supported": "strong"
    204     },
    205     {
    206       "claim": "Embedding-based semantic similarity is a poor proxy for join outcome when records contain multiple attribute values or irrelevant text",
    207       "evidence": "Figure 10: optimal cascade cost ratio degrades from ~0 to ~0.4 as persons mentioned increases from 1 to 5; FDJ maintains near-zero cost across all settings",
    208       "supported": "strong"
    209     },
    210     {
    211       "claim": "LOTUS (using SUPG) fails to meet recall targets on finite datasets, failing up to 75% of the time",
    212       "evidence": "Table 2: LOTUS fails to meet 90% recall target on every run on BioDEX (average recall below 80%); cites [65] reporting similar results",
    213       "supported": "moderate"
    214     }
    215   ],
    216   "methodology_tags": [
    217     "benchmark-eval",
    218     "theoretical"
    219   ],
    220   "key_findings": "FDJ proposes featurized decomposition — automatically extracting domain-specific features from text records and combining them into CNF logical expressions to prune non-matching pairs before expensive LLM evaluation. The core theoretical contribution is a tight multi-dimensional generalization of existing 1D cascade threshold bounds, identifying the worst-case dataset that maximizes the probability of the adjusted target being too conservative. Experiments on 6 real-world datasets show 2–10× cost reduction over BARGAIN and up to 8× over the oracle optimal cascade, with the largest gains on tasks where a small number of features accurately characterize the join condition (entity matching) and minimal gains on complex multi-label classification (BioDEX: 0.99×). The NP-hardness proof establishes that optimal decomposition is intractable, justifying the greedy construction.",
    221   "red_flags": [
    222     {
    223       "flag": "No limitations section",
    224       "detail": "The paper has no dedicated limitations or threats-to-validity discussion; failure modes, scope boundaries, and generalizability concerns are absent."
    225     },
    226     {
    227       "flag": "Simulated LLM calls",
    228       "detail": "Experiments simulate LLM calls by returning known ground-truth labels and computing token costs analytically (Section 8.1). This avoids actual LLM variance, latency, and prompt-sensitivity that would affect real deployments."
    229     },
    230     {
    231       "flag": "No funding disclosure",
    232       "detail": "No funding sources or acknowledgments appear anywhere in the paper; potential industrial affiliations or grants cannot be assessed."
    233     },
    234     {
    235       "flag": "Small calibration sample for featurization",
    236       "detail": "Only 50 positive samples are used for featurization generation and scaffold construction (Section 8.1), while 200 total are used for threshold setting; sensitivity to this choice is not fully analyzed."
    237     },
    238     {
    239       "flag": "Up-to framing obscures typical case",
    240       "detail": "The headline '10× reduction' reflects the best-case dataset (Movies); two of six datasets show minimal improvement (BioDEX 0.99×, Categorize 0.83×), which receives less emphasis in the abstract."
    241     }
    242   ],
    243   "cited_papers": [
    244     {
    245       "title": "LOTUS: Enabling Semantic Queries with LLMs over Tables of Unstructured and Structured Data",
    246       "relevance": "Primary comparison baseline; FDJ directly improves on LOTUS's model cascade approach for semantic joins"
    247     },
    248     {
    249       "title": "Cut Costs, Not Accuracy: LLM-Powered Data Processing with Guarantees (BARGAIN)",
    250       "relevance": "State-of-the-art baseline FDJ compares against; shares the statistical guarantees framework that FDJ generalizes"
    251     },
    252     {
    253       "title": "Approximate Selection with Guarantees using Proxies (SUPG)",
    254       "relevance": "Foundational 1D threshold selection method that FDJ's multi-dimensional bounds generalize"
    255     },
    256     {
    257       "title": "DocETL: Agentic Query Rewriting and Evaluation for Complex Document Processing",
    258       "relevance": "Related LLM-powered data processing system that semantic joins (as optimized by FDJ) could accelerate"
    259     },
    260     {
    261       "title": "Decomposed Prompting: A Modular Approach for Solving Complex Tasks",
    262       "relevance": "Best practices for multi-step LLM pipelines that FDJ's featurization generation procedure follows"
    263     },
    264     {
    265       "title": "FrugalGPT: How to Use Large Language Models while Reducing Cost and Improving Performance",
    266       "relevance": "Related work on cost-efficient LLM usage via model cascades"
    267     },
    268     {
    269       "title": "Deep Learning for Entity Matching: A Design Space Exploration",
    270       "relevance": "Key prior work on entity resolution that FDJ's semantic join framework generalizes"
    271     },
    272     {
    273       "title": "On the Theoretical Limitations of Embedding-based Retrieval",
    274       "relevance": "Theoretical support for FDJ's central empirical observation that embeddings fail to capture multi-attribute join conditions"
    275     }
    276   ],
    277   "engagement_factors": {
    278     "practical_relevance": {
    279       "score": 3,
    280       "justification": "Directly applicable to any LLM-powered data system with semantic joins; integrates with existing systems listed in the paper (Snowflake, Databricks, AlloyDB) and reduces costs up to 10× on real tasks."
    281     },
    282     "surprise_contrarian": {
    283       "score": 2,
    284       "justification": "Challenges the assumption that embedding similarity is a good proxy for semantic join quality, showing empirically and theoretically that feature extraction dominates in practice."
    285     },
    286     "fear_safety": {
    287       "score": 0,
    288       "justification": "No AI safety or risk concerns; the paper is about cost optimization in data systems."
    289     },
    290     "drama_conflict": {
    291       "score": 1,
    292       "justification": "Directly competes with and outperforms LOTUS/BARGAIN (recent publications), but the framing is collegial rather than confrontational."
    293     },
    294     "demo_ability": {
    295       "score": 2,
    296       "justification": "Source code is referenced in Appendix I and the police records use case is a real deployed project; practitioners could apply this to real entity matching tasks."
    297     },
    298     "brand_recognition": {
    299       "score": 1,
    300       "justification": "UC Berkeley is a respected academic institution but not a major AI lab; no famous product or company involvement."
    301     }
    302   },
    303   "hn_data": {
    304     "threads": [
    305       {
    306         "hn_id": "45529216",
    307         "title": "DeepMind's paper reveals Google's new direction on RAG: In-Context Retreival",
    308         "points": 6,
    309         "comments": 1,
    310         "url": "https://news.ycombinator.com/item?id=45529216",
    311         "created_at": "2025-10-09T15:38:33Z"
    312       },
    313       {
    314         "hn_id": "42418821",
    315         "title": "Specifications: The missing link to make development of LLM an eng discipline",
    316         "points": 2,
    317         "comments": 0,
    318         "url": "https://news.ycombinator.com/item?id=42418821",
    319         "created_at": "2024-12-14T19:07:39Z"
    320       },
    321       {
    322         "hn_id": "33980774",
    323         "title": "Graph algorithms for predicting subcellular localization at the pathway level",
    324         "points": 1,
    325         "comments": 0,
    326         "url": "https://news.ycombinator.com/item?id=33980774",
    327         "created_at": "2022-12-14T06:51:58Z"
    328       },
    329       {
    330         "hn_id": "33453848",
    331         "title": "The friendship paradox in real and model networks (2020)",
    332         "points": 1,
    333         "comments": 0,
    334         "url": "https://news.ycombinator.com/item?id=33453848",
    335         "created_at": "2022-11-03T16:52:43Z"
    336       },
    337       {
    338         "hn_id": "29539537",
    339         "title": "Internet, on the Ground by Nick Merrill",
    340         "points": 1,
    341         "comments": 0,
    342         "url": "https://news.ycombinator.com/item?id=29539537",
    343         "created_at": "2021-12-13T13:49:47Z"
    344       }
    345     ],
    346     "top_points": 6,
    347     "total_points": 11,
    348     "total_comments": 1
    349   }
    350 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs