scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27244B)
      1 {
      2   "paper": {
      3     "title": "Featurized-Decomposition Join: Low-Cost Semantic Joins with Guarantees",
      4     "authors": [
      5       "Sepanta Zeighami",
      6       "Shreya Shankar",
      7       "Aditya Parameswaran"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2512.05399",
     12     "doi": "10.48550/arXiv.2512.05399"
     13   },
     14   "scan_version": 2,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval", "theoretical"],
     17   "key_findings": "FDJ reduces semantic join costs by up to 10x over BARGAIN by automatically constructing featurized decompositions — logical expressions in CNF that extract and compare features from text records instead of relying on embedding similarity. The method provides statistical guarantees on precision and recall. The approach is most effective when join conditions can be decomposed into discrete features (e.g., names, dates, locations), but offers limited gains on classification-style joins where the mapping is complex. The paper also proves that finding the minimum-cost featurized decomposition is NP-hard and provides novel statistical results for multi-dimensional threshold selection.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The paper states 'Detailed prompts for all our LLM powered functions are available in our source code' (Appendix I) but provides no repository URL or archive link in the paper."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Most datasets used are publicly available: Products [33,42], BioDEX [14,46], Citations [38], Categorize [40,56], IMDB [23]. The paper references these public sources directly."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup section is provided. The paper mentions using OpenAI models but gives no environment specification."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions, README, or scripts for replicating experiments are provided in the paper."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Main results in Table 3 and Figures 7-10 report point estimates only. No confidence intervals or error bars on cost ratios or recall values."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper claims FDJ 'improves over BARGAIN across all datasets' (Sec 8.2) based on comparing numbers in Table 3 without any statistical tests."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Table 3 reports cost ratios with reduction factors in parentheses (e.g., '6.80 (0.24×)' for Citations, '6.70 (0.09×)' for Movies), providing both absolute and relative magnitudes with baseline context."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The paper justifies system parameters (250 positive pairs, 50 for featurizations) but provides no power analysis or justification for the number of datasets or experimental runs in the evaluation."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Table 2 shows 'Avg. % Failed' implying multiple runs, and Sec 8.1 mentions 'every run' for LOTUS. However, no standard deviations, IQR, or spread measures are reported for any results."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The paper compares against BARGAIN [65] (state-of-the-art model cascade), optimal cascade (oracle lower bound), and discusses LOTUS [46] (excluded for failing guarantees). See Table 3."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "BARGAIN [65] is from SIGMOD 2026 (to appear), LOTUS [46] from 2024. These represent the current state-of-the-art for semantic joins with guarantees."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "Fig 9 provides cost breakdowns and Sec 8.3-8.4 study parameter sensitivity. However, no ablation removes or modifies individual FDJ components (e.g., iterative feature generation, CNF formulation, threshold selection) to measure their contribution."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The paper reports cost ratio, observed recall, and failure rate (Table 2). Precision is guaranteed at 100% by design. Cost breakdowns are provided (Fig 9)."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No human evaluation of join quality. All evaluation uses ground-truth labels: 'instead of actually performing the LLM call, we simulate it by returning the known ground-truth' (Sec 8.1)."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "For Products: 'we evaluate the join condition on the test set and use the training set to create the featurized decomposition' (Sec 8.1). For other datasets, FDJ uses internal samples S and S' (Alg. 6) with fresh samples for threshold setting, evaluating on the full cross product."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Table 3 provides per-dataset results. Sec 8.2 categorizes datasets into three types with analysis of each. Fig 9 shows per-component cost breakdowns. Sec 8.4 provides per-characteristic analysis."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Sec 8.2 discusses where FDJ provides limited gains: 'Categorize and BioDEX are classification tasks. In such datasets the join relationship may not be easily decomposable into a set of featurizations.' BioDEX shows only 0.99× improvement."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "BioDEX shows FDJ at 73.8 vs BARGAIN at 73.9 (0.99× — essentially no improvement). LOTUS is shown to fail meeting recall targets on every run (Table 2). Categorize shows only 0.83× improvement."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Abstract claims 'up to 10 times reduction in cost' — supported by Table 3 (Movies: 6.70 vs 69.9 = ~10x). 'providing the same quality guarantees' — supported by Theorem 7.1 and Table 2 empirical validation."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper's causal claims (FDJ reduces cost) are supported by controlled experimental comparison with baselines on 6 datasets, theoretical analysis with proofs (Theorems 4.2, 6.1, 7.1), and systematic analysis of data characteristics (Sec 8.4)."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The paper bounds claims: 'up to 10 times' (not always), discusses when gains are limited (classification tasks, Sec 8.2), and systematically analyzes factors affecting performance (number of attributes, text length, Sec 8.4)."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper explains why FDJ works and why embeddings fail but does not consider alternative explanations for observed results, such as whether dataset-specific tuning or the choice of LLM/embedding model could explain the performance differences."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper measures cost ratio and recall/precision directly — these exactly match the claimed contributions (cost reduction with quality guarantees). No proxy gap between measurement and claims."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The paper specifies 'GPT-4.1 for feature extraction and performing join, O3 for generating candidate featurizations; and text-embedding-large as the embedding model' (Sec 8.1) but provides no snapshot dates or API versions."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Appendix I provides summaries of prompts for LLM functions, but the paper states 'our actual prompts include additional content for passing examples and defining output format and instruction' — the full prompts are only in the source code (no URL provided)."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "System parameters are reported (250 positive pairs, γ=0.05, T_R=90%, T_P=100%, δ=10%, β=0) but LLM hyperparameters (temperature, top-p, max tokens) for GPT-4.1 and O3 are not stated."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. FDJ is an algorithmic pipeline, not an agent-based system."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Sec 8.1 describes dataset preparation: subsampling procedures for BioDEX/Categorize/Citations ('sampled 20,000 ground-truth pairs'), Movies dataset creation ('crawling Wikipedia pages'), Products train/test setup, and synthetic data generation (Sec 8.4)."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No dedicated limitations section. The paper proceeds from experiments (Sec 8) to related work (Sec 9) to conclusion (Sec 10) without discussing limitations."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No threats-to-validity discussion. No mention of threats specific to this study's design or evaluation methodology."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "The paper does not explicitly state what the results do NOT show. While Sec 8.2 discusses dataset categories where FDJ is less effective, there are no explicit scope boundary statements."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "Most source datasets are public, but the specific subsamples, extracted features, and experimental artifacts used in evaluation are not released for independent verification."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Each dataset's source and preparation is described in Sec 8.1: original references for public datasets, subsampling procedures, Movies dataset creation from IMDB/Wikipedia, and Police Records from the Police Records Access Project [1]."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. Data sources are standard public benchmarks and established datasets."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The paper documents data preparation: subsampling 20,000 ground-truth pairs from larger datasets, creating join prompts (Appendix I), cost simulation methodology, and synthetic data generation process (Sec 8.4)."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding information or acknowledgments section appears in the paper."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "All three authors are listed as UC Berkeley affiliations with email addresses. They are not employees of OpenAI (whose models are used in evaluation)."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No funding is disclosed, so independence of funder cannot be verified."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial disclosure statement appears in the paper. Two authors (Shankar, Parameswaran) are co-authors of LOTUS [46], a baseline system."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "The paper evaluates a system (FDJ) for optimizing semantic joins, not a pre-trained model's capability on benchmarks. LLM calls are simulated using ground-truth labels."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "The paper does not evaluate a pre-trained model's knowledge on benchmarks. The evaluation measures FDJ system performance using simulated LLM calls."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "Benchmark contamination is not applicable — the evaluation tests the FDJ algorithm's cost-quality tradeoff, not whether the LLM has seen the test data."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Cost ratio is the primary metric throughout: Table 3 reports cost ratios for all datasets, Fig 9 shows cost breakdowns, and Sec 2 defines the cost model based on monetary cost of LLM/embedding tokens."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "The paper reports relative cost ratios but does not state absolute compute budgets (total API spend, GPU hours). Prop E.1 gives cost complexity in terms of tokens but no actual numbers."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "Main results in Table 3 are single-run numbers. The algorithm involves random sampling (Alg. 6, Lines 1 and 5) but sensitivity to randomness is not reported."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "Table 2 reports 'Avg. % Failed' and the text mentions 'every run' for LOTUS, implying multiple runs. But the exact number of runs is never stated."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "System parameters (k, k', γ, β) are set to fixed values justified by reasoning, but no hyperparameter search was conducted and no search budget is reported."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": true,
    315         "justification": "System parameters are justified: '250 positive pairs' based on statistical significance needs, γ=0.05 'to avoid adding featurizations that have marginal difference' (Sec 6.2), '50 positive samples' based on empirical observation that LLMs stop generating new featurizations (Appx E)."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors evaluate their own system (FDJ) and implement the baselines. Two authors (Shankar, Parameswaran) are co-authors of LOTUS [46], which is excluded from the main comparison. No discussion of author-evaluation bias."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "Fig 8 shows cost ratio vs target recall across datasets — performance as a function of quality target, which is the cost-quality tradeoff. Fig 9 breaks down costs. Methods are compared at matched quality targets."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The paper categorizes datasets into three types (Sec 8.2) but does not formally discuss whether the benchmarks are representative of real-world semantic join workloads or whether the evaluation setup (simulated LLM calls) affects validity."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No scaffolding is involved. FDJ is an algorithmic pipeline, not an agent-based system with scaffolding."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "LLM calls are simulated with ground truth (Sec 8.1), but the feature extraction and featurization generation steps use real GPT-4.1/O3 calls. No discussion of whether the LLM's training data could affect feature extraction quality on these public datasets."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the evaluation setup leaks information. The featurization generation process uses labeled samples from the same dataset being evaluated on, but the statistical framework is designed to account for this. No explicit discussion of feature leakage."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "Samples for construction (S, S') are drawn from the same cross product being evaluated on. While the statistical guarantees account for this, independence between construction and evaluation data is not explicitly discussed."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No leakage detection or prevention method is applied. The paper relies on its statistical framework for guarantees but does not use canary strings, membership inference, or decontamination."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "FDJ provides up to 10x cost reduction over BARGAIN, the state-of-the-art model cascade method with statistical guarantees.",
    369       "evidence": "Table 3: On Citations dataset, FDJ costs 6.80 vs BARGAIN's 28.0 (0.24×). On Movies, 6.70 vs 69.9 (0.09×, approximately 10x). Average ~50% cost across all 6 datasets.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "FDJ provides up to 8x cost reduction over the optimal cascade, which represents a lower bound on any cascade-based approach.",
    374       "evidence": "Table 3: On Movies dataset, FDJ costs 6.70 vs optimal cascade's 52.5 (approximately 8x). Optimal cascade cheats by using all ground-truth pairs to set thresholds.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Finding the minimum-cost featurized decomposition (MCFD) is NP-hard.",
    379       "evidence": "Theorem 4.2 with full proof in Appendix H.1 via reduction from Set Cover.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "FDJ provides statistical guarantees on recall: P(R(Y) < T) ≤ δ.",
    384       "evidence": "Theorem 7.1 with proof via Theorem 6.1 and Lemma 6.2. Empirically validated in Table 2: FDJ meets 90% recall target with only 7% failure rate (target δ=10%).",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Embedding-based approaches fail when text records contain multiple attribute values or irrelevant information.",
    389       "evidence": "Fig 10: Optimal cascade cost increases from ~0 to ~0.5 as number of persons mentioned grows from 1 to 5. Cost jumps from ~0.1 to ~0.4 when just 2 additional sentences are added to text.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "LOTUS fails to provide statistical guarantees in practice, missing the 90% recall target on every run.",
    394       "evidence": "Table 2: LOTUS has 100% failure rate and average recall of only 75.4% at T_R=90%. Limited to BioDEX dataset only.",
    395       "supported": "moderate"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "No error bars on main results",
    401       "detail": "Table 3, the central results table, reports single-point cost ratios with no uncertainty quantification. The FDJ algorithm involves random sampling (Alg 6 Lines 1, 5) which should produce variance across runs, but this is not reported."
    402     },
    403     {
    404       "flag": "LLM calls simulated with ground truth",
    405       "detail": "Sec 8.1: 'for every invocation of Lp, instead of actually performing the LLM call, we simulate it by returning the known ground-truth and calculate the cost by creating the prompt.' This means the evaluation does not capture real LLM failure modes (hallucinations, inconsistencies) that could affect FDJ's performance in practice."
    406     },
    407     {
    408       "flag": "Authors' prior work excluded from comparison",
    409       "detail": "Two authors (Shankar, Parameswaran) are co-authors of LOTUS [46], which is excluded from the main experiments. While the exclusion is justified (LOTUS fails guarantees per Table 2), this means the paper's proposed method is not compared against its authors' own prior system in the main evaluation."
    410     },
    411     {
    412       "flag": "No limitations section",
    413       "detail": "The paper has no dedicated limitations section discussing threats to validity, boundary conditions, or potential failure modes beyond what is shown in the experiments."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Lotus: Enabling semantic queries with llms over tables of unstructured and structured data",
    419       "authors": ["Liana Patel", "Siddharth Jha", "Carlos Guestrin", "Matei Zaharia"],
    420       "year": 2024,
    421       "arxiv_id": "2407.11418",
    422       "relevance": "Core baseline system for LLM-powered semantic queries including joins, directly compared against in this paper."
    423     },
    424     {
    425       "title": "Cut Costs, Not Accuracy: LLM-Powered Data Processing with Guarantees",
    426       "authors": ["Sepanta Zeighami", "Shreya Shankar", "Aditya Parameswaran"],
    427       "year": 2026,
    428       "relevance": "State-of-the-art model cascade method (BARGAIN) with statistical guarantees for LLM-powered data processing, primary baseline."
    429     },
    430     {
    431       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    432       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    433       "year": 2023,
    434       "arxiv_id": "2305.05176",
    435       "relevance": "Proposes model cascade framework for cost-efficient LLM usage, foundational work for the cascade approach that FDJ builds upon."
    436     },
    437     {
    438       "title": "DocETL: Agentic Query Rewriting and Evaluation for Complex Document Processing",
    439       "authors": ["Shreya Shankar", "Tristan Chambers", "Tarak Shah", "Aditya G Parameswaran", "Eugene Wu"],
    440       "year": 2024,
    441       "arxiv_id": "2410.12189",
    442       "relevance": "Agentic system for processing unstructured documents with LLMs, supports semantic operations including joins."
    443     },
    444     {
    445       "title": "Approximate Selection with Guarantees using Proxies",
    446       "authors": ["Daniel Kang", "Edward Gan", "Peter Bailis", "Tatsunori Hashimoto", "Matei Zaharia"],
    447       "year": 2020,
    448       "relevance": "SUPG framework providing statistical guarantees for proxy-based selection, theoretical foundation that FDJ generalizes to higher dimensions."
    449     },
    450     {
    451       "title": "Can foundation models wrangle your data?",
    452       "authors": ["Avanika Narayan", "Ines Chami", "Laurel Orr", "Simran Arora", "Christopher Ré"],
    453       "year": 2022,
    454       "arxiv_id": "2205.09911",
    455       "relevance": "Evaluates foundation models for data wrangling tasks including entity matching, relevant to the LLM-powered data processing paradigm."
    456     },
    457     {
    458       "title": "ThriftLLM: On Cost-Effective Selection of Large Language Models for Classification Queries",
    459       "authors": ["Keke Huang", "Yimin Shi", "Dujian Ding"],
    460       "year": 2025,
    461       "arxiv_id": "2501.04901",
    462       "relevance": "Cost-effective LLM selection via ensembling, related approach to reducing LLM-powered data processing costs."
    463     },
    464     {
    465       "title": "SMART: Automatically Scaling Down Language Models with Accuracy Guarantees for Reduced Processing Fees",
    466       "authors": ["Saehan Jo", "Immanuel Trummer"],
    467       "year": 2024,
    468       "arxiv_id": "2403.13835",
    469       "relevance": "Model profiling for cost reduction with accuracy guarantees in LLM-powered data processing."
    470     },
    471     {
    472       "title": "LLM-Powered Proactive Data Systems",
    473       "authors": ["Sepanta Zeighami", "Yiming Lin", "Shreya Shankar", "Aditya Parameswaran"],
    474       "year": 2025,
    475       "arxiv_id": "2502.13016",
    476       "relevance": "Broader vision for LLM-powered data systems that includes semantic joins as a core operation."
    477     },
    478     {
    479       "title": "A declarative system for optimizing AI workloads",
    480       "authors": ["Chunwei Liu", "Matthew Russo", "Michael Cafarella"],
    481       "year": 2024,
    482       "arxiv_id": "2405.14696",
    483       "relevance": "System for optimizing LLM-powered data processing workflows, related approach to cost-efficient AI data management."
    484     }
    485   ]
    486 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs