ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (20739B)


      1 {
      2   "paper": {
      3     "title": "Cracking SQL Barriers: An LLM-based Dialect Translation System",
      4     "authors": ["Wei Zhou", "Yuyang Gao", "Xuanhe Zhou", "Guoliang Li"],
      5     "year": 2025,
      6     "venue": "Proc. ACM Manag. Data (SIGMOD)",
      7     "doi": "10.1145/3725278"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub repository provided: https://github.com/weAIDB/CrackSQL. The abstract states 'The code is available at https://github.com/weAIDB/CrackSQL.'"
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Section 7.2 states 'The evaluation set and the translation result have been released at https://github.com/weAIDB/CrackSQL.'"
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "Hardware is listed (Xeon E5-2678, RTX 3080 Ti) and database versions (MySQL v8.0, PostgreSQL v14, Oracle 11g), but no requirements.txt, Dockerfile, or detailed software dependency listing is provided in the paper."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided in the paper itself. The GitHub repository is referenced but the paper does not include a 'Reproducing Results' section or specific commands."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "All results in Tables 1-7 are reported as point estimates (e.g., '74.19%') with no confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims CrackSQL outperforms baselines based solely on comparing percentage values. No statistical significance tests are reported."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Percentage improvements with baseline context are provided throughout, e.g., '3.22%-21.95% result accuracy improvement than SQLGlot' and absolute accuracy values are given for all methods in Table 1."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The dataset contains 248, 142, and 111 query pairs for three translation directions. No justification is given for why these sizes are sufficient, nor is any power analysis discussed."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance, standard deviation, or results across multiple runs are reported. All results appear to be single-run numbers."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Section 7.1 lists baselines: SQLGlot, jOOQ, Ora2Pg, SQLines, GPT-4o, CodeLlama-7B, and Llama3.1-8B-Instruct."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include GPT-4o (2024) and Llama3.1 (2024), which are contemporary. Rule-based tools like SQLGlot are actively maintained and represent the state of the art for rule-based translation."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Section 7.5 provides ablation studies across all three main components: Functionality-based Query Processing (Table 2), Cross-Dialect Embedding Model (Table 4), and Local-To-Global Translation validation mechanisms (Table 2)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Two main metrics are used: Executable Ratio (AccEX) and Result Accuracy (AccRES), plus Retrieval Precision for the embedding model."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of translation quality is included. Evaluation is entirely automated via execution and result comparison against ground truth."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section 7.5.4 states 'None of the samples in the evaluation set is included in the training datasets' for the fine-tuning experiments. The evaluation set is separate from training data."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Figure 7 provides error distributions broken down by translation type (Syntax Rules, Functions & Operators, Keywords, Data Types, Others). Table 1 shows per-dialect-pair breakdowns."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 7.4 discusses translation error distributions in detail, and Table 3 provides specific examples of translations that different tools fail on. Section 7.3 discusses specific failure cases like incorrect TO_TIMESTAMP translation."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 7.5.4 reports that fine-tuning Llama3.1-8B can hurt performance: 'Llama3.1-8B after fine-tuning (i.e., 38.10% accuracy on average) even performs worse over MySQL→PG data samples than the original version (i.e., 54.76% accuracy on average).'"
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims CrackSQL 'significantly outperforms existing methods (e.g., by up to 77.42%)'. Table 1 supports this with improvements up to 77.42% in some scenarios."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims about component contributions are supported by ablation studies (Table 2) that use controlled single-variable manipulation (removing one component at a time)."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper bounds scope to MySQL, PostgreSQL, and Oracle, and Section 9 explicitly states limitations: 'translation types supported by CrackSQL are constrained by the available BNF grammar' and notes issues with new dialects that have incomplete documents."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No discussion of alternative explanations for why CrackSQL outperforms baselines. For example, whether the improvement comes primarily from additional LLM calls (more compute) vs. the proposed techniques is not analyzed. No threats-to-validity section exists."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper says 'GPT-4o' without specifying an API version or snapshot date. CodeLlama-7B and Llama3.1-8B-Instruct link to HuggingFace model pages but no specific version/checkpoint is given."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Section 7.1 states 'The detailed LLM instruction prompts are provided at https://github.com/weAIDB/CrackSQL' with a footnote link. This is an external repository with actual prompts."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 7.1 reports hyperparameters: k=3 for retrieved specifications, LoRA for fine-tuning, learning rate 1e-4, training epoch 50. Embedding model components (StarEncoder, bge-large-en-v1.5, etc.) are specified."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The agentic scaffolding (iterative LLM translation with validation, operation extension, retry logic) is described in detail in Section 6 and Algorithm 1, including the Local-to-Global strategy with syntactic and semantic validation steps."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 7.2 describes data collection from three sources (NL2SQL benchmarks, GitHub repos, StackExchange), de-duplication of query templates, and LLM-assisted extraction from StackExchange. Dataset statistics are provided."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 9 (Conclusion) contains a substantive limitations discussion: 'Despite its effectiveness, CrackSQL has three main limitations' covering BNF grammar constraints, precision loss issues, and human intervention for new dialects."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The limitations in Section 9 are specific to this system: BNF grammar constraints limiting translation types, precision loss issues (DOUBLE vs. DOUBLE PRECISION), and dependency on well-structured dialect documentation."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 2.2 explicitly states scope: 'we mainly consider the common translation types found in existing dialect translation tools, i.e., translating (1) syntax rules, (2) keywords, (3) built-in functions and operators, (4) data types.' Section 9 notes UDFs and stored procedures are excluded."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The evaluation set and translation results are released at the GitHub repository (Section 7.2 footnote 9)."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 7.2 describes three data sources: BIRD NL2SQL benchmark, GitHub repositories (SQLGlot, jOOQ issues and test cases), and StackExchange dumps. Collection methods for each are described."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants in this study. Data comes from publicly available benchmarks, code repositories, and online sources."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 7.2 describes the pipeline: collecting from three sources, de-duplicating query templates, expanding to Oracle, and final dataset statistics (248, 142, 111 query pairs). Section 4.1 documents syntax specification extraction."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Acknowledgments section lists funding: National Key R&D Program of China, NSF of China, Shenzhen Project, Zhongguancun Lab, Huawei, and BNRist."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: Shanghai Jiao Tong University and Tsinghua University. These are academic institutions, not product vendors being evaluated."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Funders include Chinese government research programs and Huawei. The paper evaluates open-source tools (SQLGlot, jOOQ) and general LLMs (GPT-4o, Llama). Huawei does not have a direct stake in the CrackSQL system's performance vs. these baselines."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "GPT-4o, CodeLlama, and Llama3.1 are used for translation, but no training data cutoff dates are stated. The benchmark queries could potentially be in LLM training data."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether LLMs may have seen the benchmark queries (from BIRD, StackExchange, SQLGlot test cases) during pre-training."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The benchmark draws from public sources (BIRD benchmark published 2023, StackExchange, SQLGlot repo) that could be in GPT-4o's training data. This contamination risk is not discussed."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "CrackSQL makes multiple LLM API calls (up to 3-7 trials per operation) but no API costs, token counts, or per-query latency figures are reported."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "Hardware is listed but no total GPU hours, API spend, or training time for the embedding model is reported."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CrackSQL significantly outperforms existing methods by up to 77.42% in translation accuracy.",
    286       "evidence": "Table 1 shows AccRES improvements: 3.22%-21.95% over SQLGlot, 3.22%-39.69% over jOOQ, 9.75%-64.51% over SQLines, 7.69%-34.93% over GPT-4o across six dialect translation scenarios.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "Query Segmentation significantly enhances translation accuracy by 48.91% on average.",
    291       "evidence": "Table 2 ablation study comparing CrackSQL with and without Query Segmentation on Oracle→PG and Oracle→MySQL scenarios.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "The Cross-Dialect Embedding Model achieves 86.34% higher retrieval precision than baselines on average.",
    296       "evidence": "Table 4 shows retrieval precision across PG→MySQL, MySQL→PG, Oracle→PG at k=1,3,5 compared against BM25, StarEncoder, MiniLM, stella, and MultiEmbed.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "CrackSQL is robust across different underlying LLMs.",
    301       "evidence": "Table 5 shows CrackSQL improves all tested LLMs: 6.67%-13.33% for GPT-4o, 15.39%-20% for Llama3.1, 23.08%-50% for CodeLlama. CrackSQL(Llama3.1) achieves similar performance to CrackSQL(GPT-4o).",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Fine-tuning LLMs can negatively affect translation generalizability.",
    306       "evidence": "Table 6 shows Llama3.1-8B after fine-tuning achieves 38.10% accuracy on average for MySQL→PG, worse than the original 54.76%. Section 7.5.4 discusses this.",
    307       "supported": "strong"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "CrackSQL is an LLM-based SQL dialect translation system that outperforms rule-based tools (SQLGlot, jOOQ, SQLines) and direct LLM use (GPT-4o) across six translation scenarios between MySQL, PostgreSQL, and Oracle. The system uses three components: functionality-based query processing to segment and simplify queries, a cross-dialect embedding model for matching equivalent syntax across dialects, and a local-to-global translation strategy with hybrid validation. Ablation studies show query segmentation provides the largest accuracy gain (~49%), and the system works with different underlying LLMs including smaller open-source models.",
    312   "red_flags": [
    313     {
    314       "flag": "No statistical significance tests",
    315       "detail": "All comparisons between CrackSQL and baselines are based on raw percentage differences with no significance tests, confidence intervals, or variance across runs. With dataset sizes of 111-248 query pairs, observed differences may not be statistically significant."
    316     },
    317     {
    318       "flag": "Benchmark contamination risk",
    319       "detail": "The evaluation dataset draws from public sources (BIRD benchmark, StackExchange, SQLGlot test cases) that may appear in GPT-4o's training data. This could affect both the baseline GPT-4o results and CrackSQL's own performance since it uses GPT-4o internally. This is not discussed."
    320     },
    321     {
    322       "flag": "No cost analysis",
    323       "detail": "CrackSQL makes multiple LLM API calls per query (up to 7 trials) plus embedding model inference, but no cost or latency comparison is provided. This makes practical deployment assessment impossible."
    324     },
    325     {
    326       "flag": "Self-evaluation by authors' own system",
    327       "detail": "The benchmark dataset was partially constructed by the authors (6 human months), and Oracle translations were expanded by 'verify and collect the successfully translated queries.' The selection process could inadvertently favor CrackSQL's capabilities."
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "Can LLM Already Serve as A Database Interface? A BIg Bench for Large-Scale Database Grounded Text-to-SQLs",
    333       "authors": ["Jinyang Li", "Binyuan Hui", "Ge Qu"],
    334       "year": 2023,
    335       "relevance": "BIRD benchmark used as a data source for the dialect translation evaluation dataset."
    336     },
    337     {
    338       "title": "Lost in Translation: A Study of Bugs Introduced by Large Language Models while Translating Code",
    339       "authors": ["Rangeet Pan", "Ali Reza Ibrahimzada", "Rahul Krishna"],
    340       "year": 2024,
    341       "relevance": "Studies bugs introduced by LLMs during code translation, directly relevant to LLM reliability in code generation."
    342     },
    343     {
    344       "title": "StarCoder: may the source be with you!",
    345       "authors": ["Raymond Li", "Loubna Ben Allal"],
    346       "year": 2023,
    347       "relevance": "StarEncoder used as the syntax structure encoder in CrackSQL; relevant to code LLM capabilities."
    348     },
    349     {
    350       "title": "Large language models: A survey",
    351       "authors": ["Shervin Minaee", "Tomas Mikolov"],
    352       "year": 2024,
    353       "relevance": "Survey of LLM capabilities cited to motivate using LLMs for SQL translation tasks."
    354     },
    355     {
    356       "title": "LLM-R2: A Large Language Model Enhanced Rule-based Rewrite System for Boosting Query Efficiency",
    357       "authors": ["Zhaodonghui Li", "Haitao Yuan"],
    358       "year": 2024,
    359       "relevance": "Related work on using LLMs to enhance rule-based database systems, closely related to the LLM-augmented approach."
    360     },
    361     {
    362       "title": "Query Rewriting via Large Language Models",
    363       "authors": ["Jie Liu", "Barzan Mozafari"],
    364       "year": 2024,
    365       "relevance": "LLM-based query rewriting via iterative self-reflection, related approach to using LLMs for SQL transformation."
    366     },
    367     {
    368       "title": "D-Bot: Database Diagnosis System using Large Language Models",
    369       "authors": ["Xuanhe Zhou", "Guoliang Li"],
    370       "year": 2024,
    371       "relevance": "LLM-based database diagnosis system, part of the broader trend of using LLMs for database tasks."
    372     },
    373     {
    374       "title": "GPTuner: A Manual-Reading Database Tuning System via GPT-Guided Bayesian Optimization",
    375       "authors": ["Jiale Lao", "Yibo Wang"],
    376       "year": 2024,
    377       "relevance": "LLM-augmented database tuning system, relevant to LLM applications in database management."
    378     }
    379   ]
    380 }

Impressum · Datenschutz