scan.json (24235B)
1 { 2 "paper": { 3 "title": "Automated Discovery of Test Oracles for Database Management Systems Using LLMs", 4 "authors": ["Qiuyang Mang", "Runyuan He", "Suyang Zhong", "Xiaoxuan Liu", "Huanchen Zhang", "Alvin Cheung"], 5 "year": 2025, 6 "venue": "arXiv preprint", 7 "arxiv_id": "2510.06663" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": false, 14 "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. The tool Argus is described but no release link is given." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "The paper does not release the bug reports, generated CAQ pairs, SQL snippet corpus, or any other experimental data. Bug report links to individual DBMS issue trackers are not provided as a dataset." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "The paper states '64 cores and 128 GB memory running on Ubuntu 24.04' (Section 7) but does not provide library versions, dependency lists, requirements.txt, or any reproducible environment specification." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The algorithms are described at a conceptual level but there are no concrete instructions for reproducing the experiments." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "The paper reports point estimates for coverage metrics and bug counts without confidence intervals or error bars. Coverage figures (e.g., '19.9% higher line coverage') and metamorphic coverage ratios are presented without uncertainty quantification." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims Argus outperforms baselines in coverage and bug detection but does not apply any statistical significance tests to these comparisons. Differences are reported as raw ratios (e.g., '5.473x', '6.43x') without tests." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper reports effect sizes with baseline context throughout: e.g., '19.9% and 18.1% higher line and branch coverage than SQLancer++' (Section 7.2), '5.473x, 6.431x, and 5.571x higher' metamorphic coverage (Table 3), '3.33x more unique logic bugs' (Section 7.3), with absolute values provided in figures and tables." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "Key parameters like '10 test suites, each with 100 test cases' and '20 bug reports' for the false positive analysis are used without justification for why these sample sizes are sufficient." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "The metamorphic coverage experiment uses 10 test suites and reports averages (Table 3), but no standard deviation, variance, or spread measure is reported. Coverage experiments appear to be single runs." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "The paper compares Argus against SQLancer and SQLancer++ for code coverage (Section 7.2), a composite baseline of 11 oracles from four prior works (TLP, NoREC, EET, DQP) for bug finding (Section 7.3), and an LLM-as-a-judge baseline for the prover ablation (Section 7.4)." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "SQLancer and SQLancer++ are actively maintained state-of-the-art DBMS testing tools. The composite baseline includes oracles from 2020-2024 works (TLP 2020, NoREC 2020, EET 2024, DQP 2024). The LLM baseline uses GPT-5." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "The paper includes multiple ablation-like studies: comparing Argus-50 vs Argus-5000 to evaluate oracle quantity (Section 7.3), evaluating the SQL equivalence prover vs LLM-as-a-judge (Section 7.4), and comparing the CAQ-based approach vs a naive baseline for cost/efficiency (Section 7.5)." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "The evaluation uses line coverage, branch coverage, metamorphic coverage (line/function/branch), number of unique bugs found, false positive rate, false negative rate, cost, and throughput across different experiments." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": true, 83 "justification": "Bug reports were manually adjudicated: 'we then manually adjudicated each report as a true positive or a false positive' (Section 7.4). Additionally, all 40 bugs were reported to DBMS developers, with 36 confirmed and 26 fixed by developers." 84 }, 85 "held_out_test_set": { 86 "applies": false, 87 "answer": false, 88 "justification": "This is a software testing tool evaluation, not a machine learning model evaluation. There is no train/test split concept applicable here." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Table 2 provides per-DBMS breakdowns of bugs found (Dolt, DuckDB, MySQL, PostgreSQL, TiDB), categorized by bug status (fixed, confirmed, duplicate, pending) and bug type (logic vs other). Coverage results are broken down by DBMS and metric type." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Section 7.4 discusses false positive and false negative cases in detail, including a specific false positive example (Listing 11). Section 8 discusses prover bugs and limitations. The paper acknowledges Argus-50 underperformed the baseline." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports several negative results: Argus-50 found fewer bugs than the human-designed baseline (Section 7.3), the prover has a high false negative rate (8/10 rejections were false negatives, Section 7.4), and Argus slightly underperforms SQLancer on PostgreSQL line coverage (Section 7.2)." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims '40 previously unknown bugs, 35 of which are logic bugs, with 36 confirmed and 26 already fixed' — directly supported by Table 2. Claims of coverage improvement ('up to 1.19x in code coverage, and 6.43x in metamorphic coverage') are supported by Section 7.2 and Table 3." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper makes causal claims about why Argus works (the CAQ abstraction enables scalability, the prover ensures soundness) and supports these through controlled ablation studies: removing the prover (Section 7.4), varying oracle count (Section 7.3), comparing CAQ vs naive approach (Section 7.5). The ablation designs involve single-variable manipulation." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": true, 120 "justification": "Section 8 (Limitations) explicitly bounds generalization: 'Argus currently focuses on relational DBMS and SQL' and acknowledges the prover 'currently supports only a subset of SQL features.' The scope is clearly limited to the five tested DBMSs." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper does not discuss alternative explanations for why Argus finds more bugs. For example, it does not consider whether the improvement is due to the LLM's creativity, the formal verification, the larger number of test cases, or the specific SQL features tested. The discussion section covers limitations but not confounding factors." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper states 'o4-mini' and 'GPT-5' but does not provide specific API versions, snapshot dates, or version identifiers (e.g., no model checkpoint or API version string). These are marketing names without version pinning." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper describes the prompting strategy conceptually: 'The prompt qualitatively asks the LLM to generate novel queries that are different from the provided examples' (Section 5.2) and mentions 'Documentation-augmentation Generation' (Section 6.1), but the actual prompt text is not provided." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": false, 142 "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported. Algorithm parameters like StopThreshold (initially 3), MaxDepth, N (number of schemas), and K (test cases per oracle) are mentioned but their experimental values are not all specified." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "The agentic scaffolding is described in detail: Algorithms 1 and 2 describe the full pipeline including iterative prompting with in-context learning from Equal/Fail sets, k-means clustering of query plans for example selection, the SQL equivalence prover verification loop, and the corpus synthesis and instantiation process." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "The paper documents preprocessing steps: grammar-based seed generation (Section 5.1), runtime validation of LLM-generated snippets (Section 6.1, Listing 4), filtering via determinism/null-preserving/empty-preserving constraints (Section 6.2), and cross-combination of expressions (Section 6.1)." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 8 (Discussion) contains a dedicated 'Limitations and future work' subsection that discusses three specific limitations." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "The limitations are specific to this study: (1) the SQL equivalence prover supports only a subset of SQL features, limiting oracle scope; (2) the approach is limited to relational DBMS and SQL; (3) no mechanism for prioritizing or ranking generated oracles. Additionally, prover bugs are discussed as a real encountered issue." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 8 explicitly states scope boundaries: 'Argus currently focuses on relational DBMS and SQL' and 'does not provide a mechanism to prioritize or rank the generated oracles.' The paper also notes that the prover 'does not understand the semantics of most functions, treating them as uninterpreted functions.'" 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "No raw data is available. Bug reports, generated CAQ pairs, snippet corpora, coverage measurements, and experimental logs are not released for independent verification." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "The bug collection process is well documented: 'We continuously ran Argus on the five DBMSs for a three-month testing campaign. We used the latest development versions and reported bugs only when they could be reproduced on their latest versions' (Section 7.1). The deduplication policy is also described." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants were involved. The study tests automated tools against DBMSs with no human subjects." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The full pipeline from schema generation through CAQ generation, prover verification, corpus synthesis, instantiation, and bug detection is documented in Algorithms 1 and 2. Section 7.1 describes the bug reporting workflow including deduplication against known issues." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "Section 11 (Acknowledgements) thanks individuals for feedback and support but does not disclose any funding sources, grants, or sponsorships." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are clearly stated: UC Berkeley and National University of Singapore. The paper tests third-party open-source DBMSs, not products by the authors' institutions." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding source is disclosed, making it impossible to assess funder independence. The absence of a funding disclosure is itself a gap." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests statement or financial interest disclosure is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": false, 219 "answer": false, 220 "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. LLMs are used as a generation tool within the Argus pipeline, not evaluated for their standalone performance." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": false, 224 "answer": false, 225 "justification": "Not applicable — the paper tests DBMSs for bugs, not LLM benchmark performance. Whether the LLM has seen SQL in training does not affect the validity of formally verified equivalent query pairs." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": false, 229 "answer": false, 230 "justification": "Not applicable — no benchmark evaluation of model capabilities. The LLM is a tool for generating candidate queries, not the subject of evaluation." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study. The paper evaluates an automated testing tool." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants involved." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants involved." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants involved." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants involved." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants involved." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants involved." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": true, 274 "justification": "Section 7.5 reports detailed cost information: 'generating the CAQ pairs costs about $3 in LLM calls, and instantiating test cases costs roughly $1 per 1,000, leveraging a reusable corpus of 100,000 snippets built for only $12.' Figure 5 shows cumulative cost over time." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "The paper states the machine specification (64 cores, 128 GB memory, Ubuntu 24.04) and testing duration (three-month campaign, 24-hour coverage runs), but does not quantify total compute budget (total CPU/GPU hours, total API spend across all experiments)." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "Argus discovered 40 previously unknown bugs across five DBMSs, including 35 logic bugs, with 36 confirmed and 26 fixed by developers.", 286 "evidence": "Table 2 in Section 7.1 provides the full breakdown by DBMS, bug status, and bug type. Individual bug examples are shown in Listings 6-10.", 287 "supported": "strong" 288 }, 289 { 290 "claim": "Argus achieves up to 5.47x higher metamorphic coverage than SQLancer on DuckDB.", 291 "evidence": "Table 3 reports average metamorphic coverage across 10 test suites: Argus 17.820% vs SQLancer 3.256% for lines (5.473x), 7.910% vs 1.230% for functions (6.431x), 7.315% vs 1.313% for branches (5.571x).", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "Argus's new oracles detect 3.33x more unique logic bugs than the union of prior oracles within 6 hours.", 296 "evidence": "Figure 4 shows Argus-5000 finding 10 unique bugs vs the baseline's 3 bugs on Dolt v1.0.0 over 6 hours. Unique bugs were verified using git-bisect to link to fix commits.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "The SQL equivalence prover is necessary to eliminate false positives — all 20 bug reports from the LLM-as-a-judge approach were false positives.", 301 "evidence": "Section 7.4 describes the ablation: 'we performed an ablation that disables the prover while using an LLM-as-a-judge approach, and ran Argus on DuckDB until collecting 20 bug reports. We then manually adjudicated each report... all of them were false positives.'", 302 "supported": "strong" 303 }, 304 { 305 "claim": "Argus is cost-effective: generating CAQ pairs costs ~$3 in LLM calls and instantiating test cases costs ~$1 per 1,000.", 306 "evidence": "Section 7.5 and Figure 5 provide cost breakdowns for a 1-hour run on Dolt, showing the two-phase approach with specific dollar amounts.", 307 "supported": "moderate" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval", "case-study"], 311 "key_findings": "Argus automates test oracle discovery for DBMS testing by combining LLM-generated Constrained Abstract Query (CAQ) pairs with formal SQL equivalence verification. Evaluated on five widely-used DBMSs (Dolt, DuckDB, MySQL, PostgreSQL, TiDB), it discovered 40 previously unknown bugs including 35 logic bugs, 36 of which were confirmed by developers. The approach achieves up to 5.47x higher metamorphic coverage than state-of-the-art tools and demonstrates that the formal prover is essential, as an LLM-as-a-judge approach produced 100% false positive bug reports in comparison.", 312 "red_flags": [ 313 { 314 "flag": "No code or data release", 315 "detail": "Despite describing a complete system (Argus) with detailed algorithms, no source code, generated oracles, snippet corpus, or bug report dataset is released, making independent reproduction impossible." 316 }, 317 { 318 "flag": "No statistical rigor in comparisons", 319 "detail": "Coverage and bug-finding comparisons report only point estimates without confidence intervals, error bars, or significance tests. The metamorphic coverage experiment averages 10 test suites without reporting variance. Single-run coverage experiments do not account for randomness in the generation process." 320 }, 321 { 322 "flag": "Small sample in key ablation", 323 "detail": "The prover vs LLM-as-a-judge comparison (Section 7.4) uses only 20 bug reports and 10 rejected pairs per configuration. These small samples support strong conclusions about false positive/negative rates without statistical justification." 324 }, 325 { 326 "flag": "Prompts and hyperparameters not provided", 327 "detail": "The actual prompts used for LLM-based CAQ generation and snippet synthesis are not provided, only described qualitatively. No LLM hyperparameters (temperature, top-p) are reported, despite these significantly affecting generation quality." 328 } 329 ], 330 "cited_papers": [ 331 { 332 "title": "Fuzz4all: Universal fuzzing with large language models", 333 "authors": ["Chunqiu Steven Xia", "Matteo Paltenghi", "Jia Le Tian", "Michael Pradel", "Lingming Zhang"], 334 "year": 2024, 335 "relevance": "Demonstrates LLM-based universal fuzzing for automated testing, directly relevant to LLM-aided software testing methodology." 336 }, 337 { 338 "title": "Whitefox: White-box compiler fuzzing empowered by large language models", 339 "authors": ["Chenyuan Yang", "Yinlin Deng", "Runyu Lu", "Jiayi Yao", "Jiawei Liu", "Reyhaneh Jabbarvand", "Lingming Zhang"], 340 "year": 2024, 341 "relevance": "Uses LLMs for compiler testing with white-box techniques, relevant to LLM-aided testing methodology." 342 }, 343 { 344 "title": "KernelGPT: Enhanced kernel fuzzing via large language models", 345 "authors": ["Chenyuan Yang", "Zijie Zhao", "Lingming Zhang"], 346 "year": 2025, 347 "relevance": "Applies LLMs to kernel fuzzing, relevant to understanding LLM capabilities in automated testing domains." 348 }, 349 { 350 "title": "Automated program repair via conversation: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT", 351 "authors": ["Chunqiu Steven Xia", "Lingming Zhang"], 352 "year": 2024, 353 "relevance": "Evaluates LLM-based program repair with cost analysis, relevant to LLM-aided software engineering productivity." 354 }, 355 { 356 "title": "AlphaEvolve: A coding agent for scientific and algorithmic discovery", 357 "authors": ["Alexander Novikov"], 358 "year": 2025, 359 "arxiv_id": "2506.13131", 360 "relevance": "LLM-based agent for algorithm discovery, relevant to agentic AI capabilities in code generation." 361 }, 362 { 363 "title": "Enhancing static analysis for practical bug detection: An LLM-integrated approach", 364 "authors": ["Haonan Li", "Yu Hao", "Yizhuo Zhai", "Zhiyun Qian"], 365 "year": 2024, 366 "relevance": "Integrates LLMs with static analysis for bug detection, relevant to LLM-aided software quality." 367 }, 368 { 369 "title": "Testing Database Systems with Large Language Model Synthesized Fragments", 370 "authors": ["Suyang Zhong", "Manuel Rigger"], 371 "year": 2025, 372 "arxiv_id": "2505.02012", 373 "relevance": "Uses LLMs to synthesize SQL fragments for DBMS testing, directly complementary to Argus's approach." 374 }, 375 { 376 "title": "Can large language models discover metamorphic relations? A large-scale empirical study", 377 "authors": ["Jiaming Zhang", "Chang-ai Sun", "Huai Liu", "Sijin Dong"], 378 "year": 2025, 379 "relevance": "Empirically evaluates LLM capability in discovering metamorphic relations for testing, directly related to Argus's test oracle discovery." 380 }, 381 { 382 "title": "LLM-SQL-Solver: Can LLMs determine SQL equivalence?", 383 "authors": ["Fuheng Zhao", "Lawrence Lim", "Ishtiyaque Ahmad", "Divyakant Agrawal", "Amr El Abbadi"], 384 "year": 2023, 385 "arxiv_id": "2312.10321", 386 "relevance": "Evaluates LLM capability in SQL equivalence determination, the core verification task in Argus's pipeline." 387 }, 388 { 389 "title": "S*: Test time scaling for code generation", 390 "authors": ["Dacheng Li"], 391 "year": 2025, 392 "arxiv_id": "2502.14382", 393 "relevance": "Explores test-time scaling for LLM code generation, relevant to LLM-aided programming capabilities." 394 } 395 ] 396 }