scan.json (24517B)
1 { 2 "paper": { 3 "title": "Analysis and Evaluation of Synthetic Data Generation in Speech Dysfluency Detection", 4 "authors": ["Jinming Zhang", "Xuanru Zhou", "Jiachen Lian", "Shuhe Li", "William Li", "Zoe Ezzes", "Rian Bogley", "Lisa Wauters", "Zachary Miller", "Jet Vonk", "Brittany Morin", "Maria Gorno-Tempini", "Gopala Anumanchipalli"], 5 "year": 2025, 6 "venue": "arXiv (submitted to Interspeech)", 7 "arxiv_id": "2505.22029", 8 "doi": "10.48550/arXiv.2505.22029" 9 }, 10 "checklist": { 11 "artifacts": { 12 "code_released": { 13 "applies": true, 14 "answer": true, 15 "justification": "The abstract states 'All data, models, and code are open-sourced at https://github.com/Berkeley-Speech-Group/LLM-Dys.' A specific GitHub URL is provided." 16 }, 17 "data_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "The abstract explicitly states 'All data, models, and code are open-sourced' and points to the GitHub repository. The LLM-Dys dataset is the main contribution of the paper." 21 }, 22 "environment_specified": { 23 "applies": true, 24 "answer": false, 25 "justification": "No environment specifications, requirements.txt, Dockerfile, or dependency versions are mentioned in the paper. The paper mentions using VITS, Whisper-large-v3-turbo, and E2-TTS but does not provide setup details." 26 }, 27 "reproduction_instructions": { 28 "applies": true, 29 "answer": false, 30 "justification": "No step-by-step reproduction instructions are provided in the paper itself. The paper refers to the GitHub repository but does not include a 'Reproducing Results' section or specific commands." 31 } 32 }, 33 "statistical_methodology": { 34 "confidence_intervals_or_error_bars": { 35 "applies": true, 36 "answer": false, 37 "justification": "All results in Tables 2-5 are reported as point estimates (e.g., F1=0.91, Precision=0.95) with no confidence intervals, error bars, or uncertainty measures." 38 }, 39 "significance_tests": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper claims 'state-of-the-art' performance and compares against baselines (Wagner et al., Yolo-Stutter, StutterNet) by simply comparing numbers. No statistical significance tests are reported for any comparison." 43 }, 44 "effect_sizes_reported": { 45 "applies": true, 46 "answer": false, 47 "justification": "The paper reports raw metric values but does not provide effect sizes such as Cohen's d or relative improvements with baseline context. Comparisons between systems are presented as raw numbers in tables without quantifying the magnitude of improvement." 48 }, 49 "sample_size_justified": { 50 "applies": true, 51 "answer": false, 52 "justification": "Key evaluation sample sizes are mentioned (300 unique utterances per type for LLM-Dys testing, 200 manually annotated samples for SEP-28k, 80/120 for UCLASS) but none are justified or supported by power analysis. The choice of 200 manual annotations from a 28,000-clip dataset is not explained." 53 }, 54 "variance_reported": { 55 "applies": true, 56 "answer": false, 57 "justification": "No variance, standard deviation, or spread measures are reported for any experimental results. It appears all results are from single runs with no indication of variability across seeds or runs." 58 } 59 }, 60 "evaluation_design": { 61 "baselines_included": { 62 "applies": true, 63 "answer": true, 64 "justification": "The paper compares against Wagner et al. [11], Yolo-Stutter [14], and StutterNet [1] in Tables 3 and 5. Cross-dataset audio quality comparisons include VCTK-token, VCTK, LibriTTS, and LibriStutter in Fig. 2." 65 }, 66 "baselines_contemporary": { 67 "applies": true, 68 "answer": true, 69 "justification": "The baselines are contemporary: Wagner et al. (Interspeech 2024), Yolo-Stutter (Interspeech 2024), and VCTK-Token (2024). StutterNet (2024) is also recent. These represent the current state of the art in dysfluency detection." 70 }, 71 "ablation_study": { 72 "applies": true, 73 "answer": true, 74 "justification": "The paper includes ablation-like studies: scaling law experiments (Section 4.3.3) showing impact of dataset size, and analysis of fluent-to-disfluent speech ratio (Section 4.3.4). However, these focus on data characteristics rather than model components." 75 }, 76 "multiple_metrics": { 77 "applies": true, 78 "answer": true, 79 "justification": "The paper uses multiple metrics: Precision, Recall, F1-score, Accuracy, Token Error Rate (TER), and Token Distance (TD), reported across Tables 2-5." 80 }, 81 "human_evaluation": { 82 "applies": true, 83 "answer": false, 84 "justification": "The paper relies on Meta Audiobox Aesthetics (an automated tool) for quality evaluation and standard detection metrics. No human evaluation of the synthetic speech quality or detection results is conducted, despite claims about 'naturalness' which is inherently a subjective quality." 85 }, 86 "held_out_test_set": { 87 "applies": true, 88 "answer": true, 89 "justification": "Testing on SEP-28k uses 200 manually annotated samples separate from training (zero-shot evaluation). UCLASS uses 120 test samples separate from the 80 fine-tuning samples. The LLM-Dys test set uses 300 utterances per type from a designated testing set." 90 }, 91 "per_category_breakdown": { 92 "applies": true, 93 "answer": true, 94 "justification": "Results are broken down by dysfluency type (Insertion, Repetition, Pause, Prolongation) and by level (word-level vs phoneme-level) across all tables. Fig. 2-3 also show per-type comparisons." 95 }, 96 "failure_cases_discussed": { 97 "applies": true, 98 "answer": false, 99 "justification": "No failure cases or error analysis are discussed. The paper does not show examples where the system fails or analyze systematic patterns in errors. Section 4.3.3 notes performance plateaus but does not analyze why." 100 }, 101 "negative_results_reported": { 102 "applies": true, 103 "answer": true, 104 "justification": "Section 4.3.3 reports that 'simply increasing textual data or diversity may not yield additional performance improvements' and shows performance plateaus or slight declines beyond certain dataset sizes. Section 2.2 notes VITS 'shows limitations in synthesizing fillers like um and uh.'" 105 } 106 }, 107 "claims_and_evidence": { 108 "abstract_claims_supported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The abstract claims 'state-of-the-art performance' which is supported by Tables 3 and 5 showing improvements over baselines. The claim of 'most comprehensive dysfluent speech corpus' is supported by Table 1 (12,790 hours, 11 categories). The claim about LLM-enhanced quality is supported by Fig. 2." 112 }, 113 "causal_claims_justified": { 114 "applies": true, 115 "answer": false, 116 "justification": "The paper makes causal claims such as 'LLM-Dys achieves superior synthesis quality compared to other text-diversity-constrained simulation corpora' and attributes performance to 'consistent patterns in LLM-generated dysfluencies and our standardized TTS pipeline.' However, no controlled experiments isolate the specific causal factors — the LLM-based text generation, the TTS model choice, and the scale are confounded." 117 }, 118 "generalization_bounded": { 119 "applies": true, 120 "answer": false, 121 "justification": "The paper tests only in English with VCTK speakers and specific TTS models, yet the title and abstract make broad claims about 'speech dysfluency detection' without bounding to English or the specific speaker set. The conclusion mentions 'cross-lingual coverage' as future work, implicitly acknowledging this gap, but the main claims are not bounded." 122 }, 123 "alternative_explanations_discussed": { 124 "applies": true, 125 "answer": false, 126 "justification": "No alternative explanations are discussed for the observed results. The improvement could be due to scale (12,790 hours vs smaller baselines), TTS model choice, LLM text quality, or speaker diversity, but these factors are not disentangled or discussed." 127 } 128 }, 129 "setup_transparency": { 130 "model_versions_specified": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper states 'claude-3-5-sonnet' for text generation and 'Whisper-large-v3-turbo' for detection, but does not provide snapshot dates or API versions for Claude. The VITS and E2-TTS implementations lack version specifics." 134 }, 135 "prompts_provided": { 136 "applies": true, 137 "answer": false, 138 "justification": "The paper states 'The prompts we used can be found at our open-sourced page' but does not include the actual prompt text in the paper or appendix. Only a reference to the GitHub repository is given." 139 }, 140 "hyperparameters_reported": { 141 "applies": true, 142 "answer": false, 143 "justification": "Key hyperparameters are largely unreported. The paper mentions pause durations (0.8-3.5s for word-level, 0.3-1.5s for phoneme-level) and prolongation extension (0.17-0.8s), but does not report LLM temperature/sampling settings, Whisper fine-tuning hyperparameters (learning rate, epochs, batch size), or TTS model parameters." 144 }, 145 "scaffolding_described": { 146 "applies": false, 147 "answer": false, 148 "justification": "No agentic scaffolding is used. The LLM (Claude) is used for single-turn text generation, not as an agent." 149 }, 150 "data_preprocessing_documented": { 151 "applies": true, 152 "answer": true, 153 "justification": "Section 2 describes the data pipeline: LLM text generation (Section 2.1), speech synthesis via VITS/E2-TTS (Section 2.2), special handling for pause and prolongation markers, and speaker variation (109 VCTK speakers). The processing steps from text to final audio are documented." 154 } 155 }, 156 "limitations_and_scope": { 157 "limitations_section_present": { 158 "applies": true, 159 "answer": false, 160 "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion briefly mentions future directions (expanding styles, languages, articulatory priors) but does not discuss limitations of the current work." 161 }, 162 "threats_to_validity_specific": { 163 "applies": true, 164 "answer": false, 165 "justification": "No specific threats to validity are discussed. There is no mention of the synthetic-to-real domain gap, potential biases in LLM-generated text patterns, or limitations of using automated quality metrics instead of human evaluation." 166 }, 167 "scope_boundaries_stated": { 168 "applies": true, 169 "answer": false, 170 "justification": "No explicit scope boundaries are stated. The paper does not note that results are specific to English, VCTK speakers, or the particular TTS models used. The conclusion mentions future cross-lingual work but does not frame current results as limited to the tested setting." 171 } 172 }, 173 "data_integrity": { 174 "raw_data_available": { 175 "applies": true, 176 "answer": true, 177 "justification": "The paper states all data are open-sourced at the GitHub repository. The entire LLM-Dys dataset (12,790 hours of synthetic speech) is made available." 178 }, 179 "data_collection_described": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section 2 describes the full data generation process: LLM-based text generation (Section 2.1), TTS synthesis (Section 2.2), with details on speaker variation, pause/prolongation handling, and statistics (Section 2.3)." 183 }, 184 "recruitment_methods_described": { 185 "applies": false, 186 "answer": false, 187 "justification": "No human participants were recruited. The dataset is entirely synthetic, generated from LLMs and TTS models using existing speech corpora (VCTK)." 188 }, 189 "data_pipeline_documented": { 190 "applies": true, 191 "answer": true, 192 "justification": "The pipeline is documented in Section 2 and Figure 1: LLM generates dysfluent text with labels, text is fed to TTS (VITS or E2-TTS), pause/prolongation markers are processed, and 109 speaker variants are generated per utterance. Table 1 provides counts at each level." 193 } 194 }, 195 "conflicts_of_interest": { 196 "funding_disclosed": { 197 "applies": true, 198 "answer": true, 199 "justification": "Section 6 (Acknowledgements) lists funding support: 'UC Noyce Initiative, Society of Hellman Fellows, NIH/NIDCD, and the Schwab Innovation fund.'" 200 }, 201 "affiliations_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "Author affiliations are clearly listed: Zhejiang University (China), UC Berkeley (USA), and UCSF (USA). The paper does not evaluate a commercial product from any of these institutions." 205 }, 206 "funder_independent_of_outcome": { 207 "applies": true, 208 "answer": true, 209 "justification": "The funders (UC Noyce Initiative, NIH/NIDCD, Hellman Fellows, Schwab Innovation fund) are academic/government research funders with no direct financial stake in the dysfluency detection results." 210 }, 211 "financial_interests_declared": { 212 "applies": true, 213 "answer": false, 214 "justification": "No competing interests or financial interests statement is provided. The paper uses Anthropic's Claude for text generation but does not disclose whether any authors have financial relationships with Anthropic." 215 } 216 }, 217 "contamination": { 218 "training_cutoff_stated": { 219 "applies": true, 220 "answer": false, 221 "justification": "The paper uses Whisper-large-v3-turbo as the base model for fine-tuning and evaluates on SEP-28k and UCLASS benchmarks. No training data cutoff is stated for the Whisper model, making it impossible to assess whether benchmark data could have been in Whisper's pre-training data." 222 }, 223 "train_test_overlap_discussed": { 224 "applies": true, 225 "answer": false, 226 "justification": "No discussion of potential train/test overlap. The paper fine-tunes Whisper on synthetic data and tests on real data (SEP-28k, UCLASS), but does not discuss whether Whisper's pre-training data might include speech from VCTK speakers or overlap with test samples." 227 }, 228 "benchmark_contamination_addressed": { 229 "applies": true, 230 "answer": false, 231 "justification": "SEP-28k (2021) and UCLASS (2009) were both available before Whisper's training. The paper does not address whether Whisper may have been exposed to these datasets during its large-scale pre-training on web audio." 232 } 233 }, 234 "human_studies": { 235 "pre_registered": { 236 "applies": false, 237 "answer": false, 238 "justification": "No human participants are involved. The study uses synthetic data generation and automated evaluation." 239 }, 240 "irb_or_ethics_approval": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants are involved in this study." 244 }, 245 "demographics_reported": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants. The 'speakers' are from the VCTK corpus, not study participants." 249 }, 250 "inclusion_exclusion_criteria": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants in this study." 254 }, 255 "randomization_described": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study." 259 }, 260 "blinding_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "attrition_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 } 270 }, 271 "cost_and_practicality": { 272 "inference_cost_reported": { 273 "applies": true, 274 "answer": false, 275 "justification": "No inference cost or latency is reported. The system uses Claude API calls for text generation and runs TTS models for 12,790 hours of synthesis, but no costs are mentioned." 276 }, 277 "compute_budget_stated": { 278 "applies": true, 279 "answer": false, 280 "justification": "No computational budget is stated. Generating 12,790 hours of synthetic speech likely required substantial compute, and fine-tuning Whisper-large-v3-turbo is also non-trivial, but neither the GPU hours, training time, nor API costs are reported." 281 } 282 } 283 }, 284 "claims": [ 285 { 286 "claim": "LLM-Dys is the most comprehensive dysfluent speech corpus with over 10,000 hours covering 11 dysfluency categories.", 287 "evidence": "Table 1 shows 12,790 total hours across 11 types at word and phoneme levels. Comparison with prior datasets in Section 1 shows no prior dataset at this scale.", 288 "supported": "strong" 289 }, 290 { 291 "claim": "LLM-Dys achieves superior synthesis quality compared to other simulation corpora and is comparable to real fluent speech.", 292 "evidence": "Fig. 2 shows Meta Audiobox Aesthetics scores (CE, CU, PQ) where LLM-Dys scores are close to or exceed those of VCTK and LibriTTS (real speech corpora). However, evaluation is entirely automated — no human perceptual evaluation.", 293 "supported": "moderate" 294 }, 295 { 296 "claim": "The system achieves state-of-the-art performance on real-world dysfluency detection benchmarks (SEP-28k and UCLASS).", 297 "evidence": "Table 3 shows higher F1 scores than Wagner et al. and Yolo-Stutter on most dysfluency types for SEP-28k. Table 5 shows improved accuracy (0.971) and F1 (0.977) vs StutterNet (0.932) on UCLASS. However, comparisons are incomplete — Yolo-Stutter only reports recall, and the evaluation is zero-shot for SEP-28k.", 298 "supported": "moderate" 299 }, 300 { 301 "claim": "Simply increasing textual data or diversity may not yield additional performance improvements unless a high-quality TTS model is employed.", 302 "evidence": "Section 4.3.3 and Fig. 5 show performance plateaus after 3x4000 samples, with marginal or declining improvements at 3x12000. However, this finding is limited to the specific TTS model used and the claim about TTS quality is not experimentally verified.", 303 "supported": "weak" 304 }, 305 { 306 "claim": "The optimal fluent-to-disfluent speech ratio is approximately 0.05.", 307 "evidence": "Section 4.3.4 and Fig. 5 (right) show performance peaks at ratio ~0.05. But this is specific to the SEP-28k disfluency distribution and is not generalizable.", 308 "supported": "moderate" 309 } 310 ], 311 "methodology_tags": ["benchmark-eval"], 312 "key_findings": "The paper introduces LLM-Dys, a large-scale (12,790 hours) synthetic dysfluency speech dataset covering 11 categories at both word and phoneme levels, generated using Claude 3.5 Sonnet for text simulation and VITS/E2-TTS for speech synthesis. The system achieves state-of-the-art performance on SEP-28k and UCLASS benchmarks for dysfluency detection using a fine-tuned Whisper-large-v3-turbo model. Scaling experiments show diminishing returns beyond 3x4000 training samples per type, and the optimal fluent-to-disfluent ratio is approximately 0.05. Quality evaluation via Meta Audiobox Aesthetics shows the synthetic speech approaches real speech quality.", 313 "red_flags": [ 314 { 315 "flag": "No error bars or uncertainty quantification", 316 "detail": "All results across Tables 2-5 are single point estimates with no confidence intervals, standard deviations, or indication of variability across runs. For a machine learning paper, this makes it impossible to assess whether differences are meaningful." 317 }, 318 { 319 "flag": "No human evaluation of speech quality", 320 "detail": "The paper makes strong claims about 'naturalness' and 'quality' of synthetic speech but relies entirely on an automated metric (Meta Audiobox Aesthetics). Speech naturalness is inherently perceptual and subjective, making human evaluation critical for such claims." 321 }, 322 { 323 "flag": "Incomplete baseline comparisons", 324 "detail": "Comparisons with Yolo-Stutter are recall-only (missing precision and F1). Wagner et al. comparison only shows F1. The metrics are not consistent across baselines, making fair comparison difficult." 325 }, 326 { 327 "flag": "Confounded experimental design", 328 "detail": "The improvement over prior work could be due to multiple confounded factors: LLM text generation quality, larger dataset scale (12,790 hrs vs smaller baselines), TTS model choice (VITS/E2-TTS), or speaker diversity (109 speakers). No controlled experiment isolates which factor drives the improvement." 329 }, 330 { 331 "flag": "No limitations section", 332 "detail": "The paper lacks any discussion of limitations, threats to validity, or scope boundaries. Generalization claims are unbounded — results are specific to English with VCTK speakers and particular TTS models, but the paper does not acknowledge this." 333 }, 334 { 335 "flag": "Small manually annotated test sets", 336 "detail": "The SEP-28k evaluation uses only 200 manually annotated samples from a 28,000-clip dataset. The UCLASS evaluation uses 120 test samples with only 80 for fine-tuning. These small sample sizes are not justified and limit the reliability of the reported metrics." 337 } 338 ], 339 "cited_papers": [ 340 { 341 "title": "Large language models for dysfluency detection in stuttered speech", 342 "authors": ["D. Wagner", "S. P. Bayerl", "I. Baumann", "K. Riedhammer", "E. Nöth", "T. Bocklet"], 343 "year": 2024, 344 "relevance": "Uses LLMs for speech dysfluency detection, directly relevant to LLM capabilities in speech analysis tasks." 345 }, 346 { 347 "title": "Yolo-stutter: End-to-end region-wise speech dysfluency detection", 348 "authors": ["X. Zhou", "A. Kashyap", "S. Li"], 349 "year": 2024, 350 "relevance": "End-to-end dysfluency detection baseline, representative of current approaches in the field." 351 }, 352 { 353 "title": "Ssdm: Scalable speech dysfluency modeling", 354 "authors": ["J. Lian", "X. Zhou", "Z. Ezzes"], 355 "year": 2024, 356 "relevance": "NeurIPS 2024 paper on scalable dysfluency modeling using TTS-based simulation, directly related methodology." 357 }, 358 { 359 "title": "Robust speech recognition via large-scale weak supervision", 360 "authors": ["A. Radford", "J. W. Kim", "T. Xu", "G. Brockman", "C. McLeavey", "I. Sutskever"], 361 "year": 2023, 362 "relevance": "Whisper model used as the base for the detection framework, relevant to LLM/foundation model evaluation methodology." 363 }, 364 { 365 "title": "Scaling laws for neural language models", 366 "authors": ["J. Kaplan", "S. McCandlish", "T. Henighan", "T. B. Brown"], 367 "year": 2020, 368 "arxiv_id": "2001.08361", 369 "relevance": "Referenced for the scaling law experiments; relevant to understanding data scaling in AI systems." 370 }, 371 { 372 "title": "Time and tokens: Benchmarking end-to-end speech dysfluency detection", 373 "authors": ["X. Zhou", "J. Lian", "C. J. Cho"], 374 "year": 2024, 375 "arxiv_id": "2409.13582", 376 "relevance": "VCTK-Token benchmark and token-based detection framework that this paper builds upon." 377 }, 378 { 379 "title": "Meta audiobox aesthetics: Unified automatic quality assessment for speech, music, and sound", 380 "authors": ["A. Tjandra", "Y.-C. Wu", "B. Guo"], 381 "year": 2025, 382 "arxiv_id": "2502.05139", 383 "relevance": "Automated audio quality evaluation tool used as the primary quality metric, relevant to evaluation methodology." 384 }, 385 { 386 "title": "F5-tts: A fairytaler that fakes fluent and faithful speech with flow matching", 387 "authors": ["Y. Chen", "Z. Niu", "Z. Ma"], 388 "year": 2024, 389 "arxiv_id": "2410.06885", 390 "relevance": "TTS model variant (E2-TTS) used for word-level insertion synthesis in the pipeline." 391 } 392 ] 393 }