scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30117B)
      1 {
      2   "paper": {
      3     "title": "Improving Robustness of LLM-based Speech Synthesis by Learning Monotonic Alignment",
      4     "authors": [
      5       "Paarth Neekhara",
      6       "Shehzeen Hussain",
      7       "Subhankar Ghosh",
      8       "Jason Li",
      9       "Rafael Valle",
     10       "Rohan Badlani",
     11       "Boris Ginsburg"
     12     ],
     13     "year": 2024,
     14     "venue": "Interspeech",
     15     "arxiv_id": "2406.17957",
     16     "doi": "10.48550/arXiv.2406.17957"
     17   },
     18   "scan_version": 3,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval"],
     21   "key_findings": "An encoder-decoder T5-based TTS model with guided cross-attention alignment learning significantly reduces character error rate from 9.03% to 3.92% on challenging texts with repeating words. Cross-attention heads in LLM-based TTS models implicitly learn text-speech alignment during next-token prediction training, and this alignment can be made more robust through a combination of beta-binomial attention priors and CTC-based alignment loss without adding new parameters. Finite Scalar Quantization (FSQ) spectral codecs outperform RVQ-based codecs (Encodec, Dac) for TTS quality and enable parallel codebook prediction.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No source code repository is provided. The paper links to audio examples at https://t5tts.github.io/ but does not release any implementation code."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "Training uses three public datasets (LibriTTS, HiFiTTS, LibriVox MLS) but also a 'proprietary, 2-speaker, 63 hour dataset' that is not released. The 100 challenging texts used for the key evaluation in Table 2 are also not provided."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper mentions 32 NVIDIA A100 GPUs and AdamW optimizer but does not provide a requirements.txt, Dockerfile, or detailed software environment specification with library versions."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions are provided. The paper describes the method but does not include scripts or a README for reproducing experiments."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "MOS scores in Table 2 include 95% confidence intervals (e.g., '4.06 ± 0.038'). However, the primary intelligibility metrics (CER, WER, SSIM) in Tables 1-3 are reported as point estimates without error bars."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No statistical significance tests are performed. All comparisons between models are based on point estimates (e.g., CER 9.03% vs 3.92%) without any test of statistical significance."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Effect sizes are reported with baseline context. Table 2 shows CER reduction from 9.03% (no alignment) to 3.92% (with alignment), and comparison against baselines like VALL-E-X (8.37% CER) and SpeechT5 (6.14% CER), providing sufficient context to judge magnitude."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The evaluation uses 200 holdout utterances for seen speakers and 200 for unseen speakers, 100 challenging texts, and 100 Harvard Sentences, but no justification is given for these sample sizes."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No variance across experimental runs is reported. Tables 1-3 show single-run results. The MOS confidence intervals reflect inter-listener variance, not experimental variance across model training runs."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Table 2 compares against three prior LLM-based TTS models: VALL-E-X, Bark, and SpeechT5. Table 1 includes ablation variants as internal baselines."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Baselines include VALL-E-X (2023), Bark (2023), and SpeechT5 (2022), all reasonably contemporary for a 2024 paper in the LLM-based TTS space."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Three ablation variants are compared in Tables 1-2: (1) No Prior, No L_align, (2) With Prior, No L_align, (3) With Prior, With L_align. Table 3 ablates across three codec choices."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Four metrics are used: Character Error Rate (CER), Word Error Rate (WER), Speaker Similarity (SSIM), and Mean Opinion Score (MOS). Table 2 additionally reports character insertions, deletions, and substitutions."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "MOS evaluation is conducted on Amazon Mechanical Turk using 100 Harvard Sentences. 'Each audio is rated by at least 10 independent listeners' on a 1-5 scale, totaling 2000 evaluations per model (Section 4.2)."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Seen-speaker evaluation uses '200 holdout utterances of the train-clean-360 set.' Unseen-speaker evaluation uses VCTK speakers not in the training data (Section 4.2)."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Results are broken down by: seen vs unseen speakers (Table 1), context location (encoder vs decoder), codec choice (Table 3), and standard vs challenging texts (Tables 1 vs 2)."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper discusses failure modes: 'Without the prior and with L_align, we obtain monotonic but unaligned attention maps, leading to no speech synthesis' (Section 4.2). The introduction discusses hallucination failures (repeating words, missing words, looping, infinite silences)."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The model trained with L_align but without the prior produced 'no speech synthesis' (Section 4.2). Dac codec yielded worse CER/WER than spectral codec (Table 3). Encoder context yielded lower speaker similarity for unseen speakers than decoder context (Table 1)."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The abstract claims the technique 'significantly improves robustness of LLM-based TTS models.' Table 2 supports this with CER reduction from 9.03% to 3.92% and outperformance of prior models. The claim about cross-attention heads learning alignment is supported by Figure 1."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The main causal claim is that attention prior and alignment loss improve robustness. This is supported by controlled ablation: three model variants differ only in the alignment learning components, trained on the same data with the same architecture. The ablation design (single-variable manipulation) adequately supports the causal claim."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The title claims improvement for 'LLM-based Speech Synthesis' broadly, but experiments are conducted only on English, with a single encoder-decoder architecture (T5), and specific codecs. Decoder-only LLMs (GPT-style) are not tested despite being mentioned. The generalization to other languages, architectures, and settings is not bounded."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "No alternative explanations for the results are discussed. For example, the improvement could partly stem from the pretrained T5 initialization interacting differently with the alignment constraints, or the baselines may perform differently with more training data. These possibilities are not considered."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper uses CER/WER from an ASR model as a proxy for intelligibility, SSIM for speaker similarity, and MOS for naturalness. These are standard, well-established proxies in speech synthesis, and the paper's claims match the granularity of measurements (e.g., 'reduces Character Error Rate' rather than vague claims about quality)."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "The model architecture is fully specified: 12 layers, 12 attention heads, 768 embedding dimension, 4096 FFN dimension, 220M parameters (Section 4.1). Evaluation models are linked to specific HuggingFace checkpoints (conformer_transducer_large for ASR, wavlm-base-plus-sv for speaker verification). Baselines use released checkpoints [19, 36, 37]."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "The task prompts are provided: 'Phoneme TTS' or 'Text to Speech' prepended to input text (Section 3.1). The full input structure (text tokens + reference audio tokens → target audio tokens) is described in Section 3.2."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Extensive hyperparameters are reported: batch size 192, 32 A100 GPUs, 250,000 steps, learning rate 1e-4, AdamW, dropout 0.1, top-k=80, temperature=0.85, context duration 3 seconds, S1=8000, S2=15000 for prior annealing, max sequence length 1536 (Section 4.1)."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "No agentic scaffolding is used. This is a standalone TTS model that takes text input and generates speech tokens autoregressively."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "The paper describes tokenization of text (sentence-piece and phonemes), audio codec processing (Encodec, Dac, spectral codec producing T×N matrices of discrete codes), embedding computation via N codebook tables, and the delay pattern scheme for RVQ tokens (Sections 3.1-3.2)."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion (Section 5) summarizes contributions but does not discuss limitations."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No threats to validity are discussed. The paper does not address potential issues such as the proprietary training data, single-run results, limited language coverage, or the model architecture specificity."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "No explicit scope boundaries are stated. The paper does not clarify what the results do NOT show, such as whether findings generalize beyond English, beyond T5 architecture, or beyond the tested codecs."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "Raw experimental data (synthesized audio files, attention maps) are not available for download. Audio examples are linked on a demo page but raw evaluation data is not released."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Training data is described: '1.8k hours of English TTS data from four datasets: the train-clean-360 subset of LibriTTS, HiFiTTS, a 1000 hour subset of the LibriVox MLS dataset, and a proprietary, 2-speaker, 63 hour dataset' (Section 4.1). Evaluation sets are specified with sizes and speaker counts."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "MTurk workers are used for MOS evaluation but no recruitment details are provided (qualifications, geographic restrictions, screening criteria). The paper only states 'each audio is rated by at least 10 independent listeners.'"
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The pipeline from raw audio to codec tokens to model training is described: audio → neural audio codec → T×N discrete code matrix → embedding via N codebook tables → model input (Section 3.1). The evaluation pipeline (synthesize → ASR transcription → CER/WER computation) is also documented."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding sources are disclosed. All authors are from NVIDIA Corporation but no funding statement or grant information is provided."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "All authors are identified as affiliated with 'NVIDIA Corporation, Santa Clara, CA, USA' in the paper header."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "All authors are NVIDIA employees evaluating their own TTS model. NVIDIA has a commercial interest in speech synthesis technology, making the funder non-independent of the outcome."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial interests statement is provided. NVIDIA employees may hold equity or patents related to speech synthesis technology, but this is not disclosed."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "This paper trains its own TTS model and evaluates speech synthesis quality (intelligibility, naturalness), not a pre-trained model's knowledge on a benchmark. The test is whether synthesized speech matches the input text, not whether the model 'knows' answers."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "Not applicable for the same reason: the evaluation tests TTS synthesis quality on held-out utterances, not model knowledge. The paper does use proper train/test splits (holdout utterances, unseen speakers)."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "Not applicable: the paper evaluates a TTS system's ability to synthesize speech, not a pre-trained model's benchmark performance. Contamination in the LLM benchmark sense is not a relevant concern."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": true,
    254         "answer": false,
    255         "justification": "No pre-registration is mentioned for the MOS evaluation study on Amazon Mechanical Turk."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": true,
    259         "answer": false,
    260         "justification": "No IRB or ethics board approval is mentioned for the MOS evaluation involving MTurk workers."
    261       },
    262       "demographics_reported": {
    263         "applies": true,
    264         "answer": false,
    265         "justification": "No demographics are reported for the MTurk listeners who rated audio quality."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": true,
    269         "answer": false,
    270         "justification": "No inclusion or exclusion criteria for MTurk listeners are stated. The paper does not describe any screening or qualification requirements."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "This is a rating study (MOS evaluation) where listeners rate individual audio samples, not an experimental study with treatment/control assignment requiring randomization."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "This is a rating study where listeners evaluate individual audio samples. While the standard practice is to present audio without system labels, the study design does not involve treatment/control blinding in the experimental sense."
    281       },
    282       "attrition_reported": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "No information on listener attrition or dropout is provided. The paper states each audio was rated by 'at least 10 independent listeners' but does not report how many listeners started vs finished."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No inference cost, latency, or real-time factor is reported for the TTS model despite this being critical for real-world TTS deployment."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "The paper states '32 NVIDIA A100 GPUs' and '250,000 steps' but does not report total GPU hours, wall-clock training time, or compute cost. Three separate model variants plus three codec variants were trained, representing substantial compute without quantification."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No results across multiple random seeds are reported. All tables appear to show single-run results."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The number of experimental runs is not stated. It appears each model variant was trained once."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No hyperparameter search is described. Key hyperparameters like S1=8000, S2=15000, top-k=80, temperature=0.85 are reported but no justification or search process is documented."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": true,
    319         "justification": "All three alignment learning configurations are reported in Tables 1-2 (No Prior/No L_align, With Prior/No L_align, With Prior/With L_align), and all three codec variants in Table 3. The selection is transparent rather than cherry-picked."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": false,
    323         "answer": false,
    324         "justification": "No statistical significance tests are performed at all, so multiple comparison correction is not applicable."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "NVIDIA authors evaluate their own TTS model against open-source baselines (VALL-E-X, Bark, SpeechT5) using released checkpoints but do not acknowledge the potential bias of evaluating their own system. Training data and compute budgets differ across models."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The T5-TTS model is trained on 1.8k hours with 32 A100 GPUs for 250K steps. The compute budgets and training data sizes for baselines (VALL-E-X, Bark, SpeechT5) likely differ significantly but are not compared or discussed."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "No discussion of whether CER/WER from ASR model transcription truly measures synthesis intelligibility, or whether MOS evaluation captures meaningful quality differences. The construct validity of using ASR model errors as a proxy for human-perceived intelligibility is not examined."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": false,
    343         "answer": false,
    344         "justification": "No scaffolding is involved in TTS synthesis. The models directly generate audio tokens from text input."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "The model is initialized from a pretrained T5 checkpoint trained on Pile, which could contain text overlapping with evaluation sets. This potential leakage is not discussed."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether the evaluation setup inadvertently leaks information. For example, the reference audio context (3 seconds from the same speaker) could provide phonetic or prosodic cues about the target."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "While the paper uses holdout utterances and unseen speakers, it does not formally discuss non-independence. The seen-speaker test uses LibriTTS holdout utterances, but whether these share content or recording conditions with training data is not analyzed."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No concrete leakage detection or prevention method is applied. The train/test splits rely on standard dataset partitions without verification."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Guided attention training with attention prior and CTC alignment loss reduces CER from 9.03% to 3.92% on challenging texts",
    373       "evidence": "Table 2 shows three T5-TTS variants on 100 challenging texts: No Prior/No L_align (CER 9.03%), With Prior/No L_align (7.27%), With Prior/With L_align (3.92%). All insertions, deletions, and substitutions decrease.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Cross-attention heads in LLM-based TTS models implicitly learn text-speech alignment during next-token prediction training",
    378       "evidence": "Figure 1 visualizes cross-attention score matrices showing near-diagonal patterns indicating monotonic alignment. Section 3.3 describes this observation qualitatively.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "T5-TTS with alignment learning outperforms VALL-E-X, Bark, and SpeechT5 on intelligibility and naturalness",
    383       "evidence": "Table 2 shows T5-TTS (W Prior, W L_align) achieves CER 3.92%, WER 9.22%, MOS 4.06 vs VALL-E-X (8.37%, 16.8%, 3.94), Bark (11.90%, 19.1%, 3.93), SpeechT5 (6.14%, 13.5%, 3.98). MOS includes 95% CIs.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Spectral codec with parallel codebook prediction outperforms Encodec and Dac for TTS synthesis",
    388       "evidence": "Table 3 compares three codecs: Spectral codec (CER 2.16%, MOS 3.91), Encodec (CER 4.01%, MOS 3.57), Dac (CER 6.72%, MOS 3.92). Spectral codec has best intelligibility; Dac has similar naturalness but worse intelligibility.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Feeding context to the decoder yields better speaker similarity and intelligibility for unseen speakers than feeding to the encoder",
    393       "evidence": "Table 1 shows decoder context achieves CER 2.31%, SSIM 0.779 vs encoder context CER 2.86%, SSIM 0.741 on unseen VCTK speakers. Only one evaluation setting.",
    394       "supported": "weak"
    395     }
    396   ],
    397   "red_flags": [
    398     {
    399       "flag": "Company evaluating own product",
    400       "detail": "All seven authors are from NVIDIA, evaluating their own TTS model against open-source baselines. The comparison may be influenced by familiarity with their own system's strengths and optimal inference settings."
    401     },
    402     {
    403       "flag": "No statistical significance tests",
    404       "detail": "All performance comparisons are based on point estimates without any significance tests. The differences in MOS (4.06 vs 3.98 for SpeechT5) may not be statistically significant despite overlapping confidence intervals."
    405     },
    406     {
    407       "flag": "Single-run results with no variance reporting",
    408       "detail": "Results appear to be from single training runs. TTS model training can be sensitive to random initialization and data ordering, but no seed sensitivity analysis or multi-run variance is reported."
    409     },
    410     {
    411       "flag": "Unreleased evaluation data",
    412       "detail": "The 100 challenging texts used for the key comparison in Table 2 are described only as texts 'with repeating words' and are not released. This makes the primary evaluation non-reproducible."
    413     },
    414     {
    415       "flag": "Potentially unfair baseline comparison",
    416       "detail": "The T5-TTS model is trained on 1.8k hours of data with 32 A100 GPUs for 250K steps. Training data sizes and compute budgets for VALL-E-X, Bark, and SpeechT5 may differ significantly, making the comparison potentially unfair. No matched-compute or matched-data comparison is provided."
    417     },
    418     {
    419       "flag": "Proprietary training data",
    420       "detail": "One of four training datasets is a 'proprietary, 2-speaker, 63 hour dataset' that cannot be accessed by other researchers, partially limiting reproducibility."
    421     }
    422   ],
    423   "cited_papers": [
    424     {
    425       "title": "Improving language understanding by generative pre-training",
    426       "authors": ["A. Radford", "K. Narasimhan", "T. Salimans", "I. Sutskever"],
    427       "year": 2018,
    428       "relevance": "Foundational GPT paper establishing the autoregressive language modeling paradigm used in LLM-based speech synthesis."
    429     },
    430     {
    431       "title": "Exploring the limits of transfer learning with a unified text-to-text transformer",
    432       "authors": ["C. Raffel", "N. Shazeer", "A. Roberts", "K. Lee", "S. Narang", "M. Matena", "Y. Zhou", "W. Li", "P. J. Liu"],
    433       "year": 2020,
    434       "relevance": "T5 encoder-decoder architecture paper that provides the backbone for the proposed TTS model."
    435     },
    436     {
    437       "title": "Neural codec language models are zero-shot text to speech synthesizers",
    438       "authors": ["C. Wang", "S. Chen", "Y. Wu", "Z. Zhang", "L. Zhou"],
    439       "year": 2023,
    440       "arxiv_id": "2301.02111",
    441       "relevance": "VALL-E paper demonstrating LLM-based zero-shot TTS capabilities, a key baseline and motivation for this work."
    442     },
    443     {
    444       "title": "Survey of hallucination in natural language generation",
    445       "authors": ["Z. Ji", "N. Lee", "R. Frieske", "T. Yu", "D. Su"],
    446       "year": 2023,
    447       "relevance": "Comprehensive survey on LLM hallucination that contextualizes the attention error/hallucination problem addressed in TTS synthesis."
    448     },
    449     {
    450       "title": "Audiolm: a language modeling approach to audio generation",
    451       "authors": ["Z. Borsos", "R. Marinier", "D. Vincent", "E. Kharitonov"],
    452       "year": 2023,
    453       "relevance": "Pioneered training decoder-only LLMs on discretized audio tokens from neural codecs, establishing the LLM-based audio generation paradigm."
    454     },
    455     {
    456       "title": "SpeechT5: Unified-modal encoder-decoder pre-training for spoken language processing",
    457       "authors": ["J. Ao", "R. Wang", "L. Zhou", "C. Wang", "S. Ren"],
    458       "year": 2022,
    459       "relevance": "Encoder-decoder architecture for unified text-speech processing, a direct baseline in this paper's evaluation."
    460     },
    461     {
    462       "title": "Ella-v: Stable neural codec language modeling with alignment-guided sequence reordering",
    463       "authors": ["Y. Song", "Z. Chen", "X. Wang", "Z. Ma", "X. Chen"],
    464       "year": 2024,
    465       "arxiv_id": "2401.07333",
    466       "relevance": "Concurrent work addressing the same alignment instability problem in LLM-based TTS through sequence reordering rather than attention guidance."
    467     },
    468     {
    469       "title": "High fidelity neural audio compression",
    470       "authors": ["A. Défossez", "J. Copet", "G. Synnaeve", "Y. Adi"],
    471       "year": 2023,
    472       "relevance": "Encodec neural audio codec used as one of the three codec options in this paper's TTS system."
    473     }
    474   ],
    475   "engagement_factors": {
    476     "practical_relevance": {
    477       "score": 2,
    478       "justification": "TTS engineers could apply the attention prior and CTC alignment loss technique to improve robustness of their LLM-based TTS models without architecture changes."
    479     },
    480     "surprise_contrarian": {
    481       "score": 1,
    482       "justification": "The finding that cross-attention heads implicitly learn alignment is interesting but not deeply surprising; the proposed fix (guided attention) builds on known techniques from non-LLM TTS."
    483     },
    484     "fear_safety": {
    485       "score": 0,
    486       "justification": "No safety, security, or risk implications — this is a speech synthesis quality improvement."
    487     },
    488     "drama_conflict": {
    489       "score": 0,
    490       "justification": "No controversy or conflicting claims; straightforward technical contribution."
    491     },
    492     "demo_ability": {
    493       "score": 1,
    494       "justification": "Audio examples are available at https://t5tts.github.io/ but no code or model is released for others to try."
    495     },
    496     "brand_recognition": {
    497       "score": 2,
    498       "justification": "From NVIDIA, a well-known AI hardware and software company, though not one of the headline LLM labs."
    499     }
    500   }
    501 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs