ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27617B)


      1 {
      2   "paper": {
      3     "title": "Text-to-Audio Generation using Instruction-Tuned LLM and Latent Diffusion Model",
      4     "authors": [
      5       "Deepanway Ghosal",
      6       "Navonil Majumder",
      7       "Ambuj Mehrish",
      8       "Soujanya Poria"
      9     ],
     10     "year": 2023,
     11     "venue": "arXiv preprint",
     12     "arxiv_id": "2304.13731",
     13     "doi": "10.48550/arXiv.2304.13731"
     14   },
     15   "scan_version": 3,
     16   "active_modules": [
     17     "experimental_rigor",
     18     "data_leakage"
     19   ],
     20   "methodology_tags": [
     21     "benchmark-eval"
     22   ],
     23   "key_findings": "TANGO, using frozen FLAN-T5-Large as the text encoder in a latent diffusion model, outperforms AudioLDM on most objective metrics (FD 24.52, KL 1.37, FAD 1.59) and subjective evaluations (OVL 85.94, REL 80.36) on the AudioCaps test set, despite training on 63x less data than AudioLDM. Audio pressure level-based mixing augmentation improves over random mixing. When pre-trained on a larger corpus (TANGO-FULL-FT), performance improves further to FD 18.93 and KL 1.12.",
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "GitHub repository URL provided in the paper header: https://github.com/declare-lab/tango."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The main dataset (AudioCaps) is publicly available. The authors also released their augmented dataset on HuggingFace: https://huggingface.co/datasets/declare-lab/TangoPromptBank (Section 3.1)."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper mentions hardware (4 A6000 GPUs) and some training settings but provides no requirements.txt, Dockerfile, or detailed software dependency list with library versions."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No step-by-step reproduction instructions are provided in the paper. Training details are described but not as a reproducible recipe with explicit commands."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "All results in Tables 1-8 are point estimates with no confidence intervals or error bars."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Claims like 'TANGO outperforms AudioLDM' are based solely on comparing raw metric values. No statistical significance tests (p-values, t-tests, etc.) are reported."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Absolute metric values are reported with baselines providing context for magnitude of differences (e.g., TANGO FD 24.52 vs AudioLDM-L FD 27.12, Table 1). Dataset size ratios (63x smaller) are also stated."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No justification is provided for why the AudioCaps test set size is appropriate for the claims made. For subjective evaluation, only 30 randomly selected samples are used with no power analysis."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No variance, standard deviation, or multi-run results are reported. All results appear to be from single runs."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple baselines are compared: DiffSound, AudioGen, and several AudioLDM configurations (Section 3.2, Table 1)."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "AudioLDM (2023) was the state-of-the-art at time of writing. AudioGen (2022) and DiffSound (2022) are also recent. All baselines are within 1 year of the paper."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Table 4 ablates augmentation strategy (random vs. pressure-based). Table 5 ablates inference steps and classifier-free guidance scale. Section 2.4 notes that dropping text guidance 10% of the time performed equivalently."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Three objective metrics (FD, KL, FAD) and two subjective metrics (OVL, REL) are used throughout (Table 1)."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Six human evaluators assessed overall audio quality (OVL) and relevance to input text (REL) on 30 randomly selected samples on a 1-100 scale (Section 3.3)."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The AudioCaps test set is used for evaluation, which is separate from the training set (45,438 clips) and validation set (2,240 instances). Best checkpoint selected on validation loss (Section 3.1)."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Table 6 breaks down by temporal sequence type, Table 7 by number of labels, and Table 8 by audio category (human, animal, natural, things, etc.)."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 5 discusses specific failure cases: 'the generations from TANGO for prompts Chopping tomatoes on a wooden table and Chopping potatoes on a metal table are very similar.' Limitations in fine-grained control are acknowledged."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Table 5 shows poor performance without classifier-free guidance (guidance scale 1). Table 8 shows categories where AudioLDM outperforms TANGO (e.g., animal sounds FD). Authors acknowledge inability to reproduce AudioLDM-L-Full-FT results (Section 3.2)."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The abstract claims TANGO 'outperforms the state-of-the-art AudioLDM on most metrics and stays comparable on the rest.' Table 1 confirms: TANGO beats AudioLDM-L on all three objective metrics; it beats AudioLDM-L-Full-FT on KL and FAD while trailing slightly on FD. The hedging is appropriate."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper claims 'We may largely attribute this to the use of LLM FLAN-T5' (Section 3.2) but this is confounded: TANGO differs from AudioLDM in text encoder, training procedure (no CLAP), and augmentation strategy simultaneously. No controlled experiment isolates the FLAN-T5 contribution from other architectural differences."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The title claims 'Text-to-Audio Generation' broadly, but results are only on the AudioCaps dataset, which is biased toward YouTube clips. The paper does not bound generalizations to this specific domain of short (10s) YouTube audio clips."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper attributes performance to FLAN-T5 but does not substantively discuss alternative explanations such as: differences in training procedure (no CLAP pre-training), different effective batch sizes, or the fact that TANGO has more parameters (866M vs 739M for AudioLDM-L)."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper uses established audio generation metrics (FD, KL, FAD) and explicitly describes what each measures: 'FAD is more related to human perception, KL divergence captures the similarities between the original and generated audio signals based on broad concepts' (Section 3.3). Claims match metric granularity."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "FLAN-T5-LARGE (780M parameters) is specified. AudioLDM checkpoint variants are named precisely (AudioLDM-M-Full-FT, AudioLDM-L-Full, etc.) with parameter counts. The U-Net has 866M parameters (Section 3.1)."
    153       },
    154       "prompts_provided": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "The paper does not use prompting in the LLM sense. FLAN-T5 is used as a frozen text encoder, not prompted. Text inputs are audio descriptions from AudioCaps, not designed prompts."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Reported: AdamW optimizer, learning rate 3e-5, linear scheduler, 40 epochs, per-GPU batch size 3 with 4 gradient accumulation steps, 8 latent channels, cross-attention dimension 1024, compression level 4, guidance scale 3, 200 inference steps (Section 3.1)."
    163       },
    164       "scaffolding_described": {
    165         "applies": false,
    166         "answer": false,
    167         "justification": "No agentic scaffolding is used. TANGO is a standard LDM pipeline (text encoder → diffusion → VAE decoder → vocoder)."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Audio resampled to 16KHz, clips longer than 10s segmented into 10s partitions. Augmentation strategy described with pressure-level mixing equations (Section 2.3). VAE compression details provided (Section 2.5)."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 5 'Limitations' is a dedicated section discussing TANGO's inability to finely control generations over textual control prompts."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Section 5 provides specific examples: 'the generations from TANGO for prompts Chopping tomatoes on a wooden table and Chopping potatoes on a metal table are very similar.' This is a concrete, study-specific limitation rather than generic boilerplate."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "The limitations section identifies what the model struggles with but does not systematically state what the results do NOT show. No explicit boundaries on the scope of claims (e.g., limited to 10s YouTube audio clips, English captions only)."
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "AudioCaps is publicly available. The augmented dataset is released on HuggingFace (https://huggingface.co/datasets/declare-lab/TangoPromptBank). The VAE and vocoder checkpoints are from Liu et al. [18]."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "AudioCaps dataset described: 45,438 training clips, 2,240 validation, 10s clips from YouTube, crowd-sourced from AudioSet. Test set has 5 captions per clip, one chosen at random (Section 3.1). TANGO-FULL-FT dataset statistics in Table 3."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "Six human evaluators are used for subjective evaluation but their recruitment method is not described. Only stated: 'The evaluators were proficient in the English language and instructed well to make a fair assessment' (Section 3.3). No details on who they are or how they were selected."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Pipeline is documented: audio clips → 16KHz resampling → mel-spectrogram → VAE compression → latent space. Augmentation via pressure-level mixing is described with equations. For TANGO-FULL-FT, dataset processing (segmentation, resampling) and label-to-prompt conversion are described (Section 3.1)."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Acknowledgements section: 'We are grateful to ORACLE FOR RESEARCH and HUGGINGFACE for their generous support to the project TANGO.'"
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "All authors are from DeCLaRe Lab, Singapore University of Technology and Design. No conflict with evaluated products (they don't evaluate Oracle or HuggingFace products)."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": true,
    228         "justification": "Oracle for Research and HuggingFace provide general research support. Neither has a direct financial stake in TANGO outperforming AudioLDM specifically."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests or financial interests statement is present in the paper."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "The paper trains a new LDM and uses FLAN-T5 as a frozen text encoder. It does not evaluate a pre-trained model's capability on a benchmark — it evaluates a newly trained generative system."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "The paper uses standard AudioCaps train/test splits for a newly trained model, not evaluating pre-trained model knowledge on a benchmark."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "Not applicable — the paper trains a new model on AudioCaps training set and evaluates on AudioCaps test set. This is standard supervised learning evaluation, not testing pre-trained model knowledge."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "The paper has no human subjects study. The 6 human evaluators rate audio outputs but are not subjects of investigation."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human subjects study. Evaluators rate audio quality but are not research participants in the human-subjects sense."
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human subjects study. The 6 evaluators are described only as 'proficient in the English language.'"
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human subjects study."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human subjects study."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human subjects study."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human subjects study."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Inference uses 200 diffusion steps but no wall-clock inference time, latency, or cost per generated audio is reported."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": true,
    299         "justification": "Training budget reported: 'four A6000 GPUs... a total of 52 hours to train 40 epochs' (Section 3.1). TANGO-FULL-FT: 200,000 pre-training steps on 4 A6000 GPUs."
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from a single training run."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The number of experimental runs is never stated. It is unclear whether results come from one run or multiple."
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Hyperparameters (learning rate, guidance scale, etc.) appear selected but no search budget or search procedure is described."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": true,
    321         "justification": "Best checkpoint selected on validation loss: 'we report results for the checkpoint with the best validation loss, which we obtained at epoch 39' (Section 3.1). Guidance scale selection justified by Table 5 sweep."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": false,
    325         "answer": false,
    326         "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The authors do not acknowledge the bias of evaluating their own system. They note AudioLDM-L-Full-FT checkpoint was unavailable and they couldn't reproduce its results, but do not discuss systematic self-comparison bias."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "TANGO has 866M parameters vs AudioLDM-L's 739M, a 17% parameter increase. Training compute differences are not discussed as a potential confound for the performance comparison."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "The paper uses FD, KL, and FAD without questioning whether these metrics actually capture audio generation quality. No discussion of construct validity or limitations of these automated metrics beyond briefly noting FAD relates to human perception."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": false,
    345         "answer": false,
    346         "justification": "No agentic scaffolding is involved in this generative model evaluation."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "FLAN-T5 was pre-trained on massive text data that could include AudioCaps captions. No discussion of whether the frozen text encoder has seen test set descriptions during pre-training."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether features from the frozen FLAN-T5 encoder or the pre-trained VAE (trained on AudioSet which includes AudioCaps) could leak test-time information."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "AudioCaps is derived from AudioSet. The VAE and vocoder were trained on AudioSet, which includes AudioCaps data. No discussion of whether this creates dependence between the pre-trained components and the evaluation set."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": false,
    368         "justification": "No concrete leakage detection or prevention method is used or described."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "TANGO outperforms AudioLDM and other baseline TTA models on most objective and subjective metrics on AudioCaps, despite training LDM on 63x smaller dataset.",
    375       "evidence": "Table 1: TANGO achieves FD 24.52, KL 1.37, FAD 1.59 vs AudioLDM-L (FD 27.12, KL 1.86, FAD 2.08) using only AudioCaps. Subjective: OVL 85.94, REL 80.36 vs AudioLDM-M-Full-FT OVL 79.85, REL 76.84 (Section 3.4).",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Using an instruction-tuned LLM (FLAN-T5) as a frozen text encoder is sufficient for TTA generation without CLAP-based joint text-audio pre-training.",
    380       "evidence": "Table 1 shows TANGO with frozen FLAN-T5 outperforms AudioLDM models that use CLAP (Section 2.1, 3.4). However, multiple variables change simultaneously (encoder type, training procedure, augmentation).",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Pressure level-based audio mixing augmentation outperforms random mixing.",
    385       "evidence": "Table 4: Relative pressure augmentation yields FD 24.52, KL 1.37, FAD 1.59 vs random augmentation FD 25.84, KL 1.38, FAD 2.72. Controlled comparison within the same TANGO architecture.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "TANGO-FULL-FT achieves state-of-the-art when pre-trained on a larger dataset corpus followed by AudioCaps fine-tuning.",
    390       "evidence": "Table 2: TANGO-FULL-FT achieves FD 18.93, KL 1.12 vs AudioLDM-L-Full-FT FD 23.31, KL 1.59 (Section 3.4).",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "There is substantial improvement when going from 100 to 200 inference steps for TANGO, unlike AudioLDM where performance plateaus.",
    395       "evidence": "Table 5 left: FD improves from 26.13 (100 steps) to 24.52 (200 steps), FAD from 1.87 to 1.59 (Section 3.4).",
    396       "supported": "moderate"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "No uncertainty quantification",
    402       "detail": "All results across Tables 1-8 are single-point estimates with no error bars, confidence intervals, or multi-run variance. It is impossible to assess whether the reported improvements are within noise."
    403     },
    404     {
    405       "flag": "Confounded causal attribution",
    406       "detail": "The paper attributes improvement to FLAN-T5, but TANGO differs from AudioLDM in multiple ways simultaneously: text encoder (FLAN-T5 vs RoBERTa/T5), training procedure (no CLAP pre-training), augmentation strategy, and parameter count (866M vs 739M). No controlled experiment isolates the FLAN-T5 contribution."
    407     },
    408     {
    409       "flag": "Tiny subjective evaluation",
    410       "detail": "Only 6 human evaluators rated 30 samples each. No inter-rater reliability (kappa, ICC) is reported. With this sample size, subjective evaluation results are not statistically robust."
    411     },
    412     {
    413       "flag": "Incomplete baseline comparison",
    414       "detail": "The AudioLDM-L-Full-FT checkpoint was unavailable, and the authors could not reproduce its results. Comparison with this strongest baseline relies entirely on borrowed numbers from the original paper, with no way to verify conditions are matched."
    415     },
    416     {
    417       "flag": "Parameter count mismatch not discussed",
    418       "detail": "TANGO (866M) has 17% more parameters than AudioLDM-L (739M), the most direct baseline. This difference is not discussed as a potential confound for the performance improvements."
    419     }
    420   ],
    421   "cited_papers": [
    422     {
    423       "title": "Scaling instruction-finetuned language models",
    424       "authors": ["Hyung Won Chung", "Le Hou", "Shayne Longpre", "Barret Zoph", "Yi Tay"],
    425       "year": 2022,
    426       "arxiv_id": "2210.11416",
    427       "relevance": "Foundational work on instruction-tuned LLMs (FLAN-T5), directly used as the text encoder in TANGO and demonstrating instruction tuning improves downstream task performance."
    428     },
    429     {
    430       "title": "Why can GPT learn in-context? Language models secretly perform gradient descent as meta-optimizers",
    431       "authors": ["Damai Dai", "Yutao Sun", "Li Dong", "Yaru Hao"],
    432       "year": 2022,
    433       "relevance": "Theoretical explanation for in-context learning in LLMs, cited as motivation for using FLAN-T5's gradient-descent mimicking property for cross-modal transfer."
    434     },
    435     {
    436       "title": "AudioLDM: Text-to-audio generation with latent diffusion models",
    437       "authors": ["Haohe Liu", "Zehua Chen", "Yi Yuan", "Xinhao Mei"],
    438       "year": 2023,
    439       "arxiv_id": "2301.12503",
    440       "relevance": "Primary baseline and inspiration for TANGO; demonstrates latent diffusion models for text-to-audio generation using CLAP-based embeddings."
    441     },
    442     {
    443       "title": "High-resolution image synthesis with latent diffusion models",
    444       "authors": ["Robin Rombach", "Andreas Blattmann", "Dominik Lorenz", "Patrick Esser"],
    445       "year": 2022,
    446       "relevance": "Foundational latent diffusion model (Stable Diffusion) architecture that TANGO and AudioLDM are based on."
    447     },
    448     {
    449       "title": "Audiogen: Textually guided audio generation",
    450       "authors": ["Felix Kreuk", "Gabriel Synnaeve", "Adam Polyak", "Uriel Singer"],
    451       "year": 2022,
    452       "arxiv_id": "2209.15352",
    453       "relevance": "Key baseline for text-to-audio generation using transformer encoder-decoder with autoregressive audio token generation."
    454     },
    455     {
    456       "title": "Photorealistic text-to-image diffusion models with deep language understanding",
    457       "authors": ["Chitwan Saharia", "William Chan", "Saurabh Saxena"],
    458       "year": 2022,
    459       "relevance": "Studied the effect of LLM text encoders (T5) on image generation quality, directly motivating TANGO's use of instruction-tuned LLMs for audio generation."
    460     },
    461     {
    462       "title": "Classifier-free diffusion guidance",
    463       "authors": ["Jonathan Ho", "Tim Salimans"],
    464       "year": 2021,
    465       "relevance": "Core technique used in TANGO for guiding the reverse diffusion process with text input during inference."
    466     },
    467     {
    468       "title": "WavCaps: A ChatGPT-assisted weakly-labelled audio captioning dataset for audio-language multimodal research",
    469       "authors": ["Xinhao Mei", "Chutong Meng", "Haohe Liu"],
    470       "year": 2023,
    471       "arxiv_id": "2303.17395",
    472       "relevance": "ChatGPT-generated captions dataset used in TANGO-FULL-FT training, relevant to LLM-assisted data creation for multimodal AI."
    473     }
    474   ],
    475   "engagement_factors": {
    476     "practical_relevance": {
    477       "score": 2,
    478       "justification": "Released code, model, and dataset enable practitioners to generate audio from text descriptions, with potential use in media production."
    479     },
    480     "surprise_contrarian": {
    481       "score": 1,
    482       "justification": "Somewhat surprising that a frozen instruction-tuned LLM achieves SOTA with 63x less training data, but the general trend of larger pre-trained models improving downstream tasks is well established."
    483     },
    484     "fear_safety": {
    485       "score": 0,
    486       "justification": "No safety or security concerns raised by text-to-audio generation in this context."
    487     },
    488     "drama_conflict": {
    489       "score": 0,
    490       "justification": "No controversy; straightforward technical contribution with fair comparison framing."
    491     },
    492     "demo_ability": {
    493       "score": 2,
    494       "justification": "Code released on GitHub and a demo page exists at https://tango-web.github.io/, making it testable by others."
    495     },
    496     "brand_recognition": {
    497       "score": 1,
    498       "justification": "From SUTD's DeCLaRe Lab, not a major AI lab, but the work builds on well-known models (FLAN-T5, Stable Diffusion)."
    499     }
    500   }
    501 }

Impressum · Datenschutz