scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (18516B)
      1 {
      2   "paper": {
      3     "title": "Empowering LLMs with Pseudo-Untrimmed Videos for Audio-Visual Temporal Understanding",
      4     "authors": ["Yunlong Tang", "Daiki Shimada", "Jing Bi", "Mingqian Feng", "Hang Hua", "Chenliang Xu"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2403.16276"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL or code archive link found in the paper."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The PU-VALOR dataset is introduced but no download link or repository URL is provided in the paper. The construction pipeline is described but the dataset itself is not released."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions a single NVIDIA 48G A6000 GPU and Vicuna-7B-v1.5, but no requirements.txt, library versions, or environment setup details are provided."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The methodology is described at a high level but not with sufficient detail to reproduce without significant guesswork."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "All results are reported as point estimates without confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims AVicuna 'surpasses all other LLM-based models' but provides no statistical significance tests to support these comparisons."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Raw scores are reported but no effect sizes (Cohen's d, etc.) or contextual percentage improvements with baselines are systematically provided."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification for the size of the PU-VALOR dataset (114K) or the evaluation benchmarks used. No power analysis."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance, standard deviation, or multi-run results reported. All results appear to be single-run numbers."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Tables 2, 3, 5, and 6 compare AVicuna against multiple baselines including PandaGPT, Macaw-LLM, AV-LLM, Video-LLaMA, VTimeLLM, ActionFormer, UnAV, and UniAV."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include contemporary models from 2023-2024 such as VTimeLLM, AV-LLM, UniAV-AT/ST, which were recent at time of writing."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Table 4 provides ablation study removing PU-VALOR, AVTI, A5-222K, and audio input individually. Figure 4 and Table 8 analyze Audio-Interleaving Rates."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple metrics used: accuracy on QA benchmarks (MSVD-QA, MSRVTT-QA, etc.), mAP at multiple IoU thresholds for AVEDL, GPT-based evaluation across 5 dimensions, R1@0.5/0.7 and mIoU for VTG."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation is included. QA evaluation uses GPT scoring (automated). The paper makes claims about video understanding quality that would benefit from human judgment."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Evaluation is performed on established benchmark test sets (MSVD-QA, MSRVTT-QA, ActivityNet-QA, AVSD, MUSIC-AVQA, UnAV-100) that are separate from training data."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down per benchmark dataset (Tables 2, 3, 6) and per IoU threshold (Tables 3, 4, 8). GPT evaluation is broken down across 5 dimensions (Table 5)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The Limitations section discusses hallucination, deficiency in spatial comprehension, and insufficient precision for ultra-long videos as failure modes."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The AIR analysis (Figure 4, Table 8) shows performance degradation at high audio-interleaving rates. Ablation results show performance drops. VTimeLLM reproduction yielded lower than reported numbers."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims state-of-the-art on open-ended video QA, AVQA, and AVEDL tasks. Tables 2 and 3 support these claims with the highest scores across benchmarks."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims about component contributions are supported by controlled ablation studies in Table 4, where each component is removed individually while keeping others constant."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title and abstract make broad claims about 'Audio-Visual Temporal Understanding' but results are limited to specific benchmarks (UnAV-100, MSVD-QA, etc.) with a 7B parameter model. No discussion of generalization boundaries."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No discussion of alternative explanations for the results. For example, the pseudo-untrimmed dataset construction could introduce artifacts that inflate temporal localization performance on synthetic-like benchmarks."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Specific model versions are stated: Vicuna-7B-v1.5, CLIP ViT-14/L, CLAP. The LLM version (v1.5) is specified."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Prompts for Time-Event Alignment (Q1-Q7), Audio-Text Alignment (Box 1, Q1-Q9), and AudioSet label templates (Box 2, R1-R22) are provided in full."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Learning rates (1e-3 for Stages I-II, 1e-4 for Stages III-IV), epochs (2/2/1/1), training times (5/2/36/6 hours), scaling factor range [0.5, 2], 100 frames per video, and AIR=25% are reported in the Appendix."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. AVicuna is a fine-tuned multimodal LLM, not an agent-based system."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The PU-VALOR construction pipeline is described in detail: text embedding extraction, clustering (25,270 clusters), random temporal scaling with range [0.5, 2], permutation, and annotation with temporal boundaries. The A5-222K compilation from AudioSet/AudioCap/Auto-ACD is documented."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "A dedicated Limitations section in the Appendix discusses hallucination, deficiency in spatial comprehension, and insufficient precision for ultra-long videos."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The limitations are specific to this system: hallucination in generated descriptions, spatial comprehension gaps, and the 100-number precision limitation for ultra-long videos."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "While limitations are discussed, the paper does not explicitly state what the results do NOT show or what settings are excluded. The broad title suggests general audio-visual understanding without bounding the scope to the tested benchmarks and model size."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Neither the PU-VALOR dataset nor the A5-222K dataset is made available for download. Raw experimental outputs are not provided."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The PU-VALOR construction from VALOR-32K is described in detail (clustering, scaling, permutation, annotation). The A5-222K compilation from three source datasets is documented. Table 7 summarizes all datasets and their sizes."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data comes from existing public datasets (VALOR-32K, AudioSet, AudioCap, Auto-ACD)."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Figure 2 and the Methodology section document the full pipeline: caption embedding → clustering (25,270 clusters) → video selection (3-20 per cluster) → temporal scaling → permutation → annotation → 114K pairs."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section found in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: University of Rochester and Sony Group Corporation."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is disclosed, so independence cannot be assessed. One author is from Sony Group Corporation, which has commercial interest in audio-visual AI."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement found in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper uses Vicuna-7B-v1.5 (based on LLaMA) but does not state the training data cutoff date for the base model."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether test benchmark data (MSVD-QA, MSRVTT-QA, etc.) could have been seen during LLaMA/Vicuna pretraining."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The evaluation benchmarks (MSVD-QA from 2011, MSRVTT-QA from 2016, ActivityNet-QA from 2019) predate the model's likely training cutoff. No contamination analysis is provided."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference cost, latency, or tokens consumed reported despite the model requiring 100 frames per video and multi-stage processing."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Training times are stated: 5/2/36/6 hours for stages I/II/III/IV on a single NVIDIA 48G A6000 GPU (Appendix)."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "AVicuna surpasses all other LLM-based models on both video QA and AVQA benchmarks.",
    286       "evidence": "Table 2 shows AVicuna achieving highest scores: AVSD 53.1, MUSIC-QA 49.6, MSVD-QA 70.2, MSRVTT-QA 59.7, ActivityNet-QA 53.0, compared to 8 baseline models.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "AVicuna achieves state-of-the-art on Audio-Visual Event Dense Localization (AVEDL).",
    291       "evidence": "Table 3 shows AVicuna achieving 60.0 mAP@0.5 and 60.3 average mAP on UnAV-100, outperforming UniAV-ST (54.8/51.7) and all other baselines.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "PU-VALOR dataset is critical for temporal understanding — removing it drops average mAP from 60.3 to 27.9.",
    296       "evidence": "Table 4 ablation study shows w/o PU-VALOR drops from 60.3 to 27.9 average mAP.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Optimal audio-interleaving rate is 25%, with excessive audio being detrimental.",
    301       "evidence": "Figure 4 and Table 8 show peak performance at 25% AIR with decline at higher rates.",
    302       "supported": "strong"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval"],
    306   "key_findings": "AVicuna introduces a multimodal LLM with an Audio-Visual Token Interleaver (AVTI) and a novel pseudo-untrimmed dataset (PU-VALOR, 114K videos) for audio-visual temporal understanding. The model achieves state-of-the-art on AVEDL (60.3 avg mAP on UnAV-100) and competitive results across video QA and AVQA benchmarks using only 7B parameters. Ablation studies show PU-VALOR and audio input are critical components, and optimal audio-interleaving rate is 25%.",
    307   "red_flags": [
    308     {
    309       "flag": "No uncertainty quantification",
    310       "detail": "All results are single-run point estimates with no error bars, confidence intervals, or variance across runs. Claims of superiority over baselines cannot be assessed for statistical significance."
    311     },
    312     {
    313       "flag": "Synthetic dataset validity unexamined",
    314       "detail": "PU-VALOR is constructed by concatenating trimmed clips with random scaling and permutation. The paper does not examine whether this synthetic construction introduces artifacts that inflate performance on temporal localization tasks, or whether the model would generalize to naturally untrimmed videos."
    315     },
    316     {
    317       "flag": "GPT-based evaluation circularity",
    318       "detail": "Video-based generative performance (Table 5) is evaluated using GPT scoring, which introduces dependence on another LLM's judgment without calibration against human evaluation."
    319     },
    320     {
    321       "flag": "No contamination analysis",
    322       "detail": "Several evaluation benchmarks (MSVD-QA 2011, MSRVTT-QA 2016) predate LLaMA training data and could be contaminated. No analysis is provided."
    323     }
    324   ],
    325   "cited_papers": [
    326     {
    327       "title": "Video-LLaMA: An instruction-tuned audio-visual language model for video understanding",
    328       "authors": ["H. Zhang"],
    329       "year": 2023,
    330       "arxiv_id": "2306.02858",
    331       "relevance": "Baseline audio-visual LLM for video understanding, directly compared in experiments."
    332     },
    333     {
    334       "title": "VTimeLLM: Empower LLM to Grasp Video Moments",
    335       "authors": ["B. Huang", "X. Wang", "H. Chen", "Z. Song", "W. Zhu"],
    336       "year": 2023,
    337       "arxiv_id": "2311.18445",
    338       "relevance": "Key baseline for temporal understanding in video LLMs, compared across multiple benchmarks."
    339     },
    340     {
    341       "title": "Video-ChatGPT: Towards Detailed Video Understanding via Large Vision and Language Models",
    342       "authors": ["M. Maaz", "H. Rasheed", "S. Khan", "F. S. Khan"],
    343       "year": 2023,
    344       "arxiv_id": "2306.05424",
    345       "relevance": "Baseline video understanding LLM whose GPT-based evaluation methodology is adopted by AVicuna."
    346     },
    347     {
    348       "title": "Visual instruction tuning",
    349       "authors": ["H. Liu", "C. Li", "Q. Wu", "Y. J. Lee"],
    350       "year": 2023,
    351       "relevance": "LLaVA introduces visual instruction tuning, the foundational approach extended by AVicuna to audio-visual modalities."
    352     },
    353     {
    354       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    355       "authors": ["E. J. Hu"],
    356       "year": 2022,
    357       "relevance": "LoRA is used for parameter-efficient fine-tuning in AVicuna's Time-Event Alignment stage."
    358     },
    359     {
    360       "title": "Audio-Visual LLM for Video Understanding",
    361       "authors": ["F. Shu", "L. Zhang", "H. Jiang", "C. Xie"],
    362       "year": 2023,
    363       "arxiv_id": "2312.06720",
    364       "relevance": "Directly compared audio-visual LLM baseline that combines audio and visual embeddings."
    365     },
    366     {
    367       "title": "PandaGPT: One model to instruction-follow them all",
    368       "authors": ["Y. Su", "T. Lan", "H. Li", "J. Xu", "Y. Wang", "D. Cai"],
    369       "year": 2023,
    370       "relevance": "Multimodal LLM baseline supporting audio-visual input, compared in QA experiments."
    371     }
    372   ]
    373 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs