scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28875B)
      1 {
      2   "paper": {
      3     "title": "TimeCMA: Towards LLM-Empowered Multivariate Time Series Forecasting via Cross-Modality Alignment",
      4     "authors": [
      5       "Chenxi Liu",
      6       "Qianxiong Xu",
      7       "Hao Miao",
      8       "Sun Yang",
      9       "Lingzheng Zhang",
     10       "Cheng Long",
     11       "Ziyue Li",
     12       "Rui Zhao"
     13     ],
     14     "year": 2024,
     15     "venue": "AAAI Conference on Artificial Intelligence",
     16     "arxiv_id": "2406.01638",
     17     "doi": "10.1609/aaai.v39i18.34067"
     18   },
     19   "scan_version": 3,
     20   "active_modules": ["experimental_rigor", "data_leakage"],
     21   "methodology_tags": ["benchmark-eval"],
     22   "key_findings": "TimeCMA introduces a cross-modality alignment framework that retrieves disentangled and robust time series embeddings from LLM-empowered prompt embeddings via channel-wise similarity. On eight public time series forecasting datasets, TimeCMA outperforms seven baselines including Time-LLM and iTransformer on average MSE and MAE. A last-token embedding storage design reduces memory by ~97% compared to Time-LLM and speeds inference by ~12x, demonstrating practical efficiency gains for LLM-based time series forecasting.",
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "GitHub link provided in the abstract: https://github.com/ChenxiLiu-HNU/TimeCMA."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "All eight datasets used (ETTm1, ETTm2, ETTh1, ETTh2, ECL, FRED-MD, ILI, Weather) are standard public benchmarks with references to their original sources."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper mentions 'NVIDIA A100 GPUs' but provides no requirements.txt, Dockerfile, or detailed library version listing. Not enough detail to recreate the environment."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No step-by-step reproduction instructions are provided in the paper text. The code repository exists but the paper itself lacks a 'Reproducing Results' section or commands to run."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Table 1 reports only point estimates for MSE and MAE. No confidence intervals, error bars, or ± notation are present despite claiming experiments are repeated at least three times."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper claims 'TimeCMA outperforms all baselines in all cases' and reports percentage improvements (e.g., '16.1% in MSE') but uses no statistical significance tests to support these comparative claims."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "The paper reports relative improvements with baseline context: 'average improvement of 16.1% in MSE and 11.9% in MAE' compared to OFA, and '13.9% in MSE and 12.6% in MAE' compared to UniTime, alongside full comparison tables."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No justification is given for why these 8 datasets were chosen beyond being common benchmarks, and no power analysis or sample size discussion is present."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "The paper states 'Each experiment is repeated at least three times with different seeds' but Table 1 and all results show only single point estimates with no standard deviation, IQR, or any spread measure."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Seven baseline models across five categories are compared: Time-LLM, UniTime, OFA, iTransformer, PatchTST, TimesNet, and DLinear."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Baselines include recent methods: Time-LLM (ICLR 2024), UniTime (WWW 2024), iTransformer (ICLR 2024), PatchTST (ICLR 2023). These represent the contemporary state of the art."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Fig. 3 shows ablation studies removing individual components: w/o CMA, w/o LLM, w/o TSE, w/o PE, w/o MTD, each tested on FRED and ILI datasets."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Two evaluation metrics are used throughout: mean squared error (MSE) and mean absolute error (MAE)."
     93       },
     94       "human_evaluation": {
     95         "applies": false,
     96         "answer": false,
     97         "justification": "Human evaluation is irrelevant for numerical time series forecasting accuracy. Automated MSE/MAE metrics are the appropriate evaluation approach."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The datasets used (ETT, ECL, Weather, etc.) have standard train/validation/test splits that are well-established in the time series forecasting literature."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Table 1 provides results broken down by each of the 8 datasets and 4 prediction horizons (96/192/336/720 or 24/36/48/60), plus per-dataset averages."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": false,
    112         "justification": "No failure case analysis is presented. The paper does not discuss where TimeCMA performs poorly or breaks down, even though some individual entries in Table 1 show it is not always best."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": false,
    117         "justification": "No negative results are reported. Every experiment and ablation shows the proposed method or component contributing positively. No failed approaches or configurations are discussed."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The abstract claims TimeCMA 'outperforms state-of-the-arts,' which is supported by Table 1 showing best average MSE/MAE across all 8 datasets. The abstract also claims computational efficiency, supported by Table 2."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The paper makes causal claims through ablation studies (Fig. 3) showing that removing components (CMA, LLM, TSE, PE, MTD) each degrades performance. These are controlled single-variable manipulations adequate for the causal claims being made."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The title claims 'Multivariate Time Series Forecasting' broadly, but results are on 8 specific datasets (mostly energy, weather, economics). No explicit bounding to these domains. The abstract says 'extensive experiments on eight real datasets' without qualifying which types of time series the results generalize to."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "No alternative explanations for the results are discussed. The paper does not consider confounds such as whether the improvement comes from increased model capacity rather than the cross-modality alignment, or whether the prompt design choice is the key factor vs. the architecture."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The paper measures MSE and MAE for forecasting accuracy, and claims are about forecasting performance. The measurements match the granularity of the claims — no proxy gap exists."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "The paper states 'We selected GPT-2 as the LLM' without specifying the variant (small/medium/large/XL, 124M–1.5B parameters). GPT-2 has four size variants with significantly different capabilities."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Fig. 2 and Fig. 4 show the full prompt structure: 'From t1 to tL, the values were v1, ..., vL every f. The total trend value was ΔT.' All placeholder variables are mathematically defined (Eq. 1 for ΔT), allowing full reconstruction of every prompt from the data."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "Key hyperparameters are missing: no learning rate, optimizer, number of epochs, hidden dimension C value, number of encoder layers, or regularization weight λ. Only test batch size (1), training batch size (8 for efficiency comparison), and hardware (A100 GPUs) are mentioned."
    162       },
    163       "scaffolding_described": {
    164         "applies": false,
    165         "answer": false,
    166         "justification": "No agentic scaffolding is used. TimeCMA is a neural network architecture for time series forecasting with no agent loops, tool use, or scaffolding."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Preprocessing is documented: reversible instance normalization to zero mean and unit standard deviation (Eq. 2–4), and removal of variables with missing values from FRED-MD. Input sequence lengths specified (36 for ILI/FRED, 96 for others)."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "There is no limitations, threats to validity, or similar section in the paper. The paper ends with a brief conclusion and acknowledgments."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No threats to validity are discussed anywhere in the paper."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "No explicit scope boundaries are stated. The paper does not specify what the results do NOT show or what domains/settings are excluded from its claims."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "All datasets are publicly available standard benchmarks: ETT series (Zeng et al. 2023), ECL (UCI repository), FRED-MD (McCracken & Ng 2016), ILI and Weather (Wu et al. 2021). Raw data can be independently obtained."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Each dataset is cited with its original source. The paper describes modifications made: 'We removed variables with missing values in the FRED-MD (Qiu et al. 2024) and simplified it as FRED.'"
    201       },
    202       "recruitment_methods_described": {
    203         "applies": false,
    204         "answer": false,
    205         "justification": "No human participants. All data comes from standard public benchmarks."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "The data pipeline is documented: public datasets → remove missing-value variables (FRED) → reversible instance normalization → inverted embedding → model. Input/output dimensions and sequence lengths are specified."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Acknowledgments section states: 'This study is supported under the RIE2020 Industry Alignment Fund – Industry Collaboration Projects (IAF-ICP) Funding Initiative, as well as cash and in-kind contributions from the industry partner(s).'"
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "All author affiliations are listed: NTU S-Lab, Aalborg University, Peking University, HKUST Guangzhou, University of Cologne, and SenseTime Research."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": true,
    227         "justification": "The RIE2020 IAF-ICP is a Singapore government research initiative. The unnamed 'industry partner(s)' make this partially unclear, but the paper evaluates a novel method on public benchmarks rather than a specific commercial product."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests or financial interests statement is present in the paper. One author is from SenseTime Research, a commercial AI company, with no explicit conflict declaration."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "GPT-2's training data cutoff is not stated. While GPT-2 was trained on text (WebText) and the benchmarks are numerical time series, the paper does not address when the model's training data was collected."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "No discussion of whether GPT-2's training data could overlap with the benchmark data. While the risk is low (text model vs. numerical time series), it is not addressed."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "Benchmark contamination is not discussed. The ETT and Weather benchmarks were published before GPT-2's training, but since GPT-2 processes numerical values from these series as text tokens, potential memorization is not addressed."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants in this study."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Table 2 reports inference speed (seconds/iteration) for TimeCMA (0.09s on ETTm1) compared to baselines, plus memory usage (MiB) and parameter counts."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "Only 'NVIDIA A100 GPUs' is mentioned. No total training time, GPU hours, or total computational budget is reported."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The paper states 'Each experiment is repeated at least three times with different seeds' but Table 1 shows only point estimates with no standard deviation or variance across seeds."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "The paper states 'Each experiment is repeated at least three times with different seeds on NVIDIA A100 GPUs.'"
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No hyperparameter search budget is reported. The paper does not describe how hyperparameters were selected or how many configurations were tried."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "No description of how the final configuration was selected. The paper presents results without explaining the selection process for hyperparameters or architecture choices."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The paper compares 8 methods across 8 datasets and 4 horizons (256 comparisons) with no multiple comparison correction applied. No statistical tests are used at all."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The authors compare their method against baselines without acknowledging the bias of authors evaluating their own system. No discussion of whether baseline implementations are from original authors or re-implementations."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": true,
    335         "justification": "Table 2 directly compares efficiency (parameters, memory, speed) alongside forecasting performance for LLM-based methods, showing TimeCMA achieves best performance with lower computational cost."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "No discussion of whether the 8 time series benchmarks actually measure meaningful forecasting capability or whether they are representative of real-world forecasting challenges."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": false,
    344         "answer": false,
    345         "justification": "No agentic scaffolding is involved. TimeCMA is a neural architecture evaluated end-to-end."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of temporal leakage. The standard train/val/test chronological splits are used but not explicitly verified or discussed."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of feature leakage. The prompt design includes all historical values which could potentially leak future information through the trend calculation, but this is not analyzed."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No discussion of non-independence between train and test data. Time series data is inherently autocorrelated, and whether the splits adequately separate dependent observations is not addressed."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": false,
    367         "justification": "No leakage detection or prevention method is applied or discussed."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "TimeCMA outperforms all seven baselines on average across all eight datasets in both MSE and MAE.",
    374       "evidence": "Table 1 shows TimeCMA achieves best average MSE/MAE on all 8 datasets. For example, on FRED: 48.161 MSE vs next-best 61.565 (OFA); on ILI: 1.922 MSE vs next-best 2.108 (UniTime).",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Cross-modality alignment is the most impactful component, superior to simple concatenation of modalities.",
    379       "evidence": "Fig. 3 ablation study shows removing CMA (w/o CMA, replaced with concatenation) causes the largest performance degradation on both FRED and ILI datasets.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "The last token of the prompt captures the most essential temporal information from the time series.",
    384       "evidence": "Fig. 5 shows attention visualization from GPT-2's final layer, where the last token (ΔT) directs highest attention to time series value segments. Fig. 4 ablation shows prompts with numeric last tokens outperform others.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "TimeCMA is more computationally efficient than other LLM-based methods, with ~97% less memory than Time-LLM.",
    389       "evidence": "Table 2 shows TimeCMA uses 821 MiB memory vs Time-LLM's 28,882 MiB, with 0.09 s/iter vs 1.08 s/iter speed, on ETTm1-96.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Prompt-based LLMs outperform time series-based LLMs with an average improvement of 16.1% in MSE over OFA.",
    394       "evidence": "Table 1 average results across datasets show TimeCMA outperforms OFA (time series-based LLM) consistently. The 16.1% figure is stated in the main results section.",
    395       "supported": "moderate"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "No error bars despite multiple runs",
    401       "detail": "The paper claims experiments are 'repeated at least three times with different seeds' but reports only point estimates in all tables. The variance across runs is never shown, making it impossible to assess whether performance differences are statistically meaningful."
    402     },
    403     {
    404       "flag": "No statistical significance testing",
    405       "detail": "Claims of outperformance are based entirely on comparing point estimates. With 8 methods × 8 datasets × 4 horizons, some wins could be due to random variation. No significance tests are applied to any comparison."
    406     },
    407     {
    408       "flag": "No limitations section",
    409       "detail": "The paper contains no discussion of limitations, threats to validity, or scope boundaries. This is a notable omission for a venue like AAAI."
    410     },
    411     {
    412       "flag": "GPT-2 variant unspecified",
    413       "detail": "GPT-2 has four variants (124M to 1.5B parameters) with substantially different capabilities. The paper says only 'GPT-2' without specifying which variant, making reproduction uncertain."
    414     },
    415     {
    416       "flag": "Missing hyperparameters",
    417       "detail": "Critical training hyperparameters (learning rate, optimizer, epochs, hidden dimension, regularization weight λ, number of encoder/decoder layers) are not reported, hindering reproducibility."
    418     },
    419     {
    420       "flag": "Selective ablation datasets",
    421       "detail": "Ablation studies (Fig. 3) are shown only on FRED and ILI datasets, not all eight. This raises the question of whether ablation results are consistent across all datasets."
    422     }
    423   ],
    424   "cited_papers": [
    425     {
    426       "title": "Time-LLM: Time series forecasting by reprogramming large language models",
    427       "authors": ["Ming Jin", "Shiyu Wang", "Lintao Ma", "Zhixuan Chu", "James Y. Zhang", "Xiaoming Shi", "Pin-Yu Chen", "Yuxuan Liang", "Yuan-Fang Li", "Shirui Pan", "Qingsong Wen"],
    428       "year": 2024,
    429       "relevance": "Key baseline and representative of prompt-based LLM methods for time series forecasting."
    430     },
    431     {
    432       "title": "One Fits All: Power General Time Series Analysis by Pretrained LM",
    433       "authors": ["Tian Zhou", "Peisong Niu", "Xue Wang", "Liang Sun", "Rong Jin"],
    434       "year": 2023,
    435       "relevance": "Baseline representing time-series-based LLM approaches using pre-trained language models."
    436     },
    437     {
    438       "title": "UniTime: A Language-Empowered Unified Model for Cross-Domain Time Series Forecasting",
    439       "authors": ["Xu Liu", "Junfeng Hu", "Yuan Li", "Shizhe Diao", "Yuxuan Liang", "Bryan Hooi", "Roger Zimmermann"],
    440       "year": 2024,
    441       "relevance": "Prompt-based LLM baseline that uses language to enhance cross-domain time series forecasting."
    442     },
    443     {
    444       "title": "Large Language Models Are Zero-Shot Time Series Forecasters",
    445       "authors": ["Nate Gruver", "Marc Finzi", "Shikai Qiu", "Andrew Gordon Wilson"],
    446       "year": 2023,
    447       "relevance": "Demonstrates LLMs can perform zero-shot time series forecasting, foundational to LLM-for-time-series research."
    448     },
    449     {
    450       "title": "Position Paper: What Can Large Language Models Tell Us about Time Series Analysis",
    451       "authors": ["Ming Jin", "Yifan Zhang", "Wei Chen", "Kexin Zhang", "Yuxuan Liang", "Bin Yang", "Jindong Wang", "Shirui Pan", "Qingsong Wen"],
    452       "year": 2024,
    453       "relevance": "Position paper surveying LLM capabilities for time series analysis, directly relevant to understanding LLM applications beyond NLP."
    454     },
    455     {
    456       "title": "GPT4MTS: Prompt-based Large Language Model for Multimodal Time-series Forecasting",
    457       "authors": ["Furong Jia", "Kevin Wang", "Yixiang Zheng", "Defu Cao", "Yan Liu"],
    458       "year": 2024,
    459       "relevance": "Prompt-based multimodal LLM approach for time series, directly related to the data entanglement issue TimeCMA addresses."
    460     },
    461     {
    462       "title": "TEMPO: Prompt-based Generative Pre-trained Transformer for Time Series Forecasting",
    463       "authors": ["Defu Cao", "Furong Jia", "Sercan O. Arik", "Tomas Pfister", "Yixiang Zheng", "Wen Ye", "Yan Liu"],
    464       "year": 2024,
    465       "relevance": "Prompt-based GPT approach for time series, representative of the class of methods TimeCMA improves upon."
    466     },
    467     {
    468       "title": "LLM2Vec: Large language models are secretly powerful text encoders",
    469       "authors": ["Parishad BehnamGhader", "Vaibhav Adlakha", "Marius Mosbach", "Dzmitry Bahdanau", "Nicolas Chapados", "Siva Reddy"],
    470       "year": 2024,
    471       "relevance": "Demonstrates LLMs as powerful encoders, supporting TimeCMA's use of frozen LLM embeddings for non-NLP tasks."
    472     },
    473     {
    474       "title": "S2IP-LLM: Semantic Space Informed Prompt Learning with LLM for Time Series Forecasting",
    475       "authors": ["Zijie Pan", "Yushan Jiang", "Sahil Garg", "Anderson Schneider", "Yuriy Nevmyvaka", "Dongjin Song"],
    476       "year": 2024,
    477       "relevance": "Prompt learning with LLM for time series, demonstrates semantic space alignment approach related to cross-modality alignment."
    478     },
    479     {
    480       "title": "TEST: Text Prototype Aligned Embedding to Activate LLM's Ability for Time Series",
    481       "authors": ["Chenxi Sun", "Yaliang Li", "Hongyan Li", "Shenda Hong"],
    482       "year": 2024,
    483       "relevance": "Uses text-aligned embeddings to activate LLM capabilities for time series, closely related to cross-modality alignment approach."
    484     },
    485     {
    486       "title": "Foundation models for time series analysis: A tutorial and survey",
    487       "authors": ["Yuxuan Liang", "Haomin Wen", "Yuqi Nie", "Yushan Jiang", "Ming Jin", "Dongjin Song", "Shirui Pan", "Qingsong Wen"],
    488       "year": 2024,
    489       "relevance": "Comprehensive survey of foundation models for time series, provides context for the growing LLM-for-time-series field."
    490     },
    491     {
    492       "title": "Not All Tokens Are What You Need for Pretraining",
    493       "authors": ["Zhenghao Lin", "Zhibin Gou", "Yeyun Gong", "Xiao Liu", "Yelong Shen", "Ruochen Xu", "Chen Lin", "Yujiu Yang", "Jian Jiao", "Nan Duan", "Weizhu Chen"],
    494       "year": 2024,
    495       "relevance": "Foundational work on token importance in LLMs, directly supports TimeCMA's last-token design."
    496     }
    497   ],
    498   "engagement_factors": {
    499     "practical_relevance": {
    500       "score": 1,
    501       "justification": "Code is released but the method is specialized for multivariate time series forecasting research, not immediately usable as a general-purpose tool."
    502     },
    503     "surprise_contrarian": {
    504       "score": 1,
    505       "justification": "The cross-modality alignment and last-token-only ideas are novel design choices but don't fundamentally challenge conventional wisdom about LLMs or time series."
    506     },
    507     "fear_safety": {
    508       "score": 0,
    509       "justification": "No AI safety, security, or risk implications. This is a standard time series forecasting architecture paper."
    510     },
    511     "drama_conflict": {
    512       "score": 0,
    513       "justification": "No controversy or provocative claims. Standard benchmark improvement paper."
    514     },
    515     "demo_ability": {
    516       "score": 1,
    517       "justification": "GitHub code is available but requires setup for time series forecasting experiments; no interactive demo or simple pip-install experience."
    518     },
    519     "brand_recognition": {
    520       "score": 1,
    521       "justification": "Published at AAAI (top venue) with SenseTime co-authorship, but the specific method and authors are not widely known."
    522     }
    523   }
    524 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs