scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28473B)
      1 {
      2   "paper": {
      3     "title": "TimeCMA: Towards LLM-Empowered Multivariate Time Series Forecasting via Cross-Modality Alignment",
      4     "authors": [
      5       "Chenxi Liu",
      6       "Qianxiong Xu",
      7       "Hao Miao",
      8       "Sun Yang",
      9       "Lingzheng Zhang",
     10       "Cheng Long",
     11       "Ziyue Li",
     12       "Rui Zhao"
     13     ],
     14     "year": 2024,
     15     "venue": "AAAI 2025",
     16     "arxiv_id": "2406.01638",
     17     "doi": "10.48550/arXiv.2406.01638"
     18   },
     19   "scan_version": 3,
     20   "active_modules": ["experimental_rigor", "data_leakage"],
     21   "methodology_tags": ["benchmark-eval"],
     22   "key_findings": "TimeCMA proposes a dual-modality encoding framework with cross-modality alignment that retrieves disentangled and robust time series embeddings from LLM prompt embeddings via channel-wise similarity. On 8 standard forecasting datasets, TimeCMA outperforms 7 baselines including Time-LLM and iTransformer on average MSE and MAE. A last-token embedding storage design reduces computational cost and memory usage compared to other LLM-based methods while maintaining competitive performance.",
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Code released at https://github.com/ChenxiLiu-HNU/TimeCMA, stated in the abstract."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "All 8 datasets (ETTm1, ETTm2, ETTh1, ETTh2, ECL, FRED-MD, ILI, Weather) are publicly available standard benchmarks with cited sources."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper mentions 'NVIDIA A100 GPUs' and 'GPT-2' but provides no requirements.txt, library versions, or detailed environment specification."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No step-by-step reproduction instructions are provided in the paper. The reader would need to consult the GitHub repository."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Table 1 reports only point estimates for MSE and MAE across all datasets and horizons. No confidence intervals, error bars, or ± notation despite claiming experiments are repeated at least three times."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "Claims of outperformance (e.g., 'TimeCMA outperforms all baselines in all cases') are based solely on comparing point estimates without any statistical significance tests."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Percentage improvements over baselines are reported: 'average improvement of 16.1% in MSE and 11.9% in MAE' vs OFA, '13.9% in MSE and 12.6% in MAE' vs UniTime, with baseline context from Table 1."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No justification for why 8 datasets were chosen or why the specific forecast horizons (96, 192, 336, 720) were selected. No power analysis."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "Despite stating 'Each experiment is repeated at least three times with different seeds,' no standard deviation, variance, or any spread measure is reported in any table or figure."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Seven baseline models across five categories are compared: prompt-based LLMs (Time-LLM, UniTime), time series-based LLM (OFA), Transformer-based (iTransformer, PatchTST), linear (DLinear), CNN (TimesNet)."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Baselines include recent models: iTransformer (ICLR 2024), Time-LLM (ICLR 2024), UniTime (WWW 2024), PatchTST (ICLR 2023), TimesNet (ICLR 2023). All are contemporary state-of-the-art."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Fig. 3 shows ablation of model components (w/o CMA, w/o LLM, w/o TSE, w/o PE, w/o MTD) on FRED and ILI datasets. Fig. 4 shows ablation of 5 different prompt designs."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Two evaluation metrics are used throughout: Mean Squared Error (MSE) and Mean Absolute Error (MAE)."
     93       },
     94       "human_evaluation": {
     95         "applies": false,
     96         "answer": false,
     97         "justification": "Human evaluation is irrelevant for time series forecasting accuracy, which is measured by MSE and MAE against ground truth values."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Standard benchmark datasets (ETT, ECL, Weather, ILI, FRED-MD) have predefined train/validation/test splits used in the time series forecasting community."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Table 1 provides per-dataset and per-forecast-horizon breakdowns (96, 192, 336, 720 for most datasets; 24, 36, 48, 60 for ILI and FRED) plus averages."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": false,
    112         "justification": "No discussion of where TimeCMA fails or underperforms. All results are presented positively. Individual cells in Table 1 where TimeCMA is not best (e.g., ETTm1-336 where PatchTST ties) are not discussed."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": false,
    117         "justification": "No negative results are reported. All ablations show that the full model is best. No mention of approaches that were tried and abandoned."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The abstract claims 'outperforms state-of-the-arts,' which is supported by Table 1 showing TimeCMA achieves the best average MSE and MAE across all 8 datasets."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Causal claims like 'cross-modality alignment retrieves disentangled and robust time series embeddings' are supported by controlled ablation studies (Fig. 3) that remove individual components (w/o CMA, w/o LLM, w/o TSE, w/o PE, w/o MTD) and measure impact."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The title claims broad applicability ('Towards LLM-Empowered Time Series Forecasting') but experiments cover only 8 standard forecasting benchmarks in specific domains (energy, weather, illness, economics). No discussion of when the method might not generalize."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "No alternative explanations for the results are discussed. For example, the improvements could stem from increased model capacity rather than the cross-modality alignment design, but this is not considered."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The paper measures MSE and MAE for forecasting and claims forecasting performance. The measurements directly match the claims — no proxy gap exists."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "The paper states 'We selected GPT-2 as the LLM' without specifying which GPT-2 variant (gpt2, gpt2-medium, gpt2-large, gpt2-xl). The hidden dimension E is referenced but the model size is not stated."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Fig. 2 and Fig. 4 show the actual prompt structure: 'From t1 to tL, the values were v1, ..., vj every f. The total trend value was ΔT.' All placeholders are mathematically defined (Eq. 1 for ΔT, timestamps and values from input data)."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "Only test batch size (1) and training batch size (8, for efficiency analysis only) are mentioned. Learning rate, optimizer, epochs, temperature, hidden dimension C, number of encoder layers, and other critical hyperparameters are not reported."
    162       },
    163       "scaffolding_described": {
    164         "applies": false,
    165         "answer": false,
    166         "justification": "No agentic scaffolding is used. TimeCMA is a standard deep learning model with fixed architecture components."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": false,
    171         "justification": "Only minimal preprocessing is documented: 'We removed variables with missing values in the FRED-MD' and reversible instance normalization (Eq. 2-4). Train/validation/test split ratios, data normalization parameters, and other preprocessing details are not provided."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No dedicated limitations or threats-to-validity section exists. The paper has Introduction, Related Work, Preliminaries, Methodology, Experiments, Conclusion, and Acknowledgments — no limitations discussion."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No specific threats to validity are discussed anywhere in the paper."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "No explicit scope boundaries are stated. The paper does not discuss what the results do not show or what settings are excluded."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "All 8 datasets are publicly available standard benchmarks with cited sources: ETT datasets (Zeng et al. 2023), ECL (Asuncion and Newman 2007), FRED-MD (McCracken and Ng 2016), ILI and Weather (Wu et al. 2021)."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Datasets are described by name with citations to original sources. The datasets section identifies them and their properties (e.g., variable counts, domains)."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": false,
    204         "answer": false,
    205         "justification": "No human participants. All data comes from standard public benchmarks."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": false,
    210         "justification": "The pipeline from raw benchmark data to model input is not documented. Only the FRED-MD variable removal and the inverted embedding transformation are described; train/test splitting, sequence windowing, and other pipeline steps are omitted."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Acknowledgments state: 'This study is supported under the RIE2020 Industry Alignment Fund – Industry Collaboration Projects (IAF-ICP) Funding Initiative, as well as cash and in-kind contributions from the industry partner(s).'"
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "All author affiliations are listed: S-Lab NTU, Aalborg University, Peking University, HKUST (Guangzhou), University of Cologne, and SenseTime Research."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "The industry partner providing 'cash and in-kind contributions' is unnamed, making independence impossible to verify. One author is from SenseTime Research, which may have commercial interest in time series forecasting tools."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests or financial interests statement is provided in the paper."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "GPT-2 is used as a frozen feature extractor for time series data, not evaluated on NLP benchmarks. The time series benchmarks (ETT, ECL, Weather, etc.) are numerical data that would not be in GPT-2's text training corpus. Contamination in the traditional sense does not apply."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "Same rationale: the pre-trained model (GPT-2) processes text-wrapped time series, not its own training domain. The benchmarks are numerical time series, not text data GPT-2 was trained on."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "GPT-2 was trained on web text; the evaluation benchmarks are numerical time series datasets. There is essentially zero contamination risk from GPT-2 having memorized time series benchmark answers."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants in this study."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Table 2 reports inference speed (seconds), memory usage (MB), and parameter count (M) for TimeCMA vs. LLM-based baselines on ETTm1-96 and ETTm2-96."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "Hardware is mentioned ('NVIDIA A100 GPUs') but total training time, GPU hours, or computational budget for the full experimental campaign are not stated."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The paper states 'Each experiment is repeated at least three times with different seeds' but reports no per-seed results, standard deviations, or any measure of seed sensitivity."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "The paper states: 'Each experiment is repeated at least three times with different seeds on NVIDIA A100 GPUs.'"
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No mention of hyperparameter search strategy, number of configurations tried, or search budget."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "No explanation of how the final configuration was selected. The prompt ablation (Fig. 4) shows Prompt 5 is best, but it is unclear how other hyperparameters were chosen."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied despite comparisons across 8 datasets × 4 horizons × 7 baselines."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "Authors compare TimeCMA against baselines without acknowledging potential bias from implementing and tuning their own system more carefully than baselines."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "Table 2 reports efficiency metrics separately from accuracy. Performance is not shown as a function of compute budget, and baselines are not compared at matched compute levels."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "No discussion of whether MSE/MAE on these standard benchmarks actually measures meaningful forecasting capability or whether these benchmarks have known limitations."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": false,
    344         "answer": false,
    345         "justification": "No agentic scaffolding is used. TimeCMA is a standard neural architecture."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of temporal leakage. While the time series datasets use temporal splits by convention, the paper does not explicitly address whether future information leaks into training."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of whether any features contain information unavailable at prediction time."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No discussion of potential non-independence between train and test windows, which is a known issue in sliding-window time series evaluation."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": false,
    367         "justification": "No leakage detection or prevention methods are discussed or applied."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "TimeCMA outperforms all baselines on average across all 8 datasets in both MSE and MAE.",
    374       "evidence": "Table 1 shows TimeCMA achieves the best average MSE and MAE on all 8 datasets, including against prompt-based LLMs (Time-LLM, UniTime), time series-based LLM (OFA), Transformer-based (iTransformer, PatchTST), linear (DLinear), and CNN (TimesNet) methods.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Cross-modality alignment via similarity-based retrieval is superior to simple concatenation for combining time series and LLM embeddings.",
    379       "evidence": "Fig. 3 ablation study shows w/o CMA (replacing alignment with concatenation) causes the most significant performance degradation on FRED and ILI datasets.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "The last token embedding captures the most essential temporal information from the prompt.",
    384       "evidence": "Fig. 5 attention visualization shows the last token's attention is directed primarily toward time series values. Table 2 shows this design reduces computational cost (0.09s vs 1.08s for Time-LLM, 821MB vs 28,882MB memory).",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "Prompt-based LLMs outperform time series-based LLMs for forecasting, with TimeCMA showing 16.1% MSE improvement over OFA.",
    389       "evidence": "Table 1 comparison of TimeCMA vs OFA across all datasets. Only one time series-based LLM (OFA) is compared.",
    390       "supported": "weak"
    391     },
    392     {
    393       "claim": "The Transformer and LLM capture complementary information — local variable-specific vs. global shared dependencies.",
    394       "evidence": "Fig. 6 attention map visualization on ETTh1 shows Transformer attention is local/variable-specific while LLM attention captures broader variable relationships.",
    395       "supported": "weak"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "No error bars despite multiple runs",
    401       "detail": "The paper states experiments are repeated at least 3 times with different seeds but reports zero variance information. This hides result stability — the claimed improvements could be within the noise margin."
    402     },
    403     {
    404       "flag": "Claims of superiority without statistical tests",
    405       "detail": "All 'outperforms' claims (e.g., 'outperforms all baselines in all cases') are based on comparing point estimates. Without significance tests, many of the small differences in Table 1 may not be statistically meaningful."
    406     },
    407     {
    408       "flag": "No limitations section",
    409       "detail": "The paper has no discussion of limitations, threats to validity, or scope boundaries. This is a significant omission for any empirical paper."
    410     },
    411     {
    412       "flag": "Unnamed industry partner",
    413       "detail": "The acknowledgments reference 'cash and in-kind contributions from the industry partner(s)' without naming them. This prevents assessment of potential conflicts of interest."
    414     },
    415     {
    416       "flag": "Ablation study on only 2 of 8 datasets",
    417       "detail": "The ablation studies (Fig. 3) are shown only on FRED and ILI datasets. The component contributions may differ on other datasets, but this is not tested."
    418     },
    419     {
    420       "flag": "GPT-2 variant not specified",
    421       "detail": "The paper uses 'GPT-2' without specifying which variant (base 124M, medium 355M, large 774M, or XL 1.5B). Different sizes would significantly affect results and efficiency claims."
    422     }
    423   ],
    424   "cited_papers": [
    425     {
    426       "title": "Time-LLM: Time series forecasting by reprogramming large language models",
    427       "authors": ["Ming Jin", "Shiyu Wang", "Lintao Ma", "Zhixuan Chu", "James Y. Zhang", "Xiaoming Shi", "Pin-Yu Chen", "Yuxuan Liang", "Yuan-Fang Li", "Shirui Pan", "Qingsong Wen"],
    428       "year": 2024,
    429       "relevance": "Key LLM-for-time-series baseline that reprograms LLMs via prompting for forecasting tasks."
    430     },
    431     {
    432       "title": "UniTime: A Language-Empowered Unified Model for Cross-Domain Time Series Forecasting",
    433       "authors": ["Xu Liu", "Junfeng Hu", "Yuan Li", "Shizhe Diao", "Yuxuan Liang", "Bryan Hooi", "Roger Zimmermann"],
    434       "year": 2024,
    435       "relevance": "Language-empowered time series model that contextualizes prompts with time series for cross-domain forecasting."
    436     },
    437     {
    438       "title": "One Fits All: Power General Time Series Analysis by Pretrained LM",
    439       "authors": ["Tian Zhou", "Peisong Niu", "Xue Wang", "Liang Sun", "Rong Jin"],
    440       "year": 2023,
    441       "relevance": "Foundational work on using pre-trained language models for general time series analysis by replacing the tokenizer."
    442     },
    443     {
    444       "title": "Position Paper: What Can Large Language Models Tell Us about Time Series Analysis",
    445       "authors": ["Ming Jin", "Yifan Zhang", "Wei Chen", "Kexin Zhang", "Yuxuan Liang", "Bin Yang", "Jindong Wang", "Shirui Pan", "Qingsong Wen"],
    446       "year": 2024,
    447       "relevance": "Position paper examining the role and capabilities of LLMs for time series analysis."
    448     },
    449     {
    450       "title": "iTransformer: Inverted Transformers Are Effective for Time Series Forecasting",
    451       "authors": ["Yong Liu", "Tengge Hu", "Haoran Zhang", "Haixu Wu", "Shiyu Wang", "Lintao Ma", "Mingsheng Long"],
    452       "year": 2023,
    453       "relevance": "Introduces inverted embedding treating variables as tokens, a core design adopted by TimeCMA."
    454     },
    455     {
    456       "title": "A Time Series is Worth 64 Words: Long-term Forecasting with Transformers",
    457       "authors": ["Yuqi Nie", "Nam H. Nguyen", "Phanwadee Sinthong", "Jayant Kalagnanam"],
    458       "year": 2023,
    459       "relevance": "PatchTST baseline for Transformer-based time series forecasting with channel-independent design."
    460     },
    461     {
    462       "title": "TEMPO: Prompt-based Generative Pre-trained Transformer for Time Series Forecasting",
    463       "authors": ["Defu Cao", "Furong Jia", "Sercan O. Arik", "Tomas Pfister", "Yixiang Zheng", "Wen Ye", "Yan Liu"],
    464       "year": 2024,
    465       "relevance": "Prompt-based generative pre-trained Transformer approach for time series forecasting."
    466     },
    467     {
    468       "title": "Foundation models for time series analysis: A tutorial and survey",
    469       "authors": ["Yuxuan Liang", "Haomin Wen", "Yuqi Nie", "Yushan Jiang", "Ming Jin", "Dongjin Song", "Shirui Pan", "Qingsong Wen"],
    470       "year": 2024,
    471       "relevance": "Survey of foundation models for time series analysis, providing context for the LLM-for-time-series research direction."
    472     },
    473     {
    474       "title": "Large Language Models Are Zero-Shot Time Series Forecasters",
    475       "authors": ["Nate Gruver", "Marc Finzi", "Shikai Qiu", "Andrew Gordon Wilson"],
    476       "year": 2023,
    477       "relevance": "Demonstrates LLMs can perform zero-shot time series forecasting, motivating the use of LLMs in this domain."
    478     },
    479     {
    480       "title": "LLM2Vec: Large language models are secretly powerful text encoders",
    481       "authors": ["Parishad BehnamGhader", "Vaibhav Adlakha", "Marius Mosbach", "Dzmitry Bahdanau", "Nicolas Chapados", "Siva Reddy"],
    482       "year": 2024,
    483       "relevance": "Shows LLMs can serve as powerful encoders, supporting the use of frozen LLMs for embedding extraction in TimeCMA."
    484     },
    485     {
    486       "title": "GPT4MTS: Prompt-based Large Language Model for Multimodal Time-series Forecasting",
    487       "authors": ["Furong Jia", "Kevin Wang", "Yixiang Zheng", "Defu Cao", "Yan Liu"],
    488       "year": 2024,
    489       "relevance": "Prompt-based LLM approach for multimodal time series forecasting, directly related to the data entanglement problem TimeCMA addresses."
    490     },
    491     {
    492       "title": "S2IP-LLM: Semantic Space Informed Prompt Learning with LLM for Time Series Forecasting",
    493       "authors": ["Zijie Pan", "Yushan Jiang", "Sahil Garg", "Anderson Schneider", "Yuriy Nevmyvaka", "Dongjin Song"],
    494       "year": 2024,
    495       "relevance": "Semantic space-informed prompt learning for LLM-based time series forecasting, exploring prompt design for time series."
    496     },
    497     {
    498       "title": "TFB: Towards Comprehensive and Fair Benchmarking of Time Series Forecasting Methods",
    499       "authors": ["Xiangfei Qiu", "Jilin Hu", "Lekui Zhou", "Xingjian Wu", "Junyang Du", "Buang Zhang", "Chenjuan Guo", "Aoying Zhou", "Christian S. Jensen", "Zhenli Sheng", "Bin Yang"],
    500       "year": 2024,
    501       "relevance": "Benchmarking framework for fair evaluation of time series forecasting methods, relevant to evaluation methodology."
    502     }
    503   ],
    504   "engagement_factors": {
    505     "practical_relevance": {
    506       "score": 2,
    507       "justification": "Provides a usable time series forecasting framework with code released; practitioners working on multivariate forecasting could adopt the approach."
    508     },
    509     "surprise_contrarian": {
    510       "score": 1,
    511       "justification": "Identifies the data entanglement issue in existing LLM-for-time-series methods, but the solution (cross-modality alignment) follows naturally from the diagnosis."
    512     },
    513     "fear_safety": {
    514       "score": 0,
    515       "justification": "No AI safety, security, or risk concerns raised by this work."
    516     },
    517     "drama_conflict": {
    518       "score": 0,
    519       "justification": "No controversy or conflict with other work or claims."
    520     },
    521     "demo_ability": {
    522       "score": 2,
    523       "justification": "Code released on GitHub; researchers can reproduce experiments on standard datasets with available code."
    524     },
    525     "brand_recognition": {
    526       "score": 1,
    527       "justification": "From NTU S-Lab and SenseTime Research — recognized in the ML/CV community but not household names."
    528     }
    529   }
    530 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs