scan.json (17829B)
1 { 2 "paper": { 3 "title": "Context-Alignment: Activating and Enhancing LLM Capabilities in Time Series", 4 "authors": ["Yuxiao Hu", "Qian Li", "Dongxiao Zhang", "Jinyue Yan", "Yuntian Chen"], 5 "year": 2025, 6 "venue": "ICLR 2025", 7 "arxiv_id": "2501.03747" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "GitHub repository provided: https://github.com/tokaka22/ICLR25-FSCA, mentioned in the abstract." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "All datasets used (ETT, M4, UEA, Weather, Electricity, Traffic, ILI) are publicly available standard benchmarks." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "The paper mentions PyTorch and NVIDIA H800/RTX 4090 GPUs (Appendix A.1) but does not provide a requirements.txt, Dockerfile, or detailed library versions." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README-level instructions are described in the paper itself." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "All results are reported as point estimates (e.g., MSE values) with no confidence intervals or error bars." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims FSCA 'outperforms' and 'surpasses' baselines based solely on comparing numbers without any statistical significance tests." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "Percentage improvements with baseline context are reported, e.g., '3.1% MSE reduction over PatchTST', '7.3%, 12.2%, and 16.6%' over LLM-based methods (Sec. 4.2)." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "No justification for why these specific datasets or dataset sizes were chosen beyond following prior work conventions." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No standard deviations, variance, or multi-run statistics are reported. Results appear to be single-run numbers." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Extensive baselines included: GPT4TS, Time-LLM, S2IP-LLM, iTransformer, PatchTST, DLinear, TimesNet, FEDformer, ETSformer, N-HiTS, N-BEATS (Sec. 4)." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "Baselines include contemporary methods like iTransformer (2023), S2IP-LLM (2024), Time-LLM (2024), PatchTST (2022), which are recent and competitive." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Comprehensive ablation study in Sec. 4.7 and Table 6: removes dual-scale GNNs, coarse-grained branch, varies layer count and insertion positions." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "MSE and MAE for long-term/few-shot/zero-shot forecasting; SMAPE, MASE, and OWA for short-term; accuracy for classification." 79 }, 80 "human_evaluation": { 81 "applies": false, 82 "answer": false, 83 "justification": "Human evaluation is not relevant for time series forecasting/classification tasks evaluated by automated metrics." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "Standard train/test splits from established benchmarks are used. Zero-shot evaluation trains on one dataset and tests on another (Sec. 4.5)." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results broken down per dataset (8 datasets for long-term, 6 M4 subsets for short-term, 10 UEA datasets for classification) and per prediction horizon." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": false, 98 "justification": "No failure cases or error analysis is discussed. The paper only presents cases where FSCA succeeds." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "Ablation study reports configurations that hurt performance: random initialization (A.2), removing coarse-grained branch (B.1), excessive layers (C.5, C.6) causing overfitting." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Abstract claims of effectiveness across tasks, especially few-shot and zero-shot forecasting, are supported by Tables 2-6 showing consistent improvements." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "Causal claims like 'Context-Alignment activates LLM capabilities' are supported by ablation studies (Table 6) showing controlled single-variable manipulation (removing GNNs, coarse branch, etc.)." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The title and abstract claim general 'LLM Capabilities in Time Series' but results are exclusively with GPT-2 as the backbone. No other LLMs are tested." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "No alternative explanations for the improvements are discussed. The paper does not consider whether gains could come from added parameters, GNN architecture itself independent of alignment, or other confounds." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper uses 'GPT-2' without specifying which version/size (small, medium, large, xl). Appendix A.1 mentions 'pre-trained models from Wolf et al. [2020]' but no specific model variant." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "Actual prompt text is provided: 'Predict future sequences using previous data:' (Sec. 3.2) and 'Predict category (x in total) using previous data:' (Appendix B.1)." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Appendix A.1 reports Adam optimizer, decay rates (0.9, 0.999), learning rates, cosine annealing schedule (Tmax=20, eta_min=10^-8), batch size 256, N=2, early stopping, loss functions." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "No agentic scaffolding is used. This is a standard deep learning method." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Sec. 3.1 describes patching with sliding window of size p and stride s. Dataset details including dimensions, lengths, and frequencies are in Appendix A.2 Tables 7-9." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": false, 159 "justification": "No dedicated limitations or threats-to-validity section exists in the paper." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "No threats to validity are discussed anywhere in the paper." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "No explicit statements about what the results do not show. The paper does not acknowledge that results are limited to GPT-2 or specific dataset types." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": true, 176 "justification": "All datasets used are publicly available standard benchmarks (ETT, M4, UEA, Weather, Electricity, Traffic, ILI)." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Dataset details documented in Appendix A.2 with Tables 7-9 describing length, dimensions, frequency, and sources for all datasets." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants; all data comes from standard benchmarks." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The pipeline from raw time series to patches to embeddings is documented in Sec. 3.1 (Token Embedding) and the experimental setup follows established protocols from Wu et al. [2022]." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": true, 198 "justification": "Sec. 6 (Acknowledgement) lists NSFC Grant No. 62106116, China Meteorological Administration grant, National Key R&D Program, and Ningbo Major Science and Technology Projects." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations listed: Hong Kong Polytechnic University, Eastern Institute of Technology Ningbo, Shanghai Jiao Tong University." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": true, 208 "justification": "Funders are government research agencies (NSFC, China Meteorological Administration, National Key R&D Program) with no financial stake in the specific outcomes." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": false, 219 "answer": false, 220 "justification": "The paper uses GPT-2 as a backbone with frozen/fine-tuned layers for time series tasks, not evaluating the LLM's pretrained knowledge on a benchmark. Contamination of text training data is not relevant to time series evaluation." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": false, 224 "answer": false, 225 "justification": "Time series benchmarks are numerical data, not subject to LLM text training data contamination. Standard train/test splits are used." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": false, 229 "answer": false, 230 "justification": "The evaluation is on time series numerical data, not text benchmarks. LLM text contamination is not applicable." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "No inference cost, latency, or tokens consumed are reported despite the method adding GNN computation on top of LLM inference." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "Hardware is mentioned (H800, RTX 4090) but no total GPU hours, training time, or computational budget is stated." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "FSCA reduces average MSE by 3.1% over PatchTST and outperforms LLM-based methods (S2IP-LLM, Time-LLM, GPT4TS) by 7.3%, 12.2%, and 16.6% on long-term forecasting.", 286 "evidence": "Table 2 (Sec. 4.2) shows MSE/MAE across 8 datasets and multiple horizons.", 287 "supported": "moderate" 288 }, 289 { 290 "claim": "FSCA achieves 13.3% improvement over PatchTST in zero-shot forecasting.", 291 "evidence": "Table 5 (Sec. 4.5) shows cross-domain transfer results on ETT datasets.", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "FSCA achieves 6.7% MSE reduction over S2IP-LLM in few-shot forecasting with 5% training data.", 296 "evidence": "Table 4 (Sec. 4.4) with results on ETT datasets.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "Context-Alignment paradigm is the first to propose context-level alignment between time series and language.", 301 "evidence": "Stated in contributions (Sec. 1) and related work (Sec. 2.2), contrasting with token-level alignment methods.", 302 "supported": "weak" 303 }, 304 { 305 "claim": "FSCA achieves 76.4% average accuracy on UEA classification, a 2.4% increase over the next best model.", 306 "evidence": "Figure 2 (Sec. 4.6) shows average accuracy across 10 UEA datasets.", 307 "supported": "moderate" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval"], 311 "key_findings": "The paper proposes Context-Alignment, a paradigm that aligns time series data with linguistic components via Dual-Scale Context-Alignment GNNs (structural and logical alignment) to activate LLM capabilities for time series tasks. FSCA, which integrates few-shot prompting with this framework, achieves consistent improvements over baselines across long-term, short-term, few-shot, and zero-shot forecasting on standard benchmarks, with particularly strong gains in data-scarce settings. Ablation studies confirm that both dual-scale structure and logical alignment via directed edges contribute to performance.", 312 "red_flags": [ 313 { 314 "flag": "No variance or multi-run statistics", 315 "detail": "All results are single-run point estimates with no error bars, standard deviations, or confidence intervals, making it impossible to assess whether differences are statistically meaningful." 316 }, 317 { 318 "flag": "Single LLM backbone", 319 "detail": "All experiments use GPT-2 only, but the paper claims general 'LLM Capabilities in Time Series' without testing on any other LLM." 320 }, 321 { 322 "flag": "No limitations section", 323 "detail": "The paper has no limitations or threats-to-validity discussion despite several obvious limitations (single LLM, no statistical testing, no cost analysis)." 324 }, 325 { 326 "flag": "No significance testing", 327 "detail": "Claims of 'outperforming' and 'surpassing' are based solely on comparing point estimates without any statistical tests." 328 } 329 ], 330 "cited_papers": [ 331 { 332 "title": "Time-LLM: Time Series Forecasting by Reprogramming Large Language Models", 333 "authors": ["Ming Jin", "Shiyu Wang", "Lintao Ma"], 334 "year": 2024, 335 "relevance": "Key baseline for LLM-based time series forecasting using reprogramming approach." 336 }, 337 { 338 "title": "One Fits All: Power General Time Series Analysis by Pretrained LM", 339 "authors": ["Tian Zhou", "Peisong Niu", "Liang Sun", "Rong Jin"], 340 "year": 2023, 341 "relevance": "GPT4TS baseline demonstrating frozen pretrained transformers for time series analysis." 342 }, 343 { 344 "title": "S2IP-LLM: Semantic Space Informed Prompt Learning with LLM for Time Series Forecasting", 345 "authors": ["Zijie Pan", "Yushan Jiang", "Sahil Garg"], 346 "year": 2024, 347 "relevance": "Key baseline for semantic space alignment between LLMs and time series." 348 }, 349 { 350 "title": "TEST: Text Prototype Aligned Embedding to Activate LLM's Ability for Time Series", 351 "authors": ["Chenxi Sun", "Hongyan Li", "Yaliang Li", "Shenda Hong"], 352 "year": 2024, 353 "relevance": "Token-level alignment approach for activating LLMs on time series tasks." 354 }, 355 { 356 "title": "TEMPO: Prompt-based Generative Pre-trained Transformer for Time Series Forecasting", 357 "authors": ["Defu Cao", "Furong Jia", "Sercan O Arik"], 358 "year": 2024, 359 "relevance": "Prompt-based approach for LLM time series forecasting using decomposition." 360 }, 361 { 362 "title": "Language Models are Few-Shot Learners", 363 "authors": ["Tom B Brown"], 364 "year": 2020, 365 "arxiv_id": "2005.14165", 366 "relevance": "Foundation work on few-shot prompting that inspired the FSCA approach." 367 }, 368 { 369 "title": "UniTS: Building a Unified Time Series Model", 370 "authors": ["Shanghua Gao", "Teddy Koker", "Owen Queen"], 371 "year": 2024, 372 "arxiv_id": "2403.00131", 373 "relevance": "Unified time series model relevant to LLM-based time series analysis landscape." 374 } 375 ] 376 }