scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (17829B)
      1 {
      2   "paper": {
      3     "title": "Context-Alignment: Activating and Enhancing LLM Capabilities in Time Series",
      4     "authors": ["Yuxiao Hu", "Qian Li", "Dongxiao Zhang", "Jinyue Yan", "Yuntian Chen"],
      5     "year": 2025,
      6     "venue": "ICLR 2025",
      7     "arxiv_id": "2501.03747"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub repository provided: https://github.com/tokaka22/ICLR25-FSCA, mentioned in the abstract."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "All datasets used (ETT, M4, UEA, Weather, Electricity, Traffic, ILI) are publicly available standard benchmarks."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions PyTorch and NVIDIA H800/RTX 4090 GPUs (Appendix A.1) but does not provide a requirements.txt, Dockerfile, or detailed library versions."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README-level instructions are described in the paper itself."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "All results are reported as point estimates (e.g., MSE values) with no confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims FSCA 'outperforms' and 'surpasses' baselines based solely on comparing numbers without any statistical significance tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Percentage improvements with baseline context are reported, e.g., '3.1% MSE reduction over PatchTST', '7.3%, 12.2%, and 16.6%' over LLM-based methods (Sec. 4.2)."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification for why these specific datasets or dataset sizes were chosen beyond following prior work conventions."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No standard deviations, variance, or multi-run statistics are reported. Results appear to be single-run numbers."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Extensive baselines included: GPT4TS, Time-LLM, S2IP-LLM, iTransformer, PatchTST, DLinear, TimesNet, FEDformer, ETSformer, N-HiTS, N-BEATS (Sec. 4)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include contemporary methods like iTransformer (2023), S2IP-LLM (2024), Time-LLM (2024), PatchTST (2022), which are recent and competitive."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Comprehensive ablation study in Sec. 4.7 and Table 6: removes dual-scale GNNs, coarse-grained branch, varies layer count and insertion positions."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "MSE and MAE for long-term/few-shot/zero-shot forecasting; SMAPE, MASE, and OWA for short-term; accuracy for classification."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "Human evaluation is not relevant for time series forecasting/classification tasks evaluated by automated metrics."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Standard train/test splits from established benchmarks are used. Zero-shot evaluation trains on one dataset and tests on another (Sec. 4.5)."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results broken down per dataset (8 datasets for long-term, 6 M4 subsets for short-term, 10 UEA datasets for classification) and per prediction horizon."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No failure cases or error analysis is discussed. The paper only presents cases where FSCA succeeds."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Ablation study reports configurations that hurt performance: random initialization (A.2), removing coarse-grained branch (B.1), excessive layers (C.5, C.6) causing overfitting."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims of effectiveness across tasks, especially few-shot and zero-shot forecasting, are supported by Tables 2-6 showing consistent improvements."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims like 'Context-Alignment activates LLM capabilities' are supported by ablation studies (Table 6) showing controlled single-variable manipulation (removing GNNs, coarse branch, etc.)."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title and abstract claim general 'LLM Capabilities in Time Series' but results are exclusively with GPT-2 as the backbone. No other LLMs are tested."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No alternative explanations for the improvements are discussed. The paper does not consider whether gains could come from added parameters, GNN architecture itself independent of alignment, or other confounds."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper uses 'GPT-2' without specifying which version/size (small, medium, large, xl). Appendix A.1 mentions 'pre-trained models from Wolf et al. [2020]' but no specific model variant."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Actual prompt text is provided: 'Predict future sequences using previous data:' (Sec. 3.2) and 'Predict category (x in total) using previous data:' (Appendix B.1)."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Appendix A.1 reports Adam optimizer, decay rates (0.9, 0.999), learning rates, cosine annealing schedule (Tmax=20, eta_min=10^-8), batch size 256, N=2, early stopping, loss functions."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. This is a standard deep learning method."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Sec. 3.1 describes patching with sliding window of size p and stride s. Dataset details including dimensions, lengths, and frequencies are in Appendix A.2 Tables 7-9."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No dedicated limitations or threats-to-validity section exists in the paper."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No threats to validity are discussed anywhere in the paper."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No explicit statements about what the results do not show. The paper does not acknowledge that results are limited to GPT-2 or specific dataset types."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "All datasets used are publicly available standard benchmarks (ETT, M4, UEA, Weather, Electricity, Traffic, ILI)."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Dataset details documented in Appendix A.2 with Tables 7-9 describing length, dimensions, frequency, and sources for all datasets."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants; all data comes from standard benchmarks."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline from raw time series to patches to embeddings is documented in Sec. 3.1 (Token Embedding) and the experimental setup follows established protocols from Wu et al. [2022]."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Sec. 6 (Acknowledgement) lists NSFC Grant No. 62106116, China Meteorological Administration grant, National Key R&D Program, and Ningbo Major Science and Technology Projects."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations listed: Hong Kong Polytechnic University, Eastern Institute of Technology Ningbo, Shanghai Jiao Tong University."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Funders are government research agencies (NSFC, China Meteorological Administration, National Key R&D Program) with no financial stake in the specific outcomes."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper uses GPT-2 as a backbone with frozen/fine-tuned layers for time series tasks, not evaluating the LLM's pretrained knowledge on a benchmark. Contamination of text training data is not relevant to time series evaluation."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Time series benchmarks are numerical data, not subject to LLM text training data contamination. Standard train/test splits are used."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The evaluation is on time series numerical data, not text benchmarks. LLM text contamination is not applicable."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference cost, latency, or tokens consumed are reported despite the method adding GNN computation on top of LLM inference."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "Hardware is mentioned (H800, RTX 4090) but no total GPU hours, training time, or computational budget is stated."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "FSCA reduces average MSE by 3.1% over PatchTST and outperforms LLM-based methods (S2IP-LLM, Time-LLM, GPT4TS) by 7.3%, 12.2%, and 16.6% on long-term forecasting.",
    286       "evidence": "Table 2 (Sec. 4.2) shows MSE/MAE across 8 datasets and multiple horizons.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "FSCA achieves 13.3% improvement over PatchTST in zero-shot forecasting.",
    291       "evidence": "Table 5 (Sec. 4.5) shows cross-domain transfer results on ETT datasets.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "FSCA achieves 6.7% MSE reduction over S2IP-LLM in few-shot forecasting with 5% training data.",
    296       "evidence": "Table 4 (Sec. 4.4) with results on ETT datasets.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Context-Alignment paradigm is the first to propose context-level alignment between time series and language.",
    301       "evidence": "Stated in contributions (Sec. 1) and related work (Sec. 2.2), contrasting with token-level alignment methods.",
    302       "supported": "weak"
    303     },
    304     {
    305       "claim": "FSCA achieves 76.4% average accuracy on UEA classification, a 2.4% increase over the next best model.",
    306       "evidence": "Figure 2 (Sec. 4.6) shows average accuracy across 10 UEA datasets.",
    307       "supported": "moderate"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "The paper proposes Context-Alignment, a paradigm that aligns time series data with linguistic components via Dual-Scale Context-Alignment GNNs (structural and logical alignment) to activate LLM capabilities for time series tasks. FSCA, which integrates few-shot prompting with this framework, achieves consistent improvements over baselines across long-term, short-term, few-shot, and zero-shot forecasting on standard benchmarks, with particularly strong gains in data-scarce settings. Ablation studies confirm that both dual-scale structure and logical alignment via directed edges contribute to performance.",
    312   "red_flags": [
    313     {
    314       "flag": "No variance or multi-run statistics",
    315       "detail": "All results are single-run point estimates with no error bars, standard deviations, or confidence intervals, making it impossible to assess whether differences are statistically meaningful."
    316     },
    317     {
    318       "flag": "Single LLM backbone",
    319       "detail": "All experiments use GPT-2 only, but the paper claims general 'LLM Capabilities in Time Series' without testing on any other LLM."
    320     },
    321     {
    322       "flag": "No limitations section",
    323       "detail": "The paper has no limitations or threats-to-validity discussion despite several obvious limitations (single LLM, no statistical testing, no cost analysis)."
    324     },
    325     {
    326       "flag": "No significance testing",
    327       "detail": "Claims of 'outperforming' and 'surpassing' are based solely on comparing point estimates without any statistical tests."
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "Time-LLM: Time Series Forecasting by Reprogramming Large Language Models",
    333       "authors": ["Ming Jin", "Shiyu Wang", "Lintao Ma"],
    334       "year": 2024,
    335       "relevance": "Key baseline for LLM-based time series forecasting using reprogramming approach."
    336     },
    337     {
    338       "title": "One Fits All: Power General Time Series Analysis by Pretrained LM",
    339       "authors": ["Tian Zhou", "Peisong Niu", "Liang Sun", "Rong Jin"],
    340       "year": 2023,
    341       "relevance": "GPT4TS baseline demonstrating frozen pretrained transformers for time series analysis."
    342     },
    343     {
    344       "title": "S2IP-LLM: Semantic Space Informed Prompt Learning with LLM for Time Series Forecasting",
    345       "authors": ["Zijie Pan", "Yushan Jiang", "Sahil Garg"],
    346       "year": 2024,
    347       "relevance": "Key baseline for semantic space alignment between LLMs and time series."
    348     },
    349     {
    350       "title": "TEST: Text Prototype Aligned Embedding to Activate LLM's Ability for Time Series",
    351       "authors": ["Chenxi Sun", "Hongyan Li", "Yaliang Li", "Shenda Hong"],
    352       "year": 2024,
    353       "relevance": "Token-level alignment approach for activating LLMs on time series tasks."
    354     },
    355     {
    356       "title": "TEMPO: Prompt-based Generative Pre-trained Transformer for Time Series Forecasting",
    357       "authors": ["Defu Cao", "Furong Jia", "Sercan O Arik"],
    358       "year": 2024,
    359       "relevance": "Prompt-based approach for LLM time series forecasting using decomposition."
    360     },
    361     {
    362       "title": "Language Models are Few-Shot Learners",
    363       "authors": ["Tom B Brown"],
    364       "year": 2020,
    365       "arxiv_id": "2005.14165",
    366       "relevance": "Foundation work on few-shot prompting that inspired the FSCA approach."
    367     },
    368     {
    369       "title": "UniTS: Building a Unified Time Series Model",
    370       "authors": ["Shanghua Gao", "Teddy Koker", "Owen Queen"],
    371       "year": 2024,
    372       "arxiv_id": "2403.00131",
    373       "relevance": "Unified time series model relevant to LLM-based time series analysis landscape."
    374     }
    375   ]
    376 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs