scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29917B)
      1 {
      2   "paper": {
      3     "title": "LLMs as Zero-shot Graph Learners: Alignment of GNN Representations with LLM Token Embeddings",
      4     "authors": [
      5       "Duo Wang",
      6       "Yuan Zuo",
      7       "Fengzhi Li",
      8       "Junjie Wu"
      9     ],
     10     "year": 2024,
     11     "venue": "Neural Information Processing Systems (NeurIPS 2024)",
     12     "arxiv_id": "2408.14512",
     13     "doi": "10.48550/arXiv.2408.14512"
     14   },
     15   "scan_version": 3,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "TEA-GLM aligns GNN representations with LLM token embeddings via feature-wise contrastive learning with PCA-derived principal components, enabling cross-dataset and cross-task zero-shot graph learning. The framework outperforms existing LLM-as-predictor methods (GraphGPT, LLaGA) on unseen citation and e-commerce datasets for both node classification and link prediction, achieving 0.848 accuracy on Pubmed (vs. 0.793 for LLaGA) and 0.689 AUC on Pubmed link prediction (vs. 0.569 for LLaGA). Ablation studies show both feature-wise contrastive learning and graph token embeddings are essential for cross-task transfer, with the feature-wise constraint particularly important for unseen tasks.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper states in the abstract: 'Our code is available at https://github.com/W-rudder/TEA-GLM.' A concrete repository URL is provided."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "All datasets used are publicly available standard benchmarks: Arxiv from Open Graph Benchmark, Pubmed, Cora, and e-commerce datasets from the TAG benchmark. The paper references specific public dataset sources and provides scripts for data splits."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper mentions '2 NVIDIA A100 GPUs with 80GB memory each, using CUDA version 11.7' (Section 3.1) but does not provide a requirements.txt, Dockerfile, or detailed list of library versions sufficient to recreate the environment."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "While code is released on GitHub, the paper itself does not contain step-by-step reproduction instructions. No 'Reproducing Results' section or specific commands are provided in the paper."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Tables 1, 2, 5, and 6 report results with ± notation (e.g., '0.848±0.010'), representing standard deviation across runs. This serves as error bars on the main results."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper claims TEA-GLM 'outperforms all state-of-the-art models' and 'significantly outperforms' methods, but no statistical significance tests (t-tests, Mann-Whitney, etc.) are reported. Comparisons are based solely on comparing mean values."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Results are reported as absolute accuracy/F1/AUC values with baseline context in Tables 1, 2, 5, and 6, allowing readers to compute percentage improvements. For example, TEA-GLM achieves 0.848 vs. LLaGA's 0.793 on Pubmed."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No justification is given for why 5 random seeds were chosen, and no power analysis is provided. The choice of 5 seeds appears to be convention rather than justified."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper explicitly states 'we conduct five experiments with random seed values ranging from 0 to 4 and report the mean and standard deviation of the results' (Section 3.1). Standard deviations are reported in all main result tables."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Comprehensive baselines are included across multiple categories: MLP, supervised GNNs (GCN, GraphSAGE, GAT), self-supervised (DGI), knowledge distillation (GKD, GLNN), transformers (NodeFormer, DIFFormer), transfer methods (OFA), and LLM-based methods (Vicuna-7B, GraphGPT, LLaGA)."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Baselines include contemporary methods: LLaGA (2024), GraphGPT (2023), OFA (2024), DIFFormer (2023), and Vicuna-7B-v1.5 (2023). These represent recent state of the art for graph zero-shot learning."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Section 3.4 presents an ablation study removing two key components: feature-wise contrastive learning ('w/o FC') and graph token embeddings ('w/o GT'), with results shown in Figure 2 for both node classification and link prediction."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Three evaluation metrics are used: Accuracy and Macro F1 for node classification (Tables 1 and 5), and AUC for link prediction (Table 2)."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No human evaluation is included. All evaluation is automated via accuracy, Macro F1, and AUC metrics. Human evaluation of output quality could have been informative for assessing the LLM's reasoning."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The zero-shot evaluation paradigm inherently uses held-out data: models are trained on one dataset (Arxiv/Computer) and evaluated on entirely unseen datasets (Pubmed, Cora, etc.). Data splits follow established protocols from GraphGPT and TAG benchmark."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results are broken down by individual dataset across two domains (citation and e-commerce) in Tables 1, 2, 5, and 6. Parameter sensitivity analysis is also provided per-dataset in Figures 3 and 4."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "The paper briefly notes TEA-GLM does not outperform all baselines on Sports (Table 2) but provides no systematic error analysis or discussion of failure modes. No qualitative analysis of when or why the approach fails."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Table 6 shows TEA-GLM underperforms several baselines on supervised learning (training datasets), achieving 0.655 vs. OFA's 0.682 on Arxiv. The ablation study also shows removing feature-wise contrastive learning slightly improves performance on seen datasets while hurting unseen ones."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims 'state-of-the-art performance on unseen datasets and tasks compared to other methods using LLMs as predictors.' Tables 1 and 2 support this — TEA-GLM achieves the best results across most unseen datasets for both node classification and link prediction among LLM-as-predictor methods."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper makes causal claims through ablation studies (Section 3.4): removing feature-wise contrastive learning and graph token embeddings each cause performance declines. The ablation design uses controlled single-variable manipulation, which is adequate for these causal claims."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The title claims 'LLMs as Zero-shot Graph Learners' broadly, but experiments are limited to citation networks and e-commerce co-purchase graphs — two fairly similar domains with text-attributed nodes. The paper does not test on social networks, molecular graphs, knowledge graphs, or other graph types. Section 5 acknowledges graph-level tasks are untested but doesn't bound the domain generalization claim."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "No substantive discussion of alternative explanations. For example, the improvements could partly stem from Vicuna's pre-existing knowledge of paper titles and product names rather than graph structure alignment. The paper does not consider this or other confounding factors."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper measures accuracy, Macro F1, and AUC on specific classification/prediction tasks and frames its claims at the same granularity — zero-shot performance on specific graph tasks. There is no substantial proxy gap between what is measured and what is claimed."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The paper specifies 'Vicuna-7B-v1.5' as the LLM (Section 2.3.3) and 'GraphSAGE' as the GNN encoder. BERT is also specified for feature encoding. The Vicuna version (v1.5) is a specific release identifier."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Complete instruction templates are provided in Appendix D (Figure 5) for both node classification and link prediction tasks, including the full prompt text with placeholders clearly marked."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section 3.1 reports: GNN layers=2, batch size=512, 60 epochs, learning rate=2×10⁻², projector batch size=2 per GPU, 1 epoch, learning rate=1×10⁻³, Adam optimizer. Parameter sensitivity analysis for K and P is in Appendix C."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. TEA-GLM is a GNN + linear projector + frozen LLM pipeline, not an agent-based system."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 2.3.3 describes using a pretrained BERT model to encode raw node text into features. Section 3.1 describes following GraphGPT's data split methodology for citation datasets and TAG benchmark scripts for e-commerce datasets. View generation (edge removal, feature masking) is described in Section 2.2.1."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 5 is titled 'Limitations' and discusses the fact that graph-level tasks have not been explored despite the framework's applicability."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The limitations section is extremely brief (2 sentences) and only mentions the absence of graph-level task experiments. It does not discuss specific threats to validity such as domain bias in the evaluation, potential text-knowledge confounds, or limitations of the PCA alignment approach."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The paper mentions graph-level tasks as future work but does not explicitly state what the results do NOT show — for example, that results may not generalize beyond text-attributed graphs, beyond the two domains tested, or beyond the specific LLM used."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "All datasets are publicly available: Arxiv from Open Graph Benchmark, Pubmed, Cora, and TAG benchmark e-commerce datasets. Raw data can be independently obtained from these sources."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Appendix A provides detailed dataset descriptions including what nodes and edges represent, the source of each dataset, and basic statistics (Table 3). The data split methodology references established protocols."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. All data sources are standard public benchmarks."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The pipeline is documented: raw text → BERT encoding → node features → GNN pretraining with contrastive loss → linear projector training → zero-shot evaluation. Data splits follow referenced protocols. View generation (Equations 1-2) and feature projection (Equation 7) are formally specified."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Section 7 (Acknowledgement) discloses funding: National Key R&D Program of China (2023YFC3304700), National Natural Science Foundation of China (NSFC) grants 71901012, 72242101, 72031001, and Outstanding Young Scientist Program of Beijing Universities."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "All authors are disclosed as affiliated with 'MIIT Key Laboratory of Data Intelligence and Management, Beihang University' on the first page."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Funding is from the Chinese government (National Key R&D Program, NSFC) and Beijing municipal programs, which have no direct financial interest in the specific outcomes of this graph learning research."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is included in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "The paper does not state Vicuna-7B-v1.5's training data cutoff date. Since the LLM is used as a zero-shot predictor on text (paper titles, product names), knowledge of this text from pretraining is a potential confound that is not addressed."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No discussion of whether Vicuna's pretraining data includes text from the benchmark datasets (e.g., Arxiv paper titles, product descriptions). Since Vicuna-7B alone achieves 0.719 accuracy on Pubmed using only text, the LLM clearly has relevant knowledge, but potential overlap is not discussed."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "The benchmarks used (Arxiv, Pubmed, Cora) are well-known public datasets that predate Vicuna's training. The paper does not discuss whether these benchmarks or their labels were in Vicuna's training data."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No inference cost, latency, or cost per example is reported. The method involves GNN inference + linear projection + LLM inference but no timing or cost data is provided."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "The paper mentions '2 NVIDIA A100 GPUs with 80GB memory each' but does not report total GPU hours, training time, or total computational budget for the experiments."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Results are reported across 5 random seeds (0-4) with mean and standard deviation in all main tables (Tables 1, 2, 5, 6)."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": true,
    306         "justification": "Section 3.1 explicitly states: 'we conduct five experiments with random seed values ranging from 0 to 4.'"
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The paper states 'For baseline models, we adjust hyperparameters and utilize the optimal settings' but does not report the search budget, number of configurations tried, or search method for either baselines or TEA-GLM."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The paper does not explain how the reported configuration (e.g., K=3, P=1000) was selected as optimal. Parameter sensitivity analysis in Appendix C shows different K and P values but doesn't clarify whether selection was done on validation data."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The paper makes many comparisons across methods and datasets without any statistical significance tests, let alone corrections for multiple comparisons."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors implement and evaluate baselines themselves (or use author-provided checkpoints in some cases) without acknowledging the bias of self-implemented baselines. No independent evaluation is mentioned."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "TEA-GLM requires GNN pretraining + projector training + LLM inference, which may differ substantially in compute from baselines. This is not discussed or compared."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The paper does not discuss whether node classification accuracy on Arxiv/Cora/Pubmed or link prediction AUC actually measures 'zero-shot graph learning' capability, or whether the benchmarks test something else (e.g., text understanding by the LLM)."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No agentic scaffolding is involved. The system is a GNN + linear projector + frozen LLM pipeline."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "Vicuna-7B was trained on data that likely includes Arxiv papers, Pubmed abstracts, and product descriptions used in the benchmarks. The paper does not discuss whether the LLM's pretraining data temporally overlaps with the benchmark data."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "The paper provides paper titles and candidate labels in the instruction, which may allow the LLM to answer from pretrained knowledge alone (Vicuna-7B achieves 0.719 on Pubmed with just text). This potential feature leakage from the LLM's pretraining is not discussed."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether train and test examples share structural similarities or whether the public benchmarks have known overlap issues."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No concrete leakage detection or prevention method is applied (no canary strings, membership inference, temporal splits, or decontamination)."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "TEA-GLM achieves state-of-the-art zero-shot accuracy on unseen datasets compared to other LLM-as-predictor methods.",
    370       "evidence": "Table 1 shows TEA-GLM achieves the best accuracy on all 6 unseen datasets: Pubmed (0.848 vs. LLaGA 0.793), Cora (0.202 vs. LLaGA 0.168), Children (0.271 vs. Vicuna 0.270), History (0.528 vs. Vicuna 0.363), Photo (0.497 vs. GLNN 0.403), Sports (0.404 vs. LLaGA 0.352).",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "TEA-GLM outperforms baselines on cross-task zero-shot link prediction without fine-tuning.",
    375       "evidence": "Table 2 shows TEA-GLM achieves the best AUC on 7 of 8 datasets for link prediction, with models trained only on node classification. E.g., Pubmed: 0.689 vs. LLaGA 0.569. Exception is Sports where LLaGA (0.597) outperforms TEA-GLM (0.553).",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Feature-wise contrastive learning with token embeddings is essential for cross-task zero-shot transfer.",
    380       "evidence": "Figure 2 ablation study shows removing feature-wise contrastive learning ('w/o FC') causes performance decline on unseen datasets and especially on link prediction (unseen task), while slightly improving seen-dataset performance on trained tasks.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Graph token embeddings provide crucial graph information to the LLM.",
    385       "evidence": "Figure 2 ablation study shows removing graph token embeddings ('w/o GT') causes significant performance decline across both node classification and link prediction tasks on all datasets.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "A linear projector is sufficient to map GNN representations to LLM-compatible token embeddings without tuning the LLM.",
    390       "evidence": "The method uses only a linear layer (Equation 9) and keeps LLM parameters frozen throughout all phases (Section 2.3.3). Results demonstrate competitive performance with this simple design.",
    391       "supported": "moderate"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "LLM text knowledge confound",
    397       "detail": "Vicuna-7B achieves 0.719 accuracy on Pubmed using only text (paper titles), suggesting the LLM's pretrained knowledge contributes substantially to predictions. The paper does not disentangle how much of TEA-GLM's improvement comes from graph structure alignment vs. the LLM's existing text knowledge. This is especially concerning since paper titles and product names may be in Vicuna's training data."
    398     },
    399     {
    400       "flag": "No statistical significance testing",
    401       "detail": "The paper claims 'significant improvements' and 'outperforms all state-of-the-art models' but provides no statistical significance tests. Many differences between methods fall within overlapping standard deviations (e.g., TEA-GLM 0.271±0.010 vs. Vicuna 0.270±0.001 on Children)."
    402     },
    403     {
    404       "flag": "Missing GraphGPT results on e-commerce",
    405       "detail": "GraphGPT results are missing for all e-commerce datasets in both Tables 1 and 2, attributed to 'considerable time cost.' This incomplete comparison makes it harder to assess relative performance on the more challenging domain."
    406     },
    407     {
    408       "flag": "Extremely brief limitations section",
    409       "detail": "Section 5 (Limitations) contains only two sentences and discusses only the absence of graph-level experiments. It omits discussion of domain limitations, the LLM knowledge confound, the limited graph types tested (only text-attributed), and the lack of contamination analysis."
    410     }
    411   ],
    412   "cited_papers": [
    413     {
    414       "title": "Llaga: Large language and graph assistant",
    415       "authors": ["Runjin Chen", "Tong Zhao", "Ajay Jaiswal", "Neil Shah", "Zhangyang Wang"],
    416       "year": 2024,
    417       "relevance": "Directly compared baseline that uses LLMs as predictors for graph tasks by translating graph data into LLM-compatible sequences."
    418     },
    419     {
    420       "title": "GraphGPT: Graph instruction tuning for large language models",
    421       "authors": ["Jiabin Tang", "Yuhao Yang", "Wei Wei", "Lei Shi", "Lixin Su", "Suqi Cheng", "Dawei Yin", "Chao Huang"],
    422       "year": 2023,
    423       "arxiv_id": "2310.13023",
    424       "relevance": "Key baseline that aligns LLMs with graph transformers via dual-stage instruction tuning for graph reasoning."
    425     },
    426     {
    427       "title": "One for all: Towards training one graph model for all classification tasks",
    428       "authors": ["Hao Liu", "Jiarui Feng", "Lecheng Kong", "Ningyue Liang", "Dacheng Tao", "Yixin Chen", "Muhan Zhang"],
    429       "year": 2024,
    430       "relevance": "Cross-domain graph learning framework used as a baseline for zero-shot graph classification."
    431     },
    432     {
    433       "title": "Can llms effectively leverage graph structural information: when and why",
    434       "authors": ["Jin Huang", "Xingjian Zhang", "Qiaozhu Mei", "Jiaqi Ma"],
    435       "year": 2023,
    436       "arxiv_id": "2309.16595",
    437       "relevance": "Key reference showing LLMs benefit from structural information only when nodes lack sufficient text, directly motivating TEA-GLM's design choice to use only titles."
    438     },
    439     {
    440       "title": "Exploring the potential of large language models (LLMs) in learning on graph",
    441       "authors": ["Zhikai Chen", "Haitao Mao", "Hang Li", "Wei Jin"],
    442       "year": 2023,
    443       "relevance": "Explores encoding graph structures as text for LLM input, a paradigm that TEA-GLM aims to improve upon."
    444     },
    445     {
    446       "title": "TEST: Text prototype aligned embedding to activate LLM's ability for time series",
    447       "authors": ["Chenxi Sun", "Hongyan Li", "Yaliang Li", "Shenda Hong"],
    448       "year": 2024,
    449       "relevance": "Similar alignment approach for time series that projects representations into LLM token embedding space, directly related to TEA-GLM's methodology."
    450     },
    451     {
    452       "title": "Finetuned language models are zero-shot learners",
    453       "authors": ["Jason Wei", "Maarten Bosma", "Vincent Zhao", "Kelvin Guu"],
    454       "year": 2022,
    455       "relevance": "Foundational work on instruction-tuned LLMs as zero-shot learners that directly inspires TEA-GLM's approach."
    456     },
    457     {
    458       "title": "Deep graph contrastive representation learning",
    459       "authors": ["Yanqiao Zhu", "Yichen Xu", "Feng Yu", "Qiang Liu", "Shu Wu", "Liang Wang"],
    460       "year": 2020,
    461       "arxiv_id": "2006.04131",
    462       "relevance": "Contrastive learning framework for graphs that TEA-GLM builds upon for its instance-wise contrastive learning component."
    463     },
    464     {
    465       "title": "Inductive representation learning on large graphs",
    466       "authors": ["Will Hamilton", "Zhitao Ying", "Jure Leskovec"],
    467       "year": 2017,
    468       "relevance": "GraphSAGE — the GNN architecture used as the graph encoder in TEA-GLM."
    469     },
    470     {
    471       "title": "GraphTranslator: Aligning graph model to large language model for open-ended tasks",
    472       "authors": ["Mengmei Zhang", "Mingwei Sun", "Peng Wang"],
    473       "year": 2024,
    474       "relevance": "Related work on aligning graph models with LLMs for graph understanding tasks."
    475     }
    476   ],
    477   "engagement_factors": {
    478     "practical_relevance": {
    479       "score": 1,
    480       "justification": "The framework requires GNN pretraining and LLM integration — interesting research but not immediately deployable by practitioners."
    481     },
    482     "surprise_contrarian": {
    483       "score": 1,
    484       "justification": "Novel approach of PCA-based alignment between GNN and LLM spaces, but does not strongly contradict conventional wisdom."
    485     },
    486     "fear_safety": {
    487       "score": 0,
    488       "justification": "No safety, security, or risk concerns raised by this graph learning methodology paper."
    489     },
    490     "drama_conflict": {
    491       "score": 0,
    492       "justification": "No controversy or provocative claims — straightforward benchmark improvement paper."
    493     },
    494     "demo_ability": {
    495       "score": 2,
    496       "justification": "Code released on GitHub (https://github.com/W-rudder/TEA-GLM), but requires A100 GPUs and significant setup effort."
    497     },
    498     "brand_recognition": {
    499       "score": 1,
    500       "justification": "Published at NeurIPS (prestigious venue) but from Beihang University, not a widely recognized AI lab in the broader tech community."
    501     }
    502   }
    503 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs