scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30109B)
      1 {
      2   "paper": {
      3     "title": "IDGenRec: LLM-RecSys Alignment with Textual ID Learning",
      4     "authors": [
      5       "Juntao Tan",
      6       "Shuyuan Xu",
      7       "Wenyue Hua",
      8       "Yingqiang Ge",
      9       "Zelong Li",
     10       "Yongfeng Zhang"
     11     ],
     12     "year": 2024,
     13     "venue": "Annual International ACM SIGIR Conference on Research and Development in Information Retrieval",
     14     "arxiv_id": "2403.19021",
     15     "doi": "10.1145/3626772.3657821"
     16   },
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The abstract states: 'Code and data are open-sourced at https://github.com/agiresearch/IDGenRec.' A working GitHub URL is provided."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper uses publicly available Amazon Review datasets and the Yelp dataset, with download links in footnotes. The abstract also claims data is open-sourced at the GitHub repository."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The paper mentions using T5 models and SentencePiece tokenizer but does not provide a requirements.txt, Dockerfile, or detailed environment specification listing library versions."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No step-by-step reproduction instructions are provided in the paper. The training procedure is described at a high level (alternate training, 3 iterations) but no specific commands or scripts are documented."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "All results in Tables 4, 5, 6, and 7 are reported as point estimates with no confidence intervals or error bars."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Table 4 caption states: 'All improvements are significant at p < 0.05 compared to the best baseline under the student's t-test.'"
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 3.4.2 reports percentage improvements with baseline context: 'improvements of 39.44%, 23.55%, 42.37%, and 36.76% on the Sports, Beauty, Toys, and Yelp datasets, respectively.' Absolute values for both baseline and IDGenRec are shown in Table 4."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No justification is given for why these particular datasets or their sizes are sufficient. No power analysis is discussed."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No standard deviations, variance, or spread measures are reported across experimental runs in any of the results tables. The t-test in Table 4 implies multiple runs were performed, but variance is never shown."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Extensive baselines are included: 7 traditional methods (GRU4Rec, Caser, HGN, SASRec, Bert4Rec, FDSA, S3Rec) and 3 generative methods (P5-SID, P5-CID, P5-SemID) for standard evaluation, plus UniSRec for zero-shot evaluation."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Baselines include recent methods: P5 variants (2023), UniSRec (2022), S3Rec (2020). For a 2024 paper, these are reasonably contemporary. The older baselines (GRU4Rec 2015, Caser 2018) are included alongside recent ones for comprehensive coverage."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Two ablation studies are presented: Table 5 compares ID-only training, recommender-only training, and alternate training. Table 6 evaluates user ID only, item ID only, and both combined."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Four evaluation metrics are used: HR@5, HR@10, NDCG@5, and NDCG@10."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No human evaluation is included. All evaluation is based on automated ranking metrics (HR, NDCG). Human evaluation of recommendation quality could have been informative for validating the semantic richness claims."
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "A leave-one-out strategy is used for standard evaluation testing. The zero-shot evaluation uses 6 entirely unseen datasets for testing, clearly separated from the 19 training datasets."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Results are broken down by individual dataset (4 for standard, 6 for zero-shot) and by scenario type (intra-platform vs inter-platform) in Table 7."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": false,
    106         "justification": "No failure cases or error analysis is presented. The paper does not discuss where IDGenRec produces poor recommendations or under what conditions it fails."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Table 5 shows that training only the ID generator yields poor results. Section 3.2 notes that 'Further training negatively affected the zero-shot performance.' Table 7 shows UniSRec outperforms IDGenRec on HR@10 for the Music dataset."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The abstract claims the framework 'consistently surpasses existing models in sequential recommendation' (supported by Table 4) and that zero-shot performance is 'comparable to or even better than some traditional recommendation models' (supported by Table 7 comparisons with GRU4Rec, Bert4Rec, Caser)."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The paper claims textual IDs 'better utilize the semantic understanding abilities of LLMs' and that IDGenRec improvements come 'solely from a more elegant ID selection.' However, IDGenRec differs from P5 variants in multiple ways (ID generation, alternate training, constrained decoding, model initialization) — the ablation studies isolate training strategy and user IDs but do not isolate the effect of textual IDs vs. numerical IDs while holding all else constant."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The title claims 'LLM-RecSys Alignment' broadly, and the paper discusses a 'foundational generative recommendation model,' but experiments use only T5-small/base on Amazon Review and Yelp datasets (e-commerce and restaurants). Claims about foundation models are not bounded to these domains or model sizes."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "No alternative explanations are discussed. For example, the improvements could partially stem from the richer input representations (full item metadata vs. numerical IDs), the specific T5 initialization for the ID generator, or the alternate training procedure rather than the textual ID concept itself. None of these confounds are considered."
    134       },
    135       "proxy_outcome_distinction": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The paper measures HR@k and NDCG@k and frames these as recommendation performance metrics. These are standard information retrieval metrics directly measuring ranking quality, and the paper does not overclaim beyond what these metrics capture."
    139       }
    140     },
    141     "setup_transparency": {
    142       "model_versions_specified": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "The paper refers to 'T5 model,' 'T5 small model,' and 'standard pretrained T5 checkpoint' without specifying exact model sizes for the base recommender or specific checkpoint versions. The ID generator is initialized from a HuggingFace model (nandakishormpai/t5-small-machine-articles-tag-generation) which is specific, but the base recommender's T5 variant is ambiguous."
    146       },
    147       "prompts_provided": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "The paper states 'We have developed 10 such templates' but only shows one example in Figure 2. The remaining 9 templates are described as having 'minor differences' but their actual text is not provided. The paper references P5 templates but does not reproduce them."
    151       },
    152       "hyperparameters_reported": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Key hyperparameters are reported in Section 3.2: learning rates (1e-3 for recommender, 1e-8 for ID generator), training epochs (1×3 for ID generator, 10×3 for recommender), vocabulary size (32,128), DBS parameters (k=10, λ starting at 1, max 10), and ID length range [1,10)."
    156       },
    157       "scaffolding_described": {
    158         "applies": false,
    159         "answer": false,
    160         "justification": "No agentic scaffolding is used. The system is a standard training/inference pipeline with two LLM components (ID generator and recommender)."
    161       },
    162       "data_preprocessing_documented": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section 3.1 describes filtering users and items with fewer than 5 interactions, following prior work's exact processing steps. For the Fusion dataset, large datasets are downsampled to 30,000 users. Table 2 provides the density-based categorization for train/test split decisions."
    166       }
    167     },
    168     "limitations_and_scope": {
    169       "limitations_section_present": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "There is no limitations section or threats-to-validity discussion anywhere in the paper. The conclusion section discusses future potential but does not acknowledge any limitations."
    173       },
    174       "threats_to_validity_specific": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No threats to validity are discussed, either specific or generic."
    178       },
    179       "scope_boundaries_stated": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No explicit scope boundaries are stated. The paper does not discuss what the results do NOT show, what domains or model architectures are excluded, or what limitations apply to the 'foundation model' claims."
    183       }
    184     },
    185     "data_integrity": {
    186       "raw_data_available": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "All datasets used are publicly available: Amazon Review datasets with download links referenced via prior work, and Yelp dataset with a footnote URL (https://www.yelp.com/dataset). The GitHub repository also claims to include data."
    190       },
    191       "data_collection_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "The datasets are standard public benchmarks. Section 3.1 describes the data selection rules: density-based categorization of Amazon datasets (Table 2), filtering criteria (5+ interactions), and downsampling procedure (30,000 users). Table 3 provides complete statistics."
    195       },
    196       "recruitment_methods_described": {
    197         "applies": false,
    198         "answer": false,
    199         "justification": "No human participants are involved. Data comes from standard public recommendation benchmarks."
    200       },
    201       "data_pipeline_documented": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "The pipeline is documented: raw Amazon Review and Yelp datasets → filter users/items with <5 interactions → density-based train/test split → downsampling to 30,000 users for large datasets → leave-one-out for testing. Table 3 provides final statistics."
    205       }
    206     },
    207     "conflicts_of_interest": {
    208       "funding_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Acknowledgement section states: 'The work was supported in part by NSF IIS2046457 and IIS-2007907.'"
    212       },
    213       "affiliations_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "All six authors are affiliated with Rutgers University, clearly stated in the paper header. They are not evaluating a commercial product they are affiliated with."
    217       },
    218       "funder_independent_of_outcome": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "The funder is the US National Science Foundation (NSF), a government agency with no financial stake in whether IDGenRec outperforms baselines."
    222       },
    223       "financial_interests_declared": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No competing interests or financial interests statement is present in the paper."
    227       }
    228     },
    229     "contamination": {
    230       "training_cutoff_stated": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "T5's pre-training data cutoff is not stated. Since T5 was pre-trained on C4 (web crawl data), it could potentially contain Amazon review text. No mention of when training data was collected."
    234       },
    235       "train_test_overlap_discussed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No discussion of whether T5's pre-training data includes Amazon Review or Yelp content, which is plausible since both are widely available online. The zero-shot evaluation assumes the model has not seen test domain data, but pre-training contamination is not considered."
    239       },
    240       "benchmark_contamination_addressed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "The Amazon Review and Yelp datasets have been publicly available for years before T5's training. No discussion of whether T5's pre-training on web data could include these review datasets, which would provide the model with prior knowledge of item descriptions and user patterns."
    244       }
    245     },
    246     "human_studies": {
    247       "pre_registered": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study. All experiments are automated benchmark evaluations."
    251       },
    252       "irb_or_ethics_approval": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "demographics_reported": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "inclusion_exclusion_criteria": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "randomization_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "blinding_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "attrition_reported": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       }
    282     },
    283     "cost_and_practicality": {
    284       "inference_cost_reported": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No inference cost, latency, or time per recommendation is reported. The system requires running two models (ID generator + recommender) but the cost implications are not discussed."
    288       },
    289       "compute_budget_stated": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No GPU hours, training time, or hardware specifications are mentioned. The alternate training strategy with 3 iterations of ID generator + recommender training presumably requires substantial compute, but this is not quantified."
    293       }
    294     },
    295     "experimental_rigor": {
    296       "seed_sensitivity_reported": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from single runs."
    300       },
    301       "number_of_runs_stated": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The number of experimental runs is never explicitly stated. The t-test mentioned in Table 4's caption implies multiple runs, but the number is not reported."
    305       },
    306       "hyperparameter_search_budget": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "Hyperparameters are stated but no search budget is reported. It is unclear how the learning rates (1e-3 and 1e-8), number of iterations (3), and DBS parameters were selected."
    310       },
    311       "best_config_selection_justified": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "The same configuration is used across all datasets with no justification for why these specific hyperparameters are optimal. No validation set performance is shown for hyperparameter selection."
    315       },
    316       "multiple_comparison_correction": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "Student's t-tests are applied across 4 datasets × 4 metrics = 16 comparisons in Table 4, plus additional comparisons in Tables 5-7, with no correction for multiple comparisons (e.g., Bonferroni)."
    320       },
    321       "self_comparison_bias_addressed": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The authors implement their own system and compare against baseline implementations (some likely re-implementations) without acknowledging or addressing author-evaluation bias."
    325       },
    326       "compute_budget_vs_performance": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "IDGenRec uses two models (ID generator + recommender) with alternate training over 3 iterations, likely requiring substantially more compute than single-model baselines like SASRec or P5. This compute difference is never discussed or controlled for."
    330       },
    331       "benchmark_construct_validity": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "No discussion of whether HR@k and NDCG@k on leave-one-out evaluation actually measure real-world recommendation quality. The construct validity of this standard evaluation protocol is not questioned."
    335       },
    336       "scaffold_confound_addressed": {
    337         "applies": false,
    338         "answer": false,
    339         "justification": "No agentic scaffolding is involved in this work."
    340       }
    341     },
    342     "data_leakage": {
    343       "temporal_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No discussion of temporal leakage. T5 was pre-trained on web data that may include Amazon review content from the same time period as the benchmark datasets."
    347       },
    348       "feature_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether the item metadata used for ID generation leaks information about user preferences or future interactions."
    352       },
    353       "non_independence_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "For the zero-shot evaluation, the 19 training and 6 test datasets are all from Amazon Review (except Yelp). No analysis of whether structural similarities between Amazon domains (shared items, similar review patterns) compromise the independence assumption."
    357       },
    358       "leakage_detection_method": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No concrete leakage detection or prevention method is applied. No membership inference, n-gram overlap analysis, or decontamination pipeline is used."
    362       }
    363     }
    364   },
    365   "scan_version": 3,
    366   "active_modules": [
    367     "experimental_rigor",
    368     "data_leakage"
    369   ],
    370   "claims": [
    371     {
    372       "claim": "IDGenRec significantly outperforms all baselines in standard sequential recommendation, with average improvements of 39.44%, 23.55%, 42.37%, and 36.76% on Sports, Beauty, Toys, and Yelp respectively.",
    373       "evidence": "Table 4 shows IDGenRec achieving the highest scores across all 4 metrics on all 4 datasets. Section 3.4.2 reports the percentage improvements. Statistical significance at p < 0.05 under student's t-test is claimed.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "Alternate training of the ID generator and base recommender significantly boosts performance over training either component alone.",
    378       "evidence": "Table 5 shows alternate training outperforms ID-only and Rec-only training on Sports and Beauty datasets. For example, HR@5 on Sports: ID-only 0.0102, Rec-only 0.0350, Alternate 0.0429.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "The zero-shot performance of the pre-trained foundation model is comparable to or even better than some traditional recommendation models based on supervised training.",
    383       "evidence": "Section 3.5.2 and Table 7 show the foundation model surpasses GRU4Rec on all 4 shared datasets, Bert4Rec on 3/4, and Caser on 2/4 in zero-shot setting. On Yelp, zero-shot IDGenRec outperforms all traditional supervised baselines except P5.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "IDGenRec achieves 353.46% improvement over UniSRec on the cross-platform Yelp dataset in zero-shot evaluation, demonstrating superior generalizability.",
    388       "evidence": "Table 7 shows IDGenRec vs UniSRec on Yelp: HR@5 0.0300 vs 0.0064, NDCG@5 0.0248 vs 0.0051. Section 3.5.2 reports the 353.46% figure.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Including user IDs alongside item IDs enhances recommendation performance.",
    393       "evidence": "Table 6 shows User & Item ID consistently outperforms Item ID alone across all metrics on both Sports and Beauty datasets. For example, HR@5 on Sports: Item ID 0.0404, User & Item ID 0.0429.",
    394       "supported": "strong"
    395     }
    396   ],
    397   "methodology_tags": [
    398     "benchmark-eval"
    399   ],
    400   "key_findings": "IDGenRec proposes learning unique textual IDs from item metadata using human vocabulary tokens, enabling LLM-based generative recommendation without out-of-vocabulary tokens. The method significantly outperforms 10 baselines on 4 sequential recommendation benchmarks (23-42% average improvement). A foundation model trained on 19 Amazon domains shows zero-shot recommendation on 6 unseen datasets that is comparable to some supervised baselines, with particularly strong cross-platform transfer to Yelp (353% improvement over UniSRec).",
    401   "red_flags": [
    402     {
    403       "flag": "No variance or error bars reported",
    404       "detail": "All results tables show only point estimates. The t-test in Table 4 implies multiple runs were performed, but neither the number of runs nor any spread measure (std dev, IQR) is ever reported, making it impossible to assess result stability."
    405     },
    406     {
    407       "flag": "Suspiciously large improvements without robustness evidence",
    408       "detail": "Reported improvements of 23-42% over best baselines are unusually large for an incremental method change (different ID representation). Without variance reporting, seed sensitivity, or multiple-run evidence, these large gains are difficult to verify."
    409     },
    410     {
    411       "flag": "No limitations section",
    412       "detail": "The paper contains no limitations, threats to validity, or scope boundary discussion. For a paper making broad claims about 'foundation recommendation models,' this absence is concerning."
    413     },
    414     {
    415       "flag": "Foundation model claims from limited testing",
    416       "detail": "The 'foundation model' claim rests on zero-shot evaluation across Amazon Review subdomains (same platform) plus one cross-platform dataset (Yelp). True foundation model validation would require diverse platforms, modalities, and substantially more test domains."
    417     },
    418     {
    419       "flag": "Compute budget unreported despite two-model architecture",
    420       "detail": "The method requires training two separate models (ID generator and recommender) in alternating fashion for 3 iterations. No compute costs, GPU hours, or training time are reported, making it impossible to assess whether the improvements justify the additional computational overhead compared to single-model baselines."
    421     },
    422     {
    423       "flag": "Causal claims confounded by multiple simultaneous changes",
    424       "detail": "The paper attributes improvements to textual IDs, but IDGenRec differs from P5 baselines in multiple ways: ID representation, ID generator model, alternate training, constrained decoding, and model initialization. The ablation studies address training strategy and user IDs but do not isolate the effect of textual vs. numerical IDs while holding the training procedure constant."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "Recommendation as language processing (rlp): A unified pretrain, personalized prompt & predict paradigm (p5)",
    430       "authors": ["Shijie Geng", "Shuchang Liu", "Zuohui Fu", "Yingqiang Ge", "Yongfeng Zhang"],
    431       "year": 2022,
    432       "relevance": "Foundational work on LLM-based generative recommendation using text-to-text paradigm with numerical item IDs."
    433     },
    434     {
    435       "title": "How to Index Item IDs for Recommendation Foundation Models",
    436       "authors": ["Wenyue Hua", "Shuyuan Xu", "Yingqiang Ge", "Yongfeng Zhang"],
    437       "year": 2023,
    438       "relevance": "Study of item ID indexing strategies for generative recommendation, the direct predecessor to IDGenRec's approach."
    439     },
    440     {
    441       "title": "Towards universal sequence representation learning for recommender systems",
    442       "authors": ["Yupeng Hou", "Shanlei Mu", "Wayne Xin Zhao", "Yaliang Li", "Bolin Ding", "Ji-Rong Wen"],
    443       "year": 2022,
    444       "relevance": "UniSRec: encoder-only foundation model for recommendation using item text representations, key zero-shot baseline."
    445     },
    446     {
    447       "title": "Exploring the limits of transfer learning with a unified text-to-text transformer",
    448       "authors": ["Colin Raffel", "Noam Shazeer", "Adam Roberts"],
    449       "year": 2020,
    450       "arxiv_id": "1910.10683",
    451       "relevance": "T5 model used as backbone for both the ID generator and base recommender in IDGenRec."
    452     },
    453     {
    454       "title": "A Survey on Large Language Models for Recommendation",
    455       "authors": ["Likang Wu", "Zhi Zheng", "Zhaopeng Qiu"],
    456       "year": 2023,
    457       "arxiv_id": "2305.19860",
    458       "relevance": "Comprehensive survey classifying LLM-based recommender systems into discriminative and generative categories."
    459     },
    460     {
    461       "title": "Large Language Models for Generative Recommendation: A Survey and Visionary Discussions",
    462       "authors": ["Lei Li", "Yongfeng Zhang", "Dugang Liu", "Li Chen"],
    463       "year": 2023,
    464       "arxiv_id": "2309.01157",
    465       "relevance": "Survey covering generative recommendation approaches including direct item generation from LLMs."
    466     },
    467     {
    468       "title": "Training language models to follow instructions with human feedback",
    469       "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"],
    470       "year": 2022,
    471       "relevance": "InstructGPT/RLHF methodology; the paper draws parallels to its finding that training LLMs beyond one epoch causes overfitting."
    472     },
    473     {
    474       "title": "Large language models are zero-shot rankers for recommender systems",
    475       "authors": ["Yupeng Hou", "Junjie Zhang", "Zihan Lin"],
    476       "year": 2023,
    477       "arxiv_id": "2305.08845",
    478       "relevance": "Explores LLMs as zero-shot recommendation rankers without fine-tuning, relevant to LLM capability assessment."
    479     },
    480     {
    481       "title": "M6-rec: Generative pretrained language models are open-ended recommender systems",
    482       "authors": ["Zeyu Cui", "Jianxin Ma", "Chang Zhou", "Jingren Zhou", "Hongxia Yang"],
    483       "year": 2022,
    484       "arxiv_id": "2205.08084",
    485       "relevance": "Early work on using generative pretrained LMs as open-ended recommender systems."
    486     },
    487     {
    488       "title": "Recommendation as instruction following: A large language model empowered recommendation approach",
    489       "authors": ["Junjie Zhang", "Ruobing Xie", "Yupeng Hou"],
    490       "year": 2023,
    491       "arxiv_id": "2305.07001",
    492       "relevance": "LLM-empowered recommendation through instruction following, part of the generative recommendation paradigm."
    493     }
    494   ],
    495   "engagement_factors": {
    496     "practical_relevance": {
    497       "score": 2,
    498       "justification": "Code and data are open-sourced, and the approach could be implemented by practitioners building LLM-based recommendation systems, though it requires training two models."
    499     },
    500     "surprise_contrarian": {
    501       "score": 1,
    502       "justification": "The idea that textual IDs outperform numerical IDs for LLM-based recommendation is a modest insight rather than a paradigm-challenging claim."
    503     },
    504     "fear_safety": {
    505       "score": 0,
    506       "justification": "No AI safety, security, or risk concerns raised by this recommendation system research."
    507     },
    508     "drama_conflict": {
    509       "score": 0,
    510       "justification": "No controversy or conflict; a straightforward technical contribution to recommendation systems."
    511     },
    512     "demo_ability": {
    513       "score": 2,
    514       "justification": "GitHub repository with code and data is provided, allowing replication on standard public datasets."
    515     },
    516     "brand_recognition": {
    517       "score": 1,
    518       "justification": "Published at SIGIR (top-tier IR venue) from Rutgers University, a recognized research group in recommendation but not a household-name AI lab."
    519     }
    520   }
    521 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs