scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25770B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "IDGenRec: LLM-RecSys Alignment with Textual ID Learning",
      6     "authors": [
      7       "Juntao Tan",
      8       "Shuyuan Xu",
      9       "Wenyue Hua",
     10       "Yingqiang Ge",
     11       "Zelong Li",
     12       "Yongfeng Zhang"
     13     ],
     14     "year": 2024,
     15     "venue": "Annual International ACM SIGIR Conference on Research and Development in Information Retrieval",
     16     "arxiv_id": "2403.19021",
     17     "doi": "10.1145/3626772.3657821"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The abstract's main claims — consistent surpassing of baselines in standard evaluation (Table 4) and zero-shot performance comparable to supervised models (Table 7) — are tested and supported by empirical results.",
     25         "source": "haiku"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Ablation studies in Tables 5 and 6 isolate the contribution of alternate training and user IDs, providing adequate support for the causal claims about specific model components.",
     31         "source": "haiku"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Claims are bounded to specific experimental settings: 'under standard experimental setting' for Table 4, and 'completely zero-shot setting' for Table 7, tested on named datasets.",
     37         "source": "haiku"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper does not discuss whether improvements could stem from the T5 tag-generation initialization, differences in model parameter counts versus baselines, or other confounding factors.",
     43         "source": "haiku"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper measures NDCG and HR, which are standard ranking metrics that directly correspond to the claimed recommendation accuracy improvements — no proxy/outcome conflation.",
     49         "source": "haiku"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "There is no dedicated limitations or threats-to-validity section; the conclusion briefly discusses future work but not limitations of the current study.",
     57         "source": "haiku"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No specific threats to validity are discussed, such as potential data leakage from T5 pre-training on Amazon/Yelp web text, limited dataset coverage, or sensitivity to hyperparameters.",
     63         "source": "haiku"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The paper does not explicitly state what its results do NOT show (e.g., no claims about cold-start, real-time latency, or non-sequential recommendation settings being out of scope).",
     69         "source": "haiku"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Acknowledgement section states: 'The work was supported in part by NSF IIS2046457 and IIS-2007907.'",
     77         "source": "haiku"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "All six authors list Rutgers University as their affiliation in the paper header.",
     83         "source": "haiku"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Funding is from NSF, a public government agency with no commercial stake in the IDGenRec framework or recommendation performance outcomes.",
     89         "source": "haiku"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "There is no competing interests statement or declaration of patents, equity, or consulting relationships anywhere in the paper.",
     95         "source": "haiku"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper explicitly defines what 'textual IDs' must satisfy (unique, concise, semantically rich, platform-agnostic), and clearly defines the generative recommendation paradigm and what distinguishes it from discriminative methods.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper clearly states it contributes a framework (IDGenRec) combining a textual ID generator, a diverse ID generation algorithm, and an alternate training strategy to enable LLM-based generative recommendation.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 4 (Related Works) and Table 1 explicitly compare IDGenRec to P5, P5-variants, UniSRec, and Recformer, explaining why each prior approach is limited and how IDGenRec differs.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": true,
    125           "justification": "The abstract explicitly states: 'Code and data are open-sourced at https://github.com/agiresearch/IDGenRec.'",
    126           "source": "haiku"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "All datasets used (Amazon Review Datasets and Yelp) are standard publicly available benchmarks with links and citations provided.",
    132           "source": "haiku"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "No requirements.txt, Dockerfile, or environment specification is provided in the paper; only the tokenizer (SentencePiece) and model (T5) are named without versioned dependency lists.",
    138           "source": "haiku"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "Section 3.2 provides training hyperparameters but no step-by-step instructions to reproduce the experiments from scratch; the released code may contain these but the paper itself does not.",
    144           "source": "haiku"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "Tables 4, 5, 6, and 7 report point estimates only with no confidence intervals or error bars across runs.",
    152           "source": "haiku"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": true,
    157           "justification": "Table 4 caption states: 'All improvements are significant at p < 0.05 compared to the best baseline under the student's t-test.'",
    158           "source": "haiku"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "The paper reports average percentage improvements over second-best baselines: 39.44%, 23.55%, 42.37%, and 36.76% for Sports, Beauty, Toys, and Yelp respectively.",
    164           "source": "haiku"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "No power analysis or justification for dataset sizes is provided; datasets are selected based on prior usage in comparable baselines, not statistical power considerations.",
    170           "source": "haiku"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "All results in Tables 4–7 are single-run point estimates; no standard deviation or variance across multiple training runs is reported.",
    176           "source": "haiku"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Seven traditional sequential baselines (GRU4Rec, Caser, HGN, SASRec, Bert4Rec, FDSA, S3Rec) and three generative baselines (P5-SID, P5-CID, P5-SemID) are included in Table 4.",
    184           "source": "haiku"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "Baselines include P5 variants from 2022–2023 and SASRec/S3Rec which remain competitive; UniSRec (2022) is used for zero-shot comparison as the direct competitor.",
    190           "source": "haiku"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "Section 3.4.3 presents two ablations: alternate vs. ID-only vs. Rec-only training (Table 5), and item-only vs. user-only vs. both IDs (Table 6).",
    196           "source": "haiku"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Four metrics are reported for all experiments: HR@5, HR@10, NDCG@5, NDCG@10.",
    202           "source": "haiku"
    203         },
    204         "human_evaluation": {
    205           "applies": false,
    206           "answer": false,
    207           "justification": "This is a standard automatic benchmark evaluation of recommendation accuracy; human evaluation of item recommendations is not standard or expected for this type of study.",
    208           "source": "haiku"
    209         },
    210         "held_out_test_set": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "Section 3.3 states: 'we adopt a leave-one-out strategy for testing,' holding out the last interaction per user as the test item.",
    214           "source": "haiku"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Results are broken down per dataset (Sports, Beauty, Toys, Yelp for standard; Music, Instruments, Yelp cross-platform for zero-shot), providing per-task performance breakdowns.",
    220           "source": "haiku"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": false,
    225           "justification": "Section 3.4.4 shows qualitative case studies of good ID generation; systematic failure cases are not analyzed, and the Music HR@10 underperformance is noted but not investigated.",
    226           "source": "haiku"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "Table 7 shows IDGenRec underperforms UniSRec on Music HR@10, and Table 5 shows ID-only training yields poor results, both reported without obfuscation.",
    232           "source": "haiku"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": false,
    239           "justification": "The paper refers to 'standard pretrained T5 checkpoint' for the base recommender without specifying model size (small/base/large); only the ID generator's HuggingFace checkpoint URL is given.",
    240           "source": "haiku"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": true,
    245           "justification": "Figure 2 shows a fully completed real prompt example with actual item IDs filled in, and the template structure is described with fill values illustrated.",
    246           "source": "haiku"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": true,
    251           "justification": "Section 3.2 reports learning rates (1e-3 for recommender, 1e-8 for ID generator), DBS parameters (k=10, λ starting at 1, max 10), ID length limits [1,10) and [10,20), and training iterations (3 rounds).",
    252           "source": "haiku"
    253         },
    254         "scaffolding_described": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "This is a standard supervised ML pipeline, not an agentic system; no scaffolding or multi-turn agent orchestration is involved.",
    258           "source": "haiku"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Section 3.1 documents filtering (users/items with fewer than 5 interactions), dataset density-based selection rules, and downsampling large datasets to 30,000 users.",
    264           "source": "haiku"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": true,
    271           "justification": "All datasets are publicly available: Amazon Review Datasets (with citations to He & McAuley 2016, McAuley et al. 2015) and the Yelp dataset at the linked URL.",
    272           "source": "haiku"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Section 3.1 describes dataset selection criteria, density categories, which datasets were used for pre-training vs. testing, and the construction of the 'Fusion' corpus from 19 Amazon datasets.",
    278           "source": "haiku"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": false,
    282           "answer": false,
    283           "justification": "No human participant recruitment; all data comes from existing public benchmark datasets.",
    284           "source": "haiku"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "Section 3.1 documents the full pipeline from raw Amazon/Yelp data through density-based filtering, train/test splitting, and downsampling to form the Fusion training corpus.",
    290           "source": "haiku"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": true,
    296           "answer": false,
    297           "justification": "The T5 model's training data cutoff is not mentioned; T5 was pre-trained on C4 which includes web-crawled text that may overlap with Amazon review content used for evaluation.",
    298           "source": "haiku"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": true,
    302           "answer": false,
    303           "justification": "While training and test recommendation interactions are separated by design, no discussion addresses whether T5's pre-training data contains item descriptions or reviews from the test datasets.",
    304           "source": "haiku"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "Amazon Review Datasets and Yelp reviews are likely present in T5's C4 training corpus (web-crawled data), potentially giving the model prior knowledge of item metadata; this is not acknowledged.",
    310           "source": "haiku"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": false,
    316           "answer": false,
    317           "justification": "No human participants in this study.",
    318           "source": "haiku"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No human participants in this study.",
    324           "source": "haiku"
    325         },
    326         "demographics_reported": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No human participants in this study.",
    330           "source": "haiku"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No human participants in this study.",
    336           "source": "haiku"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No human participants in this study.",
    342           "source": "haiku"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No human participants in this study.",
    348           "source": "haiku"
    349         },
    350         "attrition_reported": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No human participants in this study.",
    354           "source": "haiku"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": false,
    361           "justification": "No inference latency, throughput, or cost figures are reported; important given that the framework requires running two LLMs (ID generator + recommender) at inference time.",
    362           "source": "haiku"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "No compute budget, GPU hours, or hardware specifications are mentioned anywhere in the paper.",
    368           "source": "haiku"
    369         }
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "IDGenRec consistently surpasses all baselines in standard sequential recommendation on 4 datasets, with 23–42% average improvement over the second-best baseline.",
    376       "evidence": "Table 4 with statistical significance (p < 0.05, student's t-test) across HR@5, HR@10, NDCG@5, NDCG@10 on Sports, Beauty, Toys, Yelp.",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Zero-shot IDGenRec outperforms the supervised baseline UniSRec on 5 of 6 unseen datasets, with 353% improvement on cross-platform Yelp.",
    381       "evidence": "Table 7 comparing IDGenRec vs. UniSRec zero-shot performance on 6 datasets; Music HR@10 is the single exception where UniSRec wins.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "The alternate training strategy is critical: training only the ID generator or only the recommender gives substantially worse results.",
    386       "evidence": "Table 5 ablation showing ID-only (HR@5=0.0102) vs. Rec-only (0.0350) vs. Alternate (0.0429) on Sports.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Textual IDs better utilize LLM semantic knowledge than numerical/OOV IDs used by P5 and its variants.",
    391       "evidence": "IDGenRec outperforms P5-SID, P5-CID, P5-SemID in Table 4; the comparison is made under controlled conditions using the same T5 backbone.",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "Zero-shot IDGenRec surpasses several traditional supervised models (GRU4Rec, Bert4Rec on 3/4 datasets, Caser on 2/4) despite never seeing the test data.",
    396       "evidence": "Table 7 zero-shot scores compared to Table 4 supervised baselines on shared datasets.",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "Including user IDs alongside item IDs improves recommendation performance.",
    401       "evidence": "Table 6 shows User & Item ID (HR@5=0.0429) outperforms Item ID only (0.0404) on Sports, with similar pattern on Beauty.",
    402       "supported": "moderate"
    403     }
    404   ],
    405   "methodology_tags": [
    406     "benchmark-eval"
    407   ],
    408   "key_findings": "IDGenRec introduces a T5-based ID generator that produces unique, semantically rich textual IDs for recommendation items, replacing the OOV numerical tokens used in prior generative recommendation work. On 4 standard sequential recommendation benchmarks, IDGenRec outperforms both traditional and generative baselines by 23–42% on average, with statistical significance. In a zero-shot foundation model experiment trained on 19 Amazon datasets, IDGenRec achieves recommendation performance comparable to or exceeding several supervised traditional models on 6 unseen datasets, including a 353% improvement over UniSRec on a cross-platform Yelp test — suggesting that platform-agnostic textual item representations enable LLM knowledge transfer to recommendation tasks.",
    409   "red_flags": [
    410     {
    411       "flag": "No variance across runs",
    412       "detail": "All results in Tables 4–7 are single-run point estimates; no standard deviation or confidence intervals are reported, making it impossible to assess result stability given stochastic training."
    413     },
    414     {
    415       "flag": "No limitations section",
    416       "detail": "The paper contains no dedicated limitations or threats-to-validity section; the conclusion only discusses future potential without acknowledging current constraints (scalability, cold-start, latency, etc.)."
    417     },
    418     {
    419       "flag": "Contamination not addressed",
    420       "detail": "T5 was pre-trained on C4 (web-crawled text), which likely includes Amazon reviews and Yelp content used as test data. Prior exposure to item descriptions could give IDGenRec an unfair advantage in ID generation that is not discussed."
    421     },
    422     {
    423       "flag": "T5 model size unspecified for base recommender",
    424       "detail": "'Standard pretrained T5 checkpoint' is used as the base recommender without specifying whether this is T5-small, T5-base, or T5-large, making parameter count comparisons to baselines ambiguous."
    425     },
    426     {
    427       "flag": "No financial interests declaration",
    428       "detail": "Despite being an academic paper, no competing interests statement is included — particularly relevant as several authors have co-authored many papers together in a lab context."
    429     }
    430   ],
    431   "cited_papers": [
    432     {
    433       "title": "Recommendation as language processing (RLP): A unified pretrain, personalized prompt & predict paradigm (P5)",
    434       "relevance": "Direct predecessor: first text-to-text generative recommendation system, uses OOV numerical IDs that IDGenRec replaces"
    435     },
    436     {
    437       "title": "How to Index Item IDs for Recommendation Foundation Models",
    438       "relevance": "Direct related work studying ID indexing strategies (SID, CID, SemID) that IDGenRec outperforms"
    439     },
    440     {
    441       "title": "Towards universal sequence representation learning for recommender systems (UniSRec)",
    442       "relevance": "Primary zero-shot baseline; encoder-only foundation recommendation model representing the competing approach"
    443     },
    444     {
    445       "title": "Exploring the limits of transfer learning with a unified text-to-text transformer (T5)",
    446       "relevance": "Backbone model for both the ID generator and base recommender"
    447     },
    448     {
    449       "title": "A Survey on Large Language Models for Recommendation",
    450       "relevance": "Survey contextualizing IDGenRec within the broader LLM-recommendation literature"
    451     },
    452     {
    453       "title": "Large language models are zero-shot rankers for recommender systems",
    454       "relevance": "Competing zero-shot recommendation approach using prompting rather than fine-tuning"
    455     },
    456     {
    457       "title": "Is ChatGPT a good recommender? A preliminary study",
    458       "relevance": "Cited to establish that direct ChatGPT prompting cannot match traditional recommendation baselines, motivating IDGenRec's fine-tuning approach"
    459     },
    460     {
    461       "title": "Diverse beam search: Decoding diverse solutions from neural sequence models",
    462       "relevance": "Core algorithm used in the diverse ID generation component to ensure unique IDs"
    463     }
    464   ],
    465   "engagement_factors": {
    466     "practical_relevance": {
    467       "score": 2,
    468       "justification": "Recommendation systems are widely deployed at scale; the open-sourced framework could be adapted by practitioners, though it requires training two LLMs and the inference cost is unreported."
    469     },
    470     "surprise_contrarian": {
    471       "score": 2,
    472       "justification": "Challenges the dominant approach of using numerical/OOV tokens in generative recommendation, arguing that learned natural-language IDs unlock LLM knowledge — a counterintuitive but tested claim."
    473     },
    474     "fear_safety": {
    475       "score": 0,
    476       "justification": "Pure recommendation systems research with no safety or risk implications discussed."
    477     },
    478     "drama_conflict": {
    479       "score": 1,
    480       "justification": "Mild competition angle with P5 and its variants (same group's prior work); the paper positions itself as solving a fundamental limitation of prior methods."
    481     },
    482     "demo_ability": {
    483       "score": 2,
    484       "justification": "Code is publicly released at GitHub with standard public datasets; a practitioner could reproduce the experiments, though compute requirements for training foundation models are non-trivial."
    485     },
    486     "brand_recognition": {
    487       "score": 1,
    488       "justification": "Rutgers University AI group with NSF funding; published at SIGIR (top IR venue) but not from a major industry lab, limiting name recognition outside the RecSys community."
    489     }
    490   },
    491   "hn_data": {
    492     "threads": [
    493       {
    494         "hn_id": "22678328",
    495         "title": "Modelling transmission and control of the Covid-19 pandemic in Australia [pdf]",
    496         "points": 4,
    497         "comments": 0,
    498         "url": "https://news.ycombinator.com/item?id=22678328"
    499       },
    500       {
    501         "hn_id": "39764168",
    502         "title": "A tweezer array with 6100 highly coherent atomic qubits",
    503         "points": 3,
    504         "comments": 0,
    505         "url": "https://news.ycombinator.com/item?id=39764168"
    506       },
    507       {
    508         "hn_id": "47566068",
    509         "title": "Security awareness in LLM agents: the NDAI zone case",
    510         "points": 2,
    511         "comments": 0,
    512         "url": "https://news.ycombinator.com/item?id=47566068"
    513       },
    514       {
    515         "hn_id": "39508338",
    516         "title": "CLoVe: Encoding Compositional Language in Contrastive Vision-Language Models",
    517         "points": 1,
    518         "comments": 0,
    519         "url": "https://news.ycombinator.com/item?id=39508338"
    520       }
    521     ],
    522     "top_points": 4,
    523     "total_points": 10,
    524     "total_comments": 0
    525   }
    526 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs