scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29564B)
      1 {
      2   "paper": {
      3     "title": "User Feedback Alignment for LLM-powered Exploration in Large-scale Recommendation Systems",
      4     "authors": [
      5       "Jianling Wang",
      6       "Yifan Liu",
      7       "Yinghao Sun",
      8       "Xuejian Ma",
      9       "Yueqi Wang",
     10       "He Ma",
     11       "Zhengyang Su",
     12       "Minmin Chen",
     13       "Mingyan Gao",
     14       "Onkar Dalal",
     15       "Ed H. Chi",
     16       "Lichan Hong",
     17       "Ningren Han",
     18       "Haokai Lu"
     19     ],
     20     "year": 2025,
     21     "venue": "Annual Meeting of the Association for Computational Linguistics",
     22     "arxiv_id": "2504.05522",
     23     "doi": "10.48550/arXiv.2504.05522"
     24   },
     25   "scan_version": 3,
     26   "active_modules": ["experimental_rigor", "data_leakage"],
     27   "methodology_tags": ["benchmark-eval", "case-study"],
     28   "key_findings": "A decomposed dual-LLM approach for recommendation exploration—separating novelty generation from user-preference alignment—simultaneously improves novelty and user satisfaction on a commercial short-form video platform serving billions of users. Direct RLHF on the novelty model failed (format collapse to 2% compliance and reward hacking), motivating the decoupled design with inference-time best-of-n selection via an alignment model trained on collective user feedback. Pointwise labeling slightly outperformed pairwise labeling and was 2x faster to train. Live A/B experiments showed increased positive playback rate, completion rate, and unique user-cluster engagement pairs.",
     29   "checklist": {
     30     "artifacts": {
     31       "code_released": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No source code repository, GitHub link, or code archive is mentioned anywhere in the paper."
     35       },
     36       "data_released": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The training data and user feedback signals are proprietary YouTube/Google data. No dataset is released or made available."
     40       },
     41       "environment_specified": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No environment specifications, dependency lists, or hardware details are provided. The paper mentions using Gemini but gives no setup details."
     45       },
     46       "reproduction_instructions": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "No reproduction instructions are provided. The system is a proprietary production deployment and the paper provides no guidance for replication."
     50       }
     51     },
     52     "statistical_methodology": {
     53       "confidence_intervals_or_error_bars": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No confidence intervals or error bars are reported on the live experiment results in Figure 4. The offline training curves in Figure 3 also lack error bands."
     57       },
     58       "significance_tests": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The paper claims 'significantly improved user satisfaction' and 'significant gains' but no statistical significance tests (p-values, t-tests, etc.) are reported for any comparison."
     62       },
     63       "effect_sizes_reported": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Figure 4(a) shows percentage improvements in novel impression ratio and positive feedback ratio relative to the hierarchical contextual bandit baseline. Figure 3 reports specific ppb rate improvements (+0.09%, +0.20%) for NDCG and F1 lifts."
     67       },
     68       "sample_size_justified": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "The novelty model uses '<8k training examples' (Section 3) with no justification for this number. No power analysis or sample size justification is provided for the live experiments either."
     72       },
     73       "variance_reported": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No variance, standard deviation, or any spread measure is reported across experimental runs for either offline or live experiments."
     77       }
     78     },
     79     "evaluation_design": {
     80       "baselines_included": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Section 5.1 lists multiple baselines: hierarchical contextual bandit (Song et al., 2022), neural linear bandit (Su et al., 2024), two-tower model (Yang et al., 2020), transformer-based sequential model, and the baseline LLM-powered exploration without alignment (Wang et al., 2024c)."
     84       },
     85       "baselines_contemporary": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Baselines include recent work (Su et al. 2024, Wang et al. 2024c) and current production models. The baselines span both exploration-oriented and exploitation-oriented approaches."
     89       },
     90       "ablation_study": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper compares pairwise vs. pointwise labeling strategies (Figures 4b-d) and evaluates alignment models trained for 50k vs. 100k steps (Section 5.2), both of which serve as ablations of design choices."
     94       },
     95       "multiple_metrics": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Multiple metrics are used: novel impression ratio, positive playback rate, completion rate, unique engaged user-cluster (UEUC), F1@K, and NDCG@K."
     99       },
    100       "human_evaluation": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Live A/B experiments on a platform serving billions of users constitute a large-scale user study. Real users evaluate the system's recommendations through their engagement behavior (playback, completion, likes, shares)."
    104       },
    105       "held_out_test_set": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section 5.2 states: 'Offline evaluation is done on a holdout set of interest cluster sequences, the novel interest transitions and user's feedback scores.'"
    109       },
    110       "per_category_breakdown": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Figure 4(a) breaks results down by model category (exploration-oriented vs. exploitation-oriented). Section 5.3 notes that UEUC is 'higher for more active users,' providing a user-segment breakdown."
    114       },
    115       "failure_cases_discussed": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Section 3 extensively discusses failed approaches: RLHF caused format collapse (99%→2% correct format) and reward hacking (predicting high-reward words like 'cat', 'BTS'), and scaling training data yielded neutral results."
    119       },
    120       "negative_results_reported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Section 3 reports that increasing training examples yielded 'neutral results,' that RLHF 'always resulted in the model quickly collapsing,' and that training beyond the convergence point (100k steps) led to worse live performance than the 50k-step checkpoint."
    124       }
    125     },
    126     "claims_and_evidence": {
    127       "abstract_claims_supported": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The abstract claims 'significant gains in both user satisfaction (measured by watch activity and active user counts) and exploration diversity.' Section 5.3 and Figure 4 show improvements in positive playback rate, completion rate, and UEUC, supporting these claims."
    131       },
    132       "causal_claims_justified": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Causal claims ('improves user satisfaction,' 'enhances exploration efficiency') are supported by live A/B experiments comparing treatment (user-aligned model) against control groups, which is an appropriate causal identification strategy."
    136       },
    137       "generalization_bounded": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The title frames the approach as for 'Large-scale Recommendation Systems' generally, but all experiments are on a single commercial short-form video platform (YouTube Shorts). No evidence is presented for other recommendation domains, content types, or platforms."
    141       },
    142       "alternative_explanations_discussed": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "The paper does not discuss alternative explanations for the observed improvements. No consideration of confounding factors such as novelty effects, seasonal user behavior changes, or whether the improvement is due to the dual-LLM architecture vs. simply more compute at inference time."
    146       },
    147       "proxy_outcome_distinction": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "The paper measures positive playback rate and completion rate but frames these as 'user satisfaction' without discussing whether engagement metrics actually capture satisfaction. Users may watch more without being more satisfied, and no discussion of this proxy gap appears."
    151       }
    152     },
    153     "setup_transparency": {
    154       "model_versions_specified": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "Section 5.1 states 'we employed Gemini (Team et al., 2024)' with a citation to Gemini 1.5. No specific version, model size, or snapshot date is given. Just 'Gemini' is insufficient per the schema requirements."
    158       },
    159       "prompts_provided": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Figure 2 provides the actual prompt text used for the novelty model, including specific cluster examples and generation instructions ('With less than 30 words, generate a new and different short-form video cluster...')."
    163       },
    164       "hyperparameters_reported": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "The paper mentions 'high temperature setting' for sampling and K=2 for cluster sequence length, but does not report the actual temperature value, learning rate, or other training hyperparameters. Section 5.2 mentions 50k and 100k training steps but no other details."
    168       },
    169       "scaffolding_described": {
    170         "applies": false,
    171         "answer": false,
    172         "justification": "The system uses a recommendation pipeline with two LLMs feeding into a traditional recommender, not agentic scaffolding with tools, retry logic, or memory management."
    173       },
    174       "data_preprocessing_documented": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 4.1 describes preprocessing steps: normalizing skewed feedback scores, filtering cluster transition pairs with insufficient user feedback, and rounding scores to fixed intervals to account for margin of error."
    178       }
    179     },
    180     "limitations_and_scope": {
    181       "limitations_section_present": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "There is no dedicated limitations section. Section 3 discusses limitations of the baseline approach (hierarchical planning) but the proposed method's limitations are not addressed."
    185       },
    186       "threats_to_validity_specific": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "No threats to validity are discussed for the proposed method. The paper does not address potential confounds in the A/B testing or limitations of the offline evaluation."
    190       },
    191       "scope_boundaries_stated": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "No explicit scope boundaries are stated. The paper does not specify what the results do NOT show or what settings the approach has not been tested in."
    195       }
    196     },
    197     "data_integrity": {
    198       "raw_data_available": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "All data is proprietary YouTube/Google user interaction data. No raw data is available for independent verification."
    202       },
    203       "data_collection_described": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Section 4.1 describes the data collection: per-query logging of users' cluster sequences and feedback (playback, like, share, skip) on LLM predictions, then aggregation across users for each cluster transition pair."
    207       },
    208       "recruitment_methods_described": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "The paper says the platform serves 'billions of users' but does not describe how users are selected for the live experiments, whether all users or a subset are included, or any sampling criteria."
    212       },
    213       "data_pipeline_documented": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "While Section 4.1 describes preprocessing steps (normalize, filter, round), exact thresholds and criteria are not given. The number of examples retained at each stage is not reported, and the filtering criteria for 'little user feedback' are unspecified."
    217       }
    218     },
    219     "conflicts_of_interest": {
    220       "funding_disclosed": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No funding disclosure or acknowledgments section is present. All authors are Google employees but no explicit funding statement is made."
    224       },
    225       "affiliations_disclosed": {
    226         "applies": true,
    227         "answer": true,
    228         "justification": "Author affiliations are clearly listed: Google DeepMind, YouTube, and Google Labs. The connection between authors and the evaluated platform is transparent."
    229       },
    230       "funder_independent_of_outcome": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "All authors are employed by Google, which owns YouTube, the platform being evaluated. Google has a direct financial interest in demonstrating that its recommendation system improvements work."
    234       },
    235       "financial_interests_declared": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No competing interests statement or financial interests declaration is present in the paper."
    239       }
    240     },
    241     "contamination": {
    242       "training_cutoff_stated": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "The paper fine-tunes Gemini on proprietary recommendation data and evaluates on proprietary holdout sets, not on any public benchmark testing model knowledge. Contamination of public benchmarks is not a concern here."
    246       },
    247       "train_test_overlap_discussed": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "Same as above — the evaluation uses proprietary holdout data and live A/B tests, not public benchmarks where pre-training contamination is a concern."
    251       },
    252       "benchmark_contamination_addressed": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No public benchmarks are used. The evaluation is entirely on proprietary data and live traffic, making benchmark contamination inapplicable."
    256       }
    257     },
    258     "human_studies": {
    259       "pre_registered": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "No pre-registration is mentioned for the live A/B experiments. The experiments appear to be standard industry A/B tests without academic pre-registration."
    263       },
    264       "irb_or_ethics_approval": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "No IRB or ethics board approval is mentioned, despite the experiments being conducted on billions of real users on a commercial platform."
    268       },
    269       "demographics_reported": {
    270         "applies": true,
    271         "answer": false,
    272         "justification": "No user demographics are reported. The paper mentions 'billions of users' and that UEUC is 'higher for more active users' but provides no demographic characterization."
    273       },
    274       "inclusion_exclusion_criteria": {
    275         "applies": true,
    276         "answer": false,
    277         "justification": "No inclusion or exclusion criteria for experiment participants are stated. It is unclear whether all platform users were included or if a subset was selected."
    278       },
    279       "randomization_described": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "The paper mentions 'experiment and control' groups (Figure 4) implying randomization, but the randomization procedure (how users were assigned to arms, stratification, etc.) is not described."
    283       },
    284       "blinding_described": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No blinding is described. Users presumably did not know they were in an experiment, but this is not explicitly stated or discussed."
    288       },
    289       "attrition_reported": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No attrition data is reported. The number of users who started vs. completed the experiment period is not mentioned."
    293       }
    294     },
    295     "cost_and_practicality": {
    296       "inference_cost_reported": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "Section 4.2 notes there is 'no latency impact' since inference happens offline, and mentions '5 times more predictions,' but no actual inference costs (compute time, API costs, tokens consumed) are reported."
    300       },
    301       "compute_budget_stated": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Training steps (50k, 100k) are mentioned but no GPU hours, hardware specifications, total compute budget, or training time are reported."
    305       }
    306     },
    307     "experimental_rigor": {
    308       "seed_sensitivity_reported": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No seed sensitivity analysis is reported. Results appear to be from single runs of the models."
    312       },
    313       "number_of_runs_stated": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The number of experimental runs for offline evaluation or model training is not stated."
    317       },
    318       "hyperparameter_search_budget": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "Section 5.2 mentions offline evaluation guided hyperparameter tuning (e.g., 'score normalization strategy') but no search budget, number of configurations tried, or search method is reported."
    322       },
    323       "best_config_selection_justified": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Section 5.2 justifies selecting the 50k-step checkpoint because 'F1 converged in offline evaluation as shown in Figure 3.' The 100k-step arm was deployed as a comparison arm and showed worse live performance, validating the selection."
    327       },
    328       "multiple_comparison_correction": {
    329         "applies": false,
    330         "answer": false,
    331         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    332       },
    333       "self_comparison_bias_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "Google employees evaluate a system deployed on Google's YouTube platform. The paper does not acknowledge or discuss the inherent bias of evaluating one's own system."
    337       },
    338       "compute_budget_vs_performance": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "The proposed method uses 5x more inference compute (5 independent samplings) than the baseline novelty model, but performance is not explicitly compared at matched compute budgets."
    342       },
    343       "benchmark_construct_validity": {
    344         "applies": true,
    345         "answer": true,
    346         "justification": "Section 5.2 discusses that F1@K is more appropriate than NDCG for their top-n selection task: 'F1@K focuses on the model's ability to identify the top-K most relevant clusters, which is more crucial for our top-n selection task. Memorizing the exact rankings is unnecessary.'"
    347       },
    348       "scaffold_confound_addressed": {
    349         "applies": false,
    350         "answer": false,
    351         "justification": "The evaluation compares the complete system (novelty LLM + alignment LLM + recommender) against other production systems. No model-level comparisons with different scaffolding are made — the system IS the thing being tested."
    352       }
    353     },
    354     "data_leakage": {
    355       "temporal_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether the holdout set's temporal relationship to training data could introduce leakage. The paper does not describe how the holdout set was constructed temporally."
    359       },
    360       "feature_leakage_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No discussion of whether the aggregated user feedback signals used for training the alignment model leak information about the holdout evaluation data."
    364       },
    365       "non_independence_addressed": {
    366         "applies": true,
    367         "answer": false,
    368         "justification": "No discussion of whether training and holdout cluster sequences are independent. User cluster sequences may share structural patterns across the split."
    369       },
    370       "leakage_detection_method": {
    371         "applies": true,
    372         "answer": false,
    373         "justification": "No leakage detection or prevention methods are described."
    374       }
    375     }
    376   },
    377   "claims": [
    378     {
    379       "claim": "The decomposed dual-LLM approach (novelty + alignment models) simultaneously improves recommendation novelty and user satisfaction, achieving a more optimal point on the novelty-quality operating curve.",
    380       "evidence": "Figure 4(a) shows the proposed method in the upper-right quadrant (highest novelty AND highest quality improvement relative to hierarchical contextual bandit baseline), outperforming all existing production models. Section 5.3 discusses this.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Direct RLHF on the novelty model causes catastrophic format collapse and reward hacking.",
    385       "evidence": "Section 3 reports that after 5k RLHF steps, 'the LLM's chance of predicting in the correct format drops from 99+% to 2%' and the model learned to predict high-reward words like 'cat', 'BTS', 'toys'.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Inference-time scaling with best-of-n selection from the alignment model improves exploration efficiency.",
    390       "evidence": "Section 4.2 describes sampling 5x more predictions and selecting top-k using the alignment model. The live results in Figure 4 show improvements, but no ablation isolates the inference-scaling contribution from the alignment model itself.",
    391       "supported": "weak"
    392     },
    393     {
    394       "claim": "Pointwise labeling slightly outperforms pairwise labeling for the alignment model.",
    395       "evidence": "Figures 4(b), (c), (d) show both approaches improve user satisfaction and exploration, 'with the pointwise model slightly outperforming the pairwise model' (Section 5.3). Pointwise training is also 2x faster.",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "The alignment model trained with collective user feedback effectively predicts user preferences over novel interest clusters.",
    400       "evidence": "Figure 3 shows offline NDCG and F1 lift over random improving consistently during training. Figure 2 shows qualitative examples of alignment model predictions matching intuitive user preferences.",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "red_flags": [
    405     {
    406       "flag": "Company evaluating its own product",
    407       "detail": "All 14 authors are Google employees (Google DeepMind, YouTube, Google Labs) evaluating a system deployed on YouTube. No independent evaluation or external validation is provided."
    408     },
    409     {
    410       "flag": "Redacted experimental details",
    411       "detail": "Exact dates are redacted in Figure 4, and specific metric values in the live experiments are shown only as relative percentages. This makes independent verification of the claimed improvements impossible."
    412     },
    413     {
    414       "flag": "No error bars or statistical tests on live experiments",
    415       "detail": "The word 'significant' is used multiple times to describe live experiment results, but no statistical significance tests, confidence intervals, or error bars are reported for any of the A/B test comparisons."
    416     },
    417     {
    418       "flag": "No limitations section for proposed method",
    419       "detail": "While Section 3 discusses limitations of the baseline approach, the proposed dual-LLM method's limitations (e.g., dependency on sufficient collective feedback, cold-start for new cluster transitions, scalability of K) are not discussed."
    420     },
    421     {
    422       "flag": "Inference-time scaling contribution not isolated",
    423       "detail": "The paper claims inference-time scaling (best-of-n) is effective, but no ablation separates its contribution from the alignment model's contribution. The improvement could be entirely from better scoring rather than repeated sampling."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "Training language models to follow instructions with human feedback",
    429       "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"],
    430       "year": 2022,
    431       "relevance": "Foundational RLHF paper; the current work builds on and identifies limitations of this approach for structured recommendation tasks."
    432     },
    433     {
    434       "title": "Large language monkeys: Scaling inference compute with repeated sampling",
    435       "authors": ["Bradley Brown", "Jordan Juravsky", "Ryan Ehrlich"],
    436       "year": 2024,
    437       "arxiv_id": "2407.21787",
    438       "relevance": "Introduces the inference-time scaling approach via repeated sampling that this paper adapts for recommendation."
    439     },
    440     {
    441       "title": "LLMs for user interest exploration in large-scale recommendation systems",
    442       "authors": ["Jianling Wang", "Haokai Lu", "Yifan Liu"],
    443       "year": 2024,
    444       "relevance": "The base hierarchical planning system this paper extends with user feedback alignment; describes the deployed LLM-powered exploration paradigm."
    445     },
    446     {
    447       "title": "A survey on large language models for recommendation",
    448       "authors": ["Likang Wu", "Zhi Zheng", "Zhaopeng Qiu"],
    449       "year": 2024,
    450       "relevance": "Comprehensive survey of LLMs applied to recommendation systems, providing context for the broader research area."
    451     },
    452     {
    453       "title": "TallRec: An effective and efficient tuning framework to align large language model with recommendation",
    454       "authors": ["Keqin Bao", "Jizhi Zhang", "Yang Zhang"],
    455       "year": 2023,
    456       "relevance": "Fine-tuning framework for aligning LLMs with recommendation tasks, directly related to the alignment challenge addressed in this paper."
    457     },
    458     {
    459       "title": "Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context",
    460       "authors": ["Gemini Team"],
    461       "year": 2024,
    462       "arxiv_id": "2403.05530",
    463       "relevance": "The base model used for both the novelty and alignment LLMs in this paper's experiments."
    464     },
    465     {
    466       "title": "Large language models are zero-shot rankers for recommender systems",
    467       "authors": ["Yupeng Hou", "Junjie Zhang", "Zihan Lin"],
    468       "year": 2024,
    469       "relevance": "Demonstrates LLM capabilities as zero-shot rankers in recommendation, relevant to understanding LLM reasoning in recommendation tasks."
    470     },
    471     {
    472       "title": "Better generalization with semantic IDs: A case study in ranking for recommendations",
    473       "authors": ["Anima Singh", "Trung Vu", "Nikhil Mehta"],
    474       "year": 2024,
    475       "relevance": "Fine-tuning LLMs for recommendation ranking with semantic IDs, related approach to aligning LLMs with recommendation objectives."
    476     },
    477     {
    478       "title": "Long-term value of exploration: Measurements, findings and algorithms",
    479       "authors": ["Yi Su", "Xiangyu Wang", "Elaine Ya Le"],
    480       "year": 2024,
    481       "relevance": "Addresses long-term value of exploration in recommendation systems, directly related to the exploration-exploitation tradeoff this paper tackles."
    482     },
    483     {
    484       "title": "Large language models as data augmenters for cold-start item recommendation",
    485       "authors": ["Jianling Wang", "Haokai Lu", "James Caverlee"],
    486       "year": 2024,
    487       "relevance": "Uses LLMs as data augmentation tools for recommendation systems, an alternative paradigm to the direct LLM planning approach in this paper."
    488     }
    489   ],
    490   "engagement_factors": {
    491     "practical_relevance": {
    492       "score": 2,
    493       "justification": "Describes a deployed production system on YouTube with concrete architectural patterns, but the approach requires proprietary infrastructure and data that most practitioners cannot replicate."
    494     },
    495     "surprise_contrarian": {
    496       "score": 1,
    497       "justification": "The finding that RLHF catastrophically fails for structured recommendation tasks (format collapse to 2%) is moderately surprising given RLHF's success elsewhere."
    498     },
    499     "fear_safety": {
    500       "score": 0,
    501       "justification": "No AI safety or security concerns are raised; the paper focuses on recommendation quality optimization."
    502     },
    503     "drama_conflict": {
    504       "score": 0,
    505       "justification": "No controversy or provocative claims; straightforward system improvement paper."
    506     },
    507     "demo_ability": {
    508       "score": 0,
    509       "justification": "No code, demo, or reproducible artifacts are released; the system is entirely proprietary."
    510     },
    511     "brand_recognition": {
    512       "score": 3,
    513       "justification": "Google DeepMind and YouTube authors working on YouTube's recommendation system with Gemini; high brand recognition across all dimensions."
    514     }
    515   }
    516 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs