scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31693B)
      1 {
      2   "paper": {
      3     "title": "Following the Autoregressive Nature of LLM Embeddings via Compression and Alignment",
      4     "authors": [
      5       "Jingcheng Deng",
      6       "Zhongtao Jiang",
      7       "Liang Pang",
      8       "Zihao Wei",
      9       "Liwei Chen",
     10       "Kun Xu",
     11       "Yang Song",
     12       "Huawei Shen",
     13       "Xueqi Cheng"
     14     ],
     15     "year": 2025,
     16     "venue": "Conference on Empirical Methods in Natural Language Processing",
     17     "arxiv_id": "2502.11401",
     18     "doi": "10.48550/arXiv.2502.11401"
     19   },
     20   "scan_version": 3,
     21   "active_modules": ["experimental_rigor", "data_leakage"],
     22   "methodology_tags": ["benchmark-eval"],
     23   "key_findings": "AutoRegEmbed combines information compression and conditional distribution alignment to produce LLM-based text embeddings that follow the autoregressive nature of decoder models. On 10 STS benchmarks, it achieves 84.59 average Spearman correlation with Mistral-7B using 275k training samples, outperforming LLM2Vec-Mistral (84.01) which uses 544k samples. Ablation shows information compression contributes 16.99% improvement and conditional distribution alignment adds 9.17%. The method also achieves competitive retrieval performance on MS MARCO (42.49 nDCG@10) though it requires an additional epoch of contrastive fine-tuning for retrieval tasks.",
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The abstract states 'Our code is available at https://github.com/TrustedLLM/AutoRegEmbed' — a concrete GitHub URL is provided."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "All training and evaluation data are publicly available: PWC (Ge et al., 2024), MEDI (Wang et al., 2024a), BGE (Chen et al., 2024b), MS MARCO (Nguyen et al., 2016), and standard STS benchmarks. The paper uses no proprietary data."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Appendix A mentions 'bfloat16 format, enable FlashAttention 2, and train on four A100-80G GPUs with DeepSpeed and Zero-2' but provides no requirements.txt, Dockerfile, or library version numbers sufficient to recreate the environment."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "Appendix A provides hyperparameters and training details but no step-by-step reproduction instructions or scripts. A researcher would need to infer the training pipeline from the method description."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Tables 1, 2, 3, 6, and 7 all report point estimates only (e.g., '84.59') with no confidence intervals, error bars, or ± notation."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper claims AutoRegEmbed 'significantly outperforms' and 'surpasses all leading methods' based solely on comparing point estimates. No p-values, t-tests, or any statistical significance tests are reported."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The paper reports differences with baseline context: 'performance 20% lower' for untrained models, '4.74% lower than AutoRegEmbed' for LLM2Vec unsupervised, 'outperforming LLM2Vec by a margin of 0.58' (Section 4.2). Ablation reports '9.17%' and '16.99%' improvements (Table 3)."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No justification for why 50,000 or 274,951 training samples were chosen. No power analysis or sample size reasoning."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No variance, standard deviation, or spread measures are reported anywhere. All results appear to be from single experimental runs."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table 1 includes extensive baselines across three categories: no contrastive training (Echo, PromptEOL, MetaEOL, GenEOL), unsupervised contrastive training (LLM2Vec), and supervised contrastive training (NV-Embed, SFR-Embedding-2_R, gte-Qwen2-7B-instruct, LLM2Vec, plus fair baselines). Table 2 similarly includes baselines for retrieval."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Baselines include NV-Embed (2024), LLM2Vec (2024), SFR-Embedding-2_R (2024), gte-Qwen2-7B-instruct (2023), which are recent SOTA models from the MTEB leaderboard."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Table 3 presents ablation results: removing conditional distribution alignment reduces average from 83.24 to 73.90, and the base model without training scores 56.91. Additionally, variants of the loss function (Log_sigmoid, KL divergence, JS divergence) are tested. Tables 6 and 7 provide further ablation on alignment strategies and temperature parameters."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Spearman correlation is used for STS tasks (Table 1) and nDCG@10 for retrieval tasks (Table 2). These are two distinct metric types across different evaluation paradigms."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "All evaluation is automated: Spearman correlation against human-annotated similarity scores (STS) and nDCG@10 for retrieval. No human evaluation of embedding quality is performed."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper evaluates on established benchmark test sets (STS12-STS22, STS-B, BIOSSES, SICK-R, MS MARCO, NFcorpus, SCIDOCS) which are standard held-out test sets separate from training data."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Table 1 reports results on each of the 10 individual STS datasets, and Table 2 reports results on each of the 3 retrieval datasets separately, rather than only aggregate averages."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": false,
    113         "justification": "No error analysis or qualitative examination of failure cases. The paper acknowledges weaker retrieval generalization ('AutoRegEmbed is trained solely on MS MARCO') but does not examine specific failure examples or where embeddings break down."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Table 3 shows that KL divergence (79.82) and JS divergence (79.02) variants underperform the original loss (83.24). Table 6 shows four alternative alignment strategies all performing worse. Appendix F shows some hyperparameter configurations that degrade performance (e.g., τ=1.0 drops to 81.61)."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The abstract claims AutoRegEmbed 'significantly outperforms traditional contrastive learning approaches' — supported by Table 1 margins. It claims 'performance comparable to state-of-the-art models when using the same amount of data' — supported by Table 1 showing 84.59 vs LLM2Vec-supervised 84.01 with less data."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Causal claims ('information compression contributes a 16.99% improvement', 'Conditional Distribution Alignment improves performance by 9.17%') are based on controlled ablation in Table 3 where individual components are removed while holding the rest constant. This single-variable manipulation is adequate for the claims."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The title ('Following the Autoregressive Nature of LLM Embeddings') and abstract make broad claims about LLM embeddings, but experiments use only two model families (LLaMA2-7B and Mistral-v0.1-7B) at a single scale (7B). No experiments on other architectures, sizes, or languages. The retrieval evaluation acknowledges MS MARCO-only training but the broader embedding claims are not bounded."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper attributes improvements to following the 'autoregressive nature' but does not consider alternative explanations — e.g., whether the information compression stage provides an advantage simply through additional training data (16k PWC samples), or whether the frozen decoder acts as a regularizer independent of the autoregressive motivation."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper measures Spearman correlation on STS benchmarks and nDCG@10 on retrieval benchmarks and frames results in terms of these specific metrics. It does not overclaim beyond 'text embedding quality' as measured by these standard benchmarks."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The paper specifies LLaMA2 7B and Mistral-v0.1 7B as base models. For open-source models with unique base variants at each size, this is sufficient to identify the exact model."
    153       },
    154       "prompts_provided": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Table 5 in Appendix D provides the full instruction text for both retrieval and STS tasks. For example, Inext for STS: 'This sentence means in one word: \"'. These are the actual prompts used in experiments."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Appendix A reports learning rates (2e-5 and 5e-6), batch sizes (32), epochs (2 and 4), temperature parameters (τ=0.05, β=0.1), max token length (512), number of compressed tokens (5), precision format (bfloat16), and hardware (4×A100-80G)."
    163       },
    164       "scaffolding_described": {
    165         "applies": false,
    166         "answer": false,
    167         "justification": "No agentic scaffolding is used. This is a training method for text embeddings."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 4.1 documents data preprocessing: PWC dataset deduplicated from 241,564 to 16,382 samples ('To reduce redundancy caused by repeated contexts, we remove duplicates'). For retrieval, hard negative mining is described: '7 hard negatives from the ranked list positions 30 to 210' using NV-Embed."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 6 is titled 'Limitations' and contains two substantive paragraphs about the inability to filter harmful content and bias risks in training data."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The limitations section discusses only safety/ethical concerns (harmful content, biased embeddings) — not specific threats to the validity of the experimental results. No discussion of threats like single-run evaluation, limited model families, or STS benchmark saturation."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound claims to tested model families (LLaMA2/Mistral only), tested scale (7B only), tested language (English only), or tested task types. The retrieval discussion partially bounds that domain but broader embedding claims are unbounded."
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "All datasets are publicly available: STS benchmarks via MTEB, MS MARCO, PWC, MEDI, and BGE datasets are all standard public resources. Independent verification of the training and evaluation data is possible."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section 4.1 describes training data sources in detail: PWC dataset with deduplication (241,564→16,382), NLI portion of MEDI (50,000 samples), BGE (274,951 samples), and MS MARCO training set for retrieval. Evaluation datasets are standard benchmarks cited with references."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": false,
    205         "answer": false,
    206         "justification": "No human participants. All data sources are standard public benchmarks and datasets."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "The two-stage pipeline is documented: (1) information compression training on PWC-Unique (16,382 samples, 2 epochs), (2) conditional distribution alignment on NLI data (50k or 275k samples, 4 epochs). For retrieval, an additional contrastive fine-tuning epoch is described. Hard negative mining procedure is detailed."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "The Acknowledgments section lists specific grants: Strategic Priority Research Program of CAS (No.XDB0680302), NSFC (No.62276248), Key R&D Program of Xinjiang (No.2024B03026), Beijing Nova Program (No.20250484765), and Youth Innovation Promotion Association CAS (No.2023111)."
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Author affiliations are clearly listed: Institute of Computing Technology (Chinese Academy of Sciences), University of Chinese Academy of Sciences, and Kuaishou Technology."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": true,
    228         "justification": "Funders are Chinese government research agencies (CAS, NSFC, Beijing Nova Program) and a regional government program. These have no direct financial interest in the performance of a text embedding method."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests or financial disclosure statement is included. Three authors are affiliated with Kuaishou Technology (a major tech company), but no declaration of potential financial interests or conflict of interest statement is present."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "The paper uses LLaMA2 and Mistral-v0.1 as base models but does not state their training data cutoff dates. This is necessary to assess whether STS benchmark data could have been in pre-training."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "STS benchmarks (2012-2017) were publicly available well before LLaMA2 and Mistral training. No discussion of whether STS test examples appeared in the pre-training data."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "STS12-STS17 were published 2012-2017 and STS-B even earlier — all predating LLaMA2/Mistral training. The paper does not discuss contamination risk despite evaluating on these benchmarks."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants in this study."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No inference cost or latency is reported. Appendix A mentions reduced pairwise computations (64 vs 4096) during training but does not report inference-time cost, latency, or throughput."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": true,
    299         "justification": "Appendix A states: 'The information compression task takes 20 minutes, while the conditional distribution alignment task, involving 50,000 samples, takes approximately 1 hour' on 'four A100-80G GPUs with DeepSpeed and Zero-2.' Fair baselines trained 'in 1 hour' on same hardware."
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from single runs."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The paper does not state how many experimental runs produced the reported results."
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": true,
    316         "justification": "Appendix F reports the grid search: τ ∈ {0.02, 0.05, 0.1, 0.2, 1.0} and β ∈ {0.1, 0.2, 0.3, 0.4}. Table 7 shows 9 tested configurations. For fair baselines, Appendix A reports tested batch sizes {128, 256, 512, 1024} and learning rates {1e-5, 5e-5, 1e-4, 2e-4}."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": true,
    321         "justification": "Table 7 in Appendix F shows all tested hyperparameter configurations with their performance. The paper reports that τ=0.05, β=0.1 performs best. For baselines, batch size 512 and learning rate 1e-4 were found optimal via grid search."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "Comparisons across 10 STS datasets and multiple baselines involve many pairwise comparisons. No correction for multiple comparisons (Bonferroni, Holm, etc.) is applied."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The authors implement both their method and the 'fair baselines' themselves. No acknowledgment of self-comparison bias per Lucic et al. (2018), despite the fact that their baseline implementations could systematically underperform."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "Figure 1 plots training data size vs STS performance for all methods, showing Pareto fronts. Figure 3 directly compares learning efficiency (number of samples vs STS performance). Table 1 includes a Vol. column showing training data requirements for each method."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "The paper uses STS benchmarks and MTEB without discussing whether Spearman correlation on human similarity ratings actually measures the embedding quality the paper claims. No discussion of construct validity or comparison with alternative evaluation paradigms."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": false,
    345         "answer": false,
    346         "justification": "No scaffolding is involved in this embedding method."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "STS benchmarks (2012-2017) and MS MARCO (2016) were published years before LLaMA2 and Mistral were trained. The paper does not discuss whether these benchmark examples could have been in the models' pre-training data."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether the evaluation setup leaks information. Not addressed."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No discussion of whether training data (MEDI NLI, BGE) overlaps with or shares structural similarity with the STS evaluation benchmarks."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": false,
    368         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipeline."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "AutoRegEmbed significantly outperforms traditional contrastive learning approaches on STS benchmarks using the same training data.",
    375       "evidence": "Table 1 shows AutoRegEmbed-LLaMA2 at 83.24 (10 datasets) vs the best fair contrastive baseline at 81.90 with identical 274,951 training samples. With 50k samples, AutoRegEmbed (83.24) vs best contrastive baseline (81.53). Section 4.2.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "AutoRegEmbed achieves performance comparable to SOTA with far fewer training samples.",
    380       "evidence": "Table 1 shows AutoRegEmbed-Mistral at 84.59 with 275k samples vs LLM2Vec-Mistral (supervised) at 84.01 with 544k samples, NV-Embed at 82.84 with 1M+ samples, and gte-Qwen2 at 83.06 with ~791M samples. However, these comparisons are confounded by different base models, multi-task training, and closed-source data.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Information compression contributes a 16.99% improvement and conditional distribution alignment adds 9.17%.",
    385       "evidence": "Table 3 ablation: base LLaMA2 without training = 56.91, with information compression only = 73.90, with both tasks = 83.24. The percentages match the reported improvements. Section 4.4.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "AutoRegEmbed exhibits the fastest learning efficiency growth compared to contrastive learning baselines.",
    390       "evidence": "Figure 3 shows AutoRegEmbed consistently above four contrastive learning baselines at every data point from 0 to 40,000 samples. 'With just 15,000 samples, AutoRegEmbed already surpasses the maximum performance of other contrastive learning models.' Section 4.5.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "The original loss function (Equation 2) outperforms KL divergence, JS divergence, and Log-sigmoid variants.",
    395       "evidence": "Table 3: Equation 2 achieves 83.24 vs Log_sigmoid 82.93, KL divergence 79.82, JS divergence 79.02. Section 4.4 and Appendix B provide analysis of why KL/JS underperform.",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "AutoRegEmbed outperforms most SOTA models on MS MARCO retrieval.",
    400       "evidence": "Table 2 shows AutoRegEmbed at 42.49 nDCG@10 vs LLM2Vec-supervised 41.45 and SFR-Embedding-2_R 42.18, but below gte-Qwen2 45.98. On NFcorpus and SCIDOCS, performance is lower than several baselines. Section 4.3.",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "red_flags": [
    405     {
    406       "flag": "No error bars or uncertainty quantification",
    407       "detail": "All results across all tables (1, 2, 3, 6, 7) are single-point estimates with no confidence intervals, standard deviations, or variance from multiple runs. For a paper claiming to 'significantly outperform' baselines, the absence of uncertainty quantification means claimed differences could be within noise."
    408     },
    409     {
    410       "flag": "No contamination analysis despite using old benchmarks",
    411       "detail": "STS benchmarks (2012-2017) and their labeled pairs were publicly available years before LLaMA2 and Mistral were trained. The paper does not discuss whether these test examples could have been memorized during pre-training, which would confound the evaluation of the fine-tuning method's effectiveness."
    412     },
    413     {
    414       "flag": "Self-comparison bias",
    415       "detail": "The authors implement their own 'fair baselines' for contrastive learning and compare against these. Per Lucic et al. (2018), authors' re-implementations of baselines systematically underperform. No independent evaluation or acknowledgment of this bias."
    416     },
    417     {
    418       "flag": "Retrieval results undermine core claim",
    419       "detail": "The paper claims the method eliminates the need for traditional contrastive learning, yet for retrieval tasks (Section 4.1), 'we perform an additional epoch of contrastive fine-tuning to better align with the evaluation process.' This additional cosine-based training partially contradicts the stated advantage."
    420     },
    421     {
    422       "flag": "SOTA comparisons confounded by data differences",
    423       "detail": "The paper compares data efficiency against SOTA models (NV-Embed, gte-Qwen2, SFR-Embedding-2_R) that use closed-source data and multi-task training. The difference could be attributed to data quality/diversity rather than the training method, making the data-efficiency claim difficult to verify."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "Llm2vec: Large language models are secretly powerful text encoders",
    429       "authors": ["Parishad BehnamGhader", "Vaibhav Adlakha", "Marius Mosbach", "Dzmitry Bahdanau", "Nicolas Chapados", "Siva Reddy"],
    430       "year": 2024,
    431       "arxiv_id": "2404.05961",
    432       "relevance": "Key baseline for converting decoder-only LLMs into text encoders via bidirectional attention and contrastive learning."
    433     },
    434     {
    435       "title": "Nv-embed: Improved techniques for training llms as generalist embedding models",
    436       "authors": ["Chankyu Lee", "Rajarshi Roy", "Mengyao Xu", "Jonathan Raiman", "Mohammad Shoeybi", "Bryan Catanzaro", "Wei Ping"],
    437       "year": 2024,
    438       "arxiv_id": "2405.17428",
    439       "relevance": "SOTA LLM-based embedding model using latent attention layer pooling, main baseline for embedding quality comparison."
    440     },
    441     {
    442       "title": "MTEB: massive text embedding benchmark",
    443       "authors": ["Niklas Muennighoff", "Nouamane Tazi", "Loïc Magne", "Nils Reimers"],
    444       "year": 2023,
    445       "relevance": "Standard evaluation framework used for STS and retrieval benchmarks; defines the evaluation methodology for text embedding models."
    446     },
    447     {
    448       "title": "Direct preference optimization: Your language model is secretly a reward model",
    449       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell", "Christopher D. Manning", "Stefano Ermon", "Chelsea Finn"],
    450       "year": 2023,
    451       "relevance": "DPO alignment technique that inspired the conditional distribution alignment loss design (S2 function in Equation 2)."
    452     },
    453     {
    454       "title": "Improving text embeddings with large language models",
    455       "authors": ["Liang Wang", "Nan Yang", "Xiaolong Huang", "Linjun Yang", "Rangan Majumder", "Furu Wei"],
    456       "year": 2024,
    457       "relevance": "Uses LLM-generated synthetic data for embedding fine-tuning; MEDI dataset used as training data in this work."
    458     },
    459     {
    460       "title": "Llama2vec: Unsupervised adaptation of large language models for dense retrieval",
    461       "authors": ["Chaofan Li", "Zheng Liu", "Shitao Xiao", "Yingxia Shao", "Defu Lian"],
    462       "year": 2024,
    463       "relevance": "Most closely related work: proposes pretext tasks for unsupervised LLM adaptation before contrastive fine-tuning for embeddings."
    464     },
    465     {
    466       "title": "In-context autoencoder for context compression in a large language model",
    467       "authors": ["Tao Ge", "Jing Hu", "Lei Wang", "Xun Wang", "Si-Qing Chen", "Furu Wei"],
    468       "year": 2024,
    469       "relevance": "Inspiration for the information compression task; PWC dataset from this work is used for compression training."
    470     },
    471     {
    472       "title": "Generative representational instruction tuning",
    473       "authors": ["Niklas Muennighoff", "Hongjin Su", "Liang Wang", "Nan Yang", "Furu Wei", "Tao Yu", "Amanpreet Singh", "Douwe Kiela"],
    474       "year": 2024,
    475       "arxiv_id": "2402.09906",
    476       "relevance": "Demonstrates joint generative and representational training of LLMs, relevant to combining generative and discriminative objectives."
    477     },
    478     {
    479       "title": "SimCSE: Simple contrastive learning of sentence embeddings",
    480       "authors": ["Tianyu Gao", "Xingcheng Yao", "Danqi Chen"],
    481       "year": 2021,
    482       "relevance": "Foundational contrastive learning method for text embeddings; LLM2Vec's unsupervised approach builds on SimCSE."
    483     },
    484     {
    485       "title": "Fine-tuning llama for multi-stage text retrieval",
    486       "authors": ["Xueguang Ma", "Liang Wang", "Nan Yang", "Furu Wei", "Jimmy Lin"],
    487       "year": 2024,
    488       "relevance": "RepLLaMA baseline using LLaMA's last-token hidden state for dense retrieval via contrastive fine-tuning."
    489     },
    490     {
    491       "title": "Representation learning with contrastive predictive coding",
    492       "authors": ["Aäron van den Oord", "Yazhe Li", "Oriol Vinyals"],
    493       "year": 2018,
    494       "arxiv_id": "1807.03748",
    495       "relevance": "InfoNCE loss function that forms the structural basis for the conditional distribution alignment loss."
    496     }
    497   ],
    498   "engagement_factors": {
    499     "practical_relevance": {
    500       "score": 2,
    501       "justification": "Practitioners working with LLM-based text embeddings could adopt AutoRegEmbed for more efficient embedding training, with code released."
    502     },
    503     "surprise_contrarian": {
    504       "score": 1,
    505       "justification": "Challenges the default use of standard contrastive learning for LLM embeddings, but the idea of leveraging autoregressive properties is a natural extension rather than a paradigm shift."
    506     },
    507     "fear_safety": {
    508       "score": 0,
    509       "justification": "No safety or security implications; this is a training method for text embeddings."
    510     },
    511     "drama_conflict": {
    512       "score": 0,
    513       "justification": "No controversy or conflict; a technical improvement to embedding training methodology."
    514     },
    515     "demo_ability": {
    516       "score": 2,
    517       "justification": "Code is available on GitHub and the method can be applied to open-source LLMs (LLaMA2, Mistral), though it requires GPU resources to train."
    518     },
    519     "brand_recognition": {
    520       "score": 0,
    521       "justification": "From Chinese Academy of Sciences and Kuaishou Technology — respected in NLP research but low general brand recognition."
    522     }
    523   }
    524 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs