scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25043B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "LLM Alignment as Retriever Optimization: An Information Retrieval Perspective",
      6     "authors": [
      7       "Bowen Jin",
      8       "Jinsung Yoon",
      9       "Zhen Qin",
     10       "Ziqi Wang",
     11       "Wei Xiong"
     12     ],
     13     "year": 2025,
     14     "venue": "International Conference on Machine Learning",
     15     "arxiv_id": "2502.03699",
     16     "doi": "10.48550/arXiv.2502.03699"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The 38.9% and 13.7% averaged relative improvements on AlpacaEval2 and MixEval-Hard are supported by Table 2 results (e.g., LARPO LambdaRank 34.9% vs SimPO 21.5% LC WR on Mistral-Base).",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims about hard negatives, list size, and optimization objective are each tested in controlled ablation studies (Figure 4a/b/c, Table 3, Table 4) with individual variables manipulated.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper makes broad claims about 'LLM alignment' in the title and conclusions but only tests 7B-class models on two benchmarks (AlpacaEval2, MixEval); no scope limitations on model scale, domain, or safety alignment are stated.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "LARPO uses 10 candidate responses vs. 2 for DPO baselines, confounding data quantity with loss function design; the paper does not discuss whether improvements stem from more responses rather than the IR-inspired objectives.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper claims to improve 'alignment quality' broadly but measures only instruction-following win rates on AlpacaEval2 (LLM-judged) and MixEval; the gap between these proxies and actual alignment (safety, truthfulness, etc.) is not discussed.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "There is no limitations section; the Impact Statement explicitly states 'we do not believe any specific impacts warrant explicit discussion,' and no threats-to-validity section exists.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No threats to validity are discussed anywhere in the paper, including the confound of different candidate list sizes between LARPO and baselines.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper never explicitly states what the results do not show (e.g., no discussion of applicability to larger models, safety alignment, or non-instruction-following tasks).",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Acknowledgments disclose funding from Apple PhD Fellowship, DARPA, ONR, NSF, Cisco, and Center for Intelligent Information Retrieval.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly disclosed: UIUC, Google Cloud AI Research, Google DeepMind, and University of Virginia.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Funders (NSF, DARPA, ONR, Cisco) are independent of the alignment benchmark outcomes; Google-affiliated authors evaluate primarily Mistral models, not Google products.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) is present beyond the funding acknowledgments.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The IR-alignment analogy (LLM as retriever, reward model as reranker) is precisely defined in Section 2, and LARPO's objectives are formally stated in Table 1 with proofs in Appendix F.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Four explicit contributions are listed in the introduction: the IR-alignment framework, significance of three IR principles, the LARPO method, and empirical IR-metric analysis of LLMs.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 7 relates LARPO to DPO, SimPO, LiPO, and iterative DPO, explaining how this work differs from and builds on each; the most related work (LiPO) is specifically distinguished.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No code repository or release is mentioned anywhere in the paper.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "All evaluation benchmarks (AlpacaEval2, MixEval, GSM8K, MATH) and training data (Ultrafeedback) are standard publicly available datasets.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No requirements.txt, Dockerfile, or equivalent dependency specification is provided; hardware and software environment are not described.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Appendices H provide hyperparameters but no step-by-step reproduction instructions; critical details like GPU type, training time, and data preprocessing code are absent.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "No confidence intervals or error bars are reported for any result across Tables 2-4 or Figures 4-6.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are applied despite comparative claims against multiple baselines.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Relative improvements are quantified (38.9% on AlpacaEval2, 13.7% on MixEval-Hard) with baseline values provided for context.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The choice of benchmark sizes (805 AlpacaEval2 questions, 1000 MixEval-Hard) is not discussed in terms of statistical power or justification.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or results across multiple runs are reported for any experimental condition.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Eight offline baselines (RRHF, SLiC-HF, DPO, IPO, CPO, KTO, RDPO, SimPO) and one online baseline (Iterative DPO) are included in Table 2.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Baselines include SimPO (2024), Iterative DPO (2024), and KTO (2024), which are current state-of-the-art direct preference optimization methods.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Separate ablations study the optimization objective (Table 3), hard negative hardness (Figure 4a/b), candidate list size (Figure 4c), and memorization (Table 4).",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Multiple metrics are reported: LC Win Rate and raw Win Rate on AlpacaEval2, and Score on both MixEval and MixEval-Hard.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "Evaluation uses LLM-as-judge (AlpacaEval2 with GPT-4 or Llama-3-70B, MixEval automated scoring); no human evaluation of system outputs is conducted.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "AlpacaEval2 and MixEval serve as held-out test sets; training is on Ultrafeedback, and GSM8K/MATH use standard train/test splits.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": false,
    218           "justification": "No per-category or per-task breakdown of results is provided; metrics are reported only as aggregate scores across all benchmark questions.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "No failure cases or error analysis is presented; the paper only reports aggregate performance improvements.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Figure 4b shows that very low temperature for negative generation degrades performance, and Appendix H.6 notes that responses become near-identical below a threshold, constituting a reported negative finding.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Specific model identifiers are given: Mistral-7b-base, Mistral-7b-it, Gemma2-2b-it, Mathstral-7b-it, with references to the original model papers.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "No prompts, system instructions, or prompt templates used during training or evaluation are provided.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Appendices H.1-H.7 report learning rates (e.g., 5e-7), number of iterations (3), number of responses (10), temperature search ranges, and epoch counts for each experiment.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "This paper is about training-time alignment methods, not agentic scaffolding; no agentic scaffolding is used.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": false,
    262           "justification": "Data preprocessing for Ultrafeedback training is not documented; how prompts are sampled and filtered from the dataset is not described.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "No model checkpoints, generated response sets, or reward model scores are made available for independent verification.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Algorithm 1 and Section 4 describe the iterative data collection process (generate k responses, score with reward model, rank and select for training) in sufficient detail.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants are involved; standard benchmark datasets are used.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Algorithm 1 documents the full pipeline from prompt sampling through response generation, reward scoring, ranking, and model update across iterations.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "The training data cutoff of Mistral-7b or Gemma-2b base models is not stated, which matters since AlpacaEval2 and MixEval questions may appear in pretraining data.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "No discussion of potential overlap between base model pretraining data and the evaluation benchmarks (AlpacaEval2, MixEval, GSM8K, MATH).",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "The possibility that GSM8K, MATH, or AlpacaEval2 questions appeared in the base models' pretraining data is not discussed anywhere.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants involved.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants involved.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants involved.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants involved.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants involved.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants involved.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants involved.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference latency or cost is reported despite iterative training with 10 generations per prompt being substantially more expensive than DPO.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No GPU type, GPU hours, or total computational budget is stated for any experiment.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "LARPO achieves 38.9% relative averaged improvement on AlpacaEval2 and 13.7% on MixEval-Hard compared to competitive baselines.",
    375       "evidence": "Table 2 shows LARPO (LambdaRank) reaches 34.9% LC WR vs 21.5% for SimPO on Mistral-Base with LLM-Blender reward model; relative improvement calculation over averaged baselines.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Listwise objectives (LambdaRank, ListMLE) outperform pairwise (DPO) and contrastive objectives for LLM alignment.",
    380       "evidence": "Table 3 shows LambdaRank (40.29% LC WR) and ListMLE (38.02%) exceed pairwise DPO (36.43%) for Mistral-7b-it; consistent pattern across both models.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Harder negative samples (lower temperature, on-prompt responses) lead to stronger trained LLMs.",
    385       "evidence": "Figure 4a shows hardest negatives (temp=0.7, correct prompt) achieve highest accuracy (~0.83) vs. easiest random negatives (~0.75) across 3 iterations on GSM8K.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Larger candidate lists improve alignment performance, with diminishing returns.",
    390       "evidence": "Figure 4c shows win rate increases from ~50% (4 responses) to ~62% (10 responses) on AlpacaEval2 with Mistral-7b-it using contrastive objective.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Including responses from all previous iterations in the candidate pool improves alignment over using only the current iteration.",
    395       "evidence": "Table 4 shows 'w. current + all prev' achieves 72.50% WR vs. 66.56% for 'w. current only' using Lpair on Gemma2-2b-it.",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "The LLM-as-retriever analogy is empirically validated: Pass@N curves for LLMs mirror Recall@N curves for IR retrievers.",
    400       "evidence": "Figure 2 shows parallel increasing curves for e5 retriever (Recall@N) and Mathstral-7b-it (Pass@N) on NQ and GSM8K respectively as N increases.",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval",
    406     "theoretical"
    407   ],
    408   "key_findings": "LARPO reframes LLM alignment as iterative retriever optimization by formally mapping LLMs to bi-encoder retrievers and reward models to cross-encoder rerankers, deriving listwise alignment objectives from IR ranking theory. Applied to 7B-class models on AlpacaEval2 and MixEval, LARPO's listwise objectives consistently outperform DPO and SimPO, with the LambdaRank variant achieving the strongest results. Ablations confirm three independent contributors: harder negative samples, larger candidate lists, and listwise over pairwise objectives each provide additive gains. The analogy is further empirically validated by showing that Pass@N curves for LLMs mirror Recall@N curves for traditional retrievers.",
    409   "red_flags": [
    410     {
    411       "flag": "Unfair baseline comparison",
    412       "detail": "Offline baseline scores for Table 2 are taken directly from the SimPO paper (Meng et al., 2024b), while LARPO and iterative DPO are evaluated by the authors. LLM-as-judge scores on AlpacaEval2 are not stable across time and evaluator versions, making cross-paper score comparisons unreliable."
    413     },
    414     {
    415       "flag": "Data quantity confound",
    416       "detail": "LARPO generates 10 candidate responses per prompt while DPO baselines use 2. The ablation in Figure 4c shows more responses improve performance, but the main Table 2 comparison does not control for this, making it impossible to isolate the effect of the IR-inspired loss from the effect of more training signal."
    417     },
    418     {
    419       "flag": "No variance or significance testing",
    420       "detail": "No confidence intervals, error bars, or statistical significance tests are reported for any result, despite iterative training introducing substantial run-to-run variance."
    421     },
    422     {
    423       "flag": "No code or checkpoint release",
    424       "detail": "No implementation code, trained model checkpoints, or generated response sets are released, preventing reproduction of the claimed improvements."
    425     },
    426     {
    427       "flag": "Compute cost unreported",
    428       "detail": "LARPO requires iterative retraining with 10x more generated responses per step than DPO; the substantially higher computational cost is never quantified or discussed."
    429     },
    430     {
    431       "flag": "No limitations section",
    432       "detail": "The paper has no dedicated limitations section; the Impact Statement explicitly declines to discuss any specific societal implications."
    433     }
    434   ],
    435   "cited_papers": [
    436     {
    437       "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
    438       "relevance": "Primary baseline and theoretical foundation; LARPO is explicitly framed as an enhancement of DPO's pairwise assumption with listwise IR objectives."
    439     },
    440     {
    441       "title": "SimPO: Simple Preference Optimization with a Reference-Free Reward",
    442       "relevance": "Strongest offline baseline against which LARPO is compared; provides the baseline checkpoint scores used in Table 2."
    443     },
    444     {
    445       "title": "LiPO: Listwise Preference Optimization through Learning-to-Rank",
    446       "relevance": "Most related prior work applying learning-to-rank objectives to LLM alignment; LARPO differentiates itself by online iterative data generation vs. LiPO's off-the-shelf listwise data."
    447     },
    448     {
    449       "title": "Iterative Preference Learning from Human Feedback: Bridging Theory and Practice for RLHF under KL-Constraint",
    450       "relevance": "Online alignment baseline (Iterative DPO) directly compared against LARPO; provides the iterative training framework that LARPO extends."
    451     },
    452     {
    453       "title": "Optimizing Dense Retrieval Model Training with Hard Negatives",
    454       "relevance": "Key IR paper establishing importance of hard negatives for retriever training; motivates LARPO's hard negative strategy for LLM alignment."
    455     },
    456     {
    457       "title": "RocketQA: An Optimized Training Approach to Dense Passage Retrieval",
    458       "relevance": "IR work on candidate list construction and retriever optimization that directly inspires LARPO's inclusiveness and memorization principles."
    459     },
    460     {
    461       "title": "AlpacaEval: A Simple Way to Debias Automatic Evaluators",
    462       "relevance": "Primary evaluation benchmark; length-controlled win rate metric is the main measure of LARPO's performance."
    463     },
    464     {
    465       "title": "MixEval: Deriving Wisdom of the Crowd from LLM Benchmark Mixtures",
    466       "relevance": "Secondary evaluation benchmark used to validate LARPO performance beyond the AlpacaEval2 results."
    467     },
    468     {
    469       "title": "UltraFeedback: Boosting Language Models with Scaled AI Feedback",
    470       "relevance": "Training dataset used for all LARPO and baseline experiments; central to reproducibility."
    471     }
    472   ],
    473   "engagement_factors": {
    474     "practical_relevance": {
    475       "score": 2,
    476       "justification": "Proposes a concrete training method (LARPO) that practitioners could apply to LLM alignment, but no code release limits immediate adoption."
    477     },
    478     "surprise_contrarian": {
    479       "score": 2,
    480       "justification": "The reframing of LLM alignment as an IR retrieval problem is a genuinely novel perspective that challenges the typical RL/preference-optimization framing."
    481     },
    482     "fear_safety": {
    483       "score": 1,
    484       "justification": "Addresses AI alignment tangentially but focuses on instruction-following quality rather than safety or risk; no safety implications discussed."
    485     },
    486     "drama_conflict": {
    487       "score": 1,
    488       "justification": "Positions itself against DPO's dominance and makes strong performance claims, but the competitive framing is standard for alignment papers."
    489     },
    490     "demo_ability": {
    491       "score": 0,
    492       "justification": "No code, no demo, no interactive system released; results are only reproducible with significant compute and implementation effort."
    493     },
    494     "brand_recognition": {
    495       "score": 2,
    496       "justification": "Authors from Google Cloud AI Research and Google DeepMind, published at ICML 2025, providing significant institutional credibility."
    497     }
    498   },
    499   "hn_data": {
    500     "threads": [
    501       {
    502         "hn_id": "43876843",
    503         "title": "Stop treating `AGI' as the north-star goal of AI research",
    504         "points": 46,
    505         "comments": 32,
    506         "url": "https://news.ycombinator.com/item?id=43876843"
    507       }
    508     ],
    509     "top_points": 46,
    510     "total_points": 46,
    511     "total_comments": 32
    512   }
    513 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs