scan.json (27489B)
1 { 2 "paper": { 3 "title": "RankLLM: A Python Package for Reranking with LLMs", 4 "authors": [ 5 "Sahel Sharifymoghaddam", 6 "Ronak Pradeep", 7 "Andre Slavescu", 8 "Ryan Nguyen", 9 "Andrew Xu", 10 "Zijian Chen", 11 "Yilin Zhang", 12 "Yidi Chen", 13 "Jasper Xian", 14 "Jimmy Lin" 15 ], 16 "year": 2025, 17 "venue": "SIGIR 2025", 18 "arxiv_id": "2505.19284", 19 "doi": "10.1145/3726302.3730331" 20 }, 21 "scan_version": 2, 22 "active_modules": ["experimental_rigor", "data_leakage"], 23 "methodology_tags": ["benchmark-eval"], 24 "key_findings": "RankLLM is an open-source Python package supporting pointwise, pairwise, and listwise LLM-based reranking in a modular framework. Reproduction experiments on TREC DL19-DL23 show specialized fine-tuned models (RankZephyr, DuoT5) outperform out-of-the-box LLMs, while GPT-4o-mini generates 28-75% malformed responses depending on prompt template yet still achieves competitive nDCG@10 through graceful error handling. The framework emphasizes reproducibility through end-to-end scripts and two-click reproduction pages.", 25 "checklist": { 26 "artifacts": { 27 "code_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "Repository publicly available at rankllm.ai and installable via PyPI ('pip install rank-llm[all]'). GitHub URL github.com/castorini/rank_llm provided. Source installation instructions included." 31 }, 32 "data_released": { 33 "applies": true, 34 "answer": true, 35 "justification": "All experiments use publicly available TREC DL19-DL23 datasets from MS MARCO V1/V2. The retriever component supports these datasets with prebuilt indexes and caching." 36 }, 37 "environment_specified": { 38 "applies": true, 39 "answer": true, 40 "justification": "Specific package version (0.25.0) given for pip install with dependency specification via PyPI. GPU hardware specified ('single NVIDIA RTX A6000 GPU'). CUDA-enabled PyTorch required as prerequisite." 41 }, 42 "reproduction_instructions": { 43 "applies": true, 44 "answer": true, 45 "justification": "Section 5: 'All reported results are generated by running the end-to-end demos/experimental_results.py script.' Two-click reproduction (2CR) pages provided with reproduction matrices. run_rank_llm.py wrapper script encapsulates retrieval and reranking." 46 } 47 }, 48 "statistical_methodology": { 49 "confidence_intervals_or_error_bars": { 50 "applies": true, 51 "answer": false, 52 "justification": "Table 1 reports point estimates of nDCG@10 only. No confidence intervals or error bars. The paper explicitly states 'we report single-run results' and acknowledges non-deterministic behavior." 53 }, 54 "significance_tests": { 55 "applies": true, 56 "answer": false, 57 "justification": "No statistical significance tests used. Table 1 compares models by raw nDCG@10 scores without any tests of statistical difference." 58 }, 59 "effect_sizes_reported": { 60 "applies": true, 61 "answer": false, 62 "justification": "Table 1 shows raw nDCG@10 scores with BM25 baseline in the same table, but no explicit effect sizes, percentage improvements, or relative gains are reported." 63 }, 64 "sample_size_justified": { 65 "applies": true, 66 "answer": false, 67 "justification": "No justification for using DL19-DL23 datasets specifically, or discussion of whether five test collections provide sufficient statistical power for the comparisons shown." 68 }, 69 "variance_reported": { 70 "applies": true, 71 "answer": false, 72 "justification": "Section 5 explicitly states: 'To save costs, we report single-run results.' No variance, standard deviation, or spread measures across runs despite acknowledging that 'out-of-the-box prompt-decoders rank non-deterministically.'" 73 } 74 }, 75 "evaluation_design": { 76 "baselines_included": { 77 "applies": true, 78 "answer": true, 79 "justification": "BM25 first-stage baseline shown in Table 1. Multiple reranking approaches compared: pointwise (MonoT5), pairwise (DuoT5), and various listwise methods." 80 }, 81 "baselines_contemporary": { 82 "applies": true, 83 "answer": true, 84 "justification": "Baselines include recent models: Gemini Flash 2.0, GPT-4o-mini, LLaMA 3.1 8B, Qwen 2.5 7B (all 2024-2025 models), alongside established reranking models." 85 }, 86 "ablation_study": { 87 "applies": true, 88 "answer": false, 89 "justification": "No ablation study of framework components. The sliding window parameters (window size 20, stride 10) and prompt templates are configurable but not ablated. Rows 4d-4f compare prompt templates but this is a comparison, not a systematic ablation." 90 }, 91 "multiple_metrics": { 92 "applies": true, 93 "answer": false, 94 "justification": "Only nDCG@10 is reported for ranking quality (Table 1). The paper mentions supporting mAP@100 and recall@20 (Section 4.4, Figure 8) but does not report these in the experiments." 95 }, 96 "human_evaluation": { 97 "applies": false, 98 "answer": false, 99 "justification": "Human evaluation is not relevant to this paper's claims. The framework reproduces benchmark results using TREC relevance judgments as the established ground truth for IR evaluation." 100 }, 101 "held_out_test_set": { 102 "applies": true, 103 "answer": true, 104 "justification": "Results reported on standard TREC DL19-DL23 test sets with established qrels (relevance judgments). These are well-defined test collections separate from any training data." 105 }, 106 "per_category_breakdown": { 107 "applies": true, 108 "answer": true, 109 "justification": "Table 1 breaks down results per dataset (DL19-DL23) and per model/method. Table 2 breaks down malformed response types (OK, Wrong Format, Repetition, Missing) per model." 110 }, 111 "failure_cases_discussed": { 112 "applies": true, 113 "answer": true, 114 "justification": "Table 2 shows malformed response distributions. Paper discusses that GPT-4o-mini generates '28% to 75% missing candidate ids' depending on prompt template. Section 4.3 describes how malformed responses are processed gracefully." 115 }, 116 "negative_results_reported": { 117 "applies": true, 118 "answer": true, 119 "justification": "Several negative results reported: RankVicuna underperforms most methods, RankGPTAPEER has 75.7% missing documents, Qwen 2.5 has 26.4% repetition errors, and out-of-the-box LLMs frequently generate malformed responses." 120 } 121 }, 122 "claims_and_evidence": { 123 "abstract_claims_supported": { 124 "applies": true, 125 "answer": true, 126 "justification": "Abstract claims about modularity, configurability, diverse model support, reproducibility features, and integration with Pyserini are all demonstrated through the paper's architecture description, code examples, and experimental results." 127 }, 128 "causal_claims_justified": { 129 "applies": false, 130 "answer": false, 131 "justification": "The paper is primarily a systems/framework paper that describes and demonstrates a toolkit. It does not make causal claims about what causes performance differences — it presents reproduction results and notes that 'comparing model effectiveness is not the main focus.'" 132 }, 133 "generalization_bounded": { 134 "applies": true, 135 "answer": true, 136 "justification": "Claims are appropriately scoped to the reranking task on specific TREC DL19-DL23 datasets. The paper does not overclaim beyond the tested setting and explicitly notes it is a framework paper, not a model comparison paper." 137 }, 138 "alternative_explanations_discussed": { 139 "applies": true, 140 "answer": false, 141 "justification": "No discussion of alternative explanations for observed performance differences between models. No consideration of confounding factors such as model size, training data differences, or prompt sensitivity beyond noting non-determinism." 142 }, 143 "proxy_outcome_distinction": { 144 "applies": true, 145 "answer": true, 146 "justification": "The paper measures nDCG@10 and frames it as 'reranking effectiveness,' which is the standard meaning in IR. No proxy gap — the measurement matches the claim granularity." 147 } 148 }, 149 "setup_transparency": { 150 "model_versions_specified": { 151 "applies": true, 152 "answer": false, 153 "justification": "Open-source models have specific HuggingFace paths (e.g., 'castorini/monot5-3b-msmarco-10k', 'castorini/first_mistral'). Gemini uses specific version 'gemini-2.0-flash-001'. However, GPT-4o-mini (used for rows 4d-4f, 3 key experiments) has no snapshot date or API version specified." 154 }, 155 "prompts_provided": { 156 "applies": true, 157 "answer": true, 158 "justification": "Prompt templates (RankGPT, RankGPTAPEER, LRL) are named and implemented in the open-source repository at rankllm.ai. While full prompt text is not reproduced in the paper, the code with exact prompts is publicly available." 159 }, 160 "hyperparameters_reported": { 161 "applies": true, 162 "answer": false, 163 "justification": "Reranking-specific parameters reported: window size 20, stride 10, top-100 candidates, context size 4096 for some models. However, LLM generation hyperparameters (temperature, top-p, max tokens) are not stated for any model despite using LLM APIs." 164 }, 165 "scaffolding_described": { 166 "applies": false, 167 "answer": false, 168 "justification": "No agentic scaffolding is used. RankLLM is a reranking framework that sends prompts to LLMs and processes responses — it does not involve autonomous agents, tool use, or multi-step reasoning loops." 169 }, 170 "data_preprocessing_documented": { 171 "applies": true, 172 "answer": true, 173 "justification": "The full pipeline is documented: BM25 retrieves top 100 candidates, sliding window (size 20, stride 10) processes them, long passages truncated to fit context size, malformed responses processed by removing duplicates and appending missing IDs." 174 } 175 }, 176 "limitations_and_scope": { 177 "limitations_section_present": { 178 "applies": true, 179 "answer": false, 180 "justification": "No dedicated limitations or threats-to-validity section in the paper. Section 6 (Discussion) compares to related tools but does not discuss limitations of RankLLM or the experimental evaluation." 181 }, 182 "threats_to_validity_specific": { 183 "applies": true, 184 "answer": false, 185 "justification": "No specific threats to validity discussed anywhere in the paper. The only acknowledgment of a limitation is the brief note that results are single-run due to cost constraints." 186 }, 187 "scope_boundaries_stated": { 188 "applies": true, 189 "answer": false, 190 "justification": "No explicit scope boundaries stated. While the paper notes 'comparing model effectiveness is not the main focus,' it does not state what the results do NOT show or what populations/settings are excluded." 191 } 192 }, 193 "data_integrity": { 194 "raw_data_available": { 195 "applies": true, 196 "answer": true, 197 "justification": "All underlying data is publicly available: TREC DL19-DL23 datasets, MS MARCO corpus, and qrels. The reproduction scripts can regenerate all results from scratch." 198 }, 199 "data_collection_described": { 200 "applies": true, 201 "answer": true, 202 "justification": "Data sources are standard IR benchmarks (TREC DL19-DL23 from MS MARCO V1/V2). Collection method is BM25 retrieval of top 100 candidates per query from prebuilt indexes, well described in Section 4.2." 203 }, 204 "recruitment_methods_described": { 205 "applies": false, 206 "answer": false, 207 "justification": "No human participants. All data comes from standard public IR benchmarks." 208 }, 209 "data_pipeline_documented": { 210 "applies": true, 211 "answer": true, 212 "justification": "Full pipeline documented: query → BM25 retrieval (top 100) → Request object creation → sliding window reranking → Result objects → TREC eval format → nDCG@10 evaluation. Each step is explained with code examples." 213 } 214 }, 215 "conflicts_of_interest": { 216 "funding_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "Acknowledgments section discloses NSERC, Microsoft Accelerating Foundation Models Research program, and IITP grant from Korean Government (MSIT)." 220 }, 221 "affiliations_disclosed": { 222 "applies": true, 223 "answer": true, 224 "justification": "All ten authors are affiliated with University of Waterloo. Author affiliations are clearly listed. The castorini lab created several evaluated models (RankVicuna, RankZephyr, MonoT5, DuoT5, LiT5) but this relationship is implicit from the shared affiliation and citation structure." 225 }, 226 "funder_independent_of_outcome": { 227 "applies": true, 228 "answer": true, 229 "justification": "Funders (NSERC, Microsoft research program, Korean government IITP grant) are academic funding bodies with no direct stake in which reranking model performs best. The paper is a framework demonstration, not a product evaluation." 230 }, 231 "financial_interests_declared": { 232 "applies": true, 233 "answer": false, 234 "justification": "No competing interests or financial interests statement in the paper. The authors' lab created several of the evaluated models (RankVicuna, RankZephyr, LiT5, MonoT5, DuoT5) which could constitute a form of interest." 235 } 236 }, 237 "contamination": { 238 "training_cutoff_stated": { 239 "applies": true, 240 "answer": false, 241 "justification": "No training data cutoff dates stated for any model used (GPT-4o-mini, Gemini Flash 2.0, LLaMA 3.1, Qwen 2.5). These models may have been trained on data containing TREC/MS MARCO benchmarks." 242 }, 243 "train_test_overlap_discussed": { 244 "applies": true, 245 "answer": false, 246 "justification": "No discussion of potential train/test overlap. MS MARCO and TREC DL datasets are widely published and could appear in training data of the evaluated models." 247 }, 248 "benchmark_contamination_addressed": { 249 "applies": true, 250 "answer": false, 251 "justification": "Not addressed. TREC DL19-DL23 and MS MARCO have been publicly available since before the training cutoffs of models like GPT-4o-mini and Gemini Flash 2.0. No discussion of contamination risk." 252 } 253 }, 254 "human_studies": { 255 "pre_registered": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study." 259 }, 260 "irb_or_ethics_approval": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "demographics_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 }, 270 "inclusion_exclusion_criteria": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in this study." 274 }, 275 "randomization_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in this study." 279 }, 280 "blinding_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in this study." 284 }, 285 "attrition_reported": { 286 "applies": false, 287 "answer": false, 288 "justification": "No human participants in this study." 289 } 290 }, 291 "cost_and_practicality": { 292 "inference_cost_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "No inference costs, latency measurements, or per-query timing reported. The paper mentions using GPT-4o-mini and Gemini Flash APIs but does not report API costs, wall-clock time, or tokens consumed." 296 }, 297 "compute_budget_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "Hardware mentioned ('single NVIDIA RTX A6000 GPU') but no total compute budget: no GPU hours, wall-clock time, or API costs quantified." 301 } 302 }, 303 "experimental_rigor": { 304 "seed_sensitivity_reported": { 305 "applies": true, 306 "answer": false, 307 "justification": "Section 5 explicitly states: 'To save costs, we report single-run results.' No seed sensitivity analysis despite acknowledging that models 'rank non-deterministically.'" 308 }, 309 "number_of_runs_stated": { 310 "applies": true, 311 "answer": true, 312 "justification": "Explicitly stated: 'we report single-run results' and 'For more accurate results, we recommend running each experiment multiple times.'" 313 }, 314 "hyperparameter_search_budget": { 315 "applies": true, 316 "answer": false, 317 "justification": "No hyperparameter search conducted or budget reported. Default settings used throughout without exploration of alternatives." 318 }, 319 "best_config_selection_justified": { 320 "applies": true, 321 "answer": true, 322 "justification": "The paper uses default/standard configurations for each model and explicitly states the defaults (window size 20, stride 10, top 100 candidates). Since this is a reproduction study, using original papers' defaults is the appropriate choice." 323 }, 324 "multiple_comparison_correction": { 325 "applies": false, 326 "answer": false, 327 "justification": "No statistical tests performed, so multiple comparison correction is not applicable." 328 }, 329 "self_comparison_bias_addressed": { 330 "applies": true, 331 "answer": false, 332 "justification": "The authors' lab (castorini, University of Waterloo) created RankVicuna, RankZephyr, LiT5, MonoT5, DuoT5, and FirstMistral — 6 of 12 evaluated systems. This potential self-evaluation bias is not acknowledged or discussed." 333 }, 334 "compute_budget_vs_performance": { 335 "applies": true, 336 "answer": false, 337 "justification": "No comparison of performance at matched compute budgets. Models range from small local models (MonoT5-3B) to large proprietary APIs (GPT-4o-mini, Gemini Flash) with vastly different compute requirements, but this is not discussed." 338 }, 339 "benchmark_construct_validity": { 340 "applies": true, 341 "answer": false, 342 "justification": "No discussion of whether nDCG@10 on TREC DL19-DL23 actually measures what matters for practical reranking quality. The benchmarks are used without questioning construct validity." 343 }, 344 "scaffold_confound_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "Different models use different prompt templates, context sizes, and inference frameworks. Rows 4d-4f compare GPT-4o-mini with different prompts (showing prompt effect), but the scaffold confound across models is not explicitly discussed as a variable." 348 } 349 }, 350 "data_leakage": { 351 "temporal_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "Not addressed. DL19-DL23 benchmarks predate the training of GPT-4o-mini, Gemini Flash 2.0, LLaMA 3.1, and Qwen 2.5 — these models may have seen the benchmark data during training." 355 }, 356 "feature_leakage_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "Not addressed. No discussion of whether the reranking setup leaks information not available in real-world usage." 360 }, 361 "non_independence_addressed": { 362 "applies": true, 363 "answer": false, 364 "justification": "Not addressed. No discussion of independence between training data of evaluated models and the MS MARCO/TREC test collections." 365 }, 366 "leakage_detection_method": { 367 "applies": true, 368 "answer": false, 369 "justification": "No leakage detection or prevention methods used. No canary strings, membership inference tests, or decontamination analysis." 370 } 371 } 372 }, 373 "claims": [ 374 { 375 "claim": "RankLLM supports diverse reranking models (pointwise, pairwise, listwise) across proprietary and open-source LLMs in a modular framework.", 376 "evidence": "Architecture described in Sections 3-4 with code examples (Figures 3-9). Table 1 shows results across 12 different model configurations spanning all three reranking paradigms.", 377 "supported": "strong" 378 }, 379 { 380 "claim": "Graceful handling of malformed LLM responses allows competitive ranking results despite high error rates in out-of-the-box prompt-decoders.", 381 "evidence": "Table 2 shows GPT-4o-mini has 28-75% malformed responses depending on prompt template, yet Table 1 shows competitive nDCG@10 scores (e.g., RankGPT at 0.7338 on DL19). Section 4.3 describes the graceful processing pipeline.", 382 "supported": "moderate" 383 }, 384 { 385 "claim": "The framework enables reproducible results from prior work (RankGPT, RankVicuna, RankZephyr, etc.).", 386 "evidence": "Table 1 presents reproduction results. End-to-end script demos/experimental_results.py generates all results. 2CR reproduction pages provided. However, results are single-run with no variance reported, and 'newer model versions' are used where applicable, so exact reproduction is not verified.", 387 "supported": "moderate" 388 }, 389 { 390 "claim": "Specialized fine-tuned rerankers (RankZephyr, DuoT5) outperform out-of-the-box LLMs on TREC DL benchmarks.", 391 "evidence": "Table 1 shows RankZephyr (0.7412 on DL19) outperforms Qwen 2.5 (0.6784) and LLaMA 3.1 (0.6688). However, single-run results without significance testing make this comparison unreliable.", 392 "supported": "weak" 393 }, 394 { 395 "claim": "RankLLM has achieved widespread community adoption, as evidenced by GitHub stars, PyPI downloads, and integration with frameworks like LlamaIndex, LangChain, and Rerankers.", 396 "evidence": "Stated in Section 1 and Section 7 but no specific numbers (download counts, star counts) are provided to quantify adoption.", 397 "supported": "weak" 398 } 399 ], 400 "red_flags": [ 401 { 402 "flag": "Self-evaluation bias", 403 "detail": "The authors' lab (castorini, University of Waterloo) created 6 of 12 evaluated systems (RankVicuna, RankZephyr, LiT5, MonoT5, DuoT5, FirstMistral) and the framework itself. This self-evaluation bias is not acknowledged. RankZephyr (their model) achieves the best overall results in Table 1." 404 }, 405 { 406 "flag": "Single-run results despite known non-determinism", 407 "detail": "The paper explicitly acknowledges that 'out-of-the-box prompt-decoders rank non-deterministically' but reports only single-run results 'to save costs,' making all comparative claims unreliable." 408 }, 409 { 410 "flag": "No error bars or uncertainty quantification", 411 "detail": "All 60 numbers in Table 1 are point estimates with no confidence intervals, standard deviations, or significance tests despite the acknowledged non-determinism of the evaluated systems." 412 }, 413 { 414 "flag": "No limitations section", 415 "detail": "The paper has no dedicated limitations or threats-to-validity section, despite clear limitations including single-run results, potential contamination, self-evaluation bias, and single-metric evaluation." 416 }, 417 { 418 "flag": "Contamination risk unaddressed", 419 "detail": "All TREC DL benchmarks (2019-2023) and MS MARCO predate the training of the proprietary models (GPT-4o-mini, Gemini Flash). These benchmarks could be in training data, potentially inflating the proprietary models' scores. Not discussed." 420 } 421 ], 422 "cited_papers": [ 423 { 424 "title": "Is ChatGPT Good at Search? Investigating Large Language Models as Re-Ranking Agents", 425 "authors": ["Weiwei Sun", "Lingyong Yan", "Xinyu Ma"], 426 "year": 2023, 427 "relevance": "Foundational work on LLM-based listwise reranking (RankGPT), directly reproduced in this paper." 428 }, 429 { 430 "title": "RankVicuna: Zero-shot Listwise Document Reranking with Open-Source Large Language Models", 431 "authors": ["Ronak Pradeep", "Sahel Sharifymoghaddam", "Jimmy Lin"], 432 "year": 2023, 433 "arxiv_id": "2309.15088", 434 "relevance": "Demonstrates distilling proprietary LLM reranking capability into open-source models, a core use case of RankLLM." 435 }, 436 { 437 "title": "RankZephyr: Effective and Robust Zero-Shot Listwise Reranking is a Breeze!", 438 "authors": ["Ronak Pradeep", "Sahel Sharifymoghaddam", "Jimmy Lin"], 439 "year": 2023, 440 "arxiv_id": "2312.02724", 441 "relevance": "State-of-the-art open-source listwise reranker achieving better results than its teacher model, key model in RankLLM ecosystem." 442 }, 443 { 444 "title": "FIRST: Faster Improved Listwise Reranking with Single Token Decoding", 445 "authors": ["Revanth Gangi Reddy", "JaeHyeok Doo", "Yifei Xu"], 446 "year": 2024, 447 "relevance": "Novel single-token decoding approach for efficient listwise reranking using learning-to-rank losses, integrated into RankLLM." 448 }, 449 { 450 "title": "Efficient Memory Management for Large Language Model Serving with Paged-Attention", 451 "authors": ["Woosuk Kwon", "Zhuohan Li", "Siyuan Zhuang"], 452 "year": 2023, 453 "relevance": "vLLM inference framework used by RankLLM for efficient open-source model serving." 454 }, 455 { 456 "title": "Drowning in Documents: Consequences of Scaling Reranker Inference", 457 "authors": ["Mathew Jacob", "Erik Lindgren", "Matei Zaharia"], 458 "year": 2024, 459 "arxiv_id": "2411.11767", 460 "relevance": "Analyzes scalability challenges of LLM-based reranking, directly relevant to the efficiency concerns RankLLM addresses." 461 }, 462 { 463 "title": "Zero-Shot Listwise Document Reranking with a Large Language Model", 464 "authors": ["Xueguang Ma", "Xinyu Zhang", "Ronak Pradeep", "Jimmy Lin"], 465 "year": 2023, 466 "arxiv_id": "2305.02156", 467 "relevance": "LRL zero-shot listwise reranking method, one of the prompt templates implemented and evaluated in RankLLM." 468 }, 469 { 470 "title": "Rankify: A Comprehensive Python Toolkit for Retrieval, Re-Ranking, and Retrieval-Augmented Generation", 471 "authors": ["Abdelrahman Abdallah", "Jamshid Mozafari", "Bhawna Piryani"], 472 "year": 2025, 473 "arxiv_id": "2502.02464", 474 "relevance": "Concurrent reranking toolkit covering retrieval, re-ranking, and RAG — direct competitor to RankLLM." 475 }, 476 { 477 "title": "The LLaMA 3 Herd of Models", 478 "authors": ["Abhimanyu Dubey", "Abhinav Jauhri"], 479 "year": 2024, 480 "arxiv_id": "2407.21783", 481 "relevance": "LLaMA 3.1 8B used as an out-of-the-box reranking model in RankLLM's benchmark evaluation." 482 }, 483 { 484 "title": "Building a Culture of Reproducibility in Academic Research", 485 "authors": ["Jimmy Lin"], 486 "year": 2022, 487 "arxiv_id": "2212.13534", 488 "relevance": "Foundational work on two-click reproducibility (2CR) methodology adopted by RankLLM." 489 } 490 ] 491 }