scan.json (28927B)
1 { 2 "paper": { 3 "title": "Hint-Augmented Re-ranking: Efficient Product Search using LLM-Based Query Decomposition", 4 "authors": [ 5 "Yilun Zhu", 6 "Nikhita Vedula", 7 "Shervin Malmasi" 8 ], 9 "year": 2025, 10 "venue": "IJCNLP-AACL", 11 "arxiv_id": "2511.13994", 12 "doi": "10.48550/arXiv.2511.13994" 13 }, 14 "scan_version": 2, 15 "active_modules": ["experimental_rigor", "data_leakage"], 16 "methodology_tags": ["benchmark-eval"], 17 "key_findings": "LLM-generated structured \"hints\" (brands, features, alternative queries) decomposing superlative e-commerce queries improve retrieval by 10.9 MAP points and ranking by 5.9 MRR points. Fine-tuned small models (0.5B–3B parameters) augmented with hints outperform 72B listwise LLM rankers and DeepSeek-R1 at 14x lower compute cost. Human evaluation with 10 annotators confirms hint-augmented system wins 65% of non-tie comparisons. The approach enables concurrent hint generation and retrieval, adding only 3.5–6.8% latency overhead.", 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The paper states 'Code and data is available at: https://github.com/yilunzhu/superhints/' in a footnote on page 1." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The dataset is built on the publicly available Amazon Shopping Queries Dataset (KDD 2022 Cup), and the authors state their code and data are available at their GitHub repository." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "Appendix A.1 mentions hardware (NVIDIA L40s, A100 GPUs, AWS instance types) and training settings (learning rate, batch size, FP16), but no requirements.txt, Dockerfile, or library version specifications are provided." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "No step-by-step reproduction instructions are included in the paper itself. The GitHub link is provided but the paper does not contain a 'Reproducing Results' section or scripts to replicate experiments." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": false, 45 "justification": "Tables 2, 3, and 4 report only point estimates. No confidence intervals, error bars, or ± notation are provided for any metric." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": false, 50 "justification": "The paper claims improvements (e.g., '+6.2 points', '10.9 points in MAP') but no statistical significance tests (p-values, t-tests, bootstrap tests) are reported." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "The paper reports absolute point differences with baseline context: e.g., '3B model improves to 64.39 P@1 (+6.2 points)' and 'QE-BM25 achieves 25.93 P@1, significantly outperforming BM25 (+10.5 points over doc2query).' Table 2 provides full baseline numbers for computing relative improvements." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "The dataset contains 21,407 queries and 1,070,350 pairs but no justification is given for these sizes. No power analysis or rationale for why these numbers are sufficient." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. Results appear to be single-run numbers. Table 3 reports p5 and p95 percentile latencies but no variance for quality metrics." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "Multiple baselines are included: vanilla BM25, doc2query, dense retriever (gte-Qwen2-7B-instruct), listwise LLM rankers (Qwen2.5-72B, DeepSeek-R1), and pointwise models without hints." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "Baselines include DeepSeek-R1 (2025), Qwen2.5 family (2024), gte-Qwen2-7B-instruct (2023), and doc2query (2023). These are recent and competitive models." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": true, 82 "justification": "The paper ablates hint augmentation (P vs P+H) across two model sizes (0.5B, 3B), and compares two query generation methods (features vs queries, Table 10). The with/without hints comparison isolates the contribution of LLM-generated interpretations." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": true, 87 "justification": "Six evaluation metrics are used: P@1, P@3, P@5, P@10, MAP, and MRR (Tables 2 and 4)." 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": true, 92 "justification": "Section 4.3 describes human evaluation with 10 expert annotators judging 153 queries in a blind setting, with Cohen's kappa of 0.74 measuring inter-annotator agreement." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": true, 97 "justification": "Table 1 shows explicit train/dev/test splits (13,914/2,140/5,353 queries). Results in Tables 2 and 4 are reported on the test set. Early stopping uses validation performance." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Table 8 (Appendix D.4) provides performance breakdowns across 16 product categories. Table 11 (Appendix D.5) stratifies by number of relevant items per query." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": true, 107 "justification": "Table 9 and Appendix D.1 identify three error categories (brand recognition, feature interpretation, relevancy assessment) with qualitative examples showing where the baseline fails and hints help." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "Table 10 shows that feature-based augmentation performs marginally better at P@1 but worse on other metrics than query reformulations. The listwise 72B model underperforms fine-tuned 0.5B models, demonstrating that larger models don't always win." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "Abstract claims of '10.9 points in MAP' (35.74 vs 24.79 in Table 2) and '5.9 points in MRR' (74.74 vs 68.82 in Table 2) are directly supported by the results. The claim that small models surpass large models is confirmed (3B P+H 64.39 P@1 vs 72B 35.31 P@1)." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "The paper's primary causal claim is that hint augmentation improves ranking. This is supported by controlled ablation (P vs P+H) where the only variable changed is the presence of hints, using the same model architecture, training setup, and test data." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": true, 129 "justification": "The Limitations section explicitly bounds scope: 'our method addresses only English (particularly US) queries' and acknowledges limited model diversity (only Qwen family for pointwise, two open-source LLMs for listwise). Future work for non-superlative and multilingual settings is noted." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper does not discuss alternative explanations for why hints improve performance. For example, improvements could be due to query expansion effects (more tokens = better matching) rather than genuine superlative interpretation, but this confound is not addressed." 135 }, 136 "proxy_outcome_distinction": { 137 "applies": true, 138 "answer": true, 139 "justification": "The paper measures P@k, MAP, and MRR directly on their ranking task and claims improvements in ranking quality. The human evaluation validates that automated metrics align with human judgment. The measurements match the granularity of the claims." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": true, 145 "answer": true, 146 "justification": "Specific model identifiers are used throughout: 'Claude 3.5 Sonnet v2', 'Claude 3.7 Sonnet', 'qwen2.5-72B-Instruct', 'qwen2.5-0.5B-instruct', 'qwen2.5-3B-instruct', 'gte-Qwen2-7B-instruct', 'DeepSeek-R1'. These are specific versioned model names identifiable on model hubs." 147 }, 148 "prompts_provided": { 149 "applies": true, 150 "answer": true, 151 "justification": "Full prompt texts are provided in the appendix: Tables 12 (query generation), 13 (relevance annotation), 14 (hint generation), and 15 (listwise ranking). These include the complete instructions sent to the models." 152 }, 153 "hyperparameters_reported": { 154 "applies": true, 155 "answer": false, 156 "justification": "Training hyperparameters for fine-tuned models are reported (learning rate 1e-5, batch sizes, early stopping, FP16). However, temperature, top-p, and other sampling settings for LLM inference (Claude hint generation, DeepSeek-R1 and Qwen-72B listwise ranking) are not stated." 157 }, 158 "scaffolding_described": { 159 "applies": false, 160 "answer": false, 161 "justification": "No agentic scaffolding is used. The system is a standard retrieve-then-rerank pipeline with offline hint generation, not an agent-based system." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": true, 166 "justification": "Sections 3.1-3.2 document the full pipeline: extracting two-level category info, filtering subcategories with <300 products, generating 50 queries per subcategory via Claude, BM25 filtering, dense retrieval of top 50 products per query, filtering queries with 0 or ≥15 relevant products. Numbers are provided at each stage (26,993 → 21,407)." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": true, 173 "justification": "A dedicated 'Limitations' section appears after the Conclusion, spanning two substantial paragraphs discussing model diversity, language constraints, and future extensions." 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": true, 178 "justification": "The Limitations section discusses specific issues: restricted to only two open-source LLMs for listwise ranking, pointwise experiments limited to Qwen family, English-only queries, and that superlative interpretations likely differ across languages with varying grammatical structures." 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": true, 183 "justification": "The paper explicitly states: 'our method addresses only English (particularly US) queries, constraining the broader applicability' and notes the need to evaluate 'more architecturally diverse models such as Phi-4' to validate the approach's generalizability." 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": true, 190 "justification": "The base dataset is the publicly available Amazon Shopping Queries Dataset (KDD 2022 Cup). The authors' constructed dataset and code are stated to be available at their GitHub repository." 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "Section 3.1-3.2 describes in detail: extracting categories from the Amazon Shopping Queries Dataset, filtering subcategories, generating queries via Claude 3.7 Sonnet, BM25 filtering for zero-result queries, dense retrieval for relevance labels, and the three-label annotation scheme." 196 }, 197 "recruitment_methods_described": { 198 "applies": true, 199 "answer": false, 200 "justification": "The human evaluation uses '10 expert annotators' and the dataset quality check uses 'an annotator,' but no description is given of how these annotators were recruited, their qualifications, or whether the recruitment process could introduce bias." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": true, 205 "justification": "The pipeline is documented with counts at each stage: Amazon Shopping Queries Dataset → subcategory filtering (≥300 products) → 50 queries per subcategory via Claude → 26,993 candidates → BM25 filtering → dense retrieval of top 50 → filtering (0 or ≥15 relevant) → 21,407 final queries with 1,070,350 pairs." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": false, 212 "justification": "No funding source is disclosed. All authors are from Amazon.com, Inc. but there is no acknowledgments section listing grants or corporate sponsorship." 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "Author affiliations are clearly stated as 'Amazon.com, Inc.' with Amazon email addresses. The connection to the evaluated product domain (Amazon product search) is visible." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": false, 222 "justification": "All authors are Amazon employees developing a method for product search. Amazon has a direct commercial interest in improved search performance. The funder (employer) is not independent of the outcome." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "No competing interests or financial disclosure statement is present in the paper. Amazon employees working on product search improvement have an inherent financial interest in the findings." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": true, 233 "answer": false, 234 "justification": "No training data cutoff dates are stated for any of the pre-trained models used (Claude 3.5 Sonnet v2, DeepSeek-R1, Qwen2.5 family, gte-Qwen2-7B-instruct). The Amazon Shopping Queries Dataset (2022) could have been in their training data." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": true, 238 "answer": false, 239 "justification": "No discussion of whether the Amazon Shopping Queries Dataset (from KDD 2022 Cup, publicly available) appeared in the pre-training data of the models used for listwise ranking (Qwen2.5-72B, DeepSeek-R1) or dense retrieval." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": true, 243 "answer": false, 244 "justification": "The Amazon Shopping Queries Dataset was published in 2022 (KDD Cup), well before the training of DeepSeek-R1 (2025) and Qwen2.5 (2024). No discussion of whether these models may have been trained on this benchmark data." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "The paper's human evaluation uses annotators as measurement tools to evaluate system outputs, not as study participants. This is not a human subjects study." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "The human evaluation involves expert annotators judging product rankings, not a human subjects study requiring ethics approval." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "The annotators are evaluating system outputs as part of the measurement methodology, not participating as human subjects." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in the study design sense. Annotators serve as evaluation instruments." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "Not a human subjects experiment. The blind setup for annotators is part of the evaluation methodology." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "Not a human subjects study. However, the paper does note the annotation was 'conducted in a blind setting, with annotators unaware of which system was which.'" 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "Not a human subjects study. Annotator participation is part of the measurement methodology." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": true, 287 "answer": true, 288 "justification": "Table 3 reports end-to-end latency per query (seconds) on a single L40 GPU with avg, p5, and p95. Section 4.4 reports PFlops per query (0.122–0.224 for proposed models vs 1.200–1.720 for baselines). AWS instance costs are also noted ($30.13/hr, $27.45/hr)." 289 }, 290 "compute_budget_stated": { 291 "applies": true, 292 "answer": true, 293 "justification": "Appendix A.1 reports hardware (L40s, A100 GPUs), AWS instance costs, and Appendix D.2 states fine-tuning requires '3–4 hours on a single NVIDIA L40s GPU.' PFlops per query are reported in Figure 2." 294 } 295 }, 296 "experimental_rigor": { 297 "seed_sensitivity_reported": { 298 "applies": true, 299 "answer": false, 300 "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from single training runs." 301 }, 302 "number_of_runs_stated": { 303 "applies": true, 304 "answer": false, 305 "justification": "The number of experimental runs is never explicitly stated. It is unclear whether results come from single or multiple runs." 306 }, 307 "hyperparameter_search_budget": { 308 "applies": true, 309 "answer": false, 310 "justification": "Learning rate (1e-5) and batch sizes are given but no search budget is described — no mention of how many configurations were tried or what search method was used." 311 }, 312 "best_config_selection_justified": { 313 "applies": true, 314 "answer": false, 315 "justification": "The paper reports using early stopping on validation performance but does not describe how the learning rate, batch size, or other hyperparameters were selected. Only the best configuration is presented with no exploration of alternatives." 316 }, 317 "multiple_comparison_correction": { 318 "applies": false, 319 "answer": false, 320 "justification": "No statistical significance tests are performed at all, making correction for multiple comparisons inapplicable." 321 }, 322 "self_comparison_bias_addressed": { 323 "applies": true, 324 "answer": false, 325 "justification": "The authors implement their own baselines (BM25, QE-BM25, doc2query) and fine-tune their own models. No acknowledgment of the bias inherent in evaluating one's own system against one's own implementation of baselines." 326 }, 327 "compute_budget_vs_performance": { 328 "applies": true, 329 "answer": true, 330 "justification": "Figure 2 and Appendix D.2 explicitly plot MRR/MAP against computational cost (PFlops per query, log scale), showing the efficiency-quality tradeoff across all model configurations." 331 }, 332 "benchmark_construct_validity": { 333 "applies": true, 334 "answer": false, 335 "justification": "The authors created their own benchmark with LLM-generated queries and LLM-generated labels. While they validate query naturalness (94.6%) and use human evaluation as a secondary check, they do not discuss whether their benchmark measures real-world superlative search behavior as opposed to LLM-generated approximations of it." 336 }, 337 "scaffold_confound_addressed": { 338 "applies": false, 339 "answer": false, 340 "justification": "No agentic scaffolding is involved. The system is a standard retrieve-and-rerank pipeline." 341 } 342 }, 343 "data_leakage": { 344 "temporal_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "The Amazon Shopping Queries Dataset was released in 2022. Models used (Qwen2.5, DeepSeek-R1, trained in 2024-2025) may have seen this data during pre-training. No discussion of this temporal leakage risk." 348 }, 349 "feature_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of whether the hint generation process (Claude interpreting queries) could introduce feature leakage, e.g., by encoding information about specific products that would not be available in real-time deployment." 353 }, 354 "non_independence_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "Products in train and test sets come from the same Amazon catalog. No discussion of whether product overlap or near-duplicate queries between splits could inflate results." 358 }, 359 "leakage_detection_method": { 360 "applies": true, 361 "answer": false, 362 "justification": "No leakage detection or prevention method is described. No decontamination pipeline, canary strings, membership inference tests, or overlap analysis." 363 } 364 } 365 }, 366 "claims": [ 367 { 368 "claim": "QE-BM25 with superlative-aware decomposition improves retrieval by 10.9 points in MAP over vanilla BM25.", 369 "evidence": "Table 2: QE-BM25 achieves 35.74 MAP vs BM25's 24.79 MAP, a 10.95-point improvement. Also outperforms doc2query (27.84 MAP) by 7.9 points.", 370 "supported": "strong" 371 }, 372 { 373 "claim": "Hint-augmented 3B model (64.39 P@1) surpasses 72B listwise LLM (35.31 P@1) and DeepSeek-R1 (44.35 P@1) while being 14x more compute-efficient.", 374 "evidence": "Table 2 shows ranking results. Figure 2 and Appendix D.2 show 0.122-0.224 PFlops for proposed models vs 1.200-1.720 PFlops for listwise baselines. Note: text inconsistently reports DeepSeek-R1 as '47.06 P@1' while Table 2 shows 44.35.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "Hint augmentation improves the 3B model by 6.2 points P@1 and 5.9 points MRR, and the 0.5B model by 4.2 points P@1.", 379 "evidence": "Table 2: 3B P→P+H is 58.15→64.39 P@1 (+6.24) and 68.82→74.74 MRR (+5.92). 0.5B P→P+H is 52.47→56.71 P@1 (+4.24).", 380 "supported": "strong" 381 }, 382 { 383 "claim": "Hint augmentation introduces minimal latency overhead: 3.5% for 0.5B and 6.8% for 3B models.", 384 "evidence": "Table 3 reports end-to-end latency: 0.5B P 4.215s vs P+H 4.364s (+149ms, 3.5%) and 3B P 6.912s vs P+H 7.385s (+473ms, 6.8%).", 385 "supported": "strong" 386 }, 387 { 388 "claim": "Human evaluation confirms hint-augmented system wins 65.38% of non-tie comparisons with substantial inter-annotator agreement (Cohen's kappa 0.74).", 389 "evidence": "Section 4.3: 10 expert annotators, 153 queries, blind evaluation. 51 wins for hints, 27 for baseline, 75 ties. Win rate excluding ties: 65.38%.", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "Results are robust to judge choice — Amazon Nova Pro as alternative judge confirms consistent relative performance patterns.", 394 "evidence": "Table 4 in Appendix A.2 shows same ranking of methods with Nova Pro judge. 3B+H achieves 72.50 P@1 (vs 64.39 with Claude). Relative hint improvement remains: +4.9 for 3B, +10.4 for 0.5B.", 395 "supported": "strong" 396 } 397 ], 398 "red_flags": [ 399 { 400 "flag": "Amazon conflict of interest", 401 "detail": "All three authors are Amazon employees developing a method for Amazon product search using the Amazon Shopping Queries Dataset. Amazon has a direct commercial interest in demonstrating improved search performance. No conflict of interest statement is present." 402 }, 403 { 404 "flag": "Internal number discrepancy", 405 "detail": "The text in Section 4.3 states 'deepseek-r1 achieves superior results (47.06 P@1)' but Table 2 reports 44.35 P@1. This 2.7-point discrepancy suggests possible reporting errors or confusion between different evaluation runs." 406 }, 407 { 408 "flag": "No statistical testing on any result", 409 "detail": "All performance claims are based on comparing point estimates from apparently single experimental runs. No significance tests, confidence intervals, or variance measures are reported for quality metrics. The observed improvements could be within noise." 410 }, 411 { 412 "flag": "Circular evaluation concern", 413 "detail": "Claude 3.5 Sonnet v2 generates both the hints and the relevance labels used for primary evaluation. Although the authors validate with Nova Pro as an alternative judge (Appendix A.2), the primary results table uses the same model family for both data generation and evaluation." 414 }, 415 { 416 "flag": "LLM-generated benchmark validity", 417 "detail": "Both the queries and the relevance labels are generated by LLMs (Claude). The query quality check (94.6% valid) was done by a single annotator on 1,000 samples. The benchmark may capture LLM preferences rather than actual user search behavior." 418 } 419 ], 420 "cited_papers": [ 421 { 422 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 423 "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"], 424 "year": 2024, 425 "relevance": "Directly addresses LLM cost reduction strategies, relevant to efficiency and routing in LLM-based systems." 426 }, 427 { 428 "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning", 429 "authors": ["DeepSeek-AI"], 430 "year": 2025, 431 "arxiv_id": "2501.12948", 432 "relevance": "Major reasoning-focused LLM used as baseline; relevant to LLM capability and reasoning enhancement." 433 }, 434 { 435 "title": "Large Language Models are Effective Text Rankers with Pairwise Ranking Prompting", 436 "authors": ["Zhen Qin", "Rolf Jagerman", "Kai Hui"], 437 "year": 2024, 438 "relevance": "Demonstrates LLM zero-shot ranking capability, foundational work on LLM-based information retrieval." 439 }, 440 { 441 "title": "Is ChatGPT Good at Search? Investigating Large Language Models as Re-Ranking Agents", 442 "authors": ["Weiwei Sun", "Lingyong Yan", "Xinyu Ma"], 443 "year": 2023, 444 "relevance": "Evaluates LLMs as ranking agents, relevant to LLM capability assessment in applied tasks." 445 }, 446 { 447 "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 448 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"], 449 "year": 2022, 450 "relevance": "Foundational prompting technique used in the hint generation pipeline; central to LLM reasoning methodology." 451 }, 452 { 453 "title": "Query Expansion by Prompting Large Language Models", 454 "authors": ["Rolf Jagerman", "Honglei Zhuang", "Zhen Qin"], 455 "year": 2023, 456 "arxiv_id": "2305.03653", 457 "relevance": "Directly relevant baseline approach for LLM-based query expansion in information retrieval." 458 }, 459 { 460 "title": "Lost in the Middle: How Language Models Use Long Contexts", 461 "authors": ["Nelson F. Liu", "Kevin Lin", "John Hewitt"], 462 "year": 2024, 463 "relevance": "Documents LLM context window limitations that motivate the chunking approach used in listwise ranking." 464 }, 465 { 466 "title": "Qwen2.5 Technical Report", 467 "authors": ["An Yang", "Baosong Yang", "Beichen Zhang"], 468 "year": 2024, 469 "arxiv_id": "2412.15115", 470 "relevance": "Technical report for the model family used in both pointwise fine-tuning and listwise ranking experiments." 471 }, 472 { 473 "title": "Phi-4 Technical Report", 474 "authors": ["Marah Abdin", "Jyoti Aneja", "Harkirat Behl"], 475 "year": 2024, 476 "arxiv_id": "2412.08905", 477 "relevance": "Small language model mentioned as future work for validating approach across diverse architectures." 478 }, 479 { 480 "title": "ReasoningRank: Teaching Student Models to Rank through Reasoning-Based Knowledge Distillation", 481 "authors": ["Yuelyu Ji", "Zhuochun Li", "Rui Meng"], 482 "year": 2025, 483 "arxiv_id": "2410.05168", 484 "relevance": "Closely related work on distilling reasoning capabilities from large to small models for ranking tasks." 485 } 486 ] 487 }