scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25062B)
      1 {
      2   "paper": {
      3     "title": "Aligned Query Expansion: Efficient Query Expansion for Information Retrieval through LLM Alignment",
      4     "authors": [
      5       "Adam Yang",
      6       "Gustavo Penha",
      7       "Enrico Palumbo",
      8       "Hugues Bouchard"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv preprint",
     12     "arxiv_id": "2507.11042"
     13   },
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. No mention of code availability."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses four publicly available datasets: Natural Questions, TriviaQA, WebQA, and Entity Questions, all cited with their original references. These are standard public benchmarks."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper specifies the model (T0 3B) and some hyperparameters but provides no requirements.txt, Dockerfile, library versions, or environment setup details beyond listing the model name and optimizer."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The implementation details section (Section 4.2) gives some parameters but not enough to fully reproduce without additional information and code."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Tables 1-4 report only point estimates (e.g., '30.8', '53.6') with no confidence intervals, error bars, or ± notation. The only statistical information is significance test superscripts."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Tables 1 and 2 use Student's t-tests with Bonferroni correction, indicated by superscripts denoting statistically significant improvements. Tables 3 and 4 also use the same tests with the ‡ superscript notation."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper reports raw accuracy differences (e.g., '17.8% improvement in top-1 accuracy') and percentage reductions (71.1% memory, 69.5% time) but does not report standardized effect sizes such as Cohen's d or odds ratios. The raw differences with baseline context in the efficiency comparison (Section 5.3) approach adequate reporting, but the main retrieval results in Tables 1-4 present only point estimates without baseline context in the text."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification is given for why these particular dataset sizes or test set sizes were chosen. No power analysis is discussed. The paper simply uses the standard splits of existing benchmarks without discussing whether the sample sizes are adequate for the claims being made."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No standard deviations, variance across runs, or any spread measures are reported. All results appear to be from single runs. There is no mention of averaging over multiple seeds or runs."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Three baselines are compared: (0) original query without expansion, (1) zero-shot query expansion, and (2) filtering approach (generate-then-filter using a DeBERTa reranker). These are described in Section 4.2 and results are in Tables 1-4."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The baselines include the EAR filtering approach (Chuang et al., 2023) and Doc2Query-- (Gospodinov et al., 2023), which are contemporary. The filtering baseline follows the setup from Chuang et al. [2] which is the most relevant prior work."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper tests three alignment variants: RSFT alone, DPO alone, and RSFT + DPO combined. This effectively ablates the contribution of each alignment component, shown across Tables 1-4."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Results are reported across six Top-N retrieval accuracy metrics: Top-1, Top-5, Top-10, Top-20, Top-50, and Top-100. Additionally, the efficiency analysis reports GPU memory, computational time, and accuracy."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation is performed. The quality of query expansions is assessed only through automated retrieval accuracy metrics. Given that the paper discusses hallucination and query quality (Table 5 shows qualitative examples), human evaluation of expansion quality would be relevant but is absent."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Section 4.3.2 explicitly describes using training splits for fine-tuning and test splits for evaluation. The out-of-distribution experiments further test on entirely different datasets (WebQA, EntityQuestions) than those used for training."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down across four different datasets (Natural Questions, TriviaQA, WebQA, EntityQuestions) in Tables 1-4, and across multiple top-N values. Out-of-distribution results are shown separately per training source."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "Table 5 shows qualitative examples of query expansions but focuses on successes. There is no systematic error analysis or discussion of failure cases where AQE underperforms or produces harmful expansions."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports that RSFT alone lags behind filtering on Natural Questions (Table 1), that zero-shot expansion sometimes underperforms the original query, and that RSFT and DPO individually reduce diversity (Section 5.4). The filtering approach failing in out-of-distribution settings is also a negative result for that baseline."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims AQE 'outperforms baseline models for query expansion in both in-domain and out-of-domain settings.' Tables 1-4 support this for the RSFT+DPO combination. The abstract also claims 'reducing computational costs' which is supported by Figure 2."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper makes causal claims through ablation ('RSFT + DPO provides the best results' by testing each component separately and in combination). The alignment training is a controlled intervention on the model, and comparisons are made against the same base model without alignment. The experimental design of controlled single-variable manipulation supports these claims."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title claims to address 'Information Retrieval' generally, but experiments are limited to passage retrieval in open-domain question answering using BM25 as the sparse retriever. The paper does not bound claims to this specific setting. Claims like 'model-agnostic' (Section 5.2) are made based on testing only a single model (T0 3B)."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative explanations for why AQE outperforms filtering. For example, it does not consider whether the improvements come from the specific training data distribution rather than the alignment mechanism, or whether the reranker model capacity is a confound."
    131       }
    132     },
    133     "setup_transparency": {
    134       "model_versions_specified": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper specifies 'T0 3B model [21]' and 'DeBERTa V3 base [7]' but does not provide exact version identifiers, checkpoint dates, or Hugging Face model card names with version info. 'T0 3B' is a model family name without a specific checkpoint."
    138       },
    139       "prompts_provided": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 3.1 provides the actual prompt text used: 'To answer this query, we need to know:' prepended with the original query. This is the full prompt used for zero-shot query expansion generation."
    143       },
    144       "hyperparameters_reported": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Section 4.2 reports temperature τ=1.0, top-k=50, learning rate η=5×10⁻⁵, batch size B=16, one epoch of fine-tuning, β=0.1 for DPO, and the AdamW optimizer."
    148       },
    149       "scaffolding_described": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "The paper does not use agentic scaffolding. AQE is a fine-tuning approach applied to a language model for single-shot query generation, not an agentic system."
    153       },
    154       "data_preprocessing_documented": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "The paper does not document how the training data from Natural Questions and TriviaQA was preprocessed, what filtering was applied, or how the 50 generated expansions per query were stored and managed. The pipeline from raw datasets to training pairs is not fully described."
    158       }
    159     },
    160     "limitations_and_scope": {
    161       "limitations_section_present": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "There is no dedicated limitations or threats-to-validity section in the paper. The conclusion mentions 'Future work could explore further enhancements' but does not discuss limitations of the current work."
    165       },
    166       "threats_to_validity_specific": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No specific threats to validity are discussed. There is no mention of potential confounds, limitations of the experimental setup, or risks to the conclusions drawn."
    170       },
    171       "scope_boundaries_stated": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "The paper does not explicitly state what the results do NOT show. For instance, it does not acknowledge that results are limited to a single language model (T0 3B), a single retriever (BM25), or the open-domain QA setting."
    175       }
    176     },
    177     "data_integrity": {
    178       "raw_data_available": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "The underlying datasets are publicly available, but the generated query expansions, preference pairs used for DPO training, and the trained model weights are not released. There is no way to verify the specific training data or intermediate outputs."
    182       },
    183       "data_collection_described": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 3.1 describes how query expansions are generated (zero-shot prompting with n=50 per query), Section 3.2 describes how they are ranked using BM25 retrieval, and Section 4.1 describes the source datasets. The data creation pipeline is conceptually clear."
    187       },
    188       "recruitment_methods_described": {
    189         "applies": false,
    190         "answer": false,
    191         "justification": "No human participants are involved. The paper uses standard public benchmarks (Natural Questions, TriviaQA, WebQA, Entity Questions)."
    192       },
    193       "data_pipeline_documented": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "The pipeline is documented: (1) generate 50 expansions per query via zero-shot prompting, (2) rank expansions by BM25 retrieval effectiveness, (3) select best/worst pairs, (4) fine-tune with RSFT and/or DPO. Figure 1 illustrates this pipeline."
    197       }
    198     },
    199     "conflicts_of_interest": {
    200       "funding_disclosed": {
    201         "applies": true,
    202         "answer": false,
    203         "justification": "No funding information is disclosed anywhere in the paper. There is no acknowledgments section listing grants or sponsors."
    204       },
    205       "affiliations_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Author affiliations are clearly listed: Adam Yang at Mistral AI, Gustavo Penha, Enrico Palumbo, and Hugues Bouchard at Spotify. A footnote notes the work was done during Yang's internship at Spotify."
    209       },
    210       "funder_independent_of_outcome": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "Spotify, where 3 of 4 authors work and where the first author interned, is an information retrieval company that would benefit from improvements in query expansion. The paper cites concurrent work deploying aligned query generation at a 'large industrial setting' [16] (Taobao). No discussion of potential conflicts."
    214       },
    215       "financial_interests_declared": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No competing interests or financial interests statement is provided. The authors work at companies (Spotify, Mistral AI) that have commercial interests in information retrieval technology."
    219       }
    220     },
    221     "contamination": {
    222       "training_cutoff_stated": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "The T0 3B model's training data cutoff is not stated. While the paper fine-tunes T0 3B rather than evaluating its zero-shot capability, the pre-training data could influence the quality of generated expansions, and this is not discussed."
    226       },
    227       "train_test_overlap_discussed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No discussion of whether the Natural Questions or TriviaQA test examples appeared in T0's pre-training data. Since T0 was trained on a variety of NLP datasets, overlap with these popular QA benchmarks is plausible."
    231       },
    232       "benchmark_contamination_addressed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "Natural Questions (2019), TriviaQA (2017), and Entity Questions (2021) were all published before T0's training. The paper does not discuss whether these benchmarks were in T0's pre-training data, which could inflate performance."
    236       }
    237     },
    238     "human_studies": {
    239       "pre_registered": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved in this study."
    243       },
    244       "irb_or_ethics_approval": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this study."
    248       },
    249       "demographics_reported": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this study."
    253       },
    254       "inclusion_exclusion_criteria": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this study."
    258       },
    259       "randomization_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in this study."
    263       },
    264       "blinding_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in this study."
    268       },
    269       "attrition_reported": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants are involved in this study."
    273       }
    274     },
    275     "cost_and_practicality": {
    276       "inference_cost_reported": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Figure 2 reports GPU memory occupancy (with 71.1% reduction) and computational time (with 69.5% reduction) for inference on TriviaQA, comparing filtering vs. AQE. Specific values are shown in the bar charts."
    280       },
    281       "compute_budget_stated": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "The total computational budget for training (GPU hours, hardware used, total training time) is not stated. Only inference efficiency is compared. The paper does not report what hardware was used for training or how long training took."
    285       }
    286     }
    287   },
    288   "claims": [
    289     {
    290       "claim": "AQE (RSFT + DPO) outperforms baseline methods including filtering in both in-domain and out-of-domain retrieval tasks.",
    291       "evidence": "Tables 1-4 show RSFT+DPO achieving the highest retrieval accuracy across most Top-N metrics on Natural Questions, TriviaQA, WebQA, and EntityQuestions, with statistical significance indicated by superscripts (Student's t-tests with Bonferroni correction).",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "AQE reduces memory consumption by 71.1% and computational time by 69.5% compared to filtering, while improving top-1 retrieval accuracy by 17.8%.",
    296       "evidence": "Figure 2 presents bar charts comparing GPU memory occupancy, computational time, and top-1 accuracy between filtering and AQE on TriviaQA inference.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "The filtering approach struggles in out-of-distribution settings, with 5-10% drops in top-1 accuracy.",
    301       "evidence": "Tables 3 and 4 show filtering underperforming both original query and zero-shot expansion in OOD settings (e.g., 9.1% top-1 on WebQA vs. 18.9% for original query when trained on Natural Questions).",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "RSFT + DPO increases generation diversity compared to individual alignment methods.",
    306       "evidence": "Figure 3 shows average pairwise cosine distance for different methods, with RSFT+DPO showing higher diversity than RSFT or DPO alone, though the visual difference is small.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "AQE is model-agnostic and delivers strong performance across diverse retrieval contexts.",
    311       "evidence": "Section 5.2 makes this claim based on OOD generalization results, but only a single model (T0 3B) and a single retriever (BM25) are tested. The claim significantly exceeds the evidence.",
    312       "supported": "weak"
    313     }
    314   ],
    315   "methodology_tags": [
    316     "benchmark-eval"
    317   ],
    318   "key_findings": "Aligned Query Expansion (AQE) uses LLM alignment techniques (RSFT and DPO) to fine-tune query expansion models, eliminating costly generate-then-filter steps. On Natural Questions and TriviaQA, RSFT+DPO achieves statistically significant improvements over filtering baselines in both in-domain and out-of-domain settings. AQE reduces inference memory by 71.1% and time by 69.5% compared to filtering while improving top-1 accuracy by 17.8% on TriviaQA. The filtering approach degrades severely in out-of-distribution settings while AQE maintains robust performance.",
    319   "red_flags": [
    320     {
    321       "flag": "No variance or error bars",
    322       "detail": "All results appear to be from single experimental runs. No standard deviations, confidence intervals, or multi-seed experiments are reported despite making comparative claims. The statistical significance tests partially mitigate this but uncertainty in the point estimates is unknown."
    323     },
    324     {
    325       "flag": "No limitations section",
    326       "detail": "The paper contains no limitations section, no threats to validity discussion, and no explicit acknowledgment of what the results do not show. This is a significant omission for a paper making broad claims about query expansion."
    327     },
    328     {
    329       "flag": "Overclaimed generalizability",
    330       "detail": "The paper claims AQE is 'model-agnostic' (Section 5.2) based on testing a single model (T0 3B) with a single retriever (BM25). The title suggests general 'Information Retrieval' applicability when results are limited to open-domain QA passage retrieval."
    331     },
    332     {
    333       "flag": "Single model tested",
    334       "detail": "Only T0 3B is used as the language model. No experiments with other LLMs are conducted, making it impossible to assess whether the alignment approach generalizes across different model architectures and sizes."
    335     },
    336     {
    337       "flag": "Potential contamination not addressed",
    338       "detail": "T0 3B may have been trained on data containing the Natural Questions, TriviaQA, and Entity Questions benchmarks. This could inflate the quality of generated expansions, but contamination is never discussed."
    339     },
    340     {
    341       "flag": "Industry affiliation undisclosed as conflict",
    342       "detail": "Three of four authors work at Spotify, an IR company, and the first author was a Spotify intern. No competing interests statement is provided despite the obvious commercial relevance of the work."
    343     }
    344   ],
    345   "cited_papers": [
    346     {
    347       "title": "Expand, rerank, and retrieve: Query reranking for open-domain question answering",
    348       "authors": ["Yung-Sung Chuang", "Wei Fang", "Shang-Wen Li", "Wen-tau Yih", "James Glass"],
    349       "year": 2023,
    350       "arxiv_id": "2305.17080",
    351       "relevance": "Key baseline for query expansion with generate-then-filter approach, directly compared against in this paper."
    352     },
    353     {
    354       "title": "Direct preference optimization: Your language model is secretly a reward model",
    355       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell", "Christopher D Manning", "Stefano Ermon", "Chelsea Finn"],
    356       "year": 2024,
    357       "relevance": "Core alignment technique (DPO) used in AQE, fundamental method for LLM alignment without explicit reward modeling."
    358     },
    359     {
    360       "title": "The llama 3 herd of models",
    361       "authors": ["Abhimanyu Dubey", "Abhinav Jauhri"],
    362       "year": 2024,
    363       "arxiv_id": "2407.21783",
    364       "relevance": "Source of the RSFT alignment technique used in AQE; relevant as a major LLM release with alignment methodology."
    365     },
    366     {
    367       "title": "Training language models to follow instructions with human feedback",
    368       "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"],
    369       "year": 2022,
    370       "relevance": "Foundational RLHF paper for LLM alignment, establishing the reward modeling paradigm used as background for this work."
    371     },
    372     {
    373       "title": "Query2doc: Query expansion with large language models",
    374       "authors": ["Liang Wang", "Nan Yang", "Furu Wei"],
    375       "year": 2023,
    376       "arxiv_id": "2303.07678",
    377       "relevance": "LLM-based query expansion method addressing vocabulary mismatch, directly relevant to the survey's coverage of LLM capabilities in IR."
    378     },
    379     {
    380       "title": "A survey on hallucination in large language models: Principles, taxonomy, challenges, and open questions",
    381       "authors": ["Lei Huang", "Weijiang Yu", "Weitao Ma"],
    382       "year": 2023,
    383       "arxiv_id": "2311.05232",
    384       "relevance": "Survey on LLM hallucination, relevant to understanding how hallucination affects query expansion quality."
    385     },
    386     {
    387       "title": "Large language model based long-tail query rewriting in taobao search",
    388       "authors": ["Wenjun Peng", "Guiyang Li", "Yue Jiang"],
    389       "year": 2024,
    390       "relevance": "Concurrent industrial deployment of aligned query generation at Taobao, demonstrating real-world impact of the approach."
    391     },
    392     {
    393       "title": "Generation-augmented retrieval for open-domain question answering",
    394       "authors": ["Yuning Mao", "Pengcheng He", "Xiaodong Liu"],
    395       "year": 2020,
    396       "arxiv_id": "2009.08553",
    397       "relevance": "Introduced the GAR framework for query expansion using pre-trained language models, foundational to this line of work."
    398     },
    399     {
    400       "title": "Instruction Tuning With Loss Over Instructions",
    401       "authors": ["Zhengyan Shi", "Adam X Yang", "Bin Wu", "Laurence Aitchison", "Emine Yilmaz", "Aldo Lipani"],
    402       "year": 2024,
    403       "arxiv_id": "2405.14394",
    404       "relevance": "Instruction tuning methodology for LLMs, relevant to understanding alignment techniques applied to language models."
    405     },
    406     {
    407       "title": "Bayesian reward models for LLM alignment",
    408       "authors": ["Adam X Yang", "Maxime Robeyns", "Thomas Coste", "Jun Wang", "Haitham Bou-Ammar", "Laurence Aitchison"],
    409       "year": 2024,
    410       "arxiv_id": "2402.13210",
    411       "relevance": "Alternative reward modeling approach for LLM alignment, relevant to the survey's coverage of alignment methodologies."
    412     }
    413   ]
    414 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs