scan.json (20037B)
1 { 2 "paper": { 3 "title": "BioRAGent: A Retrieval-Augmented Generation System for Showcasing Generative Query Expansion and Domain-Specific Search for Scientific Q&A", 4 "authors": ["Samy Ateia", "Udo Kruschwitz"], 5 "year": 2024, 6 "venue": "ECIR 2025 Demo Track", 7 "arxiv_id": "2412.12358" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "The paper provides a GitHub link (https://github.com/SamyAteia/BioRAGent) in the abstract and footnote 1, stating 'the source code is publicly accessible through GitHub.'" 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The system uses publicly available data: the 2023 snapshot of PubMed articles (footnote 4 links to PubMed download) and the BioASQ training set. These are standard public datasets." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "The paper mentions using the Gradio framework and Elasticsearch but does not provide a requirements.txt, Dockerfile, or detailed environment setup listing library versions." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions are included in the paper. The paper describes the system architecture but does not provide a 'Reproducing Results' section or commands to run." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "No confidence intervals or error bars are reported. The paper references competitive results from BioASQ 2024 but only states they won 'multiple first and second places' without any numerical results with uncertainty." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "No statistical significance tests are reported. The paper claims 'competitive results' and winning positions but provides no statistical comparison." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": false, 46 "justification": "No effect sizes or quantitative performance numbers are reported in this paper. The paper only states they achieved 'competitive results' and won 'multiple first and second places' without providing any specific metrics." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "No sample size justification is provided. The paper does not report how many questions or test instances were used in evaluation, deferring to the BioASQ challenge format without discussion." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No variance or standard deviation is reported across any experimental runs." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": false, 63 "justification": "No baseline comparisons are included in this paper. The paper mentions other systems took 'leading spots' in retrieval tasks but does not present any comparative results with numbers." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": false, 68 "justification": "No baselines are presented in the paper, so contemporaneity cannot be assessed." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": false, 73 "justification": "No ablation study is presented. The system has multiple components (query expansion, snippet extraction, reranking, answer generation) but none are ablated to measure individual contribution." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": false, 78 "justification": "No evaluation metrics are reported in this paper. Results are described qualitatively ('competitive results', 'winning multiple first and second places') without any specific metric values." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": false, 83 "justification": "No human evaluation of the system's outputs is presented. The BioASQ challenge involves expert evaluation, but this paper does not report those results directly — it only references winning positions." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": false, 88 "justification": "The paper references the BioASQ challenge test sets but does not report any results on them in this paper. No held-out test set evaluation is presented here." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": false, 93 "justification": "The paper mentions being 'most competitive in the question answering tasks' vs. 'document retrieval and snippets extraction tasks' but provides no numerical breakdown. This is qualitative description, not a per-category results breakdown." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": false, 98 "justification": "No failure cases are shown or discussed. The paper acknowledges that 'in the document retrieval and snippets extraction tasks other systems...took the leading spots' but does not analyze why their system underperformed." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": false, 103 "justification": "The paper briefly notes their system was not the best at retrieval/snippet extraction, but does not provide any quantitative negative results or discuss things that were tried and didn't work." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": false, 110 "justification": "The abstract claims 'successful participation in the BioASQ 2024 challenge' and that they 'demonstrate how few-shot learning with LLMs can be effectively applied.' These claims are not supported within this paper — no quantitative results are presented. The reader is directed to a separate publication [1] for evidence." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": false, 115 "justification": "The paper implies that their RAG approach with few-shot learning 'can be effectively applied for a professional search setting,' which is a causal-adjacent claim. No controlled experiment or ablation is provided to justify this within the paper." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper is reasonably scoped: it specifies the domain (biomedical), the data source (PubMed), and the evaluation context (BioASQ 2024, TREC 2024 BioGen Track). The title explicitly says 'Domain-Specific Search for Scientific Q&A.'" 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "No alternative explanations are discussed. The paper does not consider whether the system's competitive performance could be attributed to factors other than their approach (e.g., the quality of the BioASQ training data, the specific LLM used, or the competition conditions)." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "The paper specifies 'Gemini 1.5 flash 002 from Google' in Section 2, which is a specific model version identifier." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper describes prompt usage in natural language (e.g., 'few-shot (3-shot) learning' for query expansion, 'prompted with few-shot examples to extract relevant snippets') but does not provide the actual prompt text or few-shot examples used." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": false, 142 "justification": "No hyperparameters are reported. The paper does not state temperature, top-p, or other sampling settings for the LLM. The number of few-shot examples (3-shot for query expansion) is mentioned, but no other hyperparameters." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "The paper describes the RAG pipeline architecture in Section 2: query expansion (Section 2.1), document retrieval and snippet extraction with parallel processing and reranking (Section 2.2), and answer generation in two formats (Section 2.3). The workflow is described at a sufficient level." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": false, 152 "justification": "The paper mentions using the '2023 snapshot of PubMed articles' indexed in Elasticsearch with the 'default English analyzer' but does not describe any preprocessing steps, filtering criteria, or how many articles were indexed." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": false, 159 "justification": "There is no dedicated limitations or threats-to-validity section. The Conclusion section mentions future work directions but does not discuss limitations of the current system." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "No specific threats to validity are discussed anywhere in the paper." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "No explicit scope boundaries are stated. The paper does not discuss what the system cannot do, what types of questions it handles poorly, or what settings the results do not apply to." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "No raw evaluation data is provided. The paper references BioASQ challenge results but does not share the raw outputs, scores, or evaluation data." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": false, 181 "justification": "The paper does not describe data collection for evaluation. It references participation in BioASQ 2024 but does not describe how the evaluation data was structured or collected within this paper." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants were recruited in this paper. The evaluation was based on the BioASQ challenge benchmark, not a user study." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": false, 191 "justification": "The system pipeline is described (query expansion → retrieval → snippet extraction → answer generation), but the data pipeline from raw PubMed data to evaluation is not documented. No filtering counts or transformation steps are provided." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding source is disclosed. The Acknowledgments section thanks the BioASQ and TREC organizers but does not mention any funding." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are clearly stated: both authors are from the University of Regensburg, Germany. They are not evaluating a product from their own institution." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding is disclosed, so independence cannot be assessed. The absence of any funding disclosure means this criterion is not satisfied." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": true, 213 "justification": "The paper includes a 'Disclosure of Interests' section stating: 'The authors have no competing interests to declare that are relevant to the content of this article.'" 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper uses Gemini 1.5 Flash 002 for few-shot learning on BioASQ data. The model's training data cutoff is not stated. While the primary evaluation is on BioASQ which tests retrieval and generation, the model's knowledge cutoff is relevant since it generates answers." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No discussion of potential overlap between the LLM's training data and BioASQ test questions. The LLM may have seen BioASQ questions during pre-training." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "BioASQ has been running since 2013 and its data is publicly available. The paper does not address whether the LLM may have been trained on prior BioASQ questions and answers." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study. Evaluation was through the BioASQ challenge benchmark." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "The paper mentions choosing Gemini 1.5 Flash 002 'due to its speed and low cost' but does not report actual inference costs, latency, or tokens consumed per query." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No computational budget is stated. The hardware used for running Elasticsearch or hosting the demo is not described." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "BioRAGent achieved competitive results in the BioASQ 2024 challenge, winning multiple first and second places in the 12th BioASQ challenge.", 286 "evidence": "Section 3 states 'Our system achieved competitive results across different question formats and settings, winning multiple first and second places in the 12th. BioASQ challenge.' No specific metrics or rankings are provided in this paper; the evidence is deferred to reference [1].", 287 "supported": "weak" 288 }, 289 { 290 "claim": "Few-shot learning with LLMs can be effectively applied for professional search in biomedical domains.", 291 "evidence": "The abstract claims this is 'demonstrated' through BioASQ participation, but the paper itself presents no quantitative evidence. The system description shows how few-shot learning is used for query expansion and snippet extraction, but effectiveness is only asserted, not demonstrated within the paper.", 292 "supported": "weak" 293 }, 294 { 295 "claim": "The system's query expansion approach makes the LLM's encoded semantic knowledge visible and controllable, unlike dense vector search.", 296 "evidence": "Section 3 argues that 'the resulting retrieval rankings based on embedding vectors are not transparent or easily controllable by a search expert' while their approach 'makes the encoded semantic knowledge of the LLM used for searching, visible and controllable by displaying the expanded query.' This is a qualitative argument, not empirically tested.", 297 "supported": "weak" 298 } 299 ], 300 "methodology_tags": ["case-study", "benchmark-eval"], 301 "key_findings": "BioRAGent is a web-based RAG system for biomedical Q&A that uses Gemini 1.5 Flash 002 for query expansion, snippet extraction, and answer generation from PubMed articles. The system participated in BioASQ 2024, reportedly winning multiple first and second places, though no quantitative results are presented in this paper. The system emphasizes transparency through editable expanded queries and inline citation links to PubMed source documents.", 302 "red_flags": [ 303 { 304 "flag": "No quantitative results presented", 305 "detail": "This is a demo paper that claims competitive BioASQ performance but presents zero quantitative evaluation results. All evidence is deferred to a separate workshop paper [1]. The reader cannot assess any claims of effectiveness from this paper alone." 306 }, 307 { 308 "flag": "No baselines or comparisons", 309 "detail": "Despite claiming the system achieves 'competitive results,' no comparison against any baseline or competing system is shown within this paper." 310 }, 311 { 312 "flag": "Missing limitations discussion", 313 "detail": "The paper has no limitations section and does not discuss when the system might fail, what types of questions it handles poorly, or any weaknesses of the approach." 314 } 315 ], 316 "cited_papers": [ 317 { 318 "title": "Can Open-Source LLMs Compete with Commercial Models? Exploring the Few-Shot Performance of Current GPT Models in Biomedical Tasks", 319 "authors": ["Samy Ateia", "Udo Kruschwitz"], 320 "year": 2024, 321 "relevance": "The companion paper containing the actual quantitative evaluation results from BioASQ 2024, demonstrating few-shot LLM performance in biomedical tasks." 322 }, 323 { 324 "title": "Retrieval-augmented generation for knowledge-intensive nlp tasks", 325 "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus"], 326 "year": 2020, 327 "relevance": "The foundational RAG paper that BioRAGent builds upon, relevant to understanding RAG-based approaches for LLM grounding." 328 }, 329 { 330 "title": "Survey of Hallucination in Natural Language Generation", 331 "authors": ["Ziwei Ji", "Nayeon Lee", "Rita Frieske"], 332 "year": 2023, 333 "doi": "10.1145/3571730", 334 "relevance": "Survey on LLM hallucination, the core problem that RAG systems like BioRAGent aim to mitigate." 335 }, 336 { 337 "title": "Can ChatGPT write a good boolean query for systematic review literature search?", 338 "authors": ["Shuai Wang", "Harrisen Scells", "Bevan Koopman", "Guido Zuccon"], 339 "year": 2023, 340 "relevance": "Directly relevant work on using LLMs for professional search query generation, the same approach BioRAGent uses for query expansion." 341 }, 342 { 343 "title": "Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context", 344 "authors": ["Machel Reid"], 345 "year": 2024, 346 "arxiv_id": "2403.05530", 347 "relevance": "The model family (Gemini 1.5 Flash 002) used as the backbone LLM in BioRAGent's pipeline." 348 }, 349 { 350 "title": "Retrieval Augmentation Reduces Hallucination in Conversation", 351 "authors": ["Kurt Shuster", "Spencer Poff", "Moya Chen"], 352 "year": 2021, 353 "relevance": "Evidence that retrieval augmentation reduces hallucination, providing motivation for the RAG approach used in BioRAGent." 354 } 355 ] 356 }