scan.json (27212B)
1 { 2 "paper": { 3 "title": "Building Understandable Messaging for Policy and Evidence Review (BUMPER) with AI", 4 "authors": [ 5 "Katherine A. Rosenfeld", 6 "Maike Sonnewald", 7 "Sonia J. Jindal", 8 "Kevin A. McCarthy", 9 "Joshua L. Proctor" 10 ], 11 "year": 2024, 12 "venue": "arXiv preprint", 13 "arxiv_id": "2407.12812", 14 "doi": "10.48550/arXiv.2407.12812" 15 }, 16 "checklist": { 17 "artifacts": { 18 "code_released": { 19 "applies": true, 20 "answer": true, 21 "justification": "Code is released on GitHub at https://github.com/krosenfeld/bumper_paper (Section 2, line 178). A second repository for the measles seasonality code is at https://github.com/NThakkar-IDM/seasonality (Section 4, footnote 5)." 22 }, 23 "data_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "The paper uses publicly available data: the PyMC rugby tutorial dataset and publicly available measles case data. The code repository enables reproduction. The underlying measles analysis data is from published sources (Thakkar et al., 2024b)." 27 }, 28 "environment_specified": { 29 "applies": true, 30 "answer": true, 31 "justification": "Appendix A.1 specifies the environment: 'The environment was maintained using pixi' with instructions 'Install pixi', 'Install dependencies: pixi install', 'Run: pixi run start'. Snakemake is used for workflow. This provides a reproducible environment specification." 32 }, 33 "reproduction_instructions": { 34 "applies": true, 35 "answer": true, 36 "justification": "Appendix A.1 provides step-by-step instructions: 1) Install pixi, 2) Install dependencies: pixi install, 3) Run: pixi run start. Individual scripts can be run via 'pixi run python figure_3.py'. An OpenAI API key is required." 37 } 38 }, 39 "statistical_methodology": { 40 "confidence_intervals_or_error_bars": { 41 "applies": true, 42 "answer": false, 43 "justification": "The paper shows distributions of compliance scores (Figures 4, 5) but does not report confidence intervals or error bars for any quantitative results. The distributions are shown as histograms but no summary statistics with uncertainty are provided." 44 }, 45 "significance_tests": { 46 "applies": true, 47 "answer": false, 48 "justification": "The paper makes comparative claims about stability across models and prompt configurations (e.g., 'adding the explanation adds stability to the assessment') but provides no statistical significance tests. Comparisons are based on visual inspection of distributions." 49 }, 50 "effect_sizes_reported": { 51 "applies": true, 52 "answer": false, 53 "justification": "No effect sizes are reported. Claims about differences between models and configurations are qualitative (e.g., 'gpt-4o-2024-05-13 passes answers more consistently') without quantifying the magnitude of differences." 54 }, 55 "sample_size_justified": { 56 "applies": true, 57 "answer": false, 58 "justification": "Sample sizes are stated (N=25 synthesized answers, N=3 compliance checks per answer, N=1100 for UMAP analysis) but no justification is given for why these particular numbers were chosen or whether they are sufficient for the claims made." 59 }, 60 "variance_reported": { 61 "applies": true, 62 "answer": false, 63 "justification": "While distributions are shown visually in Figures 4-6, no numerical variance, standard deviation, or interquartile range statistics are reported. The reader cannot assess result stability from the information provided beyond visual inspection of histograms." 64 } 65 }, 66 "evaluation_design": { 67 "baselines_included": { 68 "applies": true, 69 "answer": true, 70 "justification": "Section 3 compares BUMPER against ChatGPT4 with the tutorial as context (Figure 2). This serves as a baseline comparison showing BUMPER provides more specific answers tied to evidence." 71 }, 72 "baselines_contemporary": { 73 "applies": true, 74 "answer": true, 75 "justification": "The baseline is ChatGPT4 (GPT-4, as of 5/20/2024), which was a contemporary model at the time of writing. This is a reasonable and competitive baseline." 76 }, 77 "ablation_study": { 78 "applies": true, 79 "answer": true, 80 "justification": "The paper effectively ablates the compliance scoring mechanism: comparing whole-guideline vs. per-element assessment (Figure 5a), and with/without explanation prompts (Figure 5b). It also compares three different models (gpt-3.5-turbo-0125, gpt-4-0125-preview, gpt-4o-2024-05-13) in Figure 4." 81 }, 82 "multiple_metrics": { 83 "applies": true, 84 "answer": true, 85 "justification": "The paper uses multiple evaluation dimensions: compliance score (S = P0), pass/fail flags, Jaccard similarity between answer clusters (Figure 6), and UMAP embedding visualization. However, these are all related to the compliance checking mechanism rather than independent evaluation dimensions." 86 }, 87 "human_evaluation": { 88 "applies": true, 89 "answer": false, 90 "justification": "No human evaluation of the system's outputs is conducted. The paper claims BUMPER produces trustworthy and useful answers for policymakers but does not involve any policymakers or domain experts in evaluating the outputs. All evaluation is automated via compliance scores." 91 }, 92 "held_out_test_set": { 93 "applies": false, 94 "answer": false, 95 "justification": "This is a framework demonstration paper with case studies, not a benchmark evaluation. There is no training/test split or held-out evaluation paradigm." 96 }, 97 "per_category_breakdown": { 98 "applies": true, 99 "answer": true, 100 "justification": "Results are broken down per model (gpt-3.5-turbo-0125, gpt-4-0125-preview, gpt-4o-2024-05-13), per query variation, per scoring method (whole vs. per-element), and per cluster in the UMAP analysis (clusters C0-C4 in Appendix A.5)." 101 }, 102 "failure_cases_discussed": { 103 "applies": true, 104 "answer": true, 105 "justification": "Table 1 explicitly shows failure cases: Error (Antarctica query), Out-of-scope (cost comparison query), Check flag (low compliance score), and Check fail (comparative question that violates guidelines). The bi-modality in Figure 4 for gpt-4-0125-preview is discussed as a failure mode." 106 }, 107 "negative_results_reported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The paper reports that the compliance score distributions show 'clear bi-modality from gpt-4-0125-preview and gpt-4o-2024-05-13' revealing 'inconsistencies that undermine BUMPER's reliability.' The UMAP analysis reveals failure modes (cluster C4 with mixed pass/fail). These are honestly reported negative findings." 111 } 112 }, 113 "claims_and_evidence": { 114 "abstract_claims_supported": { 115 "applies": true, 116 "answer": true, 117 "justification": "The abstract claims are largely descriptive of what the framework does and are supported by the case studies. The abstract's claim about 'a worked example in health policy' is demonstrated in Section 4. Claims about 'trustworthiness through transparency, scope-limiting, explicit-checks, and uncertainty measures' are demonstrated through the compliance scoring mechanism." 118 }, 119 "causal_claims_justified": { 120 "applies": true, 121 "answer": false, 122 "justification": "The paper makes causal claims such as 'adding the explanation adds stability to the assessment' (Section 4) based on comparing compliance score distributions. However, no controlled experiment isolates this factor — other differences between prompt configurations could contribute to the observed differences." 123 }, 124 "generalization_bounded": { 125 "applies": true, 126 "answer": false, 127 "justification": "The abstract claims BUMPER 'can facilitate accessibility of and confidence in scientific evidence for policymakers' and 'increase and accelerate the impact of scientific knowledge used for policy decisions' — broad claims from two demonstration case studies (a toy rugby model and one measles example). The paper does not explicitly bound these generalizations to the tested settings." 128 }, 129 "alternative_explanations_discussed": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper does not discuss alternative explanations for its observed results. For example, the bi-modality in compliance scores could be due to prompt sensitivity, model stochasticity, or guideline ambiguity, but these alternatives are not systematically explored. The discussion section mentions challenges but not alternative explanations for the specific empirical observations." 133 } 134 }, 135 "setup_transparency": { 136 "model_versions_specified": { 137 "applies": true, 138 "answer": true, 139 "justification": "Exact model versions are specified: 'gpt-4-0125-preview' (Section 2), 'gpt-3.5-turbo-0125' and 'gpt-4o-2024-05-13' (Figure 4 caption). These are specific API snapshot identifiers." 140 }, 141 "prompts_provided": { 142 "applies": true, 143 "answer": true, 144 "justification": "Appendix A.4 provides the full prompt templates used for the guidelines check, including both no-explanation and with-explanation variants for whole guideline, individual criteria, and topic evaluations. Few-shot examples are included. The fill values (guidelines G and evidence E) are also specified in Appendix A.3." 145 }, 146 "hyperparameters_reported": { 147 "applies": true, 148 "answer": false, 149 "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported. The paper uses the OpenAI API for multiple models but does not state what temperature or sampling settings were used, which significantly affects the compliance score distributions being analyzed." 150 }, 151 "scaffolding_described": { 152 "applies": true, 153 "answer": true, 154 "justification": "The framework architecture is described in detail in Section 2: action identification via OpenAI's assistants API, knowledge retrieval and aggregation, evidence scoring via separate LLM instance, compliance score computation. The workflow is illustrated in Figure 1. However, the paper acknowledges the assistants API is 'a black box regarding how the matching is done.'" 155 }, 156 "data_preprocessing_documented": { 157 "applies": true, 158 "answer": true, 159 "justification": "The knowledge base components are documented in Tables 2 and 3 (Appendix A.2), describing each data source, access method, purpose, and description. The rugby example uses PyMC tutorial data; the measles example uses published seasonality/susceptibility code." 160 } 161 }, 162 "limitations_and_scope": { 163 "limitations_section_present": { 164 "applies": true, 165 "answer": true, 166 "justification": "The Discussion section (Section 5) discusses several limitations: the potential loss of human-to-human interaction, the additional work burden on scientists, validation challenges, and the limitation to proprietary models due to computational constraints." 167 }, 168 "threats_to_validity_specific": { 169 "applies": true, 170 "answer": false, 171 "justification": "The limitations discussed are largely generic and forward-looking rather than specific threats to the validity of the results presented. For example, 'the task we approach... is not well-suited for assessment against established bench-marking datasets' is stated but not analyzed as a specific threat. No specific threats regarding the compliance score experiments (e.g., sensitivity to prompt wording, limited query variety, single-domain testing) are discussed." 172 }, 173 "scope_boundaries_stated": { 174 "applies": true, 175 "answer": false, 176 "justification": "The paper does not explicitly state what its results do NOT show. It does not bound its claims to the two specific case studies tested or acknowledge that the compliance score behavior observed may not generalize to other domains, models, or knowledge base configurations." 177 } 178 }, 179 "data_integrity": { 180 "raw_data_available": { 181 "applies": true, 182 "answer": false, 183 "justification": "While the code is released, the raw experimental data (the 1100 synthesized answers for the UMAP analysis, the compliance score distributions, the individual LLM responses) does not appear to be released for independent verification." 184 }, 185 "data_collection_described": { 186 "applies": true, 187 "answer": true, 188 "justification": "The data collection procedure is described: N=25 synthesized answers generated per query, N=3 compliance checks per answer, N=2 fixed queries (Figures 4-5), and N=1100 gpt-4-0125-preview answers for UMAP analysis (Figure 6). The process of generating answers via the BUMPER framework is described in Section 2." 189 }, 190 "recruitment_methods_described": { 191 "applies": false, 192 "answer": false, 193 "justification": "No human participants were involved in this study. The data consists of LLM-generated outputs." 194 }, 195 "data_pipeline_documented": { 196 "applies": true, 197 "answer": true, 198 "justification": "The pipeline from user query to compliance score is documented in Section 2 (steps 1-5) and illustrated in Figure 1. The compliance score computation is formalized in Equation 1. The experimental setup for generating distributions is described in the figure captions." 199 } 200 }, 201 "conflicts_of_interest": { 202 "funding_disclosed": { 203 "applies": true, 204 "answer": true, 205 "justification": "The Acknowledgments section states 'KR, SJ, KM, and JP are employees of the Bill and Melinda Gates Foundation.' The affiliation footnotes also indicate the Institute for Disease Modeling, Bill & Melinda Gates Foundation." 206 }, 207 "affiliations_disclosed": { 208 "applies": true, 209 "answer": true, 210 "justification": "Author affiliations are clearly listed: Institute for Disease Modeling, Bill & Melinda Gates Foundation (KR, SJ, KM, JP) and Department of Computer Science, University of California, Davis (MS). The Gates Foundation affiliation is prominent." 211 }, 212 "funder_independent_of_outcome": { 213 "applies": true, 214 "answer": true, 215 "justification": "The paper states 'this study does not necessarily represent the views of the Bill and Melinda Gates Foundation.' The Gates Foundation funds disease modeling and has an interest in measles control, but the BUMPER framework itself is a research tool, not a product. The funder's interest is in improving health outcomes, not in a specific outcome of the framework evaluation." 216 }, 217 "financial_interests_declared": { 218 "applies": true, 219 "answer": false, 220 "justification": "No competing interests or financial interests statement is present in the paper. The paper does not include a formal conflicts-of-interest declaration beyond the affiliation disclosure." 221 } 222 }, 223 "contamination": { 224 "training_cutoff_stated": { 225 "applies": false, 226 "answer": false, 227 "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. It uses LLMs as components of a framework for evidence synthesis and compliance checking. The evaluation is about the framework's behavior, not the model's knowledge." 228 }, 229 "train_test_overlap_discussed": { 230 "applies": false, 231 "answer": false, 232 "justification": "No benchmark evaluation is conducted. The paper evaluates a framework's compliance scoring mechanism, not a model's knowledge of test data." 233 }, 234 "benchmark_contamination_addressed": { 235 "applies": false, 236 "answer": false, 237 "justification": "No standard benchmarks are used. The evaluation consists of case studies with domain-specific queries, not benchmark tasks that could have appeared in training data." 238 } 239 }, 240 "human_studies": { 241 "pre_registered": { 242 "applies": false, 243 "answer": false, 244 "justification": "No human participants were involved in this study." 245 }, 246 "irb_or_ethics_approval": { 247 "applies": false, 248 "answer": false, 249 "justification": "No human participants were involved in this study." 250 }, 251 "demographics_reported": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants were involved in this study." 255 }, 256 "inclusion_exclusion_criteria": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants were involved in this study." 260 }, 261 "randomization_described": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants were involved in this study." 265 }, 266 "blinding_described": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants were involved in this study." 270 }, 271 "attrition_reported": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants were involved in this study." 275 } 276 }, 277 "cost_and_practicality": { 278 "inference_cost_reported": { 279 "applies": true, 280 "answer": false, 281 "justification": "No inference cost or latency is reported. The paper runs substantial LLM experiments (1100 generations for UMAP, 25x3x2 for distribution analysis, across three models) but does not report API costs, token counts, or wall-clock time." 282 }, 283 "compute_budget_stated": { 284 "applies": true, 285 "answer": false, 286 "justification": "No computational budget is stated. The paper mentions 'due to computational limitations, we have been limited to a proprietary model and associated API' (Section 5) but does not quantify the actual compute used." 287 } 288 } 289 }, 290 "claims": [ 291 { 292 "claim": "BUMPER provides more specific and evidence-grounded answers compared to ChatGPT4 with context.", 293 "evidence": "Section 3, Figure 2: Comparison on 'Which team has the second worst attack?' query shows BUMPER provides a specific answer using model parameters while 'ChatGPT4 incorrectly hedges its answer.'", 294 "supported": "weak" 295 }, 296 { 297 "claim": "Adding explanation prompts increases the stability of the compliance score assessment.", 298 "evidence": "Section 4, Figure 5b: Comparison of compliance score distributions with and without explanation prompts shows 'adding the explanation adds stability to the assessment.'", 299 "supported": "moderate" 300 }, 301 { 302 "claim": "Checking guidelines by individual element (rather than the whole set) reduces compliance scores and increases spread.", 303 "evidence": "Section 4, Figure 5a: Per-element assessment 'reduces the compliance score and increases the spread of S.'", 304 "supported": "moderate" 305 }, 306 { 307 "claim": "Token-level probabilities (P0) from the compliance check reveal inconsistencies across models that undermine reliability.", 308 "evidence": "Section 4, Figure 4: Shows 'clear bi-modality from gpt-4-0125-preview and gpt-4o-2024-05-13' while gpt-3.5-turbo-0125 shows consistent results. The paper states 'These distributions reveal inconsistencies that undermine BUMPER's reliability.'", 309 "supported": "moderate" 310 }, 311 { 312 "claim": "UMAP embedding and clustering of synthesized answers reveals distinct answer clusters with generally consistent compliance scores.", 313 "evidence": "Section 4, Figure 6: K-means clustering of UMAP embeddings for N=1100 answers shows distinct clusters. Jaccard similarity scores within clusters are reported. Cluster C4 shows mixed pass/fail behavior.", 314 "supported": "moderate" 315 }, 316 { 317 "claim": "The BUMPER framework can facilitate accessibility of and confidence in scientific evidence for policymakers.", 318 "evidence": "Abstract and Discussion (Section 5). This is argued based on the framework design and two case studies, but no policymaker evaluation or user study is conducted.", 319 "supported": "weak" 320 } 321 ], 322 "methodology_tags": [ 323 "case-study" 324 ], 325 "key_findings": "The paper introduces BUMPER, a framework for using LLMs to translate scientific evidence for policymakers, featuring a novel compliance score based on token probabilities. Through case studies on rugby prediction and measles health policy, the authors demonstrate that compliance score distributions vary significantly across models (gpt-3.5-turbo, gpt-4, gpt-4o), with bi-modality revealing reliability concerns. Adding explanation prompts and per-element guideline assessment improves stability but reduces compliance scores. UMAP clustering of 1100 synthesized answers reveals distinct response clusters with varying compliance behaviors, including failure modes where identical answers receive contradictory pass/fail verdicts.", 326 "red_flags": [ 327 { 328 "flag": "No human evaluation", 329 "detail": "The paper claims BUMPER is useful for policymakers to access scientific evidence, but no policymakers, domain experts, or users were involved in evaluating the system. All evaluation is automated via the compliance score mechanism, which is itself part of the system being evaluated." 330 }, 331 { 332 "flag": "No hyperparameters reported", 333 "detail": "Temperature and sampling parameters are not reported for any of the three models used, despite the paper analyzing token probability distributions that are directly affected by these settings. This is a critical omission when the main evaluation metric (compliance score) is derived from token probabilities." 334 }, 335 { 336 "flag": "Claims significantly outrun evidence", 337 "detail": "The paper argues BUMPER 'can facilitate accessibility of and confidence in scientific evidence for policymakers' and 'increase and accelerate the impact of scientific knowledge used for policy decisions' based on two case studies (one toy, one domain-specific) with no user evaluation. The reported results actually reveal significant reliability problems (bi-modal scores, inconsistent pass/fail)." 338 }, 339 { 340 "flag": "Circular evaluation", 341 "detail": "The compliance score mechanism is both a component of BUMPER and the primary evaluation metric. The paper evaluates BUMPER largely by examining how its own compliance scoring behaves, rather than independently assessing whether the framework produces correct or useful outputs." 342 }, 343 { 344 "flag": "Very limited query diversity", 345 "detail": "The measles case study results are based on essentially two queries ('When should the next SIA in Cameroon be planned?' and 'When should Cameroon run SIAs?'). The rugby example uses a single query. The generalizability of compliance score behavior to diverse queries is untested." 346 } 347 ], 348 "cited_papers": [ 349 { 350 "title": "Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection", 351 "authors": ["A. Asai", "Z. Wu", "Y. Wang", "A. Sil", "H. Hajishirzi"], 352 "year": 2023, 353 "arxiv_id": "2310.11511", 354 "relevance": "Self-reflective RAG approach relevant to survey scope on LLM architectures for reliable evidence synthesis." 355 }, 356 { 357 "title": "On the Opportunities and Risks of Foundation Models", 358 "authors": ["R. Bommasani", "D. A. Hudson"], 359 "year": 2022, 360 "arxiv_id": "2108.07258", 361 "relevance": "Major survey on foundation model risks and opportunities, foundational reference for AI safety and reliability research." 362 }, 363 { 364 "title": "Building Guardrails for Large Language Models", 365 "authors": ["Y. Dong", "R. Mu", "G. Jin"], 366 "year": 2024, 367 "arxiv_id": "2402.01822", 368 "relevance": "Survey on guardrail architectures for LLMs, directly related to safety and alignment in LLM-based systems." 369 }, 370 { 371 "title": "Llama Guard: LLM-based Input-Output Safeguard for Human-AI Conversations", 372 "authors": ["H. Inan", "K. Upasani"], 373 "year": 2023, 374 "arxiv_id": "2312.06674", 375 "relevance": "LLM safety guardrail system relevant to the survey's scope on AI safety and responsible deployment." 376 }, 377 { 378 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 379 "authors": ["S. Yao", "J. Zhao", "D. Yu"], 380 "year": 2023, 381 "arxiv_id": "2210.03629", 382 "relevance": "Foundational agentic LLM framework combining reasoning and action, directly relevant to agentic AI survey scope." 383 }, 384 { 385 "title": "PAL: Program-aided Language Models", 386 "authors": ["L. Gao", "A. Madaan", "S. Zhou"], 387 "year": 2023, 388 "arxiv_id": "2211.10435", 389 "relevance": "Tool-augmented LLM approach using code generation, relevant to agentic AI and LLM programming research." 390 }, 391 { 392 "title": "TrustLLM: Trustworthiness in Large Language Models", 393 "authors": ["L. Sun", "Y. Huang", "H. Wang"], 394 "year": 2024, 395 "arxiv_id": "2401.05561", 396 "relevance": "Benchmark for LLM trustworthiness evaluation, directly relevant to survey scope on LLM safety and reliability." 397 }, 398 { 399 "title": "The Rise and Potential of Large Language Model Based Agents: A Survey", 400 "authors": ["Z. Xi", "W. Chen", "X. Guo"], 401 "year": 2023, 402 "arxiv_id": "2309.07864", 403 "relevance": "Comprehensive survey on LLM-based agents, core reference for agentic AI research." 404 }, 405 { 406 "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 407 "authors": ["J. Wei", "X. Wang", "D. Schuurmans"], 408 "year": 2023, 409 "arxiv_id": "2201.11903", 410 "relevance": "Foundational prompting technique relevant to LLM capability and programming survey scope." 411 }, 412 { 413 "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", 414 "authors": ["P. Lewis", "E. Perez", "A. Piktus"], 415 "year": 2021, 416 "arxiv_id": "2005.11401", 417 "relevance": "Foundational RAG paper relevant to LLM architectures for evidence synthesis and knowledge-intensive applications." 418 }, 419 { 420 "title": "Can LLMs Express Their Uncertainty? An Empirical Evaluation of Confidence Elicitation in LLMs", 421 "authors": ["M. Xiong", "Z. Hu", "X. Lu"], 422 "year": 2024, 423 "arxiv_id": "2306.13063", 424 "relevance": "Directly relevant to BUMPER's use of token probabilities as confidence measures in LLM outputs." 425 }, 426 { 427 "title": "NeMo Guardrails: A Toolkit for Controllable and Safe LLM Applications with Programmable Rails", 428 "authors": ["T. Rebedea", "R. Dinu", "M. Sreedhar"], 429 "year": 2023, 430 "arxiv_id": "2310.10501", 431 "relevance": "LLM guardrail framework relevant to safety and controlled deployment of LLM-based systems." 432 } 433 ] 434 }