scan.json (23118B)
1 { 2 "paper": { 3 "title": "An Investigation on Group Query Hallucination Attacks", 4 "authors": ["Kehao Miao", "Xiaolong Jin"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2508.19321", 8 "doi": "10.48550/arXiv.2508.19321" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "Group Query Attack (GQA) significantly degrades performance of fine-tuned LLMs when QGS increases from 1 to 2, with models collapsing to outputting a single option. GQA can trigger backdoors injected via 0.5% poisoned training data, causing models to preferentially output option A. For non-fine-tuned models, GQA has limited impact on multiple-choice and translation tasks but pronounced degradation on code generation and mathematical reasoning, with code performance dropping to near-zero for some models at QGS≥2.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No repository URL, code link, or supplementary material archive is mentioned anywhere in the paper." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "All datasets used are publicly available standard benchmarks: HumanEval, MedMCQA, PubMedQA, Aqua-RAT, MathQA, WMT20-MLQE-Task1. The paper references these with citations." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No environment specification, requirements file, or dependency list is provided. Hardware details are not mentioned." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are provided. The appendix describes prompt templates and hyperparameters but lacks executable reproduction guidance." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "All results are reported as point estimates (e.g., '53.3 / 19.7') with no confidence intervals or error bars." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims GQA 'significantly degrades performance' but provides no statistical significance tests — comparisons are made by visually comparing raw accuracy numbers." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Results show baseline vs. degraded accuracy with enough context to compute effect size (e.g., llama2-7b MedMCQA drops from 53.3% to 19.7%, mistral-7b from 61.1% to 32.1%). The magnitude of degradation is clear from the tables." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification for the choice of datasets, number of models, or number of evaluation examples. No power analysis." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "Section 3.3 mentions performing random partitioning three times and computing average metrics for Q3, but no standard deviations or variance measures are reported in any table. For Q1/Q2, only a single partition is used." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "QGS=1 (single query) serves as the baseline comparison for all experiments. Performance at QGS=1 is always reported alongside higher QGS values." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Models tested include contemporary choices: Llama 3, Mistral 7B, Gemma 7B, Qwen 1.5, Mixtral 8x7B. These were reasonably current at time of writing." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": false, 78 "justification": "No ablation study is conducted. The paper does not investigate which aspects of group queries cause degradation (e.g., context length vs. task confusion vs. attention dilution)." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "The paper uses accuracy for multiple-choice/code/math tasks and sacreBLEU for translation tasks. It also reports predominant output option proportions." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "Human evaluation is not relevant — all tasks have automated ground-truth evaluation (accuracy, sacreBLEU, unit tests for HumanEval)." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "The paper uses standard train/test splits from established benchmarks. Section 3.3 describes randomly partitioning evaluation data to separate first queries from additional queries. Section 4.1 notes use of corresponding test sets." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down per dataset (MedMCQA, PubMedQA, Aqua-RAT, MathQA, HumanEval, WMT20), per model, and per QGS level across extensive tables in the appendix." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": false, 103 "justification": "The paper observes that models collapse to outputting a single option but does not analyze why or show qualitative failure examples beyond the single Figure 1 illustration." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper reports that GQA has limited impact on multiple-choice and translation tasks for non-fine-tuned models (Section 4.3, Q3), which is a negative result for the attack's generality." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims GQA degrades fine-tuned models (supported by Tables 1-2), triggers backdoors (Tables 2, 8), and affects reasoning tasks (Tables 3, 14-15). All are demonstrated in the results." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper claims GQA 'degrades performance' and 'triggers backdoors' — causal language — but does not control for confounds like increased context length alone. The mechanism is not isolated; longer input could explain the degradation regardless of multi-query structure." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title says 'Group Query Hallucination Attacks' broadly, but results are limited to specific 7B-33B parameter models on specific benchmarks. No acknowledgment that results may not apply to larger models or different task types." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper does not discuss alternative explanations. The degradation could be due to increased context length, attention dilution, or format confusion, but none of these are analyzed or discussed." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper measures accuracy and sacreBLEU directly and does not overclaim beyond these metrics. Claims about 'performance degradation' match the granularity of what is measured." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Specific model versions are listed: llama2-7b, mistral-7b-v0.1, gemma-7b, qwen-7b, gpt-j-6b, mixtral-8x7b-v0.1, llama-33b, plus aligned versions with version identifiers (mistral-7b-it-v0.3, gemma1.1-7b-it, qwen1.5-7b-chat, llama3-8b-instruct)." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Full prompt templates with actual fill values are provided in Appendix B (Figures 3-5, Table 4), including system prompts, user/assistant prefixes, and task-specific values for all configurations." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Appendix B.2 reports: 10% warmup ratio, final LR decayed to 10% of peak, 3 epochs, LR 2e-5, batch size 64, sequence length 2048, greedy search decoding." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. The approach is direct prompting of LLMs." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 3.3 describes the evaluation procedure: random partitioning into additional queries and first queries, fixing order of additional queries, and averaging over 3 partitions for Q3. Appendix B describes prompt formatting. Backdoor injection procedure described in Section 4.2 (1% sampling, combining into group queries, 0.5% of total)." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 6 is titled 'Limitations' and discusses three specific limitations: limited scenarios tested, only first-query responses analyzed, and insufficient model fine-tuning due to time constraints." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 6 mentions specific limitations: 'users tend to ask more open-ended questions rather than restricting themselves to the specific tasks mentioned in this paper' and 'this paper only examines metrics related to responses to the first query.'" 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "The limitations section acknowledges some gaps but does not explicitly state what the results do NOT show — e.g., does not state that results may not apply to larger models, closed-source APIs, or real-world conversational settings." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw experimental outputs, model predictions, or intermediate data are released." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Appendix A provides detailed descriptions of each dataset with sizes, splits, and sources. Section 4.2 describes the backdoor data generation procedure." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. All data comes from standard public benchmarks." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "Section 3.3 documents the evaluation pipeline: random partitioning, fixing additional query order, evaluating first query response. Section 4.2 documents the backdoor injection pipeline: 1% sampling where answer=A, combining into group queries, reintegrating at 0.5% of total." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information or acknowledgments section is present in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are listed: University of Science and Technology of China and Purdue University." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed, so independence cannot be assessed." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is present." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No training data cutoff dates are stated for any of the pre-trained models used." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether benchmark data appeared in model training sets. HumanEval (2021) could have been seen by later models." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "HumanEval was published in 2021 and all models tested were trained after that. MedMCQA and other benchmarks are also publicly available. No contamination analysis is provided." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No inference cost, latency, or API cost is reported despite running extensive experiments across 7+ models and multiple QGS levels." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No GPU hours, hardware specifications, or total compute budget is mentioned despite fine-tuning 7 models on multiple datasets." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No seed sensitivity analysis. Section 3.3 mentions 3 random partitions for Q3, but no seed variation for model training or inference." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "Section 3.3 states the random partitioning is performed three times for Q3, and once for Q1/Q2. This establishes the number of runs." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No hyperparameter search is described. The paper uses fixed hyperparameters from prior work (Chen et al.) but does not state whether any tuning was performed." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "No discussion of how the specific fine-tuning configuration was selected. Hyperparameters are adopted from prior work without justification for this specific setting." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "No statistical tests are performed at all, let alone corrections for multiple comparisons, despite comparing many models across many datasets and QGS values." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors propose GQA and evaluate it themselves without acknowledging any evaluation bias." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": false, 325 "answer": false, 326 "justification": "GQA is not a method that trades compute for performance — it is an attack that changes input format. Compute differences between QGS levels are negligible relative to the question." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "No discussion of whether the benchmarks used (HumanEval, MedMCQA, etc.) are valid measures of the capabilities claimed to be degraded by GQA." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved — models are prompted directly." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of whether models were trained on the benchmark data. HumanEval (2021), MedMCQA (2022), and other benchmarks predate several models tested." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the evaluation setup leaks information. The few-shot examples and prompt format could provide hints not available in natural usage." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of train/test independence for any benchmark used." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention methods are applied." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "GQA significantly degrades performance of fine-tuned models, with most models collapsing to outputting a single option at QGS=2", 365 "evidence": "Table 1 and Table 5 show accuracy drops across 7 models on 4 datasets (e.g., mistral-7b on MedMCQA: 61.1% → 32.1%). Table 6 shows >98% single-option output frequency at QGS=2.", 366 "supported": "strong" 367 }, 368 { 369 "claim": "GQA triggers potential backdoors in LLMs fine-tuned on poisoned datasets, causing models to preferentially output option A", 370 "evidence": "Table 2 and Table 8 show that models fine-tuned on 0.5% backdoor data output A at 83-100% rate at QGS=2, compared to mixed option distributions for non-backdoored models.", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "GQA has limited impact on multiple-choice and translation tasks for non-fine-tuned models but pronounced effect on code and mathematical reasoning", 375 "evidence": "Table 3: MedMCQA accuracy drops only ~3-4pp (e.g., llama3-8b-it: 59.9→57.9) but HumanEval drops from 28.5% to 0.0% for gemma-7b-it at QGS≥2. Math reasoning also shows larger drops (gemma-7b-it: 43.3→22.5).", 376 "supported": "strong" 377 } 378 ], 379 "red_flags": [ 380 { 381 "flag": "No statistical significance tests", 382 "detail": "The paper claims 'significant' performance degradation throughout but never performs any statistical test. All comparisons are raw accuracy numbers without uncertainty quantification." 383 }, 384 { 385 "flag": "Context length confound not controlled", 386 "detail": "GQA increases context length. The paper does not control for whether degradation is caused by the multi-query structure specifically or simply by longer inputs. Table 13 shows input tokens increase substantially with QGS." 387 }, 388 { 389 "flag": "Backdoor claim is weak", 390 "detail": "The Q2 backdoor experiment injects group queries with answer A into training, then tests with group queries — finding model outputs A. This may simply be the model learning the training distribution pattern rather than demonstrating a meaningful backdoor risk." 391 }, 392 { 393 "flag": "No code or data release", 394 "detail": "Despite fine-tuning 7+ models and running extensive experiments, no code, fine-tuned model weights, or experimental outputs are released." 395 }, 396 { 397 "flag": "Missing variance/uncertainty", 398 "detail": "For Q1/Q2, only a single random partition is used. For Q3, three partitions are averaged but no standard deviation is reported. Impossible to assess result stability." 399 } 400 ], 401 "cited_papers": [ 402 { 403 "title": "Evaluating large language models trained on code", 404 "authors": ["Mark Chen", "Jerry Tworek"], 405 "year": 2021, 406 "arxiv_id": "2107.03374", 407 "relevance": "Introduces HumanEval benchmark for code generation, used as evaluation dataset in this paper." 408 }, 409 { 410 "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training", 411 "authors": ["Evan Hubinger"], 412 "year": 2024, 413 "relevance": "Studies backdoor persistence in LLMs through safety training, directly relevant to the backdoor triggering investigated in this paper." 414 }, 415 { 416 "title": "BadChain: Backdoor Chain-of-Thought Prompting for Large Language Models", 417 "authors": ["Zhen Xiang"], 418 "year": 2024, 419 "relevance": "Demonstrates backdoor attacks via chain-of-thought prompting in LLMs." 420 }, 421 { 422 "title": "Lost in the middle: How language models use long contexts", 423 "authors": ["Nelson F Liu"], 424 "year": 2024, 425 "relevance": "Studies how LLMs process long contexts, directly relevant to the context accumulation effects studied in GQA." 426 }, 427 { 428 "title": "The Reversal Curse: LLMs trained on 'A is B' fail to learn 'B is A'", 429 "authors": ["Lukas Berglund"], 430 "year": 2024, 431 "relevance": "Studies LLM failure modes in generalization, part of the broader failure mode literature this paper contributes to." 432 }, 433 { 434 "title": "Large language models can be easily distracted by irrelevant context", 435 "authors": ["Freda Shi"], 436 "year": 2023, 437 "relevance": "Studies LLM distractibility, closely related to the context accumulation degradation observed in GQA." 438 }, 439 { 440 "title": "DecodingTrust: A Comprehensive Assessment of Trustworthiness in GPT Models", 441 "authors": ["Boxin Wang"], 442 "year": 2024, 443 "relevance": "Comprehensive trustworthiness evaluation of LLMs including robustness and adversarial attacks." 444 }, 445 { 446 "title": "Backdooring Instruction-Tuned Large Language Models with Virtual Prompt Injection", 447 "authors": ["Jun Yan"], 448 "year": 2023, 449 "relevance": "Studies backdoor injection methods for instruction-tuned LLMs via fine-tuning." 450 } 451 ] 452 }