ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (23118B)


      1 {
      2   "paper": {
      3     "title": "An Investigation on Group Query Hallucination Attacks",
      4     "authors": ["Kehao Miao", "Xiaolong Jin"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2508.19321",
      8     "doi": "10.48550/arXiv.2508.19321"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "Group Query Attack (GQA) significantly degrades performance of fine-tuned LLMs when QGS increases from 1 to 2, with models collapsing to outputting a single option. GQA can trigger backdoors injected via 0.5% poisoned training data, causing models to preferentially output option A. For non-fine-tuned models, GQA has limited impact on multiple-choice and translation tasks but pronounced degradation on code generation and mathematical reasoning, with code performance dropping to near-zero for some models at QGS≥2.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL, code link, or supplementary material archive is mentioned anywhere in the paper."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "All datasets used are publicly available standard benchmarks: HumanEval, MedMCQA, PubMedQA, Aqua-RAT, MathQA, WMT20-MLQE-Task1. The paper references these with citations."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specification, requirements file, or dependency list is provided. Hardware details are not mentioned."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. The appendix describes prompt templates and hyperparameters but lacks executable reproduction guidance."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results are reported as point estimates (e.g., '53.3 / 19.7') with no confidence intervals or error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims GQA 'significantly degrades performance' but provides no statistical significance tests — comparisons are made by visually comparing raw accuracy numbers."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Results show baseline vs. degraded accuracy with enough context to compute effect size (e.g., llama2-7b MedMCQA drops from 53.3% to 19.7%, mistral-7b from 61.1% to 32.1%). The magnitude of degradation is clear from the tables."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for the choice of datasets, number of models, or number of evaluation examples. No power analysis."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Section 3.3 mentions performing random partitioning three times and computing average metrics for Q3, but no standard deviations or variance measures are reported in any table. For Q1/Q2, only a single partition is used."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "QGS=1 (single query) serves as the baseline comparison for all experiments. Performance at QGS=1 is always reported alongside higher QGS values."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Models tested include contemporary choices: Llama 3, Mistral 7B, Gemma 7B, Qwen 1.5, Mixtral 8x7B. These were reasonably current at time of writing."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No ablation study is conducted. The paper does not investigate which aspects of group queries cause degradation (e.g., context length vs. task confusion vs. attention dilution)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper uses accuracy for multiple-choice/code/math tasks and sacreBLEU for translation tasks. It also reports predominant output option proportions."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is not relevant — all tasks have automated ground-truth evaluation (accuracy, sacreBLEU, unit tests for HumanEval)."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper uses standard train/test splits from established benchmarks. Section 3.3 describes randomly partitioning evaluation data to separate first queries from additional queries. Section 4.1 notes use of corresponding test sets."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down per dataset (MedMCQA, PubMedQA, Aqua-RAT, MathQA, HumanEval, WMT20), per model, and per QGS level across extensive tables in the appendix."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "The paper observes that models collapse to outputting a single option but does not analyze why or show qualitative failure examples beyond the single Figure 1 illustration."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports that GQA has limited impact on multiple-choice and translation tasks for non-fine-tuned models (Section 4.3, Q3), which is a negative result for the attack's generality."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims GQA degrades fine-tuned models (supported by Tables 1-2), triggers backdoors (Tables 2, 8), and affects reasoning tasks (Tables 3, 14-15). All are demonstrated in the results."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper claims GQA 'degrades performance' and 'triggers backdoors' — causal language — but does not control for confounds like increased context length alone. The mechanism is not isolated; longer input could explain the degradation regardless of multi-query structure."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title says 'Group Query Hallucination Attacks' broadly, but results are limited to specific 7B-33B parameter models on specific benchmarks. No acknowledgment that results may not apply to larger models or different task types."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative explanations. The degradation could be due to increased context length, attention dilution, or format confusion, but none of these are analyzed or discussed."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures accuracy and sacreBLEU directly and does not overclaim beyond these metrics. Claims about 'performance degradation' match the granularity of what is measured."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Specific model versions are listed: llama2-7b, mistral-7b-v0.1, gemma-7b, qwen-7b, gpt-j-6b, mixtral-8x7b-v0.1, llama-33b, plus aligned versions with version identifiers (mistral-7b-it-v0.3, gemma1.1-7b-it, qwen1.5-7b-chat, llama3-8b-instruct)."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Full prompt templates with actual fill values are provided in Appendix B (Figures 3-5, Table 4), including system prompts, user/assistant prefixes, and task-specific values for all configurations."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Appendix B.2 reports: 10% warmup ratio, final LR decayed to 10% of peak, 3 epochs, LR 2e-5, batch size 64, sequence length 2048, greedy search decoding."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The approach is direct prompting of LLMs."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 3.3 describes the evaluation procedure: random partitioning into additional queries and first queries, fixing order of additional queries, and averaging over 3 partitions for Q3. Appendix B describes prompt formatting. Backdoor injection procedure described in Section 4.2 (1% sampling, combining into group queries, 0.5% of total)."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 6 is titled 'Limitations' and discusses three specific limitations: limited scenarios tested, only first-query responses analyzed, and insufficient model fine-tuning due to time constraints."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 6 mentions specific limitations: 'users tend to ask more open-ended questions rather than restricting themselves to the specific tasks mentioned in this paper' and 'this paper only examines metrics related to responses to the first query.'"
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The limitations section acknowledges some gaps but does not explicitly state what the results do NOT show — e.g., does not state that results may not apply to larger models, closed-source APIs, or real-world conversational settings."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental outputs, model predictions, or intermediate data are released."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Appendix A provides detailed descriptions of each dataset with sizes, splits, and sources. Section 4.2 describes the backdoor data generation procedure."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. All data comes from standard public benchmarks."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section 3.3 documents the evaluation pipeline: random partitioning, fixing additional query order, evaluating first query response. Section 4.2 documents the backdoor injection pipeline: 1% sampling where answer=A, combining into group queries, reintegrating at 0.5% of total."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information or acknowledgments section is present in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are listed: University of Science and Technology of China and Purdue University."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding is disclosed, so independence cannot be assessed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff dates are stated for any of the pre-trained models used."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether benchmark data appeared in model training sets. HumanEval (2021) could have been seen by later models."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "HumanEval was published in 2021 and all models tested were trained after that. MedMCQA and other benchmarks are also publicly available. No contamination analysis is provided."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference cost, latency, or API cost is reported despite running extensive experiments across 7+ models and multiple QGS levels."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No GPU hours, hardware specifications, or total compute budget is mentioned despite fine-tuning 7 models on multiple datasets."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No seed sensitivity analysis. Section 3.3 mentions 3 random partitions for Q3, but no seed variation for model training or inference."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Section 3.3 states the random partitioning is performed three times for Q3, and once for Q1/Q2. This establishes the number of runs."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search is described. The paper uses fixed hyperparameters from prior work (Chen et al.) but does not state whether any tuning was performed."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No discussion of how the specific fine-tuning configuration was selected. Hyperparameters are adopted from prior work without justification for this specific setting."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No statistical tests are performed at all, let alone corrections for multiple comparisons, despite comparing many models across many datasets and QGS values."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors propose GQA and evaluate it themselves without acknowledging any evaluation bias."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": false,
    325         "answer": false,
    326         "justification": "GQA is not a method that trades compute for performance — it is an attack that changes input format. Compute differences between QGS levels are negligible relative to the question."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether the benchmarks used (HumanEval, MedMCQA, etc.) are valid measures of the capabilities claimed to be degraded by GQA."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved — models are prompted directly."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether models were trained on the benchmark data. HumanEval (2021), MedMCQA (2022), and other benchmarks predate several models tested."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup leaks information. The few-shot examples and prompt format could provide hints not available in natural usage."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of train/test independence for any benchmark used."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention methods are applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "GQA significantly degrades performance of fine-tuned models, with most models collapsing to outputting a single option at QGS=2",
    365       "evidence": "Table 1 and Table 5 show accuracy drops across 7 models on 4 datasets (e.g., mistral-7b on MedMCQA: 61.1% → 32.1%). Table 6 shows >98% single-option output frequency at QGS=2.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "GQA triggers potential backdoors in LLMs fine-tuned on poisoned datasets, causing models to preferentially output option A",
    370       "evidence": "Table 2 and Table 8 show that models fine-tuned on 0.5% backdoor data output A at 83-100% rate at QGS=2, compared to mixed option distributions for non-backdoored models.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "GQA has limited impact on multiple-choice and translation tasks for non-fine-tuned models but pronounced effect on code and mathematical reasoning",
    375       "evidence": "Table 3: MedMCQA accuracy drops only ~3-4pp (e.g., llama3-8b-it: 59.9→57.9) but HumanEval drops from 28.5% to 0.0% for gemma-7b-it at QGS≥2. Math reasoning also shows larger drops (gemma-7b-it: 43.3→22.5).",
    376       "supported": "strong"
    377     }
    378   ],
    379   "red_flags": [
    380     {
    381       "flag": "No statistical significance tests",
    382       "detail": "The paper claims 'significant' performance degradation throughout but never performs any statistical test. All comparisons are raw accuracy numbers without uncertainty quantification."
    383     },
    384     {
    385       "flag": "Context length confound not controlled",
    386       "detail": "GQA increases context length. The paper does not control for whether degradation is caused by the multi-query structure specifically or simply by longer inputs. Table 13 shows input tokens increase substantially with QGS."
    387     },
    388     {
    389       "flag": "Backdoor claim is weak",
    390       "detail": "The Q2 backdoor experiment injects group queries with answer A into training, then tests with group queries — finding model outputs A. This may simply be the model learning the training distribution pattern rather than demonstrating a meaningful backdoor risk."
    391     },
    392     {
    393       "flag": "No code or data release",
    394       "detail": "Despite fine-tuning 7+ models and running extensive experiments, no code, fine-tuned model weights, or experimental outputs are released."
    395     },
    396     {
    397       "flag": "Missing variance/uncertainty",
    398       "detail": "For Q1/Q2, only a single random partition is used. For Q3, three partitions are averaged but no standard deviation is reported. Impossible to assess result stability."
    399     }
    400   ],
    401   "cited_papers": [
    402     {
    403       "title": "Evaluating large language models trained on code",
    404       "authors": ["Mark Chen", "Jerry Tworek"],
    405       "year": 2021,
    406       "arxiv_id": "2107.03374",
    407       "relevance": "Introduces HumanEval benchmark for code generation, used as evaluation dataset in this paper."
    408     },
    409     {
    410       "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training",
    411       "authors": ["Evan Hubinger"],
    412       "year": 2024,
    413       "relevance": "Studies backdoor persistence in LLMs through safety training, directly relevant to the backdoor triggering investigated in this paper."
    414     },
    415     {
    416       "title": "BadChain: Backdoor Chain-of-Thought Prompting for Large Language Models",
    417       "authors": ["Zhen Xiang"],
    418       "year": 2024,
    419       "relevance": "Demonstrates backdoor attacks via chain-of-thought prompting in LLMs."
    420     },
    421     {
    422       "title": "Lost in the middle: How language models use long contexts",
    423       "authors": ["Nelson F Liu"],
    424       "year": 2024,
    425       "relevance": "Studies how LLMs process long contexts, directly relevant to the context accumulation effects studied in GQA."
    426     },
    427     {
    428       "title": "The Reversal Curse: LLMs trained on 'A is B' fail to learn 'B is A'",
    429       "authors": ["Lukas Berglund"],
    430       "year": 2024,
    431       "relevance": "Studies LLM failure modes in generalization, part of the broader failure mode literature this paper contributes to."
    432     },
    433     {
    434       "title": "Large language models can be easily distracted by irrelevant context",
    435       "authors": ["Freda Shi"],
    436       "year": 2023,
    437       "relevance": "Studies LLM distractibility, closely related to the context accumulation degradation observed in GQA."
    438     },
    439     {
    440       "title": "DecodingTrust: A Comprehensive Assessment of Trustworthiness in GPT Models",
    441       "authors": ["Boxin Wang"],
    442       "year": 2024,
    443       "relevance": "Comprehensive trustworthiness evaluation of LLMs including robustness and adversarial attacks."
    444     },
    445     {
    446       "title": "Backdooring Instruction-Tuned Large Language Models with Virtual Prompt Injection",
    447       "authors": ["Jun Yan"],
    448       "year": 2023,
    449       "relevance": "Studies backdoor injection methods for instruction-tuned LLMs via fine-tuning."
    450     }
    451   ]
    452 }

Impressum · Datenschutz