scan.json (26499B)
1 { 2 "paper": { 3 "title": "AQUA-LLM: Evaluating Accuracy, Quantization, and Adversarial Robustness Trade-offs in LLMs for Cybersecurity Question Answering", 4 "authors": [ 5 "Onat Gungor", 6 "Roshan Sood", 7 "Harold Wang", 8 "Tajana Rosing" 9 ], 10 "year": 2025, 11 "venue": "arXiv preprint", 12 "arxiv_id": "2509.13514" 13 }, 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. The authors reference external libraries (Unsloth, BitsAndBytes, DeepTeam) but do not release their own evaluation framework code." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The paper uses two publicly available benchmarks: CyberBench (reference [18]) and CyberMetric (reference [8]). These are standard public datasets that the authors did not modify in a proprietary way. The adversarial prompts are generated via the DeepTeam framework using a described procedure." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper mentions hardware (NVIDIA A100 GPU, 80 GB, 16-core CPU, 32 GB RAM, Linux VM) in Section III-G and libraries used (Unsloth, BitsAndBytes, Transformers) but does not provide a requirements.txt, Dockerfile, or detailed dependency version listing sufficient to recreate the environment." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. While the methodology sections describe the experimental pipeline conceptually, there are no specific commands or reproduction guides." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "All results are reported as point estimates (e.g., '98.75% accuracy', '77.92% ASR'). No confidence intervals, error bars, or uncertainty measures are provided on any metric." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper makes numerous comparative claims (e.g., 'fine-tuning substantially improves task performance', 'quantization accelerates inference at the cost of robustness') but relies entirely on comparing raw numbers without any statistical significance tests." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "The paper reports percentage improvements with baseline context, e.g., 'Mistral-7B reaches 100% accuracy on CyberMetric, compared to 70% in its base configuration' (Section IV.1), 'ASR increases from 52.25% in base models to 77.92% in quantized models' (Section IV.2), and 'up to 1.28× speedup' and '16× loss in robustness' (Table II)." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The evaluation uses 50 and 100 questions per benchmark subset and 50 or 100 adversarial prompts. These are very small sample sizes for the comparative claims being made, and no justification or power analysis is provided for why these sizes are sufficient." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No standard deviations, variance, or spread measures across experimental runs are reported. Results appear to be from single runs. There is no mention of multiple seeds or repeated trials." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper compares four configurations (base, quantized, fine-tuned, fine-tuned+quantized) across six models, where base models serve as baselines for each model's other configurations. Table I and Figures 2-3 show these comparisons." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "The six evaluated models are all recent: Meta LLaMA-3.1-8B-Instruct, Mistral-7B-Instruct, Phi-3.5-Mini-Instruct, Foundation-Sec-8B, Qwen 2.5-7B-Instruct, and DeepSeek-R1-Distill. These are contemporary models as of 2025." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "The four configurations (base, quantized, fine-tuned, fine-tuned+quantized) effectively constitute an ablation study showing the individual and combined effects of quantization and fine-tuning on accuracy and robustness." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Three evaluation metrics are used: Accuracy (QA correctness), Attack Success Rate (ASR, adversarial robustness), and Inference Latency (efficiency), as described in Section III-G." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "This is a benchmark evaluation paper measuring automated metrics (accuracy, ASR, latency) on multiple-choice QA tasks. Human evaluation of system outputs is not relevant to the claims being made." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "Section III-C describes explicit train/test splits: CyberMetric uses 850 training and 150 testing samples; CyberBench fine-tunes on SecMMLU (116 questions) and evaluates on CyQuiz (128 questions), a separate subset." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down per model (6 models), per configuration (4 configurations), per dataset (CyberBench and CyberMetric), and per evaluation size (50 and 100 questions) in Tables I and II and Figures 2-5." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper discusses cases where the expected pattern breaks down, e.g., Finding 5 notes Foundation-Sec-8B achieves 'up to 100% ASR' despite strong accuracy, and Finding 3 notes 'only a few exceptions across the evaluated LLMs, e.g., Meta-Llama-3.1-8B' where the FTQ robustness improvement does not hold." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Several negative findings are prominently reported: quantization alone degrades both accuracy and robustness (Finding 4), cybersecurity-specialized LLMs exhibit 'alarmingly poor robustness' (Finding 5), and fine-tuning alone 'frequently reduces robustness compared to base models' (Conclusion)." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims that (1) quantization alone yields lowest accuracy and robustness, (2) combining quantization with fine-tuning enhances both robustness and performance. Both claims are supported by the experimental results in Section IV (Tables I-II, Figures 2-5)." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper makes causal claims about the effects of fine-tuning and quantization (e.g., 'fine-tuning substantially improves task performance'). These are supported by controlled comparisons — the same models are evaluated across four configurations that differ only in the applied treatment (fine-tuning, quantization, or both), constituting adequate single-variable manipulation." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title and abstract make broad claims about 'LLMs for Cybersecurity Question Answering' but the study tests only six small open-source models (7-8B parameters) on two benchmarks with only multiple-choice QA format. The paper does not adequately bound its conclusions to this narrow scope — no mention that results may not generalize to larger models, closed-source models, or other QA formats." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper does not discuss alternative explanations for its findings. For example, it speculates that FTQ may act as 'a regularization mechanism that enhances model robustness' (Section IV.3) without considering alternatives. No threats-to-validity section or systematic consideration of confounds is provided." 131 } 132 }, 133 "setup_transparency": { 134 "model_versions_specified": { 135 "applies": true, 136 "answer": false, 137 "justification": "Models are identified by general names (e.g., 'Meta LLaMA-3.1-8B-Instruct', 'Mistral-7B-Instruct', 'Phi-3.5-Mini-Instruct') but without exact version hashes, checkpoint dates, or Hugging Face model IDs with specific revisions. The paper uses marketing/product names rather than precise reproducible identifiers." 138 }, 139 "prompts_provided": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper describes using 'model-specific chat templates' (Section III-C) and formatting prompts as 'instruction-response pairs' but does not provide the actual prompt text used for evaluation or fine-tuning. No prompt templates or examples are included." 143 }, 144 "hyperparameters_reported": { 145 "applies": true, 146 "answer": true, 147 "justification": "Section III-C and III-E report key hyperparameters: LoRA rank=16, alpha=16, no dropout, 60 training steps, AdamW optimizer, peak learning rate 2×10^-4. Section III-D specifies 4-bit NF4 quantization with BitsAndBytes." 148 }, 149 "scaffolding_described": { 150 "applies": false, 151 "answer": false, 152 "justification": "No agentic scaffolding is used. The evaluation is a standard benchmark evaluation pipeline with no agent loops, tool use, or multi-step reasoning scaffolds." 153 }, 154 "data_preprocessing_documented": { 155 "applies": true, 156 "answer": true, 157 "justification": "Section III-C documents data preprocessing: loading Hugging Face datasets, standardizing into instruction-response pairs using Unsloth, formatting with model-specific chat templates, and tokenization. Section III-B describes how CyberBench was adapted into multiple-choice QA format. Training/test splits are explicitly stated." 158 } 159 }, 160 "limitations_and_scope": { 161 "limitations_section_present": { 162 "applies": true, 163 "answer": false, 164 "justification": "There is no dedicated Limitations, Threats to Validity, or similar section in the paper. The paper goes directly from experimental results to conclusions without discussing limitations." 165 }, 166 "threats_to_validity_specific": { 167 "applies": true, 168 "answer": false, 169 "justification": "No threats to validity are discussed anywhere in the paper, whether specific or generic." 170 }, 171 "scope_boundaries_stated": { 172 "applies": true, 173 "answer": false, 174 "justification": "The paper does not explicitly state what the results do NOT show. It does not acknowledge limitations such as: only small models tested, only multiple-choice QA format evaluated, only one type of adversarial attack used, or that results may not generalize to other cybersecurity tasks or larger models." 175 } 176 }, 177 "data_integrity": { 178 "raw_data_available": { 179 "applies": true, 180 "answer": false, 181 "justification": "No raw experimental data (individual model responses, per-question accuracy breakdowns, individual adversarial prompt results) is made available. Only aggregated metrics in tables and figures are reported." 182 }, 183 "data_collection_described": { 184 "applies": true, 185 "answer": true, 186 "justification": "The data collection procedure is described: CyberBench and CyberMetric are publicly available benchmarks, adversarial prompts are generated using the DeepTeam framework with the IllegalActivity vulnerability class targeting cybercrime scenarios (Section III-F), and GPT-3.5 generates 50 or 100 adversarial examples per configuration." 187 }, 188 "recruitment_methods_described": { 189 "applies": false, 190 "answer": false, 191 "justification": "No human participants are involved. The study uses standard public benchmarks and automated adversarial prompt generation." 192 }, 193 "data_pipeline_documented": { 194 "applies": true, 195 "answer": true, 196 "justification": "The data pipeline is documented across Sections III-B through III-F: dataset loading → preprocessing into instruction-response pairs → tokenization → model inference → evaluation via accuracy and ASR metrics. The adversarial evaluation pipeline (prompt generation via DeepTeam → model querying → GPT-3.5 harmfulness assessment) is also described." 197 } 198 }, 199 "conflicts_of_interest": { 200 "funding_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "The Acknowledgments section discloses funding from NSF (with specific award numbers) and DARPA (via PRISM and CoCoSys centers in JUMP 2.0, an SRC program)." 204 }, 205 "affiliations_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "All four authors list their affiliation as the Department of Computer Science and Engineering, University of California, San Diego. No commercial product of their own is being evaluated." 209 }, 210 "funder_independent_of_outcome": { 211 "applies": true, 212 "answer": true, 213 "justification": "Funding is from NSF and DARPA, which are government agencies without a direct financial interest in the specific outcome of whether fine-tuned quantized models outperform other configurations." 214 }, 215 "financial_interests_declared": { 216 "applies": true, 217 "answer": false, 218 "justification": "No competing interests or financial interests statement is present in the paper." 219 } 220 }, 221 "contamination": { 222 "training_cutoff_stated": { 223 "applies": true, 224 "answer": false, 225 "justification": "The paper does not state the training data cutoff dates for any of the six models evaluated. This is relevant because the models may have been trained on data that includes the benchmark questions." 226 }, 227 "train_test_overlap_discussed": { 228 "applies": true, 229 "answer": false, 230 "justification": "No discussion of potential train/test overlap. The CyberMetric benchmark questions were generated by GPT-3.5 and the models evaluated could have been trained on similar or identical data. No contamination analysis is performed." 231 }, 232 "benchmark_contamination_addressed": { 233 "applies": true, 234 "answer": false, 235 "justification": "CyberBench and CyberMetric were published before 2025, and the evaluated models (especially 2024-2025 vintage) could have been trained on data containing these benchmarks. The paper does not address this contamination risk at all." 236 } 237 }, 238 "human_studies": { 239 "pre_registered": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants are involved in this study." 243 }, 244 "irb_or_ethics_approval": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants are involved in this study." 248 }, 249 "demographics_reported": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants are involved in this study." 253 }, 254 "inclusion_exclusion_criteria": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants are involved in this study." 258 }, 259 "randomization_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants are involved in this study." 263 }, 264 "blinding_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants are involved in this study." 268 }, 269 "attrition_reported": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants are involved in this study." 273 } 274 }, 275 "cost_and_practicality": { 276 "inference_cost_reported": { 277 "applies": true, 278 "answer": true, 279 "justification": "Table II reports per-question inference latency in seconds for each model configuration (base vs. quantized), with speedup ratios. For example, Meta-Llama-3.1-8B: 0.140s (base) vs. 0.112s (quantized), 1.25× speedup." 280 }, 281 "compute_budget_stated": { 282 "applies": true, 283 "answer": false, 284 "justification": "The hardware is specified (NVIDIA A100 80GB GPU) but total compute budget (GPU hours for fine-tuning, total experiment wall time, API costs for GPT-3.5 calls used in adversarial prompt generation and harmfulness evaluation) is not reported." 285 } 286 } 287 }, 288 "claims": [ 289 { 290 "claim": "Task-specific fine-tuning substantially improves QA accuracy, with fine-tuned models achieving 98.75% on CyberMetric and 97.17% on CyberBench compared to 70.42% and 69.83% for base models.", 291 "evidence": "Table I shows accuracy before and after fine-tuning across all six models on both datasets. All models show large accuracy gains (e.g., Mistral-7B: 70% → 100% on CyberMetric). Section IV Finding 1.", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "Quantization accelerates inference by up to 1.28× but increases ASR from 52.25% (base) to 77.92% (quantized), representing up to 16× robustness loss.", 296 "evidence": "Table II provides per-model latency and ASR comparisons between base and quantized configurations. The 16× figure comes from Meta-Llama-3.1-8B (5% → 80% ASR). Section IV Finding 2.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "Combining fine-tuning with quantization (FTQ) preserves accuracy while improving robustness over fine-tuning alone, with average ASR of 64.58% vs. 84.08% on CyberBench.", 301 "evidence": "Section IV Finding 3 and Figure 4 compare FT vs. FTQ configurations. The paper acknowledges exceptions (e.g., Meta-Llama-3.1-8B). Average FTQ accuracy is 94.92% vs. 97.17% for FT on CyberBench.", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "The cybersecurity-specialized Foundation-Sec-8B achieves top accuracy but exhibits 'alarmingly poor robustness' with ASR up to 100%.", 306 "evidence": "Section IV Finding 5 and Figures 2-3 show Foundation-Sec-8B reaching 100% accuracy in FT/FTQ but ASR up to 100% on CyberMetric for fine-tuned configuration.", 307 "supported": "moderate" 308 }, 309 { 310 "claim": "Fine-tuning combined with quantization may act as a regularization mechanism that enhances model robustness.", 311 "evidence": "Section IV Finding 3 speculates this based on observed lower ASR for FTQ vs. FT models. No mechanistic analysis or theoretical justification is provided.", 312 "supported": "weak" 313 } 314 ], 315 "methodology_tags": [ 316 "benchmark-eval" 317 ], 318 "key_findings": "AQUA-LLM evaluates six small open-source LLMs in four configurations (base, quantized, fine-tuned, fine-tuned+quantized) on cybersecurity QA accuracy, adversarial robustness (prompt injection ASR), and inference latency. Fine-tuning dramatically improves QA accuracy (from ~70% to ~98%), while quantization alone degrades both accuracy and robustness. Combining fine-tuning with quantization preserves most accuracy gains while improving robustness over fine-tuning alone, suggesting FTQ as the preferred deployment configuration. The cybersecurity-specialized Foundation-Sec-8B achieves top accuracy but exhibits severely poor adversarial robustness.", 319 "red_flags": [ 320 { 321 "flag": "Very small evaluation samples", 322 "detail": "The evaluation uses only 50 and 100 questions per benchmark configuration, and 50 or 100 adversarial prompts. These sample sizes are too small for reliable comparative claims — a few changed answers could flip the relative rankings. No confidence intervals or significance tests compensate for this." 323 }, 324 { 325 "flag": "Single-run results with no variance reporting", 326 "detail": "All results appear to be from single experimental runs with no repeated trials, seeds, or variance reporting. Given the stochastic nature of LLM inference and adversarial prompt generation, this makes the results unreliable." 327 }, 328 { 329 "flag": "Suspiciously high fine-tuning accuracy", 330 "detail": "Several models achieve 98-100% accuracy after fine-tuning on small datasets (116-850 training samples) evaluated on 50-100 test questions. This raises concerns about potential train/test contamination, memorization, or the test sets being too easy. No contamination analysis is performed." 331 }, 332 { 333 "flag": "GPT-3.5 as adversarial judge", 334 "detail": "Both adversarial prompt generation and harmfulness evaluation are performed by GPT-3.5 API calls. This introduces a dependency on a black-box third-party model whose behavior may change over time, and the reliability of GPT-3.5 as a harmfulness classifier is not validated." 335 }, 336 { 337 "flag": "No limitations section", 338 "detail": "The paper contains no limitations, threats to validity, or discussion of scope boundaries. This is a significant omission for a paper making broad claims about LLM deployment in safety-critical cybersecurity applications." 339 }, 340 { 341 "flag": "Benchmark contamination risk unaddressed", 342 "detail": "CyberBench and CyberMetric were published before the evaluated models' likely training cutoffs. CyberMetric questions were generated by GPT-3.5, and the models may have been trained on similar or identical data. This is never discussed." 343 }, 344 { 345 "flag": "Overlapping test subsets", 346 "detail": "Section III-C states that for CyberBench, 'the CyQuiz test set is further partitioned into overlapping subsets of 50 and 100 questions.' The 50-question subset is thus a subset of the 100-question set, meaning the two reported numbers are not independent evaluations." 347 } 348 ], 349 "cited_papers": [ 350 { 351 "title": "CyberBench: A Multi-Task Benchmark for Evaluating Large Language Models in Cybersecurity Applications", 352 "authors": ["Z. Liu", "J. Shi", "J. F. Buford"], 353 "year": 2024, 354 "relevance": "Cybersecurity-specific LLM benchmark used as one of the two evaluation datasets in this study." 355 }, 356 { 357 "title": "CyberMetric: A Benchmark Dataset Based on Retrieval-Augmented Generation for Evaluating LLMs in Cybersecurity Knowledge", 358 "authors": ["N. Tihanyi"], 359 "year": 2024, 360 "relevance": "RAG-based cybersecurity QA benchmark used as one of the two evaluation datasets." 361 }, 362 { 363 "title": "CyberSecEval 2: A Wide-Ranging Cybersecurity Evaluation Suite for Large Language Models", 364 "authors": ["R. Bhatt", "S. Chennabasappa"], 365 "year": 2024, 366 "arxiv_id": "2404.13161", 367 "relevance": "Comprehensive cybersecurity evaluation suite that identifies safety-utility trade-offs in LLMs, directly relevant to LLM safety benchmarking." 368 }, 369 { 370 "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection", 371 "authors": ["K. Greshake"], 372 "year": 2023, 373 "relevance": "Foundational work on indirect prompt injection attacks against LLMs, relevant to adversarial robustness evaluation." 374 }, 375 { 376 "title": "Sleeper Agents: Training Deceptive LLMs That Persist Through Safety Training", 377 "authors": ["E. Hubinger"], 378 "year": 2024, 379 "arxiv_id": "2401.05566", 380 "relevance": "Demonstrates persistent deceptive behavior in LLMs surviving safety training, relevant to LLM safety and adversarial robustness." 381 }, 382 { 383 "title": "Exploiting LLM Quantization", 384 "authors": ["K. Egashira"], 385 "year": 2024, 386 "arxiv_id": "2405.18137", 387 "relevance": "Shows that quantized models can exhibit adversarial behavior not present in full-precision counterparts, directly relevant to quantization safety." 388 }, 389 { 390 "title": "Fine-tuning, Quantization, and LLMs: Navigating Unintended Outcomes", 391 "authors": ["D. Kumar"], 392 "year": 2024, 393 "arxiv_id": "2404.04392", 394 "relevance": "Demonstrates that fine-tuning and quantization combined can degrade robustness, the key prior work this paper extends." 395 }, 396 { 397 "title": "PromptBench: Towards Evaluating the Robustness of Large Language Models on Adversarial Prompts", 398 "authors": ["Y. Zhu"], 399 "year": 2023, 400 "arxiv_id": "2306.04528", 401 "relevance": "Benchmark for evaluating LLM robustness to adversarial prompts, relevant to prompt injection evaluation methodology." 402 }, 403 { 404 "title": "LoRA: Low-Rank Adaptation of Large Language Models", 405 "authors": ["E. J. Hu"], 406 "year": 2022, 407 "relevance": "The parameter-efficient fine-tuning method used in this study, widely used in LLM adaptation research." 408 }, 409 { 410 "title": "QLoRA: Efficient Finetuning of Quantized LLMs", 411 "authors": ["T. Dettmers", "A. Pagnoni", "A. Holtzman", "L. Zettlemoyer"], 412 "year": 2023, 413 "relevance": "The quantization library and approach (BitsAndBytes, NF4) used in this paper's quantization pipeline." 414 }, 415 { 416 "title": "CyberLLMInstruct: A New Dataset for Analyzing Safety of Fine-Tuned LLMs Using Cyber Security Data", 417 "authors": ["A. ElZemity", "B. Arief", "S. Li"], 418 "year": 2025, 419 "relevance": "Investigates safety risks of fine-tuning LLMs on cybersecurity data, finding increased vulnerability to adversarial prompts." 420 }, 421 { 422 "title": "Exploring the Role of Large Language Models in Cybersecurity: A Systematic Survey", 423 "authors": ["S. Tian"], 424 "year": 2025, 425 "arxiv_id": "2504.15622", 426 "relevance": "Systematic survey of LLMs in cybersecurity, relevant as a comprehensive review of the field this paper contributes to." 427 } 428 ] 429 }