scan.json (21832B)
1 { 2 "paper": { 3 "title": "HarmTransform: Transforming Explicit Harmful Queries into Stealthy via Multi-Agent Debate", 4 "authors": ["Shenzhe Zhu"], 5 "year": 2025, 6 "venue": "arXiv.org", 7 "arxiv_id": "2512.23717", 8 "doi": "10.48550/arXiv.2512.23717" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "HarmTransform, a multi-agent debate framework for transforming harmful queries into stealthier forms, achieves 0.36 attack effectiveness vs 0.24 for the best baseline (SingleLLM), while maintaining 0.73 intent preservation. Ablation studies show that increasing debaters beyond 3 or rounds beyond 1 yields diminishing or negative returns. Qualitative analysis reveals debate is a double-edged sword: it can improve stealth via collaborative camouflage but also cause regressions through over-specification.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No repository URL or code link is provided in the paper." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The paper uses the publicly available Safe-RLHF dataset (Dai et al., 2023) as the source of harmful queries." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No environment specifications, dependency lists, or setup instructions are provided." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No reproduction instructions or scripts are provided." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Table 1 and Figures 4-5 report point estimates only with no confidence intervals or error bars." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims HarmTransform outperforms baselines by 0.12 but provides no statistical significance tests." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "The paper reports absolute scores and differences (e.g., 0.36 vs 0.24 effectiveness, an improvement of 0.12), providing enough context to assess magnitude." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The paper samples 100 queries from Safe-RLHF with no justification for why 100 is sufficient." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No standard deviations, variance, or spread measures are reported across runs." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Three baselines are compared: SingleLLM, SingleLLMReflect, and HarmTransform-NoDebate (Section 5.1)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": false, 73 "justification": "The paper acknowledges 'no prior work directly addresses this task' and designs its own baselines. These are intuitive ablations rather than competitive external methods." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Section 5.3 provides ablation studies varying the number of debaters (3-6) and debate rounds (0-4)." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Two metrics are used: Preservation (intent preservation) and Effectiveness (attack effectiveness), defined in Section 4." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "All evaluation is automated via LLM-as-judge. No human evaluation of query quality or stealthiness is performed." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": false, 93 "justification": "The paper samples 100 queries and evaluates on them directly. No train/dev/test separation is described." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": false, 98 "justification": "Results are reported as aggregate scores only. No breakdown by harm category or query type is provided." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 6 provides a detailed case study of debate regression (6 cases) and debate improvement (8 cases), analyzing why debate fails or succeeds." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper reports that SingleLLMReflect performs worse than SingleLLM (0.18 vs 0.24), and that additional debate rounds yield diminishing returns (Section 5.3)." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims HarmTransform 'significantly outperforms standard baselines' which is supported by Table 1 (0.36 vs 0.24). The 'double-edged sword' claim is supported by the case study in Section 6." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper makes causal claims about debate improving stealth ('debate improves the effectiveness of query stealth') via ablation, but the ablation design is confounded — the debate and no-debate conditions differ in multiple ways (agent interaction, context length, summarization input), making it hard to isolate the debate mechanism." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper tests only on 100 queries from one dataset with one model (DeepSeek-V3) but makes broad claims about multi-agent debate for 'LLM safety alignment' generally." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "No discussion of alternative explanations for the results, such as whether the improvement is due to increased compute/tokens rather than the debate structure itself." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper uses LLM-as-judge for intent preservation and refusal detection for effectiveness, both proxies. No discussion of whether these automated measures actually capture stealthiness or harmfulness as perceived by humans." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper states 'DeepSeek-V3' but provides no version number, snapshot date, or API version." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Full prompt texts for debaters, summarizer, generator, and harmful intent judge are provided in Appendix B (Figures 8-12)." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "No temperature, top-p, max tokens, or other LLM API hyperparameters are reported." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "The multi-agent debate pipeline is described in detail in Section 3, including persona assignment, local-history sharing, summarization, and generation steps (Figure 1)." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": false, 162 "justification": "The paper states 'we sample the first 100 queries' from Safe-RLHF but does not describe any filtering or preprocessing criteria beyond this." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 7 'Limitation and Future Study' discusses intent shift and information overload as key limitations." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 7 identifies specific threats: agents drifting from original intent and debates accumulating redundant content that reduces stealth quality." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "The limitations section discusses failure modes but does not explicitly state what the results do NOT show (e.g., that results are limited to DeepSeek-V3, English queries, or the specific harm categories in Safe-RLHF)." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw data (transformed queries, debate logs, judge outputs) is released for independent verification." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 5.1 states the source dataset (Safe-RLHF) and sampling method ('we sample the first 100 queries')." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data source is a standard public benchmark." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": false, 201 "justification": "The pipeline from query to evaluation is described at a high level but the case study (Section 6) selects 50 queries then extracts 14 divergent cases without explaining selection criteria for the 50." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information or acknowledgments section is present in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliation is clearly stated as University of Toronto." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed, so independence cannot be assessed." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial disclosure statement is provided." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": false, 229 "answer": false, 230 "justification": "This paper tests a red-teaming framework's ability to generate stealthy queries, not a pre-trained model's benchmark knowledge. The LLM is used as a tool for generation, not evaluated for learned capabilities." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": false, 234 "answer": false, 235 "justification": "Same as above — the paper does not evaluate a model's pre-trained knowledge on a benchmark." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": false, 239 "answer": false, 240 "justification": "Same as above — contamination in the traditional sense is not applicable." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "The framework calls DeepSeek-V3 many times (3+ debaters, summarizer, generator, judge) per query. No cost, latency, or token usage is reported." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No compute budget, API costs, or total resource usage is stated." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No mention of random seeds or sensitivity analysis across runs." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "No statement of how many runs produced the reported results." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "The number of debaters (3) and rounds (1) appear chosen without justification. No search budget described." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "The default configuration (M=3, N=1) is used for main results without justifying why this configuration was selected." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "All baselines are designed by the authors. No acknowledgment of self-comparison bias." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "HarmTransform uses significantly more compute than SingleLLM (3 debaters + summarizer + generator vs 1 LLM call) but this compute disparity is never discussed." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "Both metrics rely on LLM-as-judge (same model, DeepSeek-V3). No discussion of whether this judge accurately measures intent preservation or attack effectiveness." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "The paper evaluates its own pipeline, not comparing different models within different scaffolds." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "The Safe-RLHF dataset (2023) may have been in DeepSeek-V3's training data. Not discussed." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "The same model (DeepSeek-V3) is used for generation and judging, creating potential bias. Not discussed." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of potential dependencies between the 100 sampled queries." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention methods are used." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "HarmTransform achieves the highest attack effectiveness (0.36) among all methods, exceeding the second-best by 0.12.", 365 "evidence": "Table 1 (Section 5.2) shows effectiveness scores: HarmTransform 0.36, SingleLLM 0.24, HarmTransform-NoDebate 0.22, SingleLLMReflect 0.18.", 366 "supported": "weak" 367 }, 368 { 369 "claim": "HarmTransform maintains competitive intent preservation (0.73) while achieving highest effectiveness.", 370 "evidence": "Table 1 shows preservation scores: SingleLLM 0.77, HarmTransform 0.73, HarmTransform-NoDebate 0.73, SingleLLMReflect 0.37.", 371 "supported": "weak" 372 }, 373 { 374 "claim": "Increasing the number of debaters beyond 3 does not improve attack effectiveness.", 375 "evidence": "Figure 4 shows effectiveness remains relatively flat across 3-6 debaters.", 376 "supported": "weak" 377 }, 378 { 379 "claim": "Additional debate rounds beyond 1 lead to diminishing or negative returns.", 380 "evidence": "Figure 5 shows effectiveness peaks at round 1 and declines with more rounds.", 381 "supported": "weak" 382 } 383 ], 384 "red_flags": [ 385 { 386 "flag": "Tiny sample size", 387 "detail": "Only 100 queries are used for evaluation, and the case study analyzes just 14 divergent cases from 50 queries. No error bars or significance tests accompany any results." 388 }, 389 { 390 "flag": "Same model used for generation and evaluation", 391 "detail": "DeepSeek-V3 is used for all components: debaters, summarizer, generator, and both judges (intent preservation and refusal detection). This circularity may inflate results." 392 }, 393 { 394 "flag": "LLM-as-judge without validation", 395 "detail": "Both metrics rely entirely on LLM-based judgment with no human validation of the judge's accuracy. The intent preservation judge prompt (Appendix B.4) makes binary decisions with no inter-rater reliability check." 396 }, 397 { 398 "flag": "Unfair compute comparison", 399 "detail": "HarmTransform uses 3 debaters × multiple rounds + summarizer + generator (5+ LLM calls), while SingleLLM uses 1 call. The 0.12 effectiveness improvement may simply reflect more compute, not better methodology." 400 }, 401 { 402 "flag": "Case study attribution uses same LLM", 403 "detail": "The qualitative case study (Section 6) uses LLM-based attribution analysis to explain why debate helps or hurts, introducing another layer of unvalidated LLM judgment." 404 } 405 ], 406 "cited_papers": [ 407 { 408 "title": "Safe RLHF: Safe Reinforcement Learning from Human Feedback", 409 "authors": ["Josef Dai", "Xuehai Pan", "Ruiyang Sun"], 410 "year": 2023, 411 "arxiv_id": "2310.12773", 412 "relevance": "Safety alignment dataset used as the source of harmful queries in this study." 413 }, 414 { 415 "title": "Foundational Challenges in Assuring Alignment and Safety of Large Language Models", 416 "authors": ["Usman Anwar"], 417 "year": 2024, 418 "arxiv_id": "2404.09932", 419 "relevance": "Survey of foundational challenges in LLM safety alignment." 420 }, 421 { 422 "title": "Jailbroken: How Does LLM Safety Training Fail?", 423 "authors": ["Alexander Wei", "Nika Haghtalab", "Jacob Steinhardt"], 424 "year": 2024, 425 "arxiv_id": "2307.02483", 426 "relevance": "Analysis of how LLM safety training fails against jailbreaking attacks." 427 }, 428 { 429 "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models", 430 "authors": ["Andy Zou", "Zifan Wang", "J Zico Kolter", "Matt Fredrikson"], 431 "year": 2023, 432 "arxiv_id": "2307.15043", 433 "relevance": "Foundational work on adversarial attacks against LLM safety alignment." 434 }, 435 { 436 "title": "Improving Factuality and Reasoning in Language Models through Multiagent Debate", 437 "authors": ["Yilun Du", "Shuang Li", "Antonio Torralba"], 438 "year": 2023, 439 "arxiv_id": "2305.14325", 440 "relevance": "Foundational work on multi-agent debate for improving LLM reasoning." 441 }, 442 { 443 "title": "Encouraging Divergent Thinking in Large Language Models through Multi-Agent Debate", 444 "authors": ["Tian Liang"], 445 "year": 2023, 446 "arxiv_id": "2305.19118", 447 "relevance": "Multi-agent debate framework for enhancing LLM diversity, directly extended by HarmTransform." 448 }, 449 { 450 "title": "SORRY-Bench: Systematically Evaluating Large Language Model Safety Refusal", 451 "authors": ["Tinghao Xie"], 452 "year": 2025, 453 "relevance": "Systematic LLM safety refusal evaluation benchmark, provides the refusal indicator definition used in this paper." 454 }, 455 { 456 "title": "Red Teaming Language Models to Reduce Harms: Methods, Scaling Behaviors, and Lessons Learned", 457 "authors": ["Deep Ganguli"], 458 "year": 2022, 459 "arxiv_id": "2209.07858", 460 "relevance": "Foundational red-teaming methodology for LLM safety evaluation." 461 }, 462 { 463 "title": "Prompt Injection Attack Against LLM-Integrated Applications", 464 "authors": ["Yi Liu"], 465 "year": 2023, 466 "arxiv_id": "2306.05499", 467 "relevance": "Prompt injection attacks on LLM applications, related threat vector." 468 }, 469 { 470 "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", 471 "authors": ["Qingyun Wu"], 472 "year": 2023, 473 "arxiv_id": "2308.08155", 474 "relevance": "Multi-agent conversation framework for LLM applications." 475 }, 476 { 477 "title": "Agent-SafetyBench: Evaluating the Safety of LLM Agents", 478 "authors": ["Zhexin Zhang"], 479 "year": 2024, 480 "arxiv_id": "2412.14470", 481 "relevance": "Benchmark for evaluating LLM agent safety." 482 } 483 ] 484 }