scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25260B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "HarmTransform: Transforming Explicit Harmful Queries into Stealthy via Multi-Agent Debate",
      6     "authors": [
      7       "Shenzhe Zhu"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2512.23717",
     12     "doi": "10.48550/arXiv.2512.23717"
     13   },
     14   "checklist": {
     15     "claims_and_evidence": {
     16       "abstract_claims_supported": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Abstract claims that HarmTransform outperforms baselines (Table 1: 0.36 vs 0.24 effectiveness) and acts as a double-edged sword (Section 6 documents both improvement and regression cases).",
     20         "source": "haiku"
     21       },
     22       "causal_claims_justified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "Paper claims debate 'improves' effectiveness but lacks randomized assignment or controlled comparison. Ablation studies (Figures 4-5) examine effect of debate components, but experimental design is observational, not experimental.",
     26         "source": "haiku"
     27       },
     28       "generalization_bounded": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "Evaluation limited to 100 queries from Safe-RLHF, single model (DeepSeek-V3), but paper makes no explicit statement that findings are bounded to this setting or tested only on one model.",
     32         "source": "haiku"
     33       },
     34       "alternative_explanations_discussed": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Section 6.3 discusses why debate succeeds (collaborative camouflage, legitimization framing) and fails (over-specification, optimization backfire) with specific mechanisms for each.",
     38         "source": "haiku"
     39       },
     40       "proxy_outcome_distinction": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "Paper clearly defines preservation score (binary intent-preserved judgment) and effectiveness score (refusal bypass rate). Metrics directly measure stated claims.",
     44         "source": "haiku"
     45       }
     46     },
     47     "limitations_and_scope": {
     48       "limitations_section_present": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Section 7 titled 'Limitation and Future Study' is dedicated and discusses intent shift and information overload as specific limitations.",
     52         "source": "haiku"
     53       },
     54       "threats_to_validity_specific": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "Limitations mention intent drift and redundancy but miss critical threats: sample size justification absent, no human evaluation validity, circular evaluation bias (same model generates and judges), no significance testing, single-model generalizability.",
     58         "source": "haiku"
     59       },
     60       "scope_boundaries_stated": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "Paper does not explicitly state scope boundaries. It implicitly bounds evaluation to 100 Safe-RLHF queries and DeepSeek-V3, but does not clearly state 'results do not generalize to' or 'were only tested on.'",
     64         "source": "haiku"
     65       }
     66     },
     67     "conflicts_of_interest": {
     68       "funding_disclosed": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No funding sources mentioned anywhere in the paper (no acknowledgments section visible).",
     72         "source": "haiku"
     73       },
     74       "affiliations_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Author affiliation stated: University of Toronto. No industry or product company affiliations mentioned.",
     78         "source": "haiku"
     79       },
     80       "funder_independent_of_outcome": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "Appears to be unfunded independent work.",
     84         "source": "haiku"
     85       },
     86       "financial_interests_declared": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No competing interests statement or financial interest declaration included.",
     90         "source": "haiku"
     91       }
     92     },
     93     "scope_and_framing": {
     94       "key_terms_defined": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Key metrics formally defined: 'harmful intent preservation' (Eq. 1), 'attacking effectiveness' (Eq. 4). Personas explicitly listed in Appendix A.",
     98         "source": "haiku"
     99       },
    100       "intended_contribution_clear": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Introduction explicitly states contributions: first multi-agent debate framework for transforming harmful queries into stealthier forms while preserving intent, comprehensive evaluation protocol, analysis of debate dynamics.",
    104         "source": "haiku"
    105       },
    106       "engagement_with_prior_work": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 2.1 and 2.2 review AI safety alignment and multi-agent debate literature. Paper shows gap in prior work: existing safety research addresses explicit harmful queries, not implicit ones.",
    110         "source": "haiku"
    111       }
    112     }
    113   },
    114   "type_checklist": {
    115     "empirical": {
    116       "artifacts": {
    117         "code_released": {
    118           "applies": true,
    119           "answer": false,
    120           "justification": "No mention of code release, repository URL, or availability statement anywhere in the paper.",
    121           "source": "haiku"
    122         },
    123         "data_released": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "Uses public Safe-RLHF dataset as input, but the 100 sampled queries and generated transformed queries are not stated as released.",
    127           "source": "haiku"
    128         },
    129         "environment_specified": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "Only specifies model name (DeepSeek-V3) with no dependency specs, requirements.txt, Dockerfile, or environment isolation details.",
    133           "source": "haiku"
    134         },
    135         "reproduction_instructions": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "Appendix B provides prompts (Figures 8-12) but no step-by-step reproduction instructions. Unclear how to set up the debate framework, how to invoke LLM APIs, or which exact 100 queries were sampled.",
    139           "source": "haiku"
    140         }
    141       },
    142       "statistical_methodology": {
    143         "confidence_intervals_or_error_bars": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "Table 1 reports only point estimates (0.36, 0.24, etc.). Figures 4-5 show curves but no error bars, confidence intervals, or variance bands.",
    147           "source": "haiku"
    148         },
    149         "significance_tests": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "No statistical significance tests, p-values, or hypothesis tests reported despite comparative claims (e.g., 0.36 vs 0.24 effectiveness).",
    153           "source": "haiku"
    154         },
    155         "effect_sizes_reported": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "Paper reports raw scores (0.36 vs 0.24) but no formal effect size metrics (Cohen's d, relative improvement ratio, odds ratio).",
    159           "source": "haiku"
    160         },
    161         "sample_size_justified": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "No justification provided for 100 queries, 3 debaters, or 1 debate round. No power analysis or sample size calculation documented.",
    165           "source": "haiku"
    166         },
    167         "variance_reported": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "Figures 4-5 show point curves with no variance, standard deviation, or confidence bands across runs.",
    171           "source": "haiku"
    172         }
    173       },
    174       "evaluation_design": {
    175         "baselines_included": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Three baselines compared: SingleLLM (0.24 effectiveness), SingleLLMReflect (0.18), HARMTRANSFORM-NoDebate (0.22). Table 1.",
    179           "source": "haiku"
    180         },
    181         "baselines_contemporary": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "No prior work directly addresses this task, so baselines are reasonable (single-LLM, reflection-augmented, and debate-ablated variants).",
    185           "source": "haiku"
    186         },
    187         "ablation_study": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Section 5.3 ablates number of debaters (3-6, Figure 4) and debate rounds (0-4, Figure 5).",
    191           "source": "haiku"
    192         },
    193         "multiple_metrics": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Two complementary metrics: preservation score (intent preservation, Section 4.1) and effectiveness score (attack success, Section 4.2).",
    197           "source": "haiku"
    198         },
    199         "human_evaluation": {
    200           "applies": true,
    201           "answer": false,
    202           "justification": "No human evaluation. Both preservation and effectiveness metrics are computed using LLM judges (DeepSeek-V3), not human annotators.",
    203           "source": "haiku"
    204         },
    205         "held_out_test_set": {
    206           "applies": true,
    207           "answer": false,
    208           "justification": "100 queries from Safe-RLHF evaluated, but no train/test split mentioned. Unclear if the same 100 were used for development and final evaluation.",
    209           "source": "haiku"
    210         },
    211         "per_category_breakdown": {
    212           "applies": true,
    213           "answer": false,
    214           "justification": "No breakdown by query category, intent type, or difficulty. All 100 queries aggregated into single metrics.",
    215           "source": "haiku"
    216         },
    217         "failure_cases_discussed": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Section 6 presents 14 divergent cases: 6 debate regressions (debate made queries more detectable) and 8 debate improvements with detailed qualitative analysis.",
    221           "source": "haiku"
    222         },
    223         "negative_results_reported": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "Figures 4-5 show that increasing debaters beyond 3 and adding rounds beyond 1 provide no benefit or degrade effectiveness. Results honestly reported.",
    227           "source": "haiku"
    228         }
    229       },
    230       "setup_transparency": {
    231         "model_versions_specified": {
    232           "applies": true,
    233           "answer": true,
    234           "justification": "DeepSeek-V3 specified with citation to Liu et al. (2024) technical report. Model version is clear.",
    235           "source": "haiku"
    236         },
    237         "prompts_provided": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "Appendix B provides full system and user prompts for debater (Figures 8-9), summarizer (Figure 10), generator (Figure 11), and judge (Figure 12).",
    241           "source": "haiku"
    242         },
    243         "hyperparameters_reported": {
    244           "applies": true,
    245           "answer": false,
    246           "justification": "No temperature, top-p, max_tokens, or other LLM sampling hyperparameters reported for any component.",
    247           "source": "haiku"
    248         },
    249         "scaffolding_described": {
    250           "applies": true,
    251           "answer": true,
    252           "justification": "Debate scaffolding detailed in Section 3.1: M debaters, personas, N debate rounds, local-history sharing mechanism. Figure 1 shows pipeline overview.",
    253           "source": "haiku"
    254         },
    255         "data_preprocessing_documented": {
    256           "applies": true,
    257           "answer": false,
    258           "justification": "Paper states 'sample the first 100 queries' from Safe-RLHF but provides no preprocessing details, filtering criteria, or sampling seed.",
    259           "source": "haiku"
    260         }
    261       },
    262       "data_integrity": {
    263         "raw_data_available": {
    264           "applies": true,
    265           "answer": false,
    266           "justification": "Safe-RLHF is public, but the specific subset of 100 queries used and the transformed outputs are not stated as available.",
    267           "source": "haiku"
    268         },
    269         "data_collection_described": {
    270           "applies": true,
    271           "answer": false,
    272           "justification": "Collection procedure is minimal: 'sample the first 100 queries' from Safe-RLHF. Sampling method (first in order? random seed?) not specified.",
    273           "source": "haiku"
    274         },
    275         "recruitment_methods_described": {
    276           "applies": false,
    277           "answer": false,
    278           "justification": "No human participants, so NA.",
    279           "source": "haiku"
    280         },
    281         "data_pipeline_documented": {
    282           "applies": true,
    283           "answer": true,
    284           "justification": "Pipeline clearly documented: harmful query → debate (Section 3.1) → summarization (Section 3.2) → generation → evaluation (Section 4). Figure 1 shows full pipeline.",
    285           "source": "haiku"
    286         }
    287       },
    288       "contamination": {
    289         "training_cutoff_stated": {
    290           "applies": false,
    291           "answer": false,
    292           "justification": "Not evaluating model capabilities on benchmarks; using models as tools to generate and judge queries. NA.",
    293           "source": "haiku"
    294         },
    295         "train_test_overlap_discussed": {
    296           "applies": false,
    297           "answer": false,
    298           "justification": "Same as above, NA.",
    299           "source": "haiku"
    300         },
    301         "benchmark_contamination_addressed": {
    302           "applies": false,
    303           "answer": false,
    304           "justification": "Same as above, NA.",
    305           "source": "haiku"
    306         }
    307       },
    308       "human_studies": {
    309         "pre_registered": {
    310           "applies": false,
    311           "answer": false,
    312           "justification": "No human participants, NA.",
    313           "source": "haiku"
    314         },
    315         "irb_or_ethics_approval": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants, NA.",
    319           "source": "haiku"
    320         },
    321         "demographics_reported": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants, NA.",
    325           "source": "haiku"
    326         },
    327         "inclusion_exclusion_criteria": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants, NA.",
    331           "source": "haiku"
    332         },
    333         "randomization_described": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants, NA.",
    337           "source": "haiku"
    338         },
    339         "blinding_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants, NA.",
    343           "source": "haiku"
    344         },
    345         "attrition_reported": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants, NA.",
    349           "source": "haiku"
    350         }
    351       },
    352       "cost_and_practicality": {
    353         "inference_cost_reported": {
    354           "applies": true,
    355           "answer": false,
    356           "justification": "Multi-agent debate with 3 agents × multiple rounds, each using DeepSeek-V3 for debaters, summarizer, generator, and judges. No API cost or token count reported.",
    357           "source": "haiku"
    358         },
    359         "compute_budget_stated": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "No computational budget, total tokens, or cost estimates provided anywhere in the paper.",
    363           "source": "haiku"
    364         }
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "HARMTRANSFORM significantly outperforms baselines in producing effective query transformations that bypass LLM safety mechanisms",
    371       "evidence": "Table 1 shows HARMTRANSFORM achieves 0.36 effectiveness vs 0.24 for SingleLLM and 0.18 for SingleLLMReflect",
    372       "supported": "moderate"
    373     },
    374     {
    375       "claim": "Multi-agent debate acts as a double-edged sword: it can improve stealth but may also introduce topic shifts or unnecessary complexity",
    376       "evidence": "Section 6 identifies 8 debate improvement cases (collaborative camouflage, legitimization framing) and 6 regression cases (over-specification, optimization backfire)",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "Intent preservation remains strong (0.73) even as effectiveness is optimized to 0.36",
    381       "evidence": "Table 1 reports preservation of 0.73 for HARMTRANSFORM, matching HARMTRANSFORM-NoDebate (0.73) while outperforming baselines",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Increasing number of debaters beyond 3 provides no meaningful improvement in attack effectiveness",
    386       "evidence": "Figure 4 shows effectiveness remains flat at ~0.35-0.40 across 3-6 debaters; intent preservation peaks at 6 debaters (0.86) but effectiveness does not increase",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "More than one round of debate leads to diminishing or negative returns in effectiveness",
    391       "evidence": "Figure 5 shows effectiveness peaks at 1 round (0.36) and drops to ~0.25-0.30 at 3-4 rounds; rounds beyond 1 introduce information overload",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "Debate improves stealth through collaborative camouflage by revising red-flag phrasing with domain-specific substitutions",
    396       "evidence": "Section 6.3.1 describes this mechanism with example of shifting 'attack methods' to 'accidental exposure', analyzed on 8 improvement cases",
    397       "supported": "weak"
    398     },
    399     {
    400       "claim": "Debate can inadvertently expose harmful intent by over-specification and optimization backfire",
    401       "evidence": "Section 6.3.2 describes cases where adding concreteness or stripping defensive framing makes intent more salient, identified in 6 regression cases",
    402       "supported": "weak"
    403     }
    404   ],
    405   "methodology_tags": [
    406     "benchmark-eval",
    407     "case-study"
    408   ],
    409   "key_findings": "HarmTransform uses multi-agent debate with three personas to iteratively refine harmful queries into stealthier versions. On 100 Safe-RLHF queries evaluated with DeepSeek-V3, the framework achieves 0.36 attack effectiveness (vs 0.24 single-LLM baseline) while preserving 0.73 intent. Qualitative analysis of 14 divergent cases reveals debate acts as a double-edged sword: collaborative refinement can improve stealth through camouflage and academic framing, but over-specification and optimization backfire can paradoxically expose intent. Ablations show effectiveness peaks at 1 debate round with 3 debaters; additional rounds and debaters provide no benefit.",
    410   "red_flags": [
    411     {
    412       "flag": "No human evaluation",
    413       "detail": "All evaluations (intent preservation, attack effectiveness) use LLM judges (DeepSeek-V3), not human annotators. No validation that the 'stealthier' queries are actually harder to detect for humans or other models."
    414     },
    415     {
    416       "flag": "Tiny evaluation set with no justification",
    417       "detail": "Only 100 queries sampled from Safe-RLHF with no sample size justification or power analysis. Generalizable to this size alone; unclear if findings hold for larger corpora."
    418     },
    419     {
    420       "flag": "No statistical significance testing",
    421       "detail": "Claims that debate 'significantly outperforms' (0.36 vs 0.24) but provides no p-values, confidence intervals, or significance tests. Differences may not be statistically reliable."
    422     },
    423     {
    424       "flag": "Circular evaluation bias",
    425       "detail": "Same model (DeepSeek-V3) used to generate harmful queries, run debate, and judge preservation/effectiveness. Model may have systematic biases in what it considers 'stealthy' or 'preserved'."
    426     },
    427     {
    428       "flag": "Sampling method ambiguous",
    429       "detail": "Paper states 'sample the first 100 queries' from Safe-RLHF but does not specify ordering, random seed, or whether this is truly first-in-order. Reproducibility and selection bias uncertain."
    430     },
    431     {
    432       "flag": "No hyperparameter specification",
    433       "detail": "Temperature, top-p, max_tokens, and other sampling parameters not reported for any LLM calls. Different hyperparameters could yield different results."
    434     },
    435     {
    436       "flag": "Single-model evaluation",
    437       "detail": "All experiments use DeepSeek-V3. Unknown whether transformed queries fool other models (GPT-4, Claude, Llama) or only DeepSeek-V3's safety mechanisms."
    438     },
    439     {
    440       "flag": "No cost or practicality analysis",
    441       "detail": "Multi-agent debate framework involves 3 agents × multiple calls to DeepSeek-V3. No inference cost, latency, or computational budget reported, limiting practical applicability."
    442     },
    443     {
    444       "flag": "Ethical framing unclear",
    445       "detail": "Paper frames harmful query generation as 'safety alignment research' but does not adequately discuss the direct utility for actual jailbreaking attacks vs. defense development."
    446     }
    447   ],
    448   "cited_papers": [
    449     {
    450       "title": "Foundational challenges in assuring alignment and safety of large language models",
    451       "relevance": "Provides broader context on LLM safety challenges beyond explicit harmful queries"
    452     },
    453     {
    454       "title": "Improving factuality and reasoning in language models through multiagent debate",
    455       "relevance": "Foundational work on multi-agent debate approach applied here to harmful query transformation"
    456     },
    457     {
    458       "title": "Encouraging divergent thinking in large language models through multi-agent debate",
    459       "relevance": "Prior MAD work showing structured debate can improve reasoning; adapted here for safety application"
    460     },
    461     {
    462       "title": "Universal and transferable adversarial attacks on aligned language models",
    463       "relevance": "Related work on adversarial attacks and jailbreaking techniques that current paper builds upon"
    464     },
    465     {
    466       "title": "GPT-4 is too smart to be safe: Stealthy chat with LLMs via cipher",
    467       "relevance": "Shows stealthy query transformation methods using ciphers; similar goals to this work"
    468     },
    469     {
    470       "title": "Multi-step jailbreaking privacy attacks on chatgpt",
    471       "relevance": "Documents multi-turn jailbreaking strategies; contextualizes implicit query attacks"
    472     },
    473     {
    474       "title": "On the resilience of llm-based multi-agent collaboration with faulty agents",
    475       "relevance": "Examines when multi-agent collaboration fails; relevant to understanding debate failure cases"
    476     }
    477   ],
    478   "engagement_factors": {
    479     "practical_relevance": {
    480       "score": 2,
    481       "justification": "Practitioners interested in safety testing could use this framework, but primary utility appears to be advancing attack capabilities rather than defense."
    482     },
    483     "surprise_contrarian": {
    484       "score": 1,
    485       "justification": "Finding that multi-agent debate improves query stealth is intuitive; the double-edged sword finding (debate sometimes hurts) is mildly interesting but not surprising."
    486     },
    487     "fear_safety": {
    488       "score": 3,
    489       "justification": "Directly demonstrates techniques for bypassing LLM safety mechanisms; raises concern about arms race between attacks and defenses."
    490     },
    491     "drama_conflict": {
    492       "score": 1,
    493       "justification": "No obvious drama or conflict angle; paper frames as neutral research contribution without sensationalizing the adversarial arms race."
    494     },
    495     "demo_ability": {
    496       "score": 2,
    497       "justification": "Framework could be demonstrated on a sample harmful query with access to DeepSeek-V3 API, but requires multi-agent setup and cost."
    498     },
    499     "brand_recognition": {
    500       "score": 0,
    501       "justification": "Single-author paper from University of Toronto, not from major AI lab (OpenAI, DeepMind, Meta). No brand recognition boost."
    502     }
    503   },
    504   "hn_data": {
    505     "threads": [
    506       {
    507         "hn_id": "43825422",
    508         "title": "Jetbrains actively deleting negative reviews for AI plugin",
    509         "points": 14,
    510         "comments": 6,
    511         "url": "https://news.ycombinator.com/item?id=43825422",
    512         "created_at": "2025-04-28T19:58:23Z"
    513       },
    514       {
    515         "hn_id": "45881371",
    516         "title": "Evaluating in Silico Creativity: An Expert Review of AI Chess Compositions",
    517         "points": 2,
    518         "comments": 0,
    519         "url": "https://news.ycombinator.com/item?id=45881371",
    520         "created_at": "2025-11-10T21:46:55Z"
    521       },
    522       {
    523         "hn_id": "45743257",
    524         "title": "Linear effects, exceptions, resources: Curry-Howard destructors correspondence",
    525         "points": 2,
    526         "comments": 0,
    527         "url": "https://news.ycombinator.com/item?id=45743257",
    528         "created_at": "2025-10-29T06:17:03Z"
    529       },
    530       {
    531         "hn_id": "46433603",
    532         "title": "Training AI Co-Scientists Using Rubric Rewards [Meta Superintelligence Labs]",
    533         "points": 1,
    534         "comments": 0,
    535         "url": "https://news.ycombinator.com/item?id=46433603",
    536         "created_at": "2025-12-30T14:25:11Z"
    537       }
    538     ],
    539     "top_points": 14,
    540     "total_points": 19,
    541     "total_comments": 6
    542   }
    543 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs