ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25254B)


      1 {
      2   "paper": {
      3     "title": "CCFC: Core & Core–Full–Core Dual-Track Defense for LLM Jailbreak Protection",
      4     "authors": ["Jiaming Hu", "Haoyu Wang", "Debarghya Mukherjee", "Ioannis Ch. Paschalidis"],
      5     "year": 2025,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2508.14128"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No GitHub link or code repository URL is provided in the paper. The only URL provided is to a HuggingFace dataset for attack prompts (https://huggingface.co/datasets/UWNSL/SafeDecoding-Attackers), which is a third-party resource, not the authors' code."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper references publicly available datasets: AdvBench (Zou et al., 2023) for harmful benchmarks and Just-Eval for utility evaluation (500 instructions). Attack prompts are linked via a HuggingFace dataset URL. All evaluation data is from public benchmarks."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specifications are provided—no requirements.txt, Dockerfile, conda environment, or library versions are listed. The paper does not describe the computational environment used."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The method is described algorithmically but there are no concrete commands or instructions for reproducing experiments."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Tables 1 and 2 report only point estimates (e.g., ASR percentages and Just-Eval scores). No confidence intervals, error bars, or ± notation are provided for any results."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims CCFC 'consistently outperforms' baselines based on comparing raw numbers in Tables 1 and 2. No statistical significance tests (p-values, t-tests, etc.) are used to support these comparative claims."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper reports raw ASR percentages but does not provide standardized effect sizes (Cohen's d, odds ratios). The abstract claims '50-75% reduction' which provides some context, but the results tables lack any formal effect size measures."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper uses 50 attack prompts per jailbreak attack method and 500 Just-Eval instructions for utility evaluation, but provides no justification for why these sample sizes were chosen or whether they are sufficient for the claims made."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance, standard deviation, or spread measures are reported. All results appear to be single-run numbers with no indication of result stability across multiple runs."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Five baseline defenses are compared: Self-Examination, Paraphrase, Retokenization, Self-Reminder, ICD, and DATDP. A 'No Defense' baseline is also included. Results are shown in Table 1."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include DATDP (Armstrong et al., 2025), ICD (Wei et al., 2023), Self-Reminder (Xie et al., 2023), SmoothLLM (Robey et al., 2023), and others. DATDP from 2025 is contemporary. The selection covers the main prompt-level defense approaches."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "CCFC has multiple components (core extraction, dual-track processing, double safety check), but no ablation study is presented to show the individual contribution of each component. For example, there is no comparison of core-only vs. CFC-only vs. full CCFC."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper uses ASR (Attack Success Rate) for robustness evaluation and five Just-Eval dimensions (helpfulness, clarity, factuality, depth, engagement) for utility evaluation, reported in Tables 1 and 2 respectively."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "All evaluation is automated: ASR is computed using Dic-Judge (a keyword-based classifier), and utility is measured via Just-Eval automated scores. No human evaluation of defense outputs is performed, which is relevant given claims about response quality preservation."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "The paper does not describe any separation between development/tuning data and test data. The few-shot examples used in core extraction and the evaluation queries appear to use the same benchmark sets without explicit held-out splitting."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 1 provides per-attack-type breakdown (GCG, AutoDAN, PAIR, DeepInception, AdvBench) and Table 2 provides per-dimension utility scores (Helpfulness, Clear, Factual, Deep, Engaging), both separated by model (Vicuna, Llama-2)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The Limitations section (Section 6) discusses potential failure modes: the method may struggle with 'highly sophisticated attacks that seamlessly integrate malicious intent within semantically coherent contexts' and 'the extraction process could potentially miss subtle adversarial elements.'"
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "Every experiment shows CCFC performing at or near the best level. No configurations or approaches that failed are reported. No negative results or unsuccessful design choices are mentioned."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "The abstract claims CCFC 'cuts attack success rates by 50-75% versus state-of-the-art defenses.' Looking at Table 1 for Vicuna, comparing CCFC to DATDP (the best baseline): GCG 6% vs 12%, AutoDAN 0% vs 2%, PAIR 2% vs 2%, DeepInception 2% vs 8%. The reductions relative to DATDP are not consistently 50-75%. The claim appears to be relative to other, weaker baselines rather than the 'state-of-the-art.'"
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper makes causal claims about why CCFC works (e.g., core extraction 'effectively implementing distraction elimination strategies,' CFC track 'disrupts structure-dependent attacks'), but without ablation studies, these causal mechanisms are not empirically validated. The improvement could come from any single component."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper claims CCFC provides 'universal applicability across attack types' and is 'a practical solution for enhancing LLM safety,' but tests only on two 7B open-source models (Vicuna-7B and LLaMA2-7B-chat) with four attacks. No testing on larger models, closed-source models, or broader attack categories. The title and abstract make broad claims not bounded to the tested setting."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No alternative explanations for the results are discussed. The paper does not consider whether the improvements might be due to the additional inference passes (CCFC uses multiple forward passes), the specific few-shot examples chosen, or other confounds."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper specifies 'Vicuna-7B' and 'LLaMA2-7B-chat' but does not provide exact version identifiers, checkpoint hashes, or snapshot dates. These model names alone are insufficient as multiple versions exist."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The few-shot core extraction prompt is provided in full in Appendix Tables 3 and 4. The prompt includes the system instruction and demonstration examples used for the extraction phase."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No hyperparameters are reported—no temperature, top-p, max tokens, or other sampling parameters for the LLM inference. The paper does not describe any configuration details for the model calls."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The CCFC framework is described in detail in Section 4 with formal notation: core extraction via few-shot prompting (Eq. 2), dual-track parallel defense (Eqs. 3-4), and double safety check logic (Eq. 5). The workflow is illustrated in Figure 1."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper does not describe how the attack prompts were generated or preprocessed, beyond pointing to a HuggingFace dataset URL. No details on how the 50 prompts per attack were selected from AdvBench or how Just-Eval instructions were chosen."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 6 includes a 'Limitations' subsection that discusses the method's reliance on few-shot core extraction quality and potential struggles with sophisticated attacks."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "The limitations section is brief and somewhat generic. It mentions that core extraction 'may struggle with highly sophisticated attacks' but does not discuss specific threats like the limited model selection (only 7B models), reliance on keyword-based ASR evaluation, or the small attack sample size."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound claims to 7B open-source models or to the specific attack types tested. The conclusion claims 'universal applicability' without stating scope boundaries."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Raw model outputs, intermediate core extractions, and safety check decisions are not available. Only aggregated ASR percentages and Just-Eval scores are reported in tables."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The paper describes using AdvBench (Zou et al., 2023) with 50 harmful queries per attack method and 500 Just-Eval instructions for utility evaluation. Attack prompts are sourced from a referenced HuggingFace dataset."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants are involved in this study. All evaluation uses automated benchmarks and models."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The pipeline from attack prompt generation to final ASR computation is not fully documented. It is unclear how Dic-Judge keyword matching works exactly, what the safe phrases list is, or how edge cases in classification are handled."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding information, acknowledgments section, or grant numbers are provided in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: Boston University (Dept. of Math & Statistics, Dept. of ECE & Systems Eng., Dept. of Biomedical Eng., Faculty of Computing & Data Sciences) and University at Albany (Dept. of Computer Science)."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is disclosed, so independence of funder from outcomes cannot be assessed. The absence of funding disclosure is itself a concern."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement or financial disclosure is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "This paper evaluates a defense mechanism against jailbreak attacks, not a pre-trained model's capability on a benchmark. The defense operates at the prompt level and does not depend on model training data knowledge."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Same as above—the paper tests a defense strategy, not model knowledge. Train/test overlap for the underlying model is not relevant to the defense evaluation."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Same as above—the evaluation is about whether the defense blocks attacks, not whether the model has memorized benchmark answers. Contamination is not relevant to this evaluation setup."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants are involved in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "The paper claims 'minimal overhead' and 'a constant number of additional forward passes' but does not quantify inference cost, latency, tokens consumed, or wall-clock time. No concrete cost comparison with baselines is provided."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No information about GPU hours, hardware used, total compute time, or computational resources is provided anywhere in the paper."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CCFC cuts attack success rates by 50-75% versus state-of-the-art defenses against strong adversaries.",
    286       "evidence": "Table 1 shows ASR results. On Vicuna, CCFC achieves 6% (GCG), 0% (AutoDAN), 2% (PAIR), 2% (DeepInception). Compared to DATDP (best baseline): 12%, 2%, 2%, 8%. The 50-75% claim is not consistently supported against the best baseline.",
    287       "supported": "weak"
    288     },
    289     {
    290       "claim": "CCFC maintains helpfulness of LLMs when handling queries from benign users.",
    291       "evidence": "Table 2 shows Just-Eval scores. CCFC achieves 4.353 avg on Vicuna (vs 4.339 No Defense) and 4.398 avg on Llama-2 (vs 4.432 No Defense). CCFC is close to undefended performance.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "CCFC achieves 0% ASR on AdvBench harmful benchmark for both models.",
    296       "evidence": "Table 1 shows 0% ASR on AdvBench for both Vicuna and Llama-2 with CCFC.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "CCFC consistently outperforms state-of-the-art prompt-level defenses.",
    301       "evidence": "Table 1 shows CCFC achieves lowest or tied-lowest ASR across most attack types. On Llama-2, ICD also achieves 0% across all attacks. DATDP is very close to CCFC on several metrics. The claim of consistent superiority is overstated.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "CCFC provides universal applicability across attack types without model modifications.",
    306       "evidence": "The paper tests on only two 7B open-source models and four attack types. 'Universal applicability' is not demonstrated—no closed-source models, larger models, or broader attack categories are tested.",
    307       "supported": "weak"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "CCFC proposes a dual-track prompt-level defense against LLM jailbreak attacks that combines semantic core extraction with a Core-Full-Core structural disruption approach and a double safety check. On Vicuna-7B and LLaMA2-7B-chat, CCFC achieves near-zero attack success rates against GCG, AutoDAN, PAIR, and DeepInception attacks, matching or exceeding DATDP (the strongest baseline) while maintaining utility scores close to undefended models. However, the evaluation is limited to two small open-source models, uses no ablation study, and lacks statistical rigor (no error bars, significance tests, or multi-run variance).",
    312   "red_flags": [
    313     {
    314       "flag": "No ablation study",
    315       "detail": "CCFC has three distinct components (core extraction, dual-track processing, double safety check) but no ablation is performed to determine which component drives the improvement. The causal claims about why each component works are not empirically validated."
    316     },
    317     {
    318       "flag": "Overstated abstract claims",
    319       "detail": "The abstract claims '50-75% reduction versus state-of-the-art defenses' but DATDP (the actual best baseline) shows very similar performance. The 50-75% figure appears cherry-picked against weaker baselines rather than the best comparators."
    320     },
    321     {
    322       "flag": "No uncertainty quantification",
    323       "detail": "All results are single-run point estimates with no error bars, confidence intervals, or variance. With only 50 attack prompts per condition, individual results could vary significantly between runs."
    324     },
    325     {
    326       "flag": "Very narrow model scope with broad claims",
    327       "detail": "Only two 7B open-source models are tested, yet the paper claims 'universal applicability' and 'a practical solution for safer LLM deployment.' No testing on larger, more capable, or closed-source models."
    328     },
    329     {
    330       "flag": "Keyword-based ASR evaluation",
    331       "detail": "ASR is computed using Dic-Judge, a keyword-based classifier detecting refusal phrases. This is a known weak evaluation method that can be easily fooled by responses that avoid standard refusal phrases but are still safe, or responses that include refusal-like phrases but still contain harmful content."
    332     },
    333     {
    334       "flag": "Missing computational cost analysis",
    335       "detail": "The paper claims 'minimal overhead' but provides no quantitative cost comparison. CCFC requires at least 3 forward passes (core extraction, core track, CFC track) plus safety checking, which could be significant compared to single-pass baselines."
    336     }
    337   ],
    338   "cited_papers": [
    339     {
    340       "title": "Universal and transferable adversarial attacks on aligned language models",
    341       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J Zico Kolter", "Matt Fredrikson"],
    342       "year": 2023,
    343       "arxiv_id": "2307.15043",
    344       "relevance": "Introduces the GCG attack and AdvBench benchmark, foundational to evaluating LLM jailbreak defenses."
    345     },
    346     {
    347       "title": "Jailbreaking black box large language models in twenty queries",
    348       "authors": ["Patrick Chao", "Alexander Robey", "Edgar Dobriban", "Hamed Hassani", "George J Pappas", "Eric Wong"],
    349       "year": 2025,
    350       "relevance": "Introduces the PAIR attack method, a key black-box jailbreak approach tested in this evaluation."
    351     },
    352     {
    353       "title": "AutoDAN: Generating stealthy jailbreak prompts on aligned large language models",
    354       "authors": ["Xiaogeng Liu", "Nan Xu", "Muhao Chen", "Chaowei Xiao"],
    355       "year": 2023,
    356       "arxiv_id": "2310.04451",
    357       "relevance": "Proposes genetic-algorithm-based jailbreak prompt generation, one of the attack methods evaluated."
    358     },
    359     {
    360       "title": "DeepInception: Hypnotize large language model to be jailbreaker",
    361       "authors": ["Xuan Li", "Zhanke Zhou", "Jianing Zhu", "Jiangchao Yao", "Tongliang Liu", "Bo Han"],
    362       "year": 2023,
    363       "arxiv_id": "2311.03191",
    364       "relevance": "Introduces a narrative-based jailbreak attack that proved challenging for most baselines in this evaluation."
    365     },
    366     {
    367       "title": "SmoothLLM: Defending large language models against jailbreaking attacks",
    368       "authors": ["Alexander Robey", "Eric Wong", "Hamed Hassani", "George J Pappas"],
    369       "year": 2023,
    370       "arxiv_id": "2310.03684",
    371       "relevance": "A perturbation-based defense against jailbreaks, representing the prompt-level defense approach."
    372     },
    373     {
    374       "title": "Defending ChatGPT against jailbreak attack via self-reminders",
    375       "authors": ["Yueqi Xie", "Jingwei Yi", "Jiawei Shao", "Justin Curl", "Lingjuan Lyu", "Qifeng Chen", "Xing Xie", "Fangzhao Wu"],
    376       "year": 2023,
    377       "relevance": "Proposes the Self-Reminder defense baseline used in this evaluation."
    378     },
    379     {
    380       "title": "Gradient Cuff: Detecting jailbreak attacks on large language models by exploring refusal loss landscapes",
    381       "authors": ["Xiaomeng Hu", "Pin-Yu Chen", "Tsung-Yi Ho"],
    382       "year": 2024,
    383       "relevance": "A gradient-based model-level defense method contrasted with the prompt-level approach of CCFC."
    384     },
    385     {
    386       "title": "SafeDecoding: Defending against jailbreak attacks via safety-aware decoding",
    387       "authors": ["Zhangchen Xu", "Fengqing Jiang", "Luyao Niu", "Jinyuan Jia", "Bill Yuchen Lin", "Radha Poovendran"],
    388       "year": 2024,
    389       "arxiv_id": "2402.08983",
    390       "relevance": "A decoding-level defense approach, provides the attack prompt dataset used in this paper's experiments."
    391     },
    392     {
    393       "title": "Defense against the dark prompts: Mitigating best-of-n jailbreaking with prompt evaluation",
    394       "authors": ["Stuart Armstrong", "Matija Franklin", "Connor Stevens", "Rebecca Gorman"],
    395       "year": 2025,
    396       "arxiv_id": "2502.00580",
    397       "relevance": "Introduces DATDP, the strongest baseline defense compared against CCFC in this evaluation."
    398     },
    399     {
    400       "title": "Robust prompt optimization for defending language models against jailbreaking attacks",
    401       "authors": ["Andy Zhou", "Bo Li", "Haohan Wang"],
    402       "year": 2024,
    403       "relevance": "Proposes gradient-based defensive suffix optimization (RPO), a model-level defense approach."
    404     },
    405     {
    406       "title": "Certifying LLM safety against adversarial prompting",
    407       "authors": ["Aounon Kumar", "Chirag Agarwal", "Suraj Srinivas", "Aaron Jiaxun Li", "Soheil Feizi", "Himabindu Lakkaraju"],
    408       "year": 2023,
    409       "arxiv_id": "2309.02705",
    410       "relevance": "Proposes Erase-and-check defense method using external safety filters, a model-level defense baseline."
    411     },
    412     {
    413       "title": "Tree of attacks: Jailbreaking black-box LLMs automatically",
    414       "authors": ["Anay Mehrotra", "Manolis Zampetakis", "Paul Kassianik", "Blaine Nelson", "Hyrum Anderson", "Yaron Singer", "Amin Karbasi"],
    415       "year": 2024,
    416       "relevance": "Extends black-box jailbreak attacks with tree-structured search, relevant to understanding advanced attack strategies."
    417     }
    418   ]
    419 }

Impressum · Datenschutz