scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (29814B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Jailbreaking Leading Safety-Aligned LLMs with Simple Adaptive Attacks",
      6     "authors": [
      7       "Maksym Andriushchenko",
      8       "Francesco Croce",
      9       "Nicolas Flammarion"
     10     ],
     11     "year": 2024,
     12     "venue": "ICLR 2025",
     13     "arxiv_id": "2404.02151",
     14     "doi": null
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The abstract claims 100% ASR on all listed models via adaptive attacks; Tables 1–4 and Table 22 provide per-model empirical confirmation across 21 models using GPT-4 judge.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The causal claim that 'adaptivity is crucial' is supported by controlled ablation studies comparing prompt-only vs. prompt+RS vs. prompt+RS+self-transfer (Table 2), with clear incremental improvement at each stage.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Results are explicitly bounded to 50 AdvBench behaviors curated by Chao et al. (2023) and specific named model API versions; the paper does not claim the attacks generalize to arbitrary harmful requests or future model variants.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The paper discusses GPT-4 judge false positives (~20% for Claude 2.1, Table 19), compares four judges (GPT-4, rule-based, Llama-3-70B, Llama Guard 2, Table 20), and acknowledges that 10/10 scores do not guarantee practical attacker utility.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Section 6 explicitly states 'Even a perfect jailbreak score (10/10) from the GPT-4 judge does not always imply that the generated content is actually beneficial for an attacker,' clearly distinguishing the proxy measure from real-world harm.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section 6 is titled 'Discussion, Recommendations, and Limitations' and contains a dedicated Limitations paragraph addressing judge reliability, false positive rates, and unevaluated defenses.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Specific threats are quantified: ~20% false positive rate for Claude 2.1 (Table 19), Llama Guard 2 shows only 86% ASR on Phi-3 vs. GPT-4 judge's 100% (Table 20), and adaptive attacks against test-time defenses like SmoothLLM are explicitly noted as out of scope.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The paper explicitly states evaluation is limited to 50 AdvBench behaviors, specific API snapshots (e.g., gpt-3.5-turbo-1106), and notes that test-time defenses require separate work; Section A.2 clarifies the adaptive attack definition boundary.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Acknowledgements disclose: Google Fellowship, Open Phil AI Fellowship, unrestricted gift from Google, and Swiss National Science Foundation grant 212111.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All three authors are listed as affiliated with EPFL; no author is employed by any of the companies whose models are evaluated.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Google funds the paper via an unrestricted gift but Google's own Gemma model is shown 100% vulnerable; the paper attacks all major labs equally and the gift is explicitly described as unrestricted.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement is present; the acknowledgements list funding sources but do not declare whether authors have equity, patents, or consulting relationships with OpenAI, Anthropic, Meta, or Google.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Section A.2 formally defines 'adaptive attacks' as 'attacks that are specifically designed to target a given defense'; Section 3.1 gives a mathematical formulation of the jailbreaking objective; success is defined as GPT-4 judge score of 10/10.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The 'Contributions' paragraph explicitly enumerates four algorithmic contributions: manually designed prompt template, random search on logprobs, self-transfer, and prefilling attack for Claude.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 2 systematically categorizes prior work into manual, direct search, and LLM-assisted attacks; the paper directly compares ASR against GCG, PAIR, TAP, PAP on the same models and task in each results table.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "The abstract provides a GitHub URL (https://github.com/tml-epfl/llm-adaptive-attacks) with code, logs, and jailbreak artifacts in JailbreakBench format.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "Evaluation uses 50 behaviors from AdvBench (Zou et al., 2023) curated by Chao et al. (2023), which are publicly available standard benchmarks used unmodified.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "The paper mentions 'a single A100 GPU with an implementation based on HuggingFace transformers' but provides no requirements.txt, Dockerfile, or versioned dependency specification.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": true,
    140           "justification": "Algorithm 1 gives a formal random search procedure; full prompt templates are in Figure 1 and Tables 6–14; code with logs is released on GitHub enabling end-to-end reproduction.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "No confidence intervals or error bars are reported for any ASR results; all results are presented as single-run point estimates.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No statistical significance tests are applied when comparing attack success rates across methods; differences are reported as raw percentage point changes without any hypothesis testing.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Effect sizes are implicitly reported as percentage point improvements with baseline context (e.g., 0% prompt alone → 50% prompt+RS → 100% prompt+RS+self-transfer on Llama-2-Chat-7B vs. prior best 92%).",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "The 50-behavior test set is adopted from Chao et al. (2023) for comparability but no power analysis or sample size justification is provided to confirm it reliably estimates 100% ASR.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "ASR results are single-run point estimates; Figure 3 shows logprob variance for GPT-4 Turbo queries but variance across repeated attack runs is not reported for any ASR metric.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Extensive baselines included throughout: GCG, PAIR, TAP, PAP, AIM, and persona modulation are compared per-model in Tables 2–4 and Table 22.",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "All major baselines (GCG 2023, PAIR 2023, TAP 2023, PAP 2024) are recent and state-of-the-art at time of writing; no suspiciously weak or outdated baselines.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Table 2 ablates attack components (prompt only vs. prompt+RS vs. prompt+RS+self-transfer); Figure 4 ablates suffix length (5–60 tokens); Tables 15–17 ablate restart count and request structure for Claude.",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Four evaluation judges are used: GPT-4 semantic judge (primary), rule-based judge from Zou et al., Llama-3-70B judge, and Llama Guard 2; Table 20 compares all four across representative models.",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": false,
    203           "answer": false,
    204           "justification": "No systematic human evaluation; manual inspection of generations is mentioned only to flag false positives for Claude 2.1, which is quality control rather than evaluation.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": false,
    209           "answer": false,
    210           "justification": "Not applicable: adversarial suffixes are optimized per-request on the same 50 behaviors used for evaluation; this is not a prediction task requiring train/test splits.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": false,
    216           "justification": "Results are broken down by model and attack method but no breakdown by behavior category (violence vs. hate speech vs. misinformation) is provided within the 50 AdvBench behaviors.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Failure cases are explicitly analyzed: transfer attacks get 0% on Claude 2.1 and Claude 3 Opus (Table 4), long suffixes cause off-topic generation (Figure 4), and false positive failure modes are documented (Table 19).",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Negative results are prominently reported: prompt alone achieves 0% on Llama-2-Chat (Table 2), the standard template is completely ineffective on GPT-4o (0%), and self-transfer is explicitly noted as ineffective against R2D2.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "Exact API checkpoint names are specified: gpt-3.5-turbo-1106, gpt-4-1106-preview, gpt-4o-2024-05-13; open-weight models are specified with parameter count and release context.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "Full prompt templates are provided in Figure 1 (main template), Table 6 (in-context prompt), Table 7 (GPT-4o custom prompt), and Tables 8–14 (system prompts for each model family) with all fill values specified.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Key hyperparameters reported: suffix length 25 tokens (ablated in Figure 4), up to 10,000 iterations, up to 10 random restarts, target token 'Sure', contiguous token modification schedule.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": false,
    253           "answer": false,
    254           "justification": "No agentic scaffolding is used; the attack is direct API interaction with random search optimization, fully described in Algorithm 1.",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "The 50-behavior subset of AdvBench is cited to Chao et al. (2023) with the selection criterion noted; no further preprocessing beyond behavior selection is required.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "The GitHub repository includes 'logs and jailbreak artifacts in the JailbreakBench format,' providing per-behavior raw results for independent verification.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": false,
    274           "justification": "The 50 AdvBench behaviors are referenced from Chao et al. (2023) without re-describing the curation process; the paper treats them as a given dataset without documenting its own collection procedure.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants; the study uses a standard benchmark with no recruitment.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "The full pipeline from harmful behavior → attack optimization → judge evaluation is described in Section 3 and Algorithm 1, with model-specific variations detailed in Section 4.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": false,
    293           "answer": false,
    294           "justification": "This is an adversarial attack paper, not a benchmark capability evaluation; training data contamination with attack behaviors does not affect the validity of demonstrating attack success rates.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": false,
    299           "answer": false,
    300           "justification": "NA — the paper evaluates attack success rates, not model knowledge; the concept of train/test overlap does not apply to this attack evaluation setting.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": false,
    305           "answer": false,
    306           "justification": "NA — models are being attacked, not evaluated on knowledge; whether models saw AdvBench behaviors in training is irrelevant to whether adversarial attacks succeed.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants; the paper has an ethics/responsible disclosure statement (Appendix A.1) but no IRB approval.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": true,
    358           "justification": "Section C.2 reports: '4000 iterations of random search on Llama-3-8B take 20.9 minutes on a single A100 GPU'; total experiment time stated as 'a few hours.'",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": true,
    364           "justification": "Single A100 GPU specified for open-weight experiments; OpenAI Researcher Access Program credits and Anthropic free evaluation access are acknowledged, giving a rough cost picture.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Adaptive jailbreaking attacks achieve 100% attack success rate on all 21 tested safety-aligned LLMs including GPT-4o and Claude 3.5 Sonnet.",
    373       "evidence": "Tables 1 and 22 show 100% ASR via GPT-4 judge across 21 models using combinations of prompt templates, random search, self-transfer, and prefilling.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Random search on logprobs can find adversarial suffixes without gradient information, outperforming gradient-based GCG in API-access scenarios.",
    378       "evidence": "Table 2 shows Prompt+RS+Self-Transfer achieves 100% vs. GCG's 54% on Llama-2-Chat-7B; Section 6 notes random search requires only logprobs and has lower memory demands.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Self-transfer—initializing random search with adversarial suffixes from simpler requests—significantly improves query efficiency and attack success rates.",
    383       "evidence": "Figure 2 convergence curves show self-transfer achieves 100% ASR far faster than RS alone; Table 2 shows RS alone achieves 50% while RS+Self-Transfer achieves 100% on Llama-2-Chat-7B.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Claude models, which do not expose logprobs, can be jailbroken with 100% ASR using the API prefilling feature.",
    388       "evidence": "Table 4 shows 100% ASR on all seven Claude models (2.0 through 3.5 Sonnet) using Prompt+Prefilling Attack; Tables 16–17 provide detailed ablation across restart counts and prompt structures.",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "Static attack suites significantly overestimate LLM robustness compared to model-specific adaptive attacks.",
    393       "evidence": "Prior best ASRs on Llama-2-Chat-13B/70B were 30–38% (GCG via HarmBench); this paper achieves 100%; Section 6 argues this gap shows static evaluations give 'false sense of safety.'",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "No single attack method generalizes across all target models; model-specific vulnerabilities require custom attack designs.",
    398       "evidence": "R2D2 requires in-context learning prompts (90% in-context vs. 8% standard prompt); GPT-4o requires a custom prompt (0% default vs. 100% custom); Claude requires prefilling unavailable to other models.",
    399       "supported": "strong"
    400     }
    401   ],
    402   "methodology_tags": [
    403     "benchmark-eval",
    404     "case-study"
    405   ],
    406   "key_findings": "All 21 tested safety-aligned LLMs, including GPT-4o and Claude 3.5 Sonnet, achieve 100% attack success rate under simple adaptive jailbreaking attacks combining manually designed prompt templates, random search on logprobs, self-transfer initialization, and model-specific techniques such as prefilling for Claude. The central methodological insight is that adaptive attacks—specifically designed for each model's unique API and training vulnerabilities—are necessary for accurate robustness evaluation; static attack suites (e.g., GCG, PAIR) dramatically overestimate robustness, achieving as low as 30–38% ASR on models this paper breaks completely. The paper also demonstrates that random search without gradient access and prefilling without any optimization are surprisingly effective, lowering the barrier for adversarial evaluation.",
    407   "red_flags": [
    408     {
    409       "flag": "Tiny evaluation set",
    410       "detail": "Only 50 AdvBench behaviors are used; 100% ASR on 50 specific prompts may not generalize to the full distribution of harmful requests, and the small N makes statistical reliability impossible to assess."
    411     },
    412     {
    413       "flag": "Judge reliability undermines headline claim",
    414       "detail": "GPT-4 judge exhibits ~20% false positive rate on Claude 2.1 (Table 19 shows clearly non-harmful responses scoring 10/10), which partially invalidates the 100% ASR claim for that model and raises questions about other models."
    415     },
    416     {
    417       "flag": "No variance or CIs reported",
    418       "detail": "All ASR results are presented as single-run point estimates with no confidence intervals, error bars, or repeated-run variance, making result stability unassessable."
    419     },
    420     {
    421       "flag": "Baseline comparisons confounded",
    422       "detail": "Many baselines (marked with *) are from different request sets and/or different judges, making direct numerical comparison misleading despite being placed in the same tables."
    423     },
    424     {
    425       "flag": "No competing interests declaration",
    426       "detail": "While funding sources are disclosed in acknowledgements, no formal competing interests statement addresses potential equity, patents, or consulting relationships with OpenAI, Anthropic, Meta, or Google."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models",
    432       "relevance": "Introduces GCG attack and AdvBench benchmark; both used as primary baseline and evaluation dataset in this paper"
    433     },
    434     {
    435       "title": "Jailbreaking Black Box Large Language Models in Twenty Queries",
    436       "relevance": "Introduces PAIR attack and curates the 50-behavior AdvBench subset that serves as the primary evaluation set"
    437     },
    438     {
    439       "title": "HarmBench: A Standardized Evaluation Framework for Automated Red Teaming and Robust Refusal",
    440       "relevance": "Primary source of baseline ASR numbers across multiple attack methods; main comparison target demonstrating improved ASR"
    441     },
    442     {
    443       "title": "Jailbroken: How Does LLM Safety Training Fail?",
    444       "relevance": "Categorizes jailbreak failure modes (competing objectives, mismatched generalization) that motivate the adaptive attack design philosophy"
    445     },
    446     {
    447       "title": "Tree of Attacks: Jailbreaking Black-Box LLMs Automatically",
    448       "relevance": "Key baseline attack (TAP) that this paper significantly outperforms across all evaluated models"
    449     },
    450     {
    451       "title": "JailbreakBench: An Open Robustness Benchmark for Jailbreaking Large Language Models",
    452       "relevance": "Standardized evaluation infrastructure; code is released in JailbreakBench format for comparability"
    453     },
    454     {
    455       "title": "On Adaptive Attacks to Adversarial Example Defenses",
    456       "relevance": "Provides the formal definition of adaptive attacks used throughout the paper, adapted from image classification adversarial robustness"
    457     },
    458     {
    459       "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models",
    460       "relevance": "Primary open-weight model family attacked; provides safety system prompts used in evaluation setup"
    461     },
    462     {
    463       "title": "Universal Jailbreak Backdoors from Poisoned Human Feedback",
    464       "relevance": "Introduces trojan backdoor attack setup motivating the SaTML'24 Trojan Detection Competition described in Section 5"
    465     },
    466     {
    467       "title": "How Johnny Can Persuade LLMs to Jailbreak Them: Rethinking Persuasion to Challenge AI Safety by Humanizing LLMs",
    468       "relevance": "PAP baseline achieving prior best 92% ASR on Llama-2-Chat-7B that this paper surpasses with 100%"
    469     }
    470   ],
    471   "engagement_factors": {
    472     "practical_relevance": {
    473       "score": 3,
    474       "justification": "Code, logs, and prompt templates are released; practitioners deploying any of the 21 tested LLMs can immediately assess vulnerability using the provided attacks."
    475     },
    476     "surprise_contrarian": {
    477       "score": 3,
    478       "justification": "Directly contradicts safety claims from Anthropic, OpenAI, and Meta by demonstrating 100% ASR on models previously described as among the most robust, including Claude 3 family and Llama-2."
    479     },
    480     "fear_safety": {
    481       "score": 3,
    482       "justification": "Demonstrates that every frontier safety-aligned LLM is completely non-robust to simple adaptive attacks, with direct implications for any safety-critical deployment or harm mitigation policy."
    483     },
    484     "drama_conflict": {
    485       "score": 3,
    486       "justification": "Attacks flagship products from multiple competing AI labs simultaneously and argues HarmBench-style evaluation gives 'a false sense of safety'; also discloses prefilling vulnerability to Anthropic before publication."
    487     },
    488     "demo_ability": {
    489       "score": 3,
    490       "justification": "GitHub repo provides runnable code, pre-computed adversarial suffixes, and logs; the prefilling attack on Claude requires only API access with no GPU needed."
    491     },
    492     "brand_recognition": {
    493       "score": 2,
    494       "justification": "Authors are from EPFL (not a frontier AI lab), but the paper attacks GPT-4o and Claude 3.5 Sonnet by name and won the SaTML'24 Trojan Detection Competition."
    495     }
    496   },
    497   "hn_data": {
    498     "threads": [
    499       {
    500         "hn_id": "39941576",
    501         "title": "Jailbreaking Leading Safety-Aligned LLMs with Simple Adaptive Attacks",
    502         "points": 3,
    503         "comments": 1,
    504         "url": "https://news.ycombinator.com/item?id=39941576",
    505         "created_at": "2024-04-05T12:30:05Z"
    506       },
    507       {
    508         "hn_id": "40310614",
    509         "title": "The AI Review Lottery: Widespread AI-Assisted Peer Reviews Boost Paper Scores",
    510         "points": 2,
    511         "comments": 1,
    512         "url": "https://news.ycombinator.com/item?id=40310614",
    513         "created_at": "2024-05-09T17:31:58Z"
    514       },
    515       {
    516         "hn_id": "39987108",
    517         "title": "Text-to-SQL that asks the LLM to predict the result set",
    518         "points": 2,
    519         "comments": 1,
    520         "url": "https://news.ycombinator.com/item?id=39987108",
    521         "created_at": "2024-04-10T04:58:41Z"
    522       },
    523       {
    524         "hn_id": "43703965",
    525         "title": "LLMs, Syntax, and Semantics: Long-Distance Binding of Chinese Reflexive Ziji",
    526         "points": 2,
    527         "comments": 0,
    528         "url": "https://news.ycombinator.com/item?id=43703965",
    529         "created_at": "2025-04-16T11:22:09Z"
    530       },
    531       {
    532         "hn_id": "40663653",
    533         "title": "Poco: Policy Composition from and for Heterogeneous Robot Learning",
    534         "points": 2,
    535         "comments": 0,
    536         "url": "https://news.ycombinator.com/item?id=40663653",
    537         "created_at": "2024-06-12T21:57:06Z"
    538       },
    539       {
    540         "hn_id": "41203499",
    541         "title": "Information preservation in Kerr-Newman spacetime using closed timelike curves",
    542         "points": 1,
    543         "comments": 0,
    544         "url": "https://news.ycombinator.com/item?id=41203499",
    545         "created_at": "2024-08-09T16:54:20Z"
    546       },
    547       {
    548         "hn_id": "39257382",
    549         "title": "CamPro: Camera-Based Anti-Facial Recognition",
    550         "points": 1,
    551         "comments": 0,
    552         "url": "https://news.ycombinator.com/item?id=39257382",
    553         "created_at": "2024-02-05T04:34:54Z"
    554       },
    555       {
    556         "hn_id": "39253694",
    557         "title": "CamPro: Camera-Based Anti-Facial Recognition",
    558         "points": 1,
    559         "comments": 0,
    560         "url": "https://news.ycombinator.com/item?id=39253694",
    561         "created_at": "2024-02-04T19:50:49Z"
    562       },
    563       {
    564         "hn_id": "38889786",
    565         "title": "Learning Bimanual Mobile Manipulation with Low-Cost Whole-Body Teleoperation",
    566         "points": 1,
    567         "comments": 0,
    568         "url": "https://news.ycombinator.com/item?id=38889786",
    569         "created_at": "2024-01-06T08:58:51Z"
    570       },
    571       {
    572         "hn_id": "35714138",
    573         "title": "Human-Centric Latent Diffusion Models for Fashion Image Editing",
    574         "points": 1,
    575         "comments": 0,
    576         "url": "https://news.ycombinator.com/item?id=35714138",
    577         "created_at": "2023-04-26T14:25:14Z"
    578       }
    579     ],
    580     "top_points": 3,
    581     "total_points": 16,
    582     "total_comments": 3
    583   }
    584 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs