scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27334B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Defensive Prompt Patch: A Robust and Generalizable Defense of Large Language Models against Jailbreak Attacks",
      6     "authors": [
      7       "Chen Xiong",
      8       "Xiangyu Qi",
      9       "Pin-Yu Chen",
     10       "Tsung-Yi Ho"
     11     ],
     12     "year": 2024,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2405.20099",
     15     "doi": "10.48550/arXiv.2405.20099"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "The abstract claims 'negligible impact on utility,' but Mistral-7B-Instruct-v0.2 Win-Rate drops from 90.31% to 75.06% (~15pp), and the claim of universal applicability is tested on only two primary models.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Ablation studies (Appendix B, Tables 6–12) isolate contributions of the defense objective, helpful objective, HGA vs. RLPrompt, and synonym substitution, supporting causal claims about what drives DPP effectiveness.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The conclusion claims DPP is a 'universal defensive solution' scalable to 'various LLM platforms,' but the primary evaluation covers only two 7B-scale open-source models, with limited appendix experiments on Vicuna-13B and Llama-3-8B.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper does not consider alternative explanations for DPP's effectiveness, such as whether the suffix works via prompt dilution, semantic priming, or overfitting to the keyword-based ASR evaluation metric.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "ASR is measured primarily via substring keyword matching ('I'm sorry', 'I cannot', etc.) as a proxy for actual harm prevention; while LLaMA-Guard evaluation is added in appendices, main claims rest on the keyword proxy without adequately discussing its limitations.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "A dedicated 'Limitation' section discusses computational cost of HGA, GPT-4 training API cost, limitations of defense baselines, and vulnerability when open-weight models are run locally.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Specific threats are identified: HGA requires ~$75 USD in GPT-4 API calls per training run, and DPP can be trivially removed by users running open-weight models locally — concrete, non-boilerplate constraints.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No explicit statements about what results do not generalize to (e.g., larger models, closed-source APIs, non-English jailbreaks, attacks outside the evaluation set); the limitations section omits these scope boundaries.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The acknowledgment states 'Chen Xiong and Tsung-Yi Ho...are funded by the Hong Kong Jockey Club Charities Trust.'",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations — CUHK, Princeton University, IBM Research — are disclosed on the title page.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "The Hong Kong Jockey Club Charities Trust is a philanthropic organization with no stake in LLM products or the defense methods being evaluated.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests or financial disclosure statement is present; only the funding source is mentioned, with no declaration of patents, equity, or consulting relationships.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Section 3.1 formally defines jailbreak attack, jailbreak defense, and utility degradation with mathematical notation; ASR is defined in Appendix I and Win-Rate in Appendix J.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Three explicit contributions are bullet-pointed in the introduction: improved defense with minimal utility tradeoff, robustness against adaptive attacks, and clarity/stability of the prompt mechanism.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 2 covers both jailbreak attack methods (GCG, AutoDAN, PAIR, TAP, ICA) and defenses (Self-Reminder, RPO, Goal Prioritization), with Table 1 explicitly showing how DPP addresses deficiencies of each prior approach.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "Code is released via HuggingFace demo space (TrustSafeAI/Defensive-Prompt-Patch-Jailbreak-Defense) and an anonymous repository at anonymous.4open.science/r/DPP-23FF.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "Evaluation relies entirely on publicly available datasets: AdvBench harmful behaviors, Alpaca dataset, and JailbreakBench — all accessible without restriction.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "Appendix C mentions 'single A800 GPU with 80GB of memory' and lists hyperparameters, but no requirements.txt, Dockerfile, or software version list is provided.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "Algorithm pseudocode and hyperparameters are detailed in appendices, but there are no step-by-step instructions for running the full training-and-evaluation pipeline end-to-end.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "No confidence intervals or error bars are reported for any main results in Tables 2–5; all values are single point estimates.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No statistical significance tests are applied to comparative claims (e.g., 'outperforms RPO by 42% for ICA attack'); results are compared as raw percentages without hypothesis testing.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Effect sizes are expressed as absolute ASR differences with baseline context throughout (e.g., DPP 3.8% average ASR vs. RPO 16.8% vs. no defense 51.5%).",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "The use of 100 harmful queries from AdvBench for training and evaluation is not justified with power analysis or explanation of why 100 is sufficient.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "No variance or standard deviation across runs is reported; Table 7 shows results from three initializations but aggregates them only as averages without spread statistics.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Self-Reminder, RPO, Goal Prioritization, and Default System Prompt serve as baselines throughout Tables 2–5.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Baselines include RPO (2024), Goal Prioritization (2023), and Self-Reminder (2023) — all recent prompt-based defenses in the same category as DPP.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Appendix B provides five ablation studies: objective functions (Table 6), prefix vs. suffix format (Table 7), prototype initialization sensitivity (Tables 9–10), HGA vs. RLPrompt solver (Table 11), and synonym substitution necessity (Table 12).",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Both ASR (safety) and Win-Rate via AlpacaEval (utility) are reported as primary metrics, with a secondary Min Over Prompt metric added in Appendix Y.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": false,
    204           "answer": false,
    205           "justification": "Win-Rate uses automated AlpacaEval comparison against Davinci003, not human judges; human evaluation is not relevant to the algorithmic optimization contribution.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "Section 4.3 tests on 'another 100 harmful queries from AdvBench dataset which are independent from the Adversarial Dataset' used during training/optimization.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Results are broken down per individual jailbreak attack type (Base64, ICA, AutoDAN, GCG, PAIR, TAP, Catastrophic) in all main result tables.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Table 2 explicitly shows DPP has 10% ASR against AutoDAN (worse than Self-Reminder's 0%), and the Mistral results acknowledge higher utility degradation than simpler baselines.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "The paper directly reports that Mistral-7B-Instruct-v0.2 has worse defense-utility tradeoff than Llama-2-7B-Chat, and that DPP's adaptive ASR on Mistral (46.9%) is substantially higher than on Llama (13.0%).",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Exact model versions are specified: 'Llama-2-7B-Chat' and 'Mistral-7B-Instruct-v0.2' with citations to their respective technical reports.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "All DPP suffixes are shown in Appendix E and all defense baseline prompts (Self-Reminder, Goal Prioritization, System Prompt) are shown verbatim in Appendix H.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Appendix C lists all hyperparameters: α=1, β=10 (Llama) or α=10, β=1 (Mistral), num_steps=100, batch_size=64, crossover_rate=0.5, mutation_rate=0.01, plus sentence/paragraph iteration counts.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "No agentic scaffolding is used; DPP is a static suffix appended at inference time, not an agentic pipeline.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": false,
    261           "justification": "Dataset sampling is briefly noted (100 queries from AdvBench) but preprocessing steps for generating jailbreak prompts via each attack method are not documented beyond external GitHub links.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "While source datasets (AdvBench, Alpaca) are public, the generated jailbreak prompts and model responses used in the actual evaluation are not released.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Section 4.1 describes sampling 100 harmful queries from AdvBench and 100 benign queries from Alpaca; jailbreak generation procedures with hyperparameters and GitHub links are detailed in Appendix F.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants; all evaluation uses automated benchmarks.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": false,
    287           "justification": "The optimization pipeline is described via algorithms, but the full end-to-end pipeline from raw AdvBench queries through attack generation to ASR calculation is not documented cohesively in one place.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "Training cutoffs for Llama-2 and Mistral are not stated; relevant because AdvBench (the evaluation set) was published before both models' RLHF training, potentially contaminating baseline alignment behavior.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "AdvBench was publicly released in 2023 before Llama-2 and Mistral's training cutoffs; the possibility that models' RLHF training incorporated these refusal patterns — artificially lowering baseline ASR — is never discussed.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "The paper does not acknowledge that AdvBench examples were available before the evaluated models were trained, potentially making the baseline ASR measurements unrepresentative of real-world alignment.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants involved.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants involved.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants involved.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants involved.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants involved.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants involved.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants involved.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "The paper notes DPP adds only a suffix at inference time (O(1) overhead) but does not report actual latency or inference cost numbers.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": true,
    365           "justification": "Appendix C specifies a single A800 GPU (80GB), 15.32 seconds per training epoch, 100 epochs per training instance, and approximately $75 USD GPT-4 API cost per DPP training run.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "DPP achieves the lowest average ASR (3.8%) among prompt-based defenses on Llama-2-7B-Chat while maintaining the highest Win-Rate (82.98%).",
    374       "evidence": "Table 2: DPP at 3.8% average ASR and 82.98% Win-Rate vs. Self-Reminder (6.3% ASR, 64.84%), RPO (16.8% ASR, 79.23%), Goal Prioritization (10.0% ASR, 34.29%).",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "DPP generalizes to less-aligned models, achieving 2.0% average non-adaptive ASR on Mistral-7B-Instruct-v0.2.",
    379       "evidence": "Table 4 shows DPP at 2.0% vs. Goal Prioritization (22.2%), Self-Reminder (48.2%), System Prompt (52.7%), though Win-Rate drops to 75.06% from baseline 90.31%.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "HGA outperforms RLPrompt as an optimizer for the DPP objective, achieving lower ASR and higher utility.",
    384       "evidence": "Table 11: HGA achieves 4% GCG ASR and 82.98% Win-Rate vs. RLPrompt 15% GCG ASR and 47.89% Win-Rate on Llama-2-7B-Chat.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "Using DPP as a suffix outperforms using it as a prefix, particularly under adaptive attacks.",
    389       "evidence": "Table 7: Average adaptive GCG ASR of 57% for Prefix DPP vs. 15% for Suffix DPP; Win-Rate also higher for suffix (76.09% vs. 73.05%) on Llama-2-7B-Chat.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "DPP produces more human-readable prompts than gradient-based approaches like RPO.",
    394       "evidence": "Table 34: DPP perplexity 56.57 vs. RPO perplexity 8780.94 as measured by GPT-4 next-token log-probabilities.",
    395       "supported": "weak"
    396     },
    397     {
    398       "claim": "DPP maintains defense effectiveness against unforeseen jailbreak queries not used during training.",
    399       "evidence": "Table 37: DPP achieves 7.5% average ASR on held-out AdvBench queries on Llama-2-7B-Chat vs. Self-Reminder (15.5%), RPO (29.0%), Goal Prioritization (30.3%).",
    400       "supported": "moderate"
    401     }
    402   ],
    403   "methodology_tags": [
    404     "benchmark-eval"
    405   ],
    406   "key_findings": "DPP, trained via a bi-objective Hierarchical Genetic Algorithm balancing refusal likelihood and helpfulness, achieves the lowest average ASR (3.8%) and highest utility (82.98% Win-Rate) among prompt-based defenses on Llama-2-7B-Chat, outperforming RPO, Goal Prioritization, and Self-Reminder. The method generalizes to less-aligned Mistral-7B-Instruct-v0.2 (2.0% non-adaptive ASR) but with considerably greater utility degradation than simpler baselines. Ablation studies confirm that both defense and utility objectives are essential, and HGA substantially outperforms RLPrompt as the search algorithm. However, adaptive ASR on Mistral reaches 46.9%, the primary evaluation metric relies on keyword matching rather than actual harm assessment, and 'universal' scalability claims rest on only two primary 7B-scale models.",
    407   "red_flags": [
    408     {
    409       "flag": "Keyword-matching ASR metric",
    410       "detail": "Attack Success Rate is determined by checking whether model responses contain refusal keywords like 'I'm sorry' or 'I cannot.' A model generating harmful content that incidentally includes such a phrase is counted as defended; this is known to be an unreliable proxy for actual safety."
    411     },
    412     {
    413       "flag": "No statistical significance testing",
    414       "detail": "All comparisons (e.g., 'outperforms RPO by 42% for ICA attack') are made on 100-query point estimates without confidence intervals, error bars, or significance tests, making it impossible to assess whether differences are reliable."
    415     },
    416     {
    417       "flag": "Small, unjustified evaluation sample",
    418       "detail": "Only 100 harmful queries from AdvBench are used for both training and evaluation. Sample size is not justified, and no sensitivity analysis examines whether results hold with more queries or different source datasets."
    419     },
    420     {
    421       "flag": "Overstated universality claims",
    422       "detail": "The conclusion describes DPP as a 'universal defensive solution' scalable to 'various LLM platforms,' but primary evaluation covers only two 7B-scale open-source chat models."
    423     },
    424     {
    425       "flag": "AdvBench contamination not addressed",
    426       "detail": "AdvBench was published in 2023 before Llama-2 and Mistral's RLHF training cutoffs, making it plausible that these models' alignment training incorporated these specific harmful query patterns, artificially depressing baseline ASR measurements."
    427     },
    428     {
    429       "flag": "GPT-4 training dependency limits reproducibility",
    430       "detail": "DPP training uses GPT-4 for LLM-based prompt revisions (~$75 per run), creating a dependency on proprietary API access that cannot be reproduced without ongoing OpenAI billing and risks non-determinism across API versions."
    431     }
    432   ],
    433   "cited_papers": [
    434     {
    435       "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models (GCG)",
    436       "relevance": "Primary gradient-based attack method evaluated; DPP is specifically designed to defend against adversarial suffixes generated by GCG."
    437     },
    438     {
    439       "title": "AutoDAN: Generating Stealthy Jailbreak Prompts on Aligned Language Models",
    440       "relevance": "DPP directly adapts AutoDAN's Hierarchical Genetic Algorithm as its optimization backbone — the direct methodological ancestor of DPP."
    441     },
    442     {
    443       "title": "Jailbreaking Black Box Large Language Models in Twenty Queries (PAIR)",
    444       "relevance": "Key black-box attack baseline; tests DPP robustness against automated adversarial prompt generation via a secondary attacker LLM."
    445     },
    446     {
    447       "title": "Tree of Attacks: Jailbreaking Black-box LLMs Automatically (TAP)",
    448       "relevance": "Attack baseline using tree-structured prompt refinement; one of six attacks used in the primary evaluation."
    449     },
    450     {
    451       "title": "Defending ChatGPT Against Jailbreak Attack via Self-Reminders",
    452       "relevance": "Primary prompt-based defense baseline; DPP is positioned as superior to Self-Reminder in both defense and utility."
    453     },
    454     {
    455       "title": "Robust Prompt Optimization for Defending Language Models Against Jailbreaking Attacks (RPO)",
    456       "relevance": "Gradient-based prompt defense baseline; comparison with RPO demonstrates DPP's advantage of producing human-readable prompts with higher utility."
    457     },
    458     {
    459       "title": "Defending Large Language Models Against Jailbreaking Attacks Through Goal Prioritization",
    460       "relevance": "Defense baseline achieving low ASR at high utility cost; DPP is framed as solving this safety-utility tradeoff."
    461     },
    462     {
    463       "title": "Llama Guard: LLM-Based Input-Output Safeguard for Human-AI Conversations",
    464       "relevance": "Used as an alternative LLM-based judge to validate keyword-based ASR measurements across appendix experiments."
    465     },
    466     {
    467       "title": "JailbreakBench: An Open Robustness Benchmark for Jailbreaking Large Language Models",
    468       "relevance": "Provides out-of-distribution harmful queries for generalization testing in Appendix P."
    469     },
    470     {
    471       "title": "SmoothLLM: Defending Large Language Models Against Jailbreaking Attacks",
    472       "relevance": "Representative non-prompt-based defense contextualizing the prompt-based defense paradigm DPP operates within."
    473     }
    474   ],
    475   "engagement_factors": {
    476     "practical_relevance": {
    477       "score": 2,
    478       "justification": "DPP requires no model retraining and deploys as a simple suffix prompt; a live HuggingFace demo exists, making adoption concrete and low-friction for practitioners."
    479     },
    480     "surprise_contrarian": {
    481       "score": 1,
    482       "justification": "The finding that a short, human-readable suffix outperforms gradient-optimized unreadable prompts (RPO perplexity 8780 vs. DPP 56) is mildly counterintuitive."
    483     },
    484     "fear_safety": {
    485       "score": 2,
    486       "justification": "Addresses concrete jailbreak vulnerabilities in deployed LLMs; demonstrating that widely-used defenses (RPO: 45.7% adaptive ASR) are largely defeated by adaptive attackers raises genuine safety concerns."
    487     },
    488     "drama_conflict": {
    489       "score": 1,
    490       "justification": "Framed as an attack-defense arms race with competitive benchmarking; adaptive attack evaluation where attackers know the defense mechanism creates adversarial tension."
    491     },
    492     "demo_ability": {
    493       "score": 2,
    494       "justification": "Interactive HuggingFace demo exists at TrustSafeAI/Defensive-Prompt-Patch-Jailbreak-Defense; users can test the defense directly."
    495     },
    496     "brand_recognition": {
    497       "score": 1,
    498       "justification": "Pin-Yu Chen (IBM Research) is well-known in adversarial ML and Xiangyu Qi (Princeton) has security credibility, but the paper is ArXiv-only with no top-tier venue affiliation."
    499     }
    500   },
    501   "hn_data": {
    502     "threads": [
    503       {
    504         "hn_id": "38853706",
    505         "title": "Possible Meissner effect near room temperature: copper-substituted lead apatite",
    506         "points": 729,
    507         "comments": 318,
    508         "url": "https://news.ycombinator.com/item?id=38853706"
    509       },
    510       {
    511         "hn_id": "38850232",
    512         "title": "LK99: Possible Meissner effect near room temperature",
    513         "points": 6,
    514         "comments": 2,
    515         "url": "https://news.ycombinator.com/item?id=38850232"
    516       },
    517       {
    518         "hn_id": "42387852",
    519         "title": "LLM Synthetic Conversations Unlock?",
    520         "points": 1,
    521         "comments": 0,
    522         "url": "https://news.ycombinator.com/item?id=42387852"
    523       }
    524     ],
    525     "top_points": 729,
    526     "total_points": 736,
    527     "total_comments": 320
    528   }
    529 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs