ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (19530B)


      1 {
      2   "paper": {
      3     "title": "ConceptGuard: Neuro-Symbolic Safety Guardrails via Sparse Interpretable Jailbreak Concepts",
      4     "authors": ["Darpan Aswal", "Céline Hudelot"],
      5     "year": 2025,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2508.16325"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "Footnote 1 states 'The code will be released upon publication.' A promise of future release counts as NO."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses publicly available benchmark datasets: TechHazardQA, CatQA, AdvBench, HarmfulQA, and Alpaca-Cleaned. All are standard public benchmarks."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions using SAELens toolkit and Llama-3.2-1B-Instruct but does not provide a requirements.txt, Dockerfile, or detailed environment setup with library versions."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No README, reproduction scripts, or step-by-step instructions are provided. The methodology is described but not in a form that enables direct reproduction."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Table 1 reports point estimates (TPR, FPR, Precision, F1) with no confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims guardrails outperform baselines but provides no statistical significance tests. Comparisons are based solely on raw numbers."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Only raw metric values are reported. No effect sizes such as Cohen's d or relative improvement with baseline context are provided."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper uses 500 test prompts per class and 1500 training prompts per class. No justification or power analysis is provided for these sample sizes."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Results appear to be from single runs. No standard deviation, variance across seeds, or multiple-run statistics are reported."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Section 4.3 describes four baselines: Model-only refusal, LLM-as-a-Judge, Llama-Guard-3-1B, and a linear classifier on raw activations."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include Llama-Guard-3-1B (2024) and the underlying Llama-3.2-1B-Instruct model, which are recent and relevant."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper compares Full-SAE Guardrail vs Pruned-SAE Guardrail vs Raw Guardrail, effectively ablating the feature pruning and SAE components."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table 1 reports four metrics: True Positive Rate/Recall, False Positive Rate, Precision, and F1-score."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of the guardrail's outputs is performed. Evaluation is entirely automated using metrics and GPT-5.1 as a judge. For a safety system making subjective blocking decisions, human evaluation of edge cases would be relevant."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Training uses TechHazardQA, CatQA, and AdvBench; testing uses a separate dataset (HarmfulQA) with independently generated safe prompts. Clear train/test separation."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 1 provides breakdowns across four attack configurations (Base, Hypothetical, BoN-Base, BoN-Hypothetical). Table 3 shows feature distribution by category."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 5.2 discusses false positives in BoN attacks, analyzes why benign prompts trigger blocking via UMAP visualization (Figure 3), and identifies model confusion from perturbed prompts."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that the raw guardrail sometimes outperforms the SAE guardrails in TPR, and discusses elevated FPRs for activation-based guardrails in BoN attacks."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims SAEs extract interpretable jailbreak concepts and enable generalizable defenses. Table 1 and the feature analysis in Section 5 support these claims."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The ablation comparing full-SAE, pruned-SAE, and raw guardrails constitutes controlled single-variable manipulation. Claims about feature pruning reducing FPR are supported by the ablation design."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper tests only on Llama-3.2-1B-Instruct but makes broad claims about 'LLM internals' and 'jailbreak attacks' generally. The title and abstract do not bound findings to this single small model."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 5.2 discusses alternative explanations for elevated FPRs in BoN attacks, analyzing whether model confusion from perturbed prompts rather than guardrail failure explains the results."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper specifies 'Llama-3.2-1B-Instruct' (Section 4.2) which is a specific model version. GPT-5.1 is named for the judge/labeling role."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper describes using GPT-5.1 for label categorization and as an LLM judge but does not provide the actual prompt text used for these tasks. The appendix mentions details but no full prompts are shown."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 4.1 reports SAE training parameters: JumpReLU SAE, latent expansion factor of 4, L0 coefficient λs=0.5, L0 sparsity ~197, k=100 for top-k selection, Otsu threshold τ*≈0.3433."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. The approach is a pipeline of SAE training, feature extraction, and linear classification."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4.2 describes sampling 500 prompts from each of 3 datasets for training (1500 harmful + 1500 benign) and 500 from HarmfulQA + 500 generated for testing. The feature pruning pipeline is documented in Section 3.2."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "A 'Limitations and Future Work' section appears at the end of Section 6, discussing multiple specific limitations."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The limitations section identifies specific threats: single hook point limitation, potentially aggressive pruning heuristic, dependence on initial safety fine-tuning, and linear classifier expressiveness limitations."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "While the limitations section discusses what could be improved, it does not explicitly state what the results do NOT show (e.g., no claim about working on larger models, no claim about real-world deployment). The broad framing in abstract/title is not bounded."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw data (activations, SAE features, classifier weights) is released for independent verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 4.2 describes the data sources (TechHazardQA, CatQA, AdvBench, HarmfulQA, Alpaca-Cleaned) and the sampling procedure (500 per dataset)."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data comes from standard public benchmarks."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline is documented: activation extraction → SAE training → feature classification (junk ratio + TTR filtering) → feature categorization → linear classifier training. Sections 3.1-3.3 and 4.1-4.2 provide details including counts (3488 rich features, 4704 junk features)."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section is present in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: Université Paris-Saclay and CentraleSupélec. No product being evaluated is affiliated with the authors."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is disclosed, so independence cannot be assessed."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. It trains guardrails using model activations and tests the guardrails' classification ability, not the LLM's knowledge."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Same as above — the study tests a defense mechanism (guardrail classifier), not the LLM's benchmark performance."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Same as above — contamination of the LLM's training data is not relevant since the evaluation is of the guardrail classifier trained on activation features."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference cost, latency, or computational overhead of the guardrail is reported, despite the approach adding SAE encoding and classification on top of normal inference."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No information about GPU hours, hardware used, SAE training time, or total computational budget is provided."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "SAEs extract human-interpretable jailbreak concepts from LLM internals, with ~43% of features being semantically rich.",
    286       "evidence": "Section 5.1 and Section 3.2 describe the feature pruning yielding 3488 rich features (~43%) and 4704 junk features. Figure 2 shows example activations.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "ConceptGuard's Full-SAE guardrail achieves 0.95 F1 on base attacks and 0.91 F1 on BoN-Hypothetical, outperforming Llama-Guard-3-1B (0.72 and 0.26 respectively).",
    291       "evidence": "Table 1 reports these metrics directly across all four attack configurations.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Concept-guided guardrails trained only on direct attack samples generalize to stealthy jailbreaks (hypothetical framing, BoN perturbations).",
    296       "evidence": "Table 1 shows SAE guardrails maintain high TPR across attack configurations while baselines degrade. Section 5.2 discusses this generalization.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Jailbreak attacks share a common activation geometry in the representation space.",
    301       "evidence": "Section 5.2 and Figure 3 UMAP visualization show clustering of harmful vs benign activations. The generalization from base to BoN attacks provides indirect evidence.",
    302       "supported": "moderate"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval"],
    306   "key_findings": "ConceptGuard uses Sparse Autoencoders to extract interpretable jailbreak-related concepts from Llama-3.2-1B-Instruct activations, identifying ~43% of SAE features as semantically rich. A linear classifier operating on these rich features achieves competitive F1 scores (0.84-0.95) across four attack types while maintaining interpretability. The guardrails generalize from direct attacks to stealthy jailbreaks (BoN, hypothetical framing) despite training on only 1500 samples, providing evidence for shared activation geometry across jailbreak types.",
    307   "red_flags": [
    308     {
    309       "flag": "Single small model only",
    310       "detail": "All experiments use only Llama-3.2-1B-Instruct. Generalizability claims about 'LLM internals' are not bounded to this single 1B-parameter model. Larger models may have fundamentally different activation structures."
    311     },
    312     {
    313       "flag": "No uncertainty quantification",
    314       "detail": "All results appear to be single-run with no error bars, confidence intervals, or variance across seeds. The small test set (500 per class) makes point estimates unreliable without uncertainty measures."
    315     },
    316     {
    317       "flag": "GPT-5.1 used as ground truth judge without validation",
    318       "detail": "GPT-5.1 is used as an LLM judge for response classification, relevance assessment, and label categorization. No validation of GPT-5.1's accuracy on these tasks is provided, creating a dependency on an unvalidated oracle."
    319     },
    320     {
    321       "flag": "No inference latency or cost analysis",
    322       "detail": "For a guardrail system intended for deployment, the paper does not report inference overhead (SAE encoding + classification latency) relative to baseline approaches."
    323     }
    324   ],
    325   "cited_papers": [
    326     {
    327       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    328       "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"],
    329       "year": 2024,
    330       "arxiv_id": "2401.05566",
    331       "relevance": "Studies deceptive capabilities in LLMs that persist through safety training, directly relevant to AI safety evaluation."
    332     },
    333     {
    334       "title": "Jailbroken: How does LLM safety training fail?",
    335       "authors": ["Alexander Wei", "Nika Haghtalab", "Jacob Steinhardt"],
    336       "year": 2023,
    337       "relevance": "Analyzes failure modes of LLM safety training, foundational to understanding jailbreak attacks."
    338     },
    339     {
    340       "title": "Tree of attacks: Jailbreaking black-box LLMs automatically",
    341       "authors": ["Anay Mehrotra"],
    342       "year": 2024,
    343       "relevance": "Automated jailbreak attack method (TAP) relevant to LLM safety evaluation."
    344     },
    345     {
    346       "title": "Best-of-n jailbreaking",
    347       "authors": ["John Hughes", "Sara Price"],
    348       "year": 2024,
    349       "arxiv_id": "2412.03556",
    350       "relevance": "BoN jailbreaking method used as a key attack configuration in this paper's evaluation."
    351     },
    352     {
    353       "title": "Improving alignment and robustness with circuit breakers",
    354       "authors": ["Andy Zou", "Long Phan"],
    355       "year": 2024,
    356       "relevance": "Circuit breaker approach to LLM safety, alternative defense mechanism to guardrails."
    357     },
    358     {
    359       "title": "Refusal in language models is mediated by a single direction",
    360       "authors": ["Andy Arditi", "Oscar Obeso"],
    361       "year": 2024,
    362       "relevance": "Mechanistic interpretability finding about refusal geometry in LLMs, directly related to this paper's hypothesis about shared activation geometry."
    363     },
    364     {
    365       "title": "Building guardrails for large language models",
    366       "authors": ["Yi Dong", "Ronghui Mu"],
    367       "year": 2024,
    368       "arxiv_id": "2402.01822",
    369       "relevance": "Survey of LLM guardrail approaches, providing context for safety mechanism design."
    370     },
    371     {
    372       "title": "Judging LLM-as-a-judge with MT-Bench and Chatbot Arena",
    373       "authors": ["Lianmin Zheng"],
    374       "year": 2023,
    375       "relevance": "Foundation work on LLM-as-a-judge evaluation methodology used in this paper."
    376     },
    377     {
    378       "title": "BeaverTails: Towards improved safety alignment of LLM via a human-preference dataset",
    379       "authors": ["Jiaming Ji"],
    380       "year": 2023,
    381       "relevance": "Safety alignment dataset and methodology for LLM safety evaluation."
    382     },
    383     {
    384       "title": "Steering language model refusal with sparse autoencoders",
    385       "authors": ["Kyle O'Brien", "David Majercak"],
    386       "year": 2024,
    387       "arxiv_id": "2411.11296",
    388       "relevance": "Closely related work using SAEs for LLM safety steering, direct predecessor to ConceptGuard's approach."
    389     }
    390   ]
    391 }

Impressum · Datenschutz