ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (22886B)


      1 {
      2   "paper": {
      3     "title": "The Scales of Justitia: A Comprehensive Survey on Safety Evaluation of LLMs",
      4     "authors": [
      5       "Songyang Liu",
      6       "Chaozhuo Li",
      7       "Jiameng Qiu",
      8       "Xi Zhang",
      9       "Feiran Huang",
     10       "Litian Zhang",
     11       "Yiming Hei",
     12       "Philip S. Yu"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv",
     16     "arxiv_id": "2506.11094",
     17     "doi": "10.48550/arXiv.2506.11094"
     18   },
     19   "scan_version": 2,
     20   "active_modules": ["survey_methodology"],
     21   "methodology_tags": ["meta-analysis"],
     22   "key_findings": "This survey organizes LLM safety evaluation along four dimensions: why to evaluate (distinguishing safety from capability evaluation), what to evaluate (toxicity, robustness, ethics, bias/fairness, truthfulness, plus privacy/copyright/code generation), where to evaluate (metrics, 31 datasets/benchmarks), and how to evaluate (human, rule-based, and model-based evaluators plus 7 integrated frameworks). The paper identifies six challenges including the lack of unified evaluation standards, the need for dynamic benchmarks, and the gap between evaluating text-only LLMs versus multimodal models and agents.",
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No code repository or analysis scripts are mentioned or linked anywhere in the paper."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The survey does not release a structured dataset of the papers reviewed, its taxonomy mapping, or any extracted data."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No environment or dependency specifications are provided. As a survey, analysis code and environment could still have been released."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No instructions for reproducing the survey's paper selection or analysis process are provided."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": false,
     49         "answer": false,
     50         "justification": "Survey paper with no experiments; no quantitative results are produced."
     51       },
     52       "significance_tests": {
     53         "applies": false,
     54         "answer": false,
     55         "justification": "Survey paper with no experiments; no comparative claims requiring statistical tests."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": false,
     59         "answer": false,
     60         "justification": "Survey paper with no experiments; no effect sizes to report."
     61       },
     62       "sample_size_justified": {
     63         "applies": false,
     64         "answer": false,
     65         "justification": "Survey paper with no experiments."
     66       },
     67       "variance_reported": {
     68         "applies": false,
     69         "answer": false,
     70         "justification": "Survey paper with no experiments."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Table I compares this survey against five prior surveys (Guo et al. 2023, Dong et al. 2024, Yi et al. 2024, Ran et al. 2024, Röttger et al. 2025) on seven dimensions including taxonomy of tasks, metrics, datasets, evaluators, and frameworks."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The comparison surveys span 2023-2025, which are contemporary and relevant baselines for a 2025 survey."
     83       },
     84       "ablation_study": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "Survey paper has no system components to ablate."
     88       },
     89       "multiple_metrics": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "Survey paper does not run experiments with metrics."
     93       },
     94       "human_evaluation": {
     95         "applies": false,
     96         "answer": false,
     97         "justification": "Survey paper does not evaluate system outputs."
     98       },
     99       "held_out_test_set": {
    100         "applies": false,
    101         "answer": false,
    102         "justification": "Survey paper does not use test sets."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The survey provides detailed per-category breakdowns across toxicity (Section III.A), robustness (III.B), ethics (III.C), bias and fairness (III.D), truthfulness (III.E), and other domains (III.F), each with subcategories."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Each subsection in Sections III-V ends with a discussion of unresolved challenges and limitations of current approaches. Section VI is entirely devoted to challenges and future directions."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The paper reports negative findings throughout, e.g., that 'no attack or defense is universally effective' (Section III.B), that LLMs 'still exhibit systematic biases' (Section III.D), and that current evaluation frameworks 'focus more on integrating existing methods than on advancing evaluation methodology' (Section V.D)."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The abstract claims a 'structured overview' with a four-dimensional taxonomy, which is delivered in Sections II-V. The claimed coverage of tasks, metrics, datasets, benchmarks, evaluation methods, and future directions are all present in the paper."
    125       },
    126       "causal_claims_justified": {
    127         "applies": false,
    128         "answer": false,
    129         "justification": "The paper is a survey that synthesizes existing work and makes no causal claims of its own."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper claims to be 'comprehensive and systematic' (abstract, contributions) but does not describe any search methodology, databases searched, time period covered, or inclusion/exclusion criteria. The title claims 'comprehensive survey' but without documented scope boundaries, the comprehensiveness claim cannot be verified. The paper acknowledges MLLMs and agents are underexplored (Section VI.E) but doesn't state this as a scope boundary upfront."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": false,
    138         "answer": false,
    139         "justification": "Pure survey/taxonomy paper presenting no empirical results of its own."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": false,
    143         "answer": false,
    144         "justification": "Survey paper with no measurements of its own."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "Survey paper that does not run any models."
    152       },
    153       "prompts_provided": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "Survey paper that does not use prompting."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "Survey paper with no experiments."
    162       },
    163       "scaffolding_described": {
    164         "applies": false,
    165         "answer": false,
    166         "justification": "Survey paper uses no scaffolding."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": false,
    171         "justification": "The paper does not describe any systematic paper selection pipeline. There are no search queries, no database descriptions, no screening stages, no filtering criteria, and no PRISMA-style flow diagram. How the approximately 215 references were identified and selected is entirely undocumented."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "Section VI 'Challenges and Future Directions' discusses challenges in the field of LLM safety evaluation, but there is no section discussing limitations of the survey itself — e.g., potential gaps in coverage, search methodology limitations, or reviewer bias."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No specific threats to the validity of the survey's own methodology or conclusions are discussed. The challenges in Section VI are about the field, not about the survey."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "The paper does not explicitly state what is excluded from its scope. The focus on text-based LLMs (vs. MLLMs/agents) emerges only in Section VI.E as a future direction rather than being declared upfront as a scope boundary. No time period, language, or venue restrictions are stated."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "The corpus of surveyed papers, the taxonomy mapping, and the comparative analysis data are not available for independent verification."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "No description of how the surveyed papers were identified. No search databases, queries, or time period are mentioned. The paper jumps directly into its taxonomy without describing the literature search process."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "How papers were found and selected for inclusion is not described. For a survey, this is analogous to participant recruitment — the paper selection process should be documented to assess coverage and bias."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": false,
    210         "justification": "No pipeline from initial search to final included papers is documented. There is no indication of how many candidate papers were found, how many were screened, or how many were excluded at each stage."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding sources, grants, or sponsorships are mentioned anywhere in the paper."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Author affiliations are clearly stated: Beijing University of Posts and Telecommunications, Jinan University, Beihang University, China Academy of Information and Communications Technology, and University of Illinois at Chicago."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No funding is disclosed, so independence of the funder cannot be assessed. This is non-disclosure rather than evidence of independence."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests or financial interests statement is present in the paper."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this survey paper."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this survey paper."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this survey paper."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this survey paper."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this survey paper."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this survey paper."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants in this survey paper."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": false,
    292         "answer": false,
    293         "justification": "Survey paper with no method of its own to cost."
    294       },
    295       "compute_budget_stated": {
    296         "applies": false,
    297         "answer": false,
    298         "justification": "Survey paper with no computational experiments."
    299       }
    300     },
    301     "survey_methodology": {
    302       "prisma_or_structured_protocol": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The paper does not follow PRISMA or any structured review protocol. There is no search strategy, no reproducible queries, no flow diagram showing paper identification/screening/inclusion, and no protocol registration. The paper organizes by taxonomy but does not document its search methodology."
    306       },
    307       "quality_assessment_of_sources": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The survey does not assess the methodological quality of the papers it reviews. All cited works are treated equally regardless of their rigor. For example, Table II lists 31 datasets/benchmarks without any quality assessment of the underlying evaluation methodologies."
    311       },
    312       "publication_bias_discussed": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No discussion of publication bias. The survey does not consider whether the published literature on LLM safety evaluation skews toward positive results, novel attacks, or certain evaluation dimensions, nor does it use funnel plots or other bias detection methods."
    316       }
    317     }
    318   },
    319   "claims": [
    320     {
    321       "claim": "This is the first survey to present a systematic and holistic synthesis of recent advancements in safety evaluation of LLMs.",
    322       "evidence": "Section I contributions list and Table I comparing against five prior surveys (Guo 2023, Dong 2024, Yi 2024, Ran 2024, Röttger 2025) showing none cover all seven dimensions.",
    323       "supported": "moderate"
    324     },
    325     {
    326       "claim": "Existing safety evaluations are fragmented, addressing single aspects such as datasets or evaluators, lacking a holistic analysis of the entire pipeline.",
    327       "evidence": "Table I shows prior surveys covering at most 4 of 7 dimensions (taxonomy, metrics, datasets, evaluators, frameworks, challenges). Section I discusses this gap.",
    328       "supported": "moderate"
    329     },
    330     {
    331       "claim": "No attack or defense is universally effective, and robustness does not correlate with model size.",
    332       "evidence": "Cited from HarmBench results in Section III.B.2, referencing [31] and [138]. This is a re-stated finding, not original to this survey.",
    333       "supported": "moderate"
    334     },
    335     {
    336       "claim": "The research community has yet to reach consensus on standardized procedures for conducting LLM safety evaluations.",
    337       "evidence": "Section VI.A discusses inconsistencies in ASR definitions, benchmark usage, and evaluation methods across studies. Supported by observations throughout Sections IV-V showing varied metrics and frameworks.",
    338       "supported": "strong"
    339     },
    340     {
    341       "claim": "Current safety evaluation metrics, especially ASR, lack unified definitions making cross-study comparison difficult.",
    342       "evidence": "Section IV.A.1 states 'How to evaluate the success of a jailbreak attempt against an LLM has not been unified [17].' Sections IV.A.1-2 describe how different studies use different definitions and implementations.",
    343       "supported": "strong"
    344     },
    345     {
    346       "claim": "Safety evaluation should evolve from safe to responsible LLMs, encompassing transparency, interpretability, and auditability.",
    347       "evidence": "Section VI.F proposes this as a future direction but provides no empirical evidence or concrete framework for achieving it.",
    348       "supported": "weak"
    349     }
    350   ],
    351   "red_flags": [
    352     {
    353       "flag": "No systematic search methodology",
    354       "detail": "The paper claims to be 'comprehensive and systematic' but does not describe how papers were identified, what databases were searched, what search terms were used, or what inclusion/exclusion criteria were applied. Without a documented search methodology, the comprehensiveness claim is unverifiable."
    355     },
    356     {
    357       "flag": "No quality assessment of sources",
    358       "detail": "The survey treats all cited papers equally regardless of their methodological rigor. Papers with robust evaluation designs are summarized alongside papers with minimal evidence, with no indication of relative quality. This risks laundering weak results alongside strong ones."
    359     },
    360     {
    361       "flag": "No survey-specific limitations discussed",
    362       "detail": "Section VI discusses challenges in the field but never discusses limitations of the survey itself — potential gaps in coverage, language/venue biases in paper selection, or reviewer subjectivity in taxonomy classification."
    363     },
    364     {
    365       "flag": "Self-citation pattern",
    366       "detail": "Multiple papers from the authors' own groups are cited (references [5], [6], [10], [11], [12], [78], [113], [114], [146], [150]). While not inherently problematic, the volume of self-citations in a survey claiming objectivity warrants noting."
    367     },
    368     {
    369       "flag": "Narrative review claiming systematic status",
    370       "detail": "The paper reads as a well-organized narrative review but repeatedly calls itself 'systematic' and 'comprehensive.' Without PRISMA guidelines, structured search, or quality assessment, it does not meet the standard definition of a systematic review."
    371     }
    372   ],
    373   "cited_papers": [
    374     {
    375       "title": "A survey on evaluation of large language models",
    376       "authors": ["Y. Chang", "X. Wang", "J. Wang"],
    377       "year": 2024,
    378       "relevance": "Comprehensive survey on general LLM evaluation providing the baseline against which safety-specific evaluation is distinguished."
    379     },
    380     {
    381       "title": "GPT-4 technical report",
    382       "authors": ["J. Achiam", "S. Adler", "S. Agarwal"],
    383       "year": 2023,
    384       "arxiv_id": "2303.08774",
    385       "relevance": "Technical report for GPT-4, one of the primary LLMs evaluated across multiple safety dimensions in the surveyed papers."
    386     },
    387     {
    388       "title": "Universal and transferable adversarial attacks on aligned language models",
    389       "authors": ["A. Zou", "Z. Wang", "N. Carlini"],
    390       "year": 2023,
    391       "arxiv_id": "2307.15043",
    392       "relevance": "Introduces GCG adversarial attack and AdvBench benchmark, foundational work in LLM safety jailbreak evaluation."
    393     },
    394     {
    395       "title": "DecodingTrust: A comprehensive assessment of trustworthiness in GPT models",
    396       "authors": ["B. Wang", "W. Chen", "H. Pei"],
    397       "year": 2023,
    398       "relevance": "Comprehensive trustworthiness evaluation framework for GPT models covering robustness, bias, and safety dimensions."
    399     },
    400     {
    401       "title": "HarmBench: A standardized evaluation framework for automated red teaming and robust refusal",
    402       "authors": ["M. Mazeika", "L. Phan", "X. Yin"],
    403       "year": 2024,
    404       "arxiv_id": "2402.04249",
    405       "relevance": "Standardized red-teaming benchmark covering 18 attack methods with finding that robustness does not correlate with model size."
    406     },
    407     {
    408       "title": "JailbreakBench: An open robustness benchmark for jailbreaking large language models",
    409       "authors": ["P. Chao", "E. Debenedetti", "A. Robey"],
    410       "year": 2024,
    411       "relevance": "Open benchmark for jailbreak evaluation with standardized pipeline and public leaderboard."
    412     },
    413     {
    414       "title": "Purple Llama CyberSecEval: A secure coding benchmark for language models",
    415       "authors": ["M. Bhatt", "S. Chennabasappa", "C. Nikolaidis"],
    416       "year": 2023,
    417       "arxiv_id": "2312.04724",
    418       "relevance": "Framework evaluating security of LLM-generated code, finding ~30% insecure code generation rate across tested models."
    419     },
    420     {
    421       "title": "Do users write more insecure code with AI assistants?",
    422       "authors": ["N. Perry", "M. Srivastava", "D. Kumar"],
    423       "year": 2023,
    424       "relevance": "Empirical study on AI coding assistant security implications, directly relevant to LLM code generation safety evaluation."
    425     },
    426     {
    427       "title": "Llama Guard: LLM-based input-output safeguard for human-AI conversations",
    428       "authors": ["H. Inan", "K. Upasani", "J. Chi"],
    429       "year": 2023,
    430       "arxiv_id": "2312.06674",
    431       "relevance": "Foundational open-source safety evaluation model fine-tuned from Llama2, widely used as automated safety judge."
    432     },
    433     {
    434       "title": "JailbreakEval: An integrated toolkit for evaluating jailbreak attempts against large language models",
    435       "authors": ["D. Ran", "J. Liu", "Y. Gong"],
    436       "year": 2024,
    437       "arxiv_id": "2406.09321",
    438       "relevance": "Toolkit for standardized jailbreak evaluation highlighting the lack of unified evaluation methods."
    439     },
    440     {
    441       "title": "SALAD-Bench: A hierarchical and comprehensive safety benchmark for large language models",
    442       "authors": ["L. Li", "B. Dong", "R. Wang"],
    443       "year": 2024,
    444       "arxiv_id": "2402.05044",
    445       "relevance": "Large-scale safety benchmark with 30K prompts across 66 dimensions, one of the most comprehensive safety datasets catalogued."
    446     },
    447     {
    448       "title": "A survey of large language models",
    449       "authors": ["W. X. Zhao", "K. Zhou", "J. Li"],
    450       "year": 2023,
    451       "arxiv_id": "2303.18223",
    452       "relevance": "Broad LLM survey providing context for distinguishing general evaluation from safety-specific evaluation."
    453     }
    454   ]
    455 }

Impressum · Datenschutz