scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31370B)
      1 {
      2   "paper": {
      3     "title": "SafeGenBench: A Benchmark Framework for Security Vulnerability Detection in LLM-Generated Code",
      4     "authors": [
      5       "Xinghang Li",
      6       "Jingzhe Ding",
      7       "Chao Peng",
      8       "Bing Zhao",
      9       "Xiang Gao",
     10       "Hongwan Gao",
     11       "Xinchen Gu"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv",
     15     "arxiv_id": "2506.05692",
     16     "doi": "10.48550/arXiv.2506.05692"
     17   },
     18   "scan_version": 2,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval"],
     21   "key_findings": "SafeGenBench evaluates 13 LLMs on 558 security-related code generation tasks across 44 CWE types and 13 languages. Under zero-shot conditions, average security accuracy is only 37.44%, rising to 58.01% with safety instructions and 61.14% with few-shot examples. Reasoning models outperform non-reasoning models, with o3 and DeepSeek-R1 leading. Models perform best on memory safety violations (76.16%) and worst on insecure configurations (12.50%). The dual-judge framework (SAST + LLM) catches complementary vulnerability types, with the LLM-judge flagging 30% of issues SAST misses.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "The abstract states 'The data and code will be released soon.' A promise of future release does not constitute actual release."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "Same as code — 'will be released soon' is stated but no working URL or archive is provided."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No requirements.txt, Dockerfile, or environment setup section is provided. The paper does not describe dependencies or library versions needed to run the evaluation framework."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions are included. The evaluation pipeline (code extraction, SAST scanning, LLM judging) is described conceptually but no runnable scripts or commands are provided."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "Table 3 and Table 4 report only point estimates (e.g., '37.44%', '46.42%') with no confidence intervals, error bars, or ± notation. The only CI reported is for the LLM-Judge validation (95% CI 81.2%-96.8%), not for the main results."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper claims 'reasoning models consistently outperform non-reasoning models' and that safety instructions cause 'more than 20%' improvement, but no statistical significance tests (t-test, bootstrap, etc.) are reported for any comparison."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "The paper reports improvements with baseline context: 'average accuracy increases by more than 20%' (37.44% → 58.01% in Table 3), and per-category accuracy breakdowns in Table 4 provide absolute scores. The reader can compute effect magnitudes."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The benchmark contains 558 test cases but no justification is given for why this number is sufficient. No power analysis or sample size rationale is discussed."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "All results appear to be from single runs. No standard deviation, variance, or spread measures across experimental runs are reported in any table."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Table 1 compares SafeGenBench against three existing benchmarks (CodeLMSec, CWEval, SecRepoBench) on questions, CWEs, languages, scenario coverage, and evaluation method. 13 models are compared against each other."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The evaluated models include very recent releases: o3-high, Gemini-2.5-Pro, Claude-3.7-Sonnet, DeepSeek-R1, Qwen3, Llama4-Maverick/Scout. Comparison benchmarks include CWEval (2025) and SecRepoBench (2025)."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Table 3 separately reports SAST-Judge and LLM-Judge scores alongside the combined overall score, effectively ablating the dual-judge evaluation framework. Section 6.4 analyzes each judge's contribution independently."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Results are reported across Overall accuracy, SAST-Judge accuracy, and LLM-Judge accuracy (Table 3), plus per-category breakdowns across 8 vulnerability categories (Table 4), and per-CWE analysis (Figures 4-5)."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Section 6.4 reports: 'A stratified random sample of 9% of the test cases was selected across all CWE categories. A security expert re-evaluated the cases blindly using standardized protocols. The LLM-Judge achieves 92% accuracy.'"
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "SafeGenBench is a newly created benchmark that models have not been trained or tuned on. The 558 test cases function as a held-out evaluation set with no model selection or tuning performed on them."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Table 4 provides per-category accuracy for all 13 models across 8 vulnerability categories. Figures 4-5 show per-CWE breakdowns for each judge type."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Appendix B provides three detailed case studies (CWE-89 SQL injection, CWE-494 code integrity, CWE-798 hardcoded credentials) showing specific vulnerability patterns. Appendix C discusses divergent judge detections. Section 6.3 identifies worst-performing categories."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Section 4.1 reports that SAST tools 'often suffer from limited detection effectiveness' and LLM judges 'performance degrades significantly when no vulnerability type is specified.' Section 6.4 quantifies complementary weaknesses: SAST misses 30% of vulnerabilities that LLM catches, while LLM misses 6.24% that SAST catches."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The abstract claims 'notable deficiencies in their ability to produce vulnerability-free code' — supported by Table 3 showing 37.44% average zero-shot accuracy. The claim about the dual-judge framework is supported by Section 6.4's complementarity analysis."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The paper claims 'reasoning models consistently outperform non-reasoning models' (Section 6.1), attributing this to reasoning capability. This is an observational comparison across models that differ in many dimensions (size, training data, architecture), not controlled single-variable manipulation. No confound analysis is provided."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The title and abstract frame results as generalizing to 'LLM-Generated Code' broadly, but test questions are written in Chinese (acknowledged only in the Limitations section), which confounds code security ability with Chinese language understanding. This significant scope constraint is not reflected in the title, abstract, or main body claims."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "Section 6.3 speculates about training data distributions as an explanation for category-level variation but does not substantively consider alternatives: the Chinese language confound, model size effects, benchmark design bias, or whether CWE category difficulty varies intrinsically independent of training."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper measures vulnerability detection across 44 specific CWEs using a dual-judge framework, but frames the outcome as 'security of LLM-generated code' (abstract). The gap between 44 CWEs (out of thousands) and comprehensive code security is not discussed. The 8% error rate of the validated LLM-judge is not addressed as a measurement limitation."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "Models are listed by marketing names only: 'GPT-4o', 'o1', 'o3-high', 'Gemini-2.5-pro', 'Claude-3.5-Sonnet', etc. No snapshot dates, API versions, or specific model identifiers (e.g., 'gpt-4o-2024-08-06') are provided."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "The LLM-Judge system prompt is provided in Appendix D (Figure 6). However, the 558 test question prompts sent to the evaluated models are not provided — only three examples appear in Appendix B. The safety instruction and few-shot example prompts are described conceptually but not reproduced in full."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "No generation hyperparameters (temperature, top-p, max tokens) are reported for any of the 13 evaluated models or for DeepSeek-R1 used as the LLM judge."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "No agentic scaffolding is used. Models receive prompts directly and generate code responses."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 3.1 describes the three-stage dataset construction pipeline: vulnerability type extraction/categorization (Stage 1), LLM-based test question generation (Stage 2), and human annotation/quality assurance (Stage 3). Section 4.2 describes the two-stage code extraction process from LLM outputs."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "A dedicated 'Limitations' section appears after Section 7 (Conclusion), discussing three specific limitations: single-function task difficulty, limited evaluation scope, and single-judge reliance."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "The limitations are specific to this study: (1) test cases limited to single-function generation, (2) no assessment of functional correctness, (3) single LLM judge and single SAST tool, (4) Chinese-language test questions may affect models differently based on their Chinese language capabilities."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "The Limitations section explicitly states what the results do NOT show: no project-level evaluation, no functional correctness assessment, and acknowledges the Chinese language constraint. These bound the scope of applicable conclusions."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "The raw data (558 test cases, model outputs, judge scores) is not available. 'The data and code will be released soon' is stated but no current access is provided."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 3.1 describes the three-stage data collection process: CWE taxonomy construction from OWASP Top-10 and CWE Top 25 (Stage 1), LLM-based test question generation with specific design principles (Stage 2), and expert review with acceptance/revision/rejection criteria (Stage 3)."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "Appendix A describes annotator demographics (6 experts, Asian, ages 24-35, 4 male/2 female, >3 years experience) but does not describe how these specific individuals were selected from the available pool. 'We employ a team of six human experts' does not explain the selection process."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "The three construction stages are described conceptually, but filtering counts are missing: how many test questions were generated before human review? How many were revised vs. discarded? The paper jumps from the process description to '558 metriculously curated test cases' without intermediate counts."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding source is disclosed. All authors are affiliated with ByteDance, but no acknowledgments section lists grants or corporate funding for this research."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations with ByteDance are clearly listed in the paper header. Email addresses include bytedance.com domains."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "ByteDance is a commercial competitor to the companies whose models are evaluated (OpenAI, Google, Anthropic, Meta, Alibaba/Qwen). ByteDance develops its own AI products (MarsCode) that compete with the evaluated models, giving them a potential commercial interest in benchmark results showing competitor weaknesses."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests statement or financial disclosure is included in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No training data cutoff dates are stated for any of the 13 evaluated models or for the DeepSeek-R1 judge model."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No discussion of whether the evaluated models may have seen similar CWE-specific coding scenarios during training. While the benchmark is newly created, the underlying vulnerability patterns and CWE examples are widely available online."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "The benchmark uses well-known CWE patterns that exist extensively in training data. While the specific test questions are novel, the paper does not discuss whether models' familiarity with CWE-related coding patterns from training data affects results."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "The paper has no human participants as study subjects. The annotators are part of the research team constructing the benchmark, not study participants."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in the study. Appendix A mentions annotators were 'approved by their Ethics Review Committee' for the annotation work, but this is for the construction team, not study subjects."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in the study. Annotator demographics are reported in Appendix A for transparency, but these are research team members, not study subjects."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in the study."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in the study."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in the study."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in the study."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No API costs, tokens consumed, or inference time are reported for any of the 13 models evaluated across 558 test cases in 3 settings, or for the DeepSeek-R1 judge evaluations."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "No total computational budget is stated. The experiment involves 13 models × 558 test cases × 3 settings = 21,762 generation calls plus judge evaluations, but no cost or compute figures are reported."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No mention of random seeds or seed sensitivity analysis. Results appear to be from single runs for all 13 models across all settings."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The number of experimental runs is not stated. It is unclear whether results are from single or multiple runs."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No hyperparameter search is described or acknowledged. Generation parameters (temperature, sampling) are not reported, let alone any search over them."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The choice of DeepSeek-R1 as the unified LLM judge is not justified — no comparison with alternative judge models is provided. The selection of Semgrep as the SAST tool is not justified against alternatives."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The paper makes implicit comparisons across 13 models and 8 vulnerability categories but uses no statistical tests, let alone corrections for multiple comparisons."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The authors designed SafeGenBench and compare it favorably to prior benchmarks (Table 1) without acknowledging potential bias in this self-evaluation. No independent assessment of benchmark quality is conducted."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The paper notes reasoning models outperform non-reasoning models but does not discuss the compute cost difference. Reasoning models (o3, DeepSeek-R1) use substantially more inference compute than standard models, but performance is not normalized by compute."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "The paper does not discuss whether SafeGenBench actually measures 'code security' as claimed, versus measuring familiarity with specific CWE patterns. The 44 CWEs cover a fraction of known weaknesses, and the construct validity of this selection as representative of 'security' is not examined."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": false,
    343         "answer": false,
    344         "justification": "No scaffolding is involved. Models are queried directly with prompts."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of temporal leakage. The CWE-based vulnerability patterns used in test questions have existed online for years and are likely in all evaluated models' training data. While specific questions are novel, the underlying patterns are not."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "Section 3.1.2 notes test questions deliberately avoid security terminology to prevent tipping off models, but this design choice is not discussed as a leakage concern. In the few-shot setting, explicit vulnerability examples are provided, which constitutes intentional feature provision but is not analyzed as a potential confound."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No discussion of whether the 558 test cases are independent of each other or of model training data. Multiple test cases per CWE type may share structural similarity, but this is not addressed."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No concrete leakage detection or prevention method is applied. No canary strings, membership inference, or n-gram overlap analysis."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Under zero-shot settings, the average overall accuracy across all 13 models is merely 37.44%, indicating substantial proportions of generated code contain vulnerabilities.",
    373       "evidence": "Table 3 shows per-model overall accuracy under zero-shot ranging from 27.78% (Llama4-Scout) to 46.42% (o3), averaging 37.44%.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Explicit safety instructions in prompts increase average accuracy by more than 20 percentage points.",
    378       "evidence": "Table 3 shows average overall accuracy increasing from 37.44% (zero-shot) to 58.01% (zero-shot with safety instruction) across all 13 models.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Few-shot examples with insecure code improve accuracy by an additional 3 percentage points beyond safety instructions.",
    383       "evidence": "Table 3 shows average overall accuracy at 58.01% (zero-shot+SI) vs 61.14% (few-shot). Individual model improvements vary widely.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Reasoning models consistently outperform non-reasoning models across all three experimental settings.",
    388       "evidence": "Table 3 shows reasoning models (marked with *: Gemini-2.5-Pro, o1, o3, DeepSeek-R1, QWQ-32B) generally score higher, but overlap exists — e.g., o1 (35.30%) scores below Qwen3-MOE (41.22%) in zero-shot. No statistical test supports 'consistently.'",
    389       "supported": "weak"
    390     },
    391     {
    392       "claim": "Models perform best on Memory Safety Violations (76.16% average) and worst on Insecure Configurations (12.50% average).",
    393       "evidence": "Table 4 provides per-category accuracy across all 13 models under zero-shot. Memory Safety: 76.16% average; Insecure Configuration: 12.50% average; Resource Issues: 14.69% average.",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "SAST and LLM-based judges exhibit complementary detection strengths — the LLM-Judge catches 30.05% of vulnerabilities missed by SAST.",
    398       "evidence": "Section 6.4 reports: 'The LLM-Judge flagged vulnerabilities missed by SAST in 30.05% of instances, while the SAST-Judge caught issues overlooked by the LLM in 6.24%. Only 2.59% were deemed vulnerable by both.'",
    399       "supported": "strong"
    400     },
    401     {
    402       "claim": "The LLM-Judge achieves 92% accuracy validated against expert ground truth.",
    403       "evidence": "Section 6.4: 'A stratified random sample of 9% of the test cases was selected... The LLM-Judge achieves 92% accuracy, with a 95% binomial confidence interval ranging from 81.2% to 96.8%.'",
    404       "supported": "moderate"
    405     }
    406   ],
    407   "red_flags": [
    408     {
    409       "flag": "Company evaluating competitors without conflict disclosure",
    410       "detail": "All authors are ByteDance employees. ByteDance develops competing AI products (MarsCode/Doubao). The paper evaluates only competitor models (OpenAI, Google, Anthropic, Meta, Alibaba, DeepSeek) without testing any ByteDance models and without a competing interests statement. The benchmark is designed to highlight security weaknesses in competitor products."
    411     },
    412     {
    413       "flag": "No error bars or multiple runs",
    414       "detail": "All results across 13 models × 3 settings appear to be single-run point estimates with no variance, confidence intervals, or reproducibility assessment. LLM generation is stochastic, so results could vary across runs."
    415     },
    416     {
    417       "flag": "Chinese-language prompts not disclosed prominently",
    418       "detail": "Test questions are written in Chinese, acknowledged only in the final paragraph of the Limitations section. This is a major confound — models' performance may reflect Chinese language ability rather than security awareness — yet the title and abstract frame results as general findings about LLM code security."
    419     },
    420     {
    421       "flag": "Unreleased artifacts prevent verification",
    422       "detail": "The 558 test cases, model outputs, and evaluation code are all unreleased ('will be released soon'). No reviewer or reader can currently verify any result."
    423     },
    424     {
    425       "flag": "Judge model choice not justified",
    426       "detail": "DeepSeek-R1 is used as the unified LLM judge with no justification for this choice over alternatives. If the judge has systematic biases, all LLM-Judge results are affected. The 92% validation accuracy has a wide 95% CI (81.2%-96.8%) from a small 9% sample."
    427     },
    428     {
    429       "flag": "Missing generation hyperparameters",
    430       "detail": "Temperature, top-p, and other sampling parameters are not reported for any of the 13 models or the judge. These settings significantly affect output variability and security patterns in generated code."
    431     }
    432   ],
    433   "cited_papers": [
    434     {
    435       "title": "Asleep at the keyboard? Assessing the security of GitHub Copilot's code contributions",
    436       "authors": ["Hammond Pearce", "Baleegh Ahmad", "Benjamin Tan", "Brendan Dolan-Gavitt", "Ramesh Karri"],
    437       "year": 2025,
    438       "relevance": "Foundational study on security vulnerabilities in LLM-generated code, directly motivating SafeGenBench's research question."
    439     },
    440     {
    441       "title": "Evaluating large language models trained on code",
    442       "authors": ["Mark Chen", "Jerry Tworek"],
    443       "year": 2021,
    444       "arxiv_id": "2107.03374",
    445       "relevance": "Introduced HumanEval, the foundational code generation benchmark; contextualizes the shift from functional correctness to security evaluation."
    446     },
    447     {
    448       "title": "CodeLMSec Benchmark: Systematically Evaluating and Finding Security Vulnerabilities in Black-Box Code Language Models",
    449       "authors": ["Hossein Hajipour", "Keno Hassler", "Thorsten Holz", "Lea Schonherr", "Mario Fritz"],
    450       "year": 2024,
    451       "relevance": "Prior security benchmark for LLM-generated code using 280 insecure prompts; directly compared against in Table 1."
    452     },
    453     {
    454       "title": "CWEval: Outcome-driven evaluation on functionality and security of LLM code generation",
    455       "authors": ["Jinjun Peng", "Leyi Cui", "Kele Huang", "Junfeng Yang", "Baishakhi Ray"],
    456       "year": 2025,
    457       "arxiv_id": "2501.08200",
    458       "relevance": "Concurrent code security benchmark evaluating both functionality and security; directly compared against in Table 1."
    459     },
    460     {
    461       "title": "SecRepoBench: Benchmarking LLMs for Secure Code Generation in Real-World Repositories",
    462       "authors": ["Connor Dilgren", "Purva Chiniya", "Luke Griffith", "Yu Ding", "Yizheng Chen"],
    463       "year": 2025,
    464       "arxiv_id": "2504.21205",
    465       "relevance": "Concurrent benchmark for secure code generation in real-world repository contexts; directly compared against in Table 1."
    466     },
    467     {
    468       "title": "CYBERSECEVAL 2: A Wide-Ranging Cybersecurity Evaluation Suite for Large Language Models",
    469       "authors": ["Manish Bhatt", "Sahana Chennabasappa"],
    470       "year": 2024,
    471       "arxiv_id": "2404.13161",
    472       "relevance": "Meta's cybersecurity evaluation framework covering prompt injection and code security; contextualizes the behavioral vs code-level security distinction."
    473     },
    474     {
    475       "title": "LLMSecCode: Evaluating Large Language Models for Secure Coding",
    476       "authors": ["Anton Rydén", "Erik Näslund", "Elad Michael Schiller", "Magnus Almgren"],
    477       "year": 2024,
    478       "relevance": "Evaluation framework for secure code generation across multiple benchmarks; relevant prior work on evaluation methodology."
    479     },
    480     {
    481       "title": "GPT-4 technical report",
    482       "authors": ["Josh Achiam", "Steven Adler"],
    483       "year": 2023,
    484       "arxiv_id": "2303.08774",
    485       "relevance": "Technical report for GPT-4 family, one of the evaluated model families in SafeGenBench."
    486     },
    487     {
    488       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    489       "authors": ["Daya Guo", "Dejian Yang"],
    490       "year": 2025,
    491       "arxiv_id": "2501.12948",
    492       "relevance": "Technical report for DeepSeek-R1, both a top-performing evaluated model and the chosen LLM judge in SafeGenBench."
    493     },
    494     {
    495       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    496       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig"],
    497       "year": 2024,
    498       "relevance": "Key real-world code generation benchmark representing the shift from toy tasks to software engineering evaluation."
    499     },
    500     {
    501       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    502       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    503       "year": 2023,
    504       "relevance": "EvalPlus benchmark augmenting HumanEval with adversarial tests; relevant to methodology of rigorous code evaluation."
    505     },
    506     {
    507       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    508       "authors": ["Naman Jain", "King Han"],
    509       "year": 2024,
    510       "arxiv_id": "2403.07974",
    511       "relevance": "Addresses contamination in code benchmarks through temporal splits; relevant to benchmark design methodology."
    512     }
    513   ]
    514 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs