scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29873B)
      1 {
      2   "paper": {
      3     "title": "SecCodePRM: A Process Reward Model for Code Security",
      4     "authors": [
      5       "Weichen Yu",
      6       "Ravi Mangal",
      7       "Yinyi Luo",
      8       "Kai Hu",
      9       "Jingxuan He",
     10       "Corina S. Pasareanu",
     11       "Matt Fredrikson"
     12     ],
     13     "year": 2026,
     14     "venue": "arXiv",
     15     "arxiv_id": "2602.10418"
     16   },
     17   "scan_version": 2,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval"],
     20   "key_findings": "SecCodePRM is a process reward model that assigns step-level security scores to code trajectories for vulnerability detection and secure code generation. It reports large improvements over prior SOTA on vulnerability detection (72% vs 60% accuracy on SVEN, 96.8% vs 72.5% on PrimeVul) and achieves near-perfect partial-code VD (100% F1 on PreciseBugs). The model claims no safety-utility tradeoff in code generation, maintaining or improving functional correctness while enhancing security through inference-time scaling.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The abstract states 'Code is available at SecCodePRM' with what appears to be a hyperlinked repository reference. This indicates a code release, though the actual URL is embedded as a hyperlink in the PDF."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "All evaluation datasets are publicly available standard benchmarks: BigVul, SVEN, PrimeVul, ReposVul, PreciseBugs. The paper uses these without modification for evaluation."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper mentions 'NVIDIA 4*80GB GPUs (e.g., A100 or H100)' and 'DeepSpeed ZeRO-2' but provides no requirements.txt, Dockerfile, or specific library versions. Hardware is described but the software environment is not specified in enough detail to recreate."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions are provided in the paper. Training details are given (Table 8) but no commands, scripts, or README-level instructions for replicating the experiments."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "All results in Tables 1-5 and 7-10 report point estimates only. No confidence intervals, error bars, or ± notation appear anywhere in the paper."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper claims SecCodePRM 'outperforms' baselines across all settings based solely on comparing raw numbers. No statistical significance tests (p-values, t-tests, bootstrap tests) are reported for any comparison."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "The paper provides baseline context for improvements: 'surpasses LLMxCPG... by a margin of 12%' (Section 5.1), '30% gap in accuracy compared to SOTA methods' on PrimeVul, '11.09% on func@1 and increase of 12.98% on safe@1' (Section 5.3). Baselines and absolute values are given."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No justification is provided for the evaluation sample sizes. Benchmark sizes are inherited from the original datasets but never discussed in terms of statistical power or adequacy for the claims made."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. All results appear to be from single runs. Table 11 shows dataset statistics but not experimental variance."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Extensive baselines across all experiments: VulSim, VulBERTA, ReGVD, LLMxCPG for VD; CodeT5, CodeBERT, UniXcoder, StarCoder2, GPT-3.5/4 variants for PrimeVul; multiple LLMs for partial VD; QwenPRM for CG."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Baselines include LLMxCPG (USENIX Security 2025), GPT-oss-120B, Llama-4-Scout, Qwen2.5 series, Gemma-3, and QwenPRM. These are contemporary and competitive."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Section B.2 and Table 6 provide ablation on reward design and aggregation methods (softmax+min, last pos+min/binary/ave, softmax+ave, softmax+binary). Figure 9 further ablates the choice of reward function r1 vs r2."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Multiple metrics are used: accuracy, F1, precision, recall for VD; pairwise metrics P-C/P-V/P-B/P-R for PrimeVul; SR@k for SVEN CG; func@k and safe@k for CWEval; pass@k for LiveCodeBench."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No human evaluation of the model's outputs is included. All evaluation is automated through benchmark metrics. The paper motivates its approach by analogy to human expert VD patterns (Figure 2) but never has humans evaluate SecCodePRM's actual predictions."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Table 11 shows explicit train/test splits for all datasets. Standard benchmark test splits are used (SVEN, PrimeVul, ReposVul, PreciseBugs). PrimeVul uses chronological splitting."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Table 9 provides per-benchmark breakdowns across SVEN, PrimeVul, ReposVul, and PreciseBugs. Table 1 reports on specific CWE types (CWE-125, CWE-190, CWE-416, CWE-476). Figure 5 shows per-length-bin accuracy."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "Section B.9 presents a case study showing correct detection, but no examples of where SecCodePRM fails are shown or discussed. The paper only demonstrates successes of its own method."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Figure 7 honestly shows QwenPRM achieves slight advantages over SecCodePRM at SR@5. Table 5 shows SecCodePRM slightly underperforms QwenPRM on some LiveCodeBench configurations (e.g., QC2.5-7B k=1: 41.49 vs 43.44)."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Abstract claims of outperforming prior approaches in all three settings (full VD, partial VD, CG) are supported by Tables 1-4, Table 9, and Figures 7/10. The claim of 'preserving code functional correctness' is supported by LiveCodeBench results in Table 5."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Claims like 'process-level supervision is more effective at capturing function-level semantics' are supported by the ablation study in Section B.2/Table 6 which systematically varies the reward design. The two-stage training (Section 5) is a controlled single-variable manipulation."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The Limitation section (Section A) explicitly bounds generalization: 'trained primarily on C, C++, and Python datasets... generalization to other programming languages and paradigms (e.g., functional languages, domain-specific languages) remains underexplored' and 'effectiveness on zero-day vulnerabilities or novel attack patterns outside the training distribution is uncertain.'"
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "No alternative explanations for the performance gains are discussed. The paper attributes improvements to process-level supervision without considering confounds such as training data composition, base model choice, or the specific benchmarks used."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper's claims closely match its measurements: VD accuracy/F1 on established CWE benchmarks, SR@k for security rate, safe@k requiring both functional correctness and safety. No broader framing beyond what was measured (e.g., they don't claim to solve 'all code security')."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "The primary model is specified as 'Qwen2.5-Coder-7B-Instruct' (Section 5). Baseline models are identified with specific sizes: QC2.5-7B, QC2.5-32B, QC3-30B, Llama4-17B, Codestral-22B, Gemma3-12B, GPT-oss-120B. The Qwen model name includes a specific version identifier."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Section B.6 provides the full prompt text used for baseline partial VD: 'Given the previous code ..., determine whether the current code ... is vulnerable or not. Reason step by step, and answer with Yes or No.' SecCodePRM itself is a fine-tuned classification model, not a prompting approach."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Table 8 lists all training hyperparameters: epochs (3+1), learning rate (1e-4 → 1e-6), optimizer (Adam, β1=0.9, β2=0.95), weight decay (0.1), batch size (8×NGPUs), max sequence length (128K), DeepSpeed ZeRO-2. Detection threshold 0.5 and scaling N values also stated."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "No agentic scaffolding is used. SecCodePRM is an end-to-end model with a classification head; CG uses simple Best-of-N ranking."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 4 documents the full data pipeline: step segmentation by double newlines, compact/structural merging via heuristic patterns (suffix and prefix patterns), contrastive labeling via sequence alignment (Gestalt Pattern Matching), label propagation using AST analysis. Figure 4 visualizes the pipeline."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section A is titled 'Limitation' and provides substantive discussion of language generalization and zero-day vulnerability limitations."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The limitations identify specific threats: 'trained primarily on C, C++, and Python datasets' with uncertain generalization to 'functional languages, domain-specific languages,' and 'effectiveness on zero-day vulnerabilities or novel attack patterns outside the training distribution is uncertain.' These are specific to this study."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section A explicitly states what the results do not show: generalization to other languages/paradigms and effectiveness on zero-day/novel attack patterns. The evaluation scope is bounded to 'function-level and repository-level benchmarks with known CWE categories.'"
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "All evaluation datasets (BigVul, SVEN, PrimeVul, ReposVul, PreciseBugs) are publicly available standard benchmarks. The underlying data can be independently obtained and verified."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 4 describes the data construction pipeline in detail, and Section 4.1 analyzes training dataset properties. Table 11 provides comprehensive statistics (example counts, token ratios, step ratios, vulnerability ratios) for all datasets."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. All data comes from standard public vulnerability benchmarks."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Section 4 documents the full pipeline: raw code → step segmentation → filtering (no-op patterns) → structural merging (suffix/prefix patterns) → contrastive labeling (sequence alignment, Gestalt Pattern Matching) → label propagation (AST-based caller function labeling). Token length hacking bias is identified and BigVul/PrimeVul Unpaired excluded from training."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding sources are mentioned anywhere in the paper. No acknowledgments section is present in the provided text."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are clearly listed: Carnegie Mellon University, Colorado State University, University of California Berkeley. No commercial affiliations."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No funding is disclosed, so independence cannot be assessed. Since funding_disclosed is NO, this question cannot be answered affirmatively."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial interests statement is present in the paper. Absence of disclosure is not the same as absence of conflict."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "The base model is Qwen2.5-Coder-7B-Instruct but its training data cutoff is never stated. This is relevant because several evaluation benchmarks (SVEN 2023, PrimeVul 2023, PreciseBugs 2023) could be in Qwen's training data."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No analysis of whether Qwen2.5-Coder's pretraining data includes any of the evaluation benchmarks. PrimeVul's chronological splitting (mentioned in Section 4.1) addresses the dataset's internal split but not pretrained model contamination."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "BigVul (2020), SVEN (2023), PrimeVul (2023), PreciseBugs (2023), and ReposVul (2024) were all published before Qwen2.5-Coder's likely training cutoff. No contamination analysis is performed despite this risk."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this study. All evaluation is automated on code benchmarks."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants. The study uses publicly available code vulnerability datasets."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "No inference cost or latency is reported despite the paper claiming suitability for 'real-time feedback' and 'dense, immediate reward signals.' For a method claiming real-time applicability, inference latency is critical and missing."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Hardware is mentioned ('NVIDIA 4*80GB GPUs') but no total training time, GPU hours, or computational budget is quantified. Only hyperparameters (epochs, learning rate) are given without wall-clock duration."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from a single training run."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The number of experimental runs is never stated. For CG, N=10/20 candidates are sampled per example, but the number of independent training/evaluation runs is not specified."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "No hyperparameter search budget is reported. The hyperparameters in Table 8 appear to be fixed choices with no discussion of how they were selected or how many configurations were tried."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "While Table 6 ablates reward aggregation strategies, the selection of the main model's training hyperparameters is not justified. No validation set selection procedure is described for choosing the final model configuration."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": false,
    322         "answer": false,
    323         "justification": "No statistical tests are performed at all, so correction for multiple comparisons does not arise. The paper compares raw performance numbers without any hypothesis testing."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "No acknowledgment of self-evaluation bias. The authors implement and evaluate their own system against baselines without discussing whether their baseline implementations or evaluation setup could systematically disadvantage competitors."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "While the paper notes SecCodePRM (7B) outperforms models 'nearly 17 times its size,' no explicit compute-matched comparison is provided. The inference compute differences between Best-of-N with different N values and direct classification are not quantified."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "No discussion of whether the evaluation benchmarks (SVEN, PrimeVul, PreciseBugs, ReposVul) actually measure real-world vulnerability detection capability. The paper assumes benchmark performance equals security effectiveness without questioning construct validity."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "For CG experiments, the same base generator (QC2.5-7B or QC2.5-32B) is used across all PRM comparisons (QwenPRM vs SecCodePRM vs probability-based ranking), properly isolating the PRM's contribution. For VD, SecCodePRM is an end-to-end model without scaffolding."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "No discussion of temporal leakage. The benchmarks (BigVul 2020, SVEN 2023, PrimeVul 2023) predate the Qwen2.5-Coder model, meaning solutions could exist in the pretraining data. PrimeVul's chronological split addresses within-dataset temporal issues but not pretraining contamination."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "No discussion of feature leakage. The training data construction uses vulnerability patches (vulnerable vs fixed code pairs), and it is not discussed whether evaluation test samples share structural features with training examples."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of whether training and test examples are independent. Multiple datasets draw from overlapping open-source repositories, and potential overlap between training data (e.g., SVEN train) and test benchmarks is not analyzed."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No concrete leakage detection or prevention method is applied. The token length hacking analysis (Section 4.1) addresses data quality bias but not test set leakage. No canary strings, membership inference, or decontamination pipelines are used."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "SecCodePRM outperforms prior SOTA methods on full-code vulnerability detection across multiple benchmarks.",
    372       "evidence": "Table 1: 72% accuracy and F1 on SVEN vs 60% for LLMxCPG. Table 2: 93.66% P-C on PrimeVul Paired vs 12.94% for GPT-4 CoT. Table 3: 96.83% accuracy vs 72.50% for LLMxCPG. Table 4: 0.59 F1 on PreciseBugs vs 0.48 for MSIVD.",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "SecCodePRM (7B) outperforms models up to 17x its size on partial-code vulnerability detection.",
    377       "evidence": "Table 9: SecCodePRM achieves 50.17 F1 on SVEN, 55.71 on PrimeVul, 23.79 on ReposVul, and 90.00 on PreciseBugs, exceeding all baselines including GPT-oss-120B across all benchmarks.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "SecCodePRM improves secure code generation without sacrificing functional correctness (no safety-utility tradeoff).",
    382       "evidence": "Table 5: On LiveCodeBench, SecCodePRM maintains or slightly improves pass@k (55.19% vs 53.23% baseline for QC2.5-32B k=1). Table 7: On CWEval, func@1 increases by 11.09% and safe@1 by 12.98%.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Human experts can identify security flaws using only 40-60% of code tokens from the beginning.",
    387       "evidence": "Figure 2 box plot showing expert human VD patterns, based on analysis of vulnerability detection patterns measuring tokens required from the beginning of code.",
    388       "supported": "weak"
    389     },
    390     {
    391       "claim": "Process-level reward modeling provides a more effective inductive bias for security tasks than raw parameter scaling.",
    392       "evidence": "Section 5.2: SecCodePRM (7B) outperforms 120B models. Table 9 shows larger models (QC2.5-32B, QC3-30B) perform worse than SecCodePRM on partial VD F1.",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "SecCodePRM's accuracy is robust to increasing code length.",
    397       "evidence": "Figure 5 and Section B.1/Figure 8 show accuracy remains stable as token counts increase on PrimeVul and PreciseBugs.",
    398       "supported": "moderate"
    399     }
    400   ],
    401   "red_flags": [
    402     {
    403       "flag": "Suspiciously large performance gaps",
    404       "detail": "SecCodePRM achieves 96.83% accuracy on PrimeVul Paired vs 72.50% for SOTA LLMxCPG — a 24pp gap. On PrimeVul pairwise metrics, P-C jumps from 12.94% (GPT-4 CoT) to 93.66%, and both P-V and P-R are exactly 0%, meaning the model never makes certain types of errors. A 100% F1 on PreciseBugs partial VD (Table 9) is extraordinary. These results warrant skepticism without error bars or multiple-run verification."
    405     },
    406     {
    407       "flag": "No uncertainty quantification",
    408       "detail": "No error bars, confidence intervals, standard deviations, or multiple-run results are reported for any experiment. Given the extraordinary claims, this makes it impossible to assess result stability or statistical significance."
    409     },
    410     {
    411       "flag": "No contamination analysis",
    412       "detail": "The base model (Qwen2.5-Coder) was released in 2024, after all evaluation benchmarks were published (BigVul 2020, SVEN 2023, PrimeVul 2023, PreciseBugs 2023, ReposVul 2024). The pretraining data likely includes these benchmarks' code. No contamination analysis is performed."
    413     },
    414     {
    415       "flag": "Missing inference latency for real-time claims",
    416       "detail": "The paper repeatedly claims suitability for 'real-time feedback' and 'dense, immediate reward signals' during interactive coding, but never reports inference latency or throughput. Without timing data, the real-time applicability claim is unsubstantiated."
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "Evaluating large language models trained on code",
    422       "authors": ["M. Chen"],
    423       "year": 2021,
    424       "arxiv_id": "2107.03374",
    425       "relevance": "Foundational paper on LLM code generation evaluation (HumanEval benchmark)."
    426     },
    427     {
    428       "title": "SWE-bench: Can language models resolve real-world github issues?",
    429       "authors": ["C. E. Jimenez", "J. Yang", "A. Wettig", "S. Yao", "K. Pei", "O. Press", "K. R. Narasimhan"],
    430       "year": 2024,
    431       "relevance": "Major real-world code generation benchmark evaluating LLM capability on GitHub issues."
    432     },
    433     {
    434       "title": "LLMs cannot reliably identify and reason about security vulnerabilities (yet?): A comprehensive evaluation, framework, and benchmarks",
    435       "authors": ["S. Ullah", "M. Han", "S. Pujar", "H. Pearce", "A. Coskun", "G. Stringhini"],
    436       "year": 2024,
    437       "relevance": "Comprehensive evaluation showing LLMs struggle with vulnerability detection, directly motivating SecCodePRM."
    438     },
    439     {
    440       "title": "Asleep at the keyboard? Assessing the security of github copilot's code contributions",
    441       "authors": ["H. Pearce", "B. Ahmad", "B. Tan", "B. Dolan-Gavitt", "R. Karri"],
    442       "year": 2025,
    443       "relevance": "Empirical assessment of Copilot-generated code security, a key prior work on LLM code safety."
    444     },
    445     {
    446       "title": "A comprehensive study of LLM secure code generation",
    447       "authors": ["S.-C. Dai", "J. Xu", "G. Tao"],
    448       "year": 2025,
    449       "arxiv_id": "2503.15554",
    450       "relevance": "Comprehensive study of LLM secure code generation, directly related to the paper's CG evaluation."
    451     },
    452     {
    453       "title": "Vulnerability detection with code language models: How far are we?",
    454       "authors": ["Y. Ding", "Y. Fu", "O. Ibrahim", "C. Sitawarin", "X. Chen", "B. Alomair", "D. Wagner", "B. Ray", "Y. Chen"],
    455       "year": 2024,
    456       "arxiv_id": "2403.18624",
    457       "relevance": "PrimeVul benchmark and evaluation framework used as a primary evaluation dataset in this paper."
    458     },
    459     {
    460       "title": "LLMxCPG: Context-Aware vulnerability detection through code property graph-guided large language models",
    461       "authors": ["A. Lekssays", "H. Mouhcine", "K. Tran", "T. Yu", "I. Khalil"],
    462       "year": 2025,
    463       "relevance": "SOTA vulnerability detection baseline using LLM+static analysis hybrid approach, outperformed by SecCodePRM."
    464     },
    465     {
    466       "title": "Large language models for code: Security hardening and adversarial testing",
    467       "authors": ["J. He", "M. Vechev"],
    468       "year": 2023,
    469       "relevance": "SVEN benchmark and security hardening approach used for both training data and evaluation."
    470     },
    471     {
    472       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    473       "authors": ["N. Jain", "K. Han", "A. Gu", "W.-D. Li", "F. Yan", "T. Zhang", "S. Wang", "A. Solar-Lezama", "K. Sen", "I. Stoica"],
    474       "year": 2024,
    475       "arxiv_id": "2403.07974",
    476       "relevance": "Contamination-free code generation benchmark used to verify no safety-utility tradeoff."
    477     },
    478     {
    479       "title": "CWEval: Outcome-driven evaluation on functionality and security of LLM code generation",
    480       "authors": ["J. Peng", "L. Cui", "K. Huang", "J. Yang", "B. Ray"],
    481       "year": 2025,
    482       "relevance": "Joint functionality and security code generation benchmark requiring both correctness and vulnerability-free code."
    483     },
    484     {
    485       "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions",
    486       "authors": ["T. Y. Zhuo", "V. M. Chien", "J. Chim"],
    487       "year": 2025,
    488       "relevance": "Major code generation benchmark referenced in the LLM code generation evaluation landscape."
    489     },
    490     {
    491       "title": "Can you really trust code copilots? Evaluating large language models from a code security perspective",
    492       "authors": ["Y. Mou", "X. Deng", "Y. Luo", "S. Zhang", "W. Ye"],
    493       "year": 2025,
    494       "arxiv_id": "2505.10494",
    495       "relevance": "Evaluation of LLM code copilot security, directly relevant to AI-assisted coding safety."
    496     }
    497   ]
    498 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs