scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28581B)
      1 {
      2   "paper": {
      3     "title": "Inducing Vulnerable Code Generation in LLM Coding Assistants",
      4     "authors": [
      5       "Binqi Zeng",
      6       "Quan Zhang",
      7       "Chijin Zhou",
      8       "Gwihwan Go",
      9       "Yu Jiang",
     10       "Heyuan Shi"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv",
     14     "arxiv_id": "2504.15867",
     15     "doi": "10.48550/arXiv.2504.15867"
     16   },
     17   "scan_version": 2,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval"],
     20   "key_findings": "HACKODE demonstrates that LLM coding assistants can be induced to generate vulnerable code (buffer overflow, array violations, etc.) through adversarial attack sequences embedded in otherwise-correct referenced code examples, achieving 84.29% average attack success rate across four open-source LLMs (7b-15b). The attack transfers to unseen prompt templates and quantized models at 57.97% and 50.76% ASR respectively, and achieves 75.92% ASR on a real-world coding assistant application. A two-phase progressive generation approach (preliminary + enhancement) improves transferability by 25.49% over single-phase generation.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The paper provides a GitHub repository: https://github.com/HACKODE11/HACKODE. Section I contributions: 'The source code is included at the repository.'"
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The paper describes constructing a dataset of 35 programming problems from StackOverflow but only explicitly states the 'source code' is released at the repository. The dataset itself is not explicitly stated as released."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Hardware is specified (AMD EPYC 7763 CPUs, 8 NVIDIA V100 GPUs, Ubuntu 22.04 LTS) but no software environment details are provided — no requirements.txt, library versions, framework versions, or dependency specifications."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions are provided in the paper. The reader would need to infer the procedure from the methodology description."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "All results (Tables II-VII) report point estimate ASR percentages with no confidence intervals, error bars, or uncertainty measures."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper makes comparative claims (HACKODE vs HACKODE−, comment vs variable renaming, across LLMs) based solely on comparing raw percentages without any statistical significance tests."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Percentage differences are reported with baseline context: e.g., 'average increase of 25.49%' (HACKODE 57.97% vs HACKODE− 32.48% in Table VI), and per-model ASR breakdowns in Tables II-VII allow readers to assess magnitude."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The dataset comprises only 35 programming problems with no justification for this sample size, no power analysis, and no discussion of whether 35 is sufficient for the claims made."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No variance, standard deviation, or spread measures are reported. ASR numbers appear to be from single attack generation attempts per problem without reporting variability across runs."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Section V-D includes ablation baselines: HACKODE− (without progressive generation, Table VI) and variable renaming as alternative injection method (Table VII)."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "This paper introduces a novel attack type (inducing vulnerable code via external references) with no direct prior work to compare against. The ablation-based baselines (HACKODE−, variable renaming) are appropriate given the novelty of the threat model."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Section V-D presents two ablation studies: (1) progressive generation vs non-progressive (Table VI) and (2) comment insertion vs variable renaming (Table VII), isolating the contribution of each design choice."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "The paper reports multiple metrics: ASR (attack success rate), number of iterations, token lengths of attack sequences, responses, and assembled inputs (Table II)."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No human evaluation of the generated vulnerable code is performed. Attack success is determined automatically by checking whether the target vulnerability appears in the generated code. Human evaluation of vulnerability exploitability or code quality would be relevant."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Section V-B evaluates transferability on 'new instructions, queries, and prompt templates' that 'were not used in the generation process of the attack sequences' — a proper held-out evaluation."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Results are broken down by vulnerability type (5 categories) and by LLM (4 models) in Tables II-VII, showing substantial variation (e.g., 25%-100% ASR across categories)."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The paper discusses failure cases: Incorrect Variable on Mistral achieves only 25% ASR, GPTQ Llama2 drops to 29.63%, and general LLMs sometimes generate text instead of code. Section V-B analyzes Llama2's poor transferability."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Several negative results are reported: 25% ASR for Incorrect Variable on Mistral (Table II), 29.63% for GPTQ Llama2 (Table IV), and 0% ASR for variable renaming on Incorrect Variable and Infinite Loop (Table VII)."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Abstract claims of '84.29% success rate' and '75.92% ASR' on real-world application are directly supported by Tables II and V respectively."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The paper's causal claim (attack sequences cause LLMs to generate vulnerable code) is supported by the controlled experimental design: the same LLM with the same reference produces safe code without the attack sequence and vulnerable code with it. The ablation study (Table VI) further supports causality."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The Limitations section explicitly bounds claims: 'HACKODE currently focuses solely on open-source LLMs,' 'validated on LLMs with parameter sizes ranging from 7b to 15b,' and 'may not be highly effective against certain LLMs that exhibit significant performance degradation following quantization.'"
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper does not discuss alternative explanations for why the attack works beyond the proposed gradient-based mechanism. No consideration of confounds such as whether the models would generate vulnerable code at some baseline rate without the attack, or whether results are driven by model memorization."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper directly measures what it claims: whether the target vulnerability code appears in the LLM's generated output. ASR is a direct measure of attack success, not a proxy for a broader claim. The real-world experiment (Section V-C) validates that lab results translate to practical scenarios."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Specific model names and sizes are provided: Llama2-7b, Mistral-7b, CodeLlama-7b, and StarChat2-15b. For open-source models, these identify specific released model checkpoints."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "Figure 2 shows example instructions (IN1, IN2), prompt templates (PT1-PT3), and queries (Q1-Q3), but these are samples from a larger set. The paper states they 'draw on instructions from several LLM-powered applications' and 'use LLMs to generate new instructions' without providing the complete set. Full prompts cannot be reconstructed from the paper alone."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "Attack hyperparameters are stated (maxStep=500, k=3). However, LLM generation hyperparameters (temperature, top-p) are only described as 'All other hyperparameters were kept at their default values' without stating the actual values."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "The paper's own method (HACKODE) is a gradient-based attack optimization, not an agentic scaffolding system. The coding assistant being attacked uses a search-retrieve-generate pipeline, but the paper is evaluating an attack, not building scaffolding."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section V describes the data collection: 'utilized the StackOverflow API and the StackExchange library to collect the answers,' excluded problems solvable without external references, 'through manual validation' ensured code examples are present and referenceable. Target vulnerabilities for each problem are specified in Table I."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section VI Discussion includes a dedicated 'Limitations' paragraph discussing three specific limitations of the work."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The Limitations paragraph identifies specific threats: (1) only open-source LLMs tested, not closed-source, (2) only 7b-15b parameter range tested, (3) GPTQ quantization degrades some models' performance, reducing attack effectiveness."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Limitations section explicitly states what was not tested: closed-source LLMs, models smaller than 7b or larger than 15b, and acknowledges reduced effectiveness on heavily quantized models. Future work directions bound current claims."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "Source code is released but raw experimental data (generated attack sequences, model outputs, full response logs) is not mentioned as available for independent verification."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section V describes data collection: StackOverflow API and StackExchange library used, most up-to-date answers collected, excluded problems solvable without domain knowledge, manual validation performed, 35 problems in 4 languages (Python, Java, C++, PHP)."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. Data source is StackOverflow problems (a standard public platform), not a recruited sample."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "The paper describes the general pipeline (collect from StackOverflow → filter → manual validation → construct vulnerabilities) but does not report how many problems were initially collected, how many were filtered at each stage, or specific filtering criteria beyond 'require domain knowledge.'"
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding information, acknowledgments section, or grant numbers appear anywhere in the paper."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are clearly listed: Central South University (Zeng, Shi) and Tsinghua University (Zhang, Zhou, Go, Jiang). The authors are not affiliated with the evaluated LLMs' developers."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No funding is disclosed, making it impossible to assess funder independence. Absence of disclosure is not equivalent to absence of funding."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests statement or financial interest declaration appears in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "This paper tests an adversarial attack against LLMs, not model capability on benchmarks. The attack deliberately uses problems the models cannot solve, so training data contamination of benchmark problems is not the concern being evaluated."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "The paper evaluates attack effectiveness, not model knowledge. The attack works by manipulating inputs, not by exploiting memorized answers. Train/test overlap is structurally irrelevant to the attack evaluation."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "Same as above — the paper tests defenses/attacks rather than model knowledge. Contamination of the StackOverflow problems in training data would if anything make the attack harder (models might generate correct code on their own), which the paper already accounts for by selecting problems LLMs cannot solve."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this study."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this study."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this study."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "No inference cost, wall-clock time, or per-attack cost is reported. Iteration counts are given (avg 179.17) but not translated to time or compute cost."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Hardware is listed (AMD EPYC 7763, 8 V100 GPUs) but total compute time, GPU hours, or training budget for attack generation is not quantified."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No random seed sensitivity analysis is reported. The attack generation involves random initialization and sampling, but results across different seeds are not shown."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The number of runs producing the main ASR results (Table II) is not explicitly stated. The transferability test uses 5 assembled inputs (Section V-B), and k=3 is stated for enhancement, but it's unclear how many independent attack generation attempts were made per problem."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "Key hyperparameters (maxStep=500, k=3) are stated but no justification for their selection, search budget, or sensitivity analysis is provided."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "Fixed hyperparameters are used without justification for their selection. No validation set-based selection or reporting of alternative configurations."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": false,
    322         "answer": false,
    323         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors evaluate their own attack system (HACKODE) without acknowledging potential self-evaluation bias. No independent evaluation or discussion of this bias."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "While iteration counts are reported per attack (Table II), performance is not analyzed as a function of compute budget. No compute-matched comparisons between HACKODE and HACKODE−."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "The paper constructs a benchmark of 35 StackOverflow problems but does not discuss whether this benchmark is representative of real-world attack scenarios. The real-world experiment (Section V-C) partially validates the benchmark but construct validity is not explicitly analyzed."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "The paper explicitly addresses scaffold variation by testing attack transferability across different prompt templates, instructions, and queries (Section V-B). The attack is designed to be robust to these scaffold differences, and Table III quantifies transferability."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "While the paper selects 'most up-to-date' StackOverflow problems that LLMs cannot solve, there is no explicit discussion of temporal leakage — whether the models' training data could have included these problems or their solutions."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "No discussion of feature leakage. The attack sequence itself could potentially leak information about the target vulnerability through the reference text, but this is not analyzed."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of whether the 35 programming problems are independent or share structural similarities that could inflate attack success rates."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No concrete leakage detection or prevention method is applied. The paper verifies that LLMs cannot solve the problems directly (which is related but not a leakage detection method per se)."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "HACKODE achieves an average 84.29% attack success rate across four LLMs (Llama2-7b 77.14%, Mistral-7b 80.00%, CodeLlama-7b 94.29%, StarChat2-15b 85.71%).",
    372       "evidence": "Table II shows per-vulnerability and per-model ASR results across 35 programming problems with 5 vulnerability types.",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "Attack sequences transfer to randomly assembled inputs with an average 57.97% ASR, and 83.58% of data passes at least one of five transfer tests.",
    377       "evidence": "Table III shows transfer rates on 5 new assembled inputs per problem using unseen instructions, templates, and queries.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "Attack sequences transfer to quantized LLMs with 48.07% ASR (GPTQ) and 53.45% ASR (BitsAndBytes).",
    382       "evidence": "Table IV shows ASR on 4-bit quantized versions of the four target LLMs.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "HACKODE achieves 75.92% ASR on a real-world coding assistant (ChatChat) powered by Mistral-7b (82.14%) and CodeLlama-7b (69.70%).",
    387       "evidence": "Table V shows per-vulnerability ASR on ChatChat application with StackOverflow API agent, tested on local pages to prevent dissemination.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "The progressive generation approach improves ASR by 25.49% over non-progressive generation.",
    392       "evidence": "Table VI compares HACKODE (57.97% average) vs HACKODE− (32.48% average) on transferability tests.",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "Embedding attack sequences as code comments achieves 80.00% ASR vs 11.43% for variable renaming on Mistral-7b.",
    397       "evidence": "Table VII shows per-vulnerability comparison between comment insertion and variable renaming approaches.",
    398       "supported": "moderate"
    399     }
    400   ],
    401   "red_flags": [
    402     {
    403       "flag": "Very small dataset",
    404       "detail": "Only 35 programming problems are used, with as few as 4 examples in some vulnerability categories (Incorrect Variable). This is too small for reliable ASR estimation, especially per-category breakdowns where a single problem flip changes the rate by 25%."
    405     },
    406     {
    407       "flag": "No uncertainty quantification",
    408       "detail": "All ASR numbers are reported as point estimates without confidence intervals, error bars, or variance across runs. With n=35, the 95% CI for an 84% ASR is roughly ±12 percentage points."
    409     },
    410     {
    411       "flag": "Only small open-source models tested",
    412       "detail": "All models are 7b-15b parameter open-source models. No testing on commercial models (GPT-4, Claude) or larger open-source models (70b+), limiting generalizability claims about 'LLM Coding Assistants' broadly."
    413     },
    414     {
    415       "flag": "Simulated real-world experiment",
    416       "detail": "The 'real-world experiment' (Section V-C) uses local web pages instead of actual StackOverflow, and only one coding assistant application (ChatChat). The ecological validity of the attack spreading through actual forums is not tested."
    417     },
    418     {
    419       "flag": "No baseline vulnerability rate",
    420       "detail": "The paper does not measure how often the target LLMs generate vulnerable code without the attack sequence. Without a baseline vulnerability rate, the 84.29% ASR cannot be interpreted as the attack's marginal contribution."
    421     }
    422   ],
    423   "cited_papers": [
    424     {
    425       "title": "Evaluating large language models trained on code",
    426       "authors": ["Mark Chen", "Jerry Tworek"],
    427       "year": 2021,
    428       "arxiv_id": "2107.03374",
    429       "relevance": "Introduces Codex and HumanEval benchmark for evaluating LLM code generation capabilities."
    430     },
    431     {
    432       "title": "Asleep at the keyboard? Assessing the security of GitHub Copilot's code contributions",
    433       "authors": ["Hammond Pearce", "Baleegh Ahmad", "Benjamin Tan"],
    434       "year": 2022,
    435       "relevance": "Directly relevant: evaluates security vulnerabilities in Copilot-generated code across top 25 CWE categories."
    436     },
    437     {
    438       "title": "Do users write more insecure code with AI assistants?",
    439       "authors": ["Neil Perry", "Megha Srivastava", "Deepak Kumar"],
    440       "year": 2023,
    441       "relevance": "Studies how AI coding assistants impact security practices of users — directly relevant to LLM code security."
    442     },
    443     {
    444       "title": "Lost at C: A user study on the security implications of large language model code assistants",
    445       "authors": ["Gustavo Sandoval", "Hammond Pearce"],
    446       "year": 2023,
    447       "relevance": "User study examining security implications of LLM code assistants."
    448     },
    449     {
    450       "title": "Code Llama: Open foundation models for code",
    451       "authors": ["Baptiste Roziere", "Jonas Gehring"],
    452       "year": 2023,
    453       "arxiv_id": "2308.12950",
    454       "relevance": "Major open-source code LLM used as evaluation target in this paper."
    455     },
    456     {
    457       "title": "Universal and transferable adversarial attacks on aligned language models",
    458       "authors": ["Andy Zou", "Zifan Wang", "J Zico Kolter"],
    459       "year": 2023,
    460       "arxiv_id": "2307.15043",
    461       "relevance": "Key prior work on adversarial attacks against LLMs; HACKODE extends this to code generation domain."
    462     },
    463     {
    464       "title": "DeceptPrompt: Exploiting LLM-driven code generation via adversarial natural language instructions",
    465       "authors": ["Fangzhou Wu", "Xiaogeng Liu", "Chaowei Xiao"],
    466       "year": 2023,
    467       "arxiv_id": "2312.04730",
    468       "relevance": "Directly related attack on LLM code generation via adversarial prompts; HACKODE differs by targeting external references rather than user prompts."
    469     },
    470     {
    471       "title": "GitHub Copilot AI pair programmer: Asset or liability?",
    472       "authors": ["Arghavan Moradi Dakhel", "Vahid Majdinasab"],
    473       "year": 2023,
    474       "relevance": "Evaluates Copilot's code generation quality including correctness and usability concerns."
    475     },
    476     {
    477       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    478       "authors": ["Jiawei Liu", "Chunqiu Steven Xia"],
    479       "year": 2024,
    480       "relevance": "Rigorous evaluation of LLM code correctness — relevant to understanding baseline code quality."
    481     },
    482     {
    483       "title": "AutoSafeCoder: A multi-agent framework for securing LLM code generation through static analysis and fuzz testing",
    484       "authors": ["Ana Nunez", "Nafis Tanveer Islam"],
    485       "year": 2024,
    486       "arxiv_id": "2409.10737",
    487       "relevance": "Proposes defenses for LLM code security through static analysis and fuzzing — complementary to HACKODE's attack perspective."
    488     },
    489     {
    490       "title": "SALLM: Security assessment of generated code",
    491       "authors": ["Mohammed Latif Siddiq", "Joanna Cecilia da Silva Santos"],
    492       "year": 2024,
    493       "relevance": "Benchmark for evaluating security of LLM-generated code."
    494     },
    495     {
    496       "title": "Competition-level code generation with AlphaCode",
    497       "authors": ["Yujia Li", "David Choi"],
    498       "year": 2022,
    499       "doi": "10.1126/science.abq1158",
    500       "relevance": "Demonstrates LLM capability for competitive programming, relevant to code generation capabilities assessment."
    501     }
    502   ]
    503 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs