scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30061B)
      1 {
      2   "paper": {
      3     "title": "Large Language Model Unlearning for Source Code",
      4     "authors": [
      5       "Xue Jiang",
      6       "Yihong Dong",
      7       "Huangzhao Zhang",
      8       "Tangxinyu Wang",
      9       "Zheng Fang",
     10       "Yingwei Ma",
     11       "Rongyu Cao",
     12       "Binhua Li",
     13       "Zhi Jin",
     14       "Wenpin Jiao",
     15       "Yongbin Li",
     16       "Ge Li"
     17     ],
     18     "year": 2025,
     19     "venue": "AAAI 2026",
     20     "arxiv_id": "2506.17125",
     21     "doi": "10.48550/arXiv.2506.17125"
     22   },
     23   "scan_version": 2,
     24   "active_modules": ["experimental_rigor", "data_leakage"],
     25   "methodology_tags": ["benchmark-eval"],
     26   "key_findings": "PROD, a token-level distribution manipulation approach for LLM code unlearning, achieves superior trade-offs between forget quality and model utility compared to GA, DPO, NPO, and FLAT across three code unlearning tasks (copyrighted code, insecure code, deprecated APIs). Existing unlearning methods cause severe utility degradation on code due to the rigid syntax of programming languages, while PROD preserves code generation capability. PROD also demonstrates robustness against prefix injection adversarial attacks and generalizes across four 7B-parameter code LLMs.",
     27   "checklist": {
     28     "artifacts": {
     29       "code_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The paper states 'Our source code and data are available at https://github.com/jiangxxxue/PROD' in the Introduction section."
     33       },
     34       "data_released": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "The paper claims data is available at the GitHub URL. The benchmark also uses publicly available datasets: Stack corpus, CyberSecEval, and VersiCode."
     38       },
     39       "environment_specified": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper mentions '4 NVIDIA A100 GPUs' and optimizer details (AdamW, weight decay 0.01) but provides no software environment specification (Python version, library versions, requirements.txt, or Dockerfile)."
     43       },
     44       "reproduction_instructions": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "No step-by-step reproduction instructions are provided in the paper. A GitHub link is given but the paper itself contains no README-style reproduction guide or commands to run."
     48       }
     49     },
     50     "statistical_methodology": {
     51       "confidence_intervals_or_error_bars": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The main results in Table 1 are point estimates (PDR percentages) with no confidence intervals or ± notation. Figure 5 shows min/max ranges for adversarial attacks only, but the primary results lack uncertainty quantification."
     55       },
     56       "significance_tests": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No statistical significance tests are used. Claims like 'PROD achieves superior overall performance' and 'significantly outperforms' are based on comparing raw numbers without any formal tests (no p-values, t-tests, or bootstrap tests)."
     60       },
     61       "effect_sizes_reported": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper reports 'an average relative gain of 124% over the strongest competitor' for PDR. Table 1 provides absolute PDR values for all methods across tasks (e.g., PROD 41.8% vs GA 15.3% on copyright), giving baseline context for understanding effect magnitudes."
     65       },
     66       "sample_size_justified": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No justification for sample sizes: 100 copyrighted files, 1,916 insecure snippets, 3,449 deprecated API snippets, 5 random seeds, or 3 human evaluators. No power analysis."
     70       },
     71       "variance_reported": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "The paper states 'unlearning training is conducted five times using different random seeds, and the final evaluation results are averaged across the five runs' but does not report standard deviation, variance, or any spread measure in Table 1 or elsewhere in the main results."
     75       }
     76     },
     77     "evaluation_design": {
     78       "baselines_included": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Four baselines are compared: Gradient Ascent (GA), Direct Preference Optimization (DPO), Negative Preference Optimization (NPO), and Forget Data Only Loss Adjustment (FLAT)."
     82       },
     83       "baselines_contemporary": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Baselines are from 2023-2025: GA (Yao et al. 2024), DPO (Rafailov et al. 2023), NPO (Zhang et al. 2024), FLAT (Wang et al. 2025). The paper describes them as spanning 'current LLM unlearning paradigms.'"
     87       },
     88       "ablation_study": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Figure 6 presents ablation studies: (a) alternative loss functions (KL, JS vs cross-entropy), (b) hyperparameter p for noise elimination (0.2, 0.4, 0.8, 0.9, 1.0), and (c) hyperparameter α for forget strength (-0.125 to 1)."
     92       },
     93       "multiple_metrics": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Multiple metrics are used: forget quality (1-BLEU for copyright, combined 1-BLEU + security pass rate for insecurity, exact-match accuracy for deprecated API), model utility (HumanEval functional correctness), and the composite PDR metric."
     97       },
     98       "human_evaluation": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "A perceptual quality evaluation is conducted with 'three independent volunteer evaluators (each with two years or more of software development experience)' on 20 samples per task, plus GPT-4 evaluation on all samples. Results reported in Table 2."
    102       },
    103       "held_out_test_set": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Forget quality is measured on held-out portions of forget data (e.g., second half of copyrighted files). Model utility is measured on HumanEval, which is separate from the training/unlearning data. Deprecated API task uses a temporal split with deprecation boundary."
    107       },
    108       "per_category_breakdown": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Table 1 breaks down PDR per task (copyright, insecurity, deprecation). Figure 3 shows per-task forget quality vs model utility curves. Figure 4 shows per-model results across 4 LLMs."
    112       },
    113       "failure_cases_discussed": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "The paper discusses failure modes of baselines in detail (mute refusal, token collapse, syntactic incoherence in Figure 1) but does not show or discuss failure cases of PROD itself. No qualitative examples where PROD fails are provided."
    117       },
    118       "negative_results_reported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The ablation study reports configurations that don't work well: KL and JS divergence underperform cross-entropy (Figure 6a), no noise elimination yields worst results (Figure 6b), and certain α values degrade performance (Figure 6c). The paper also notes 'short forget sequences yf often require more training steps since the supervisory signal is weak.'"
    122       }
    123     },
    124     "claims_and_evidence": {
    125       "abstract_claims_supported": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Abstract claims of 'superior overall performance' are supported by Table 1 (highest PDR on all tasks). 'Broad applicability' is supported by Figure 4 (4 LLMs). 'Superior robustness against adversarial attacks' is supported by Figure 5."
    129       },
    130       "causal_claims_justified": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper makes causal claims through ablation studies (removing noise elimination, changing loss functions, varying hyperparameters) which are controlled single-variable manipulations. The claim that PROD's token-level manipulation preserves utility is supported by the mechanism design and empirical ablations."
    134       },
    135       "generalization_bounded": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The title claims 'Large Language Model Unlearning for Source Code' broadly, but all tested models are approximately 7B parameters (CodeLlama-7B, Qwen2.5-Coder-7B, Deepseek-coder-6.7B, Starcoder-7B). No larger or smaller models are tested. No discussion of whether results generalize beyond ~7B models or beyond the three specific tasks."
    139       },
    140       "alternative_explanations_discussed": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "No discussion of alternative explanations for PROD's superior performance. For example, the distribution sculpting could benefit from the specific memorization training setup, or the improvements could be task/scale-specific. No confound analysis is provided."
    144       },
    145       "proxy_outcome_distinction": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "The paper's claims match the granularity of its measurements. For copyright, they explicitly justify BLEU as a proxy: 'the law of copyright protects expression instead of ideas, textual similarity is employed as an indicator of potential copyright infringement.' Forget quality and model utility are directly measured, not framed as broader constructs."
    149       }
    150     },
    151     "setup_transparency": {
    152       "model_versions_specified": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Specific model names with sizes are given: CodeLlama-7B, Qwen2.5-Coder-7B, Deepseek-coder-6.7B, Starcoder-7B. These are specific open-source models identifiable by name and size."
    156       },
    157       "prompts_provided": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "The GPT-4 evaluation prompt used for the perceptual quality study is not provided. For the main evaluation, code prefixes serve as inputs, but only one example is shown in Figure 1. The actual evaluation inputs (100 copyrighted files, CyberSecEval prompts) are not included in the paper."
    161       },
    162       "hyperparameters_reported": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Comprehensive hyperparameters reported: learning rate grid {1e-4 to 1e-6}, batch size 32, 10 epochs, AdamW with weight decay 0.01, max sequence length 1024, greedy decoding (zero temperature), PROD p=0.8, α=0, NPO/DPO β=0.1."
    166       },
    167       "scaffolding_described": {
    168         "applies": false,
    169         "answer": false,
    170         "justification": "No agentic scaffolding is used. PROD is a direct model fine-tuning/post-training method."
    171       },
    172       "data_preprocessing_documented": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Preprocessing is documented for each task: copyrighted code (random draw from Stack, filtering test files/toy examples/templates), insecure code (from CyberSecEval, 1,916 snippets with 50 CWE types), deprecated API (three steps: version filtration, file filtration, temporal split, resulting in 252 packages and 3,449 snippets). Memorization training uses 1:9 forget-to-pretrain ratio."
    176       }
    177     },
    178     "limitations_and_scope": {
    179       "limitations_section_present": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No dedicated limitations section exists. The paper has only a Conclusion section with no discussion of limitations or threats to validity."
    183       },
    184       "threats_to_validity_specific": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "No threats to validity are discussed anywhere in the paper. No mention of specific weaknesses of the evaluation approach."
    188       },
    189       "scope_boundaries_stated": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "No explicit scope boundaries are stated. The paper does not discuss what the results do NOT show, e.g., whether results apply to models of different sizes, other programming languages, or real-world memorization (vs. the artificial memorization training setup)."
    193       }
    194     },
    195     "data_integrity": {
    196       "raw_data_available": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The paper states 'Our source code and data are available at https://github.com/jiangxxxue/PROD.' The underlying benchmark datasets (Stack corpus, CyberSecEval, VersiCode) are publicly available."
    200       },
    201       "data_collection_described": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Data collection is described for each task: 100 files randomly drawn from Stack corpus with filtering criteria, 1,916 CyberSecEval snippets covering 50 CWE types across 8 languages, and VersiCode-derived deprecated API data with explicit preprocessing pipeline."
    205       },
    206       "recruitment_methods_described": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "For the human evaluation: 'three independent volunteer evaluators (each with two years or more of software development experience)' — no description of how they were recruited, where they are from, or whether the recruitment could introduce bias."
    210       },
    211       "data_pipeline_documented": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "The pipeline is documented: for copyrighted code, Stack → random draw → filter non-expressive code → 100 files. For deprecated API, VersiCode → version filtration → file filtration (≥3 snippets) → temporal split → 252 packages, 3,449 snippets. Memorization training pipeline (1:9 ratio mixing) is also described."
    215       }
    216     },
    217     "conflicts_of_interest": {
    218       "funding_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Acknowledgments section lists: 'National Key R&D Program under Grant No. 2023YFB4503801, the National Natural Science Foundation of China under Grant No. 62192733, 62192730, 62192731, and the Major Program (JD) of Hubei Province (No.2023BAA024).'"
    222       },
    223       "affiliations_disclosed": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "Author affiliations clearly listed: Peking University, Tongyi Lab (Alibaba Group), and Verdent AI. The note 'Work done during Xue Jiang and Yihong Dong's internship at Tongyi Lab' is also disclosed."
    227       },
    228       "funder_independent_of_outcome": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "Funding is from Chinese government research programs (NSFC, National Key R&D Program, Hubei Province program), which have no financial stake in PROD outperforming baselines."
    232       },
    233       "financial_interests_declared": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No competing interests or financial interests statement is provided. Some authors are from Alibaba's Tongyi Lab, which has commercial interests in code LLMs, but no declaration is made."
    237       }
    238     },
    239     "contamination": {
    240       "training_cutoff_stated": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No training data cutoff dates are stated for any of the four models (CodeLlama-7B, Qwen2.5-Coder-7B, Deepseek-coder-6.7B, Starcoder-7B)."
    244       },
    245       "train_test_overlap_discussed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "No discussion of whether HumanEval (used for model utility) appeared in any model's pre-training data. For the forget tasks, memorization is intentionally induced through training, but overlap between the Stack-sourced forget data and model pre-training is not discussed."
    249       },
    250       "benchmark_contamination_addressed": {
    251         "applies": true,
    252         "answer": false,
    253         "justification": "HumanEval was published in 2021 and all four models were trained after 2021, meaning they likely encountered HumanEval during pre-training. This contamination risk is not addressed."
    254       }
    255     },
    256     "human_studies": {
    257       "pre_registered": {
    258         "applies": true,
    259         "answer": false,
    260         "justification": "No pre-registration mentioned for the human perceptual quality evaluation."
    261       },
    262       "irb_or_ethics_approval": {
    263         "applies": true,
    264         "answer": false,
    265         "justification": "No IRB or ethics board approval is mentioned for the study involving 3 human evaluators."
    266       },
    267       "demographics_reported": {
    268         "applies": true,
    269         "answer": false,
    270         "justification": "Only minimal information: 'three independent volunteer evaluators (each with two years or more of software development experience).' No other demographics (education, role, programming languages known, etc.)."
    271       },
    272       "inclusion_exclusion_criteria": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "The only criterion mentioned is 'two years or more of software development experience.' No formal inclusion/exclusion criteria or screening process described."
    276       },
    277       "randomization_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "Not an experimental study with condition assignment — all evaluators assessed all samples in a comparative manner."
    281       },
    282       "blinding_described": {
    283         "applies": true,
    284         "answer": true,
    285         "justification": "Explicitly stated: 'Each human evaluator assesses all samples without knowing which approach produced each sample.'"
    286       },
    287       "attrition_reported": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "No information about whether all 3 evaluators completed all evaluations, or any discussion of evaluator attrition or incomplete assessments."
    291       }
    292     },
    293     "cost_and_practicality": {
    294       "inference_cost_reported": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "No inference cost, latency, or wall-clock time is reported for PROD or any baseline. The distribution sculpting step adds overhead that is not quantified."
    298       },
    299       "compute_budget_stated": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "Hardware is mentioned ('4 NVIDIA A100 GPUs') but total GPU hours, training time, or total compute budget is not stated for any experiment."
    303       }
    304     },
    305     "experimental_rigor": {
    306       "seed_sensitivity_reported": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The paper states 'conducted five times using different random seeds, and the final evaluation results are averaged across the five runs' but does not report seed-to-seed variation or sensitivity analysis. Only averaged results are presented."
    310       },
    311       "number_of_runs_stated": {
    312         "applies": true,
    313         "answer": true,
    314         "justification": "Explicitly stated: 'The unlearning training is conducted five times using different random seeds, and the final evaluation results are averaged across the five runs.'"
    315       },
    316       "hyperparameter_search_budget": {
    317         "applies": true,
    318         "answer": true,
    319         "justification": "The search procedure is described: 'we select the learning rate via grid search from the set {1e-4, 5e-5, 1e-5, 5e-6, 1e-6}, optimizing for the forget quality metric on forget data.' Search method (grid), search space (5 values), and selection criterion are stated."
    320       },
    321       "best_config_selection_justified": {
    322         "applies": true,
    323         "answer": true,
    324         "justification": "Selection criterion is stated: learning rate selected by 'optimizing for the forget quality metric on forget data' via grid search. PROD hyperparameters (p=0.8, α=0) are justified by ablation results in Figure 6."
    325       },
    326       "multiple_comparison_correction": {
    327         "applies": false,
    328         "answer": false,
    329         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    330       },
    331       "self_comparison_bias_addressed": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The authors implement all baselines and compare against their own system with no acknowledgment of author-evaluation bias. No independent evaluation or discussion of whether baseline reimplementations may underperform original implementations."
    335       },
    336       "compute_budget_vs_performance": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "No comparison of compute budgets between PROD and baselines. PROD requires an additional distribution collection step that may increase compute, but this cost difference is not analyzed."
    340       },
    341       "benchmark_construct_validity": {
    342         "applies": true,
    343         "answer": false,
    344         "justification": "HumanEval is used for model utility measurement without discussing whether it adequately captures general code generation capability. The paper briefly justifies BLEU for copyright ('the law protects expression') but does not discuss construct validity of its benchmarks more broadly."
    345       },
    346       "scaffold_confound_addressed": {
    347         "applies": false,
    348         "answer": false,
    349         "justification": "No scaffolding is used. PROD is a direct model fine-tuning method with standard autocompletion inference."
    350       }
    351     },
    352     "data_leakage": {
    353       "temporal_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of temporal leakage. The copyrighted code from Stack corpus and HumanEval for utility evaluation may have been in the models' pre-training data. Training cutoff dates are not stated."
    357       },
    358       "feature_leakage_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "For the copyright task, the model is prompted with the first half of a file to see if it reproduces the second half. No discussion of whether this context provides sufficient information for correct completion regardless of memorization."
    362       },
    363       "non_independence_addressed": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No discussion of whether the Stack corpus files used as forget data overlap with the models' pre-training data (Stack is commonly used for training code LLMs). No verification of independence between forget set and pre-training data."
    367       },
    368       "leakage_detection_method": {
    369         "applies": true,
    370         "answer": false,
    371         "justification": "No concrete leakage detection or prevention method is used (no canary strings, membership inference tests, n-gram overlap analysis, or decontamination)."
    372       }
    373     }
    374   },
    375   "claims": [
    376     {
    377       "claim": "PROD achieves the best PDR score across all three unlearning tasks, with an average relative gain of 124% over the strongest competitor.",
    378       "evidence": "Table 1 shows PDR scores: PROD achieves 41.8% (copyright), 70.4% (insecurity), 58.0% (deprecation) vs best baselines of 15.3%, 24.5%, 51.6% respectively.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Existing unlearning methods (GA, DPO, NPO, FLAT) cause severe model utility degradation on code generation, making models practically unusable.",
    383       "evidence": "Figure 3 shows all baselines' model utility drops to near-zero when achieving ~90% forget quality. Figure 1 demonstrates three failure modes: mute refusal, token collapse, and syntactic incoherence.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "PROD generalizes across four distinct code LLMs (CodeLlama-7B, Qwen2.5-Coder-7B, Deepseek-coder-6.7B, Starcoder-7B).",
    388       "evidence": "Figure 4 shows PROD achieves near-perfect forget quality while maintaining utility above baselines' best on all four LLMs on the copyrighted code task.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "PROD is significantly more robust under prefix injection adversarial attack than all baselines.",
    393       "evidence": "Figure 5 shows PROD keeps similarity to copyrighted code below 0.05, while baselines exceed 0.3. PROD also shows smallest variance.",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "Human evaluators prefer PROD outputs over baselines with >70% win rate.",
    398       "evidence": "Table 2 shows win rates: 81% vs GA, 92% vs DPO, 76% vs NPO, 87% vs FLAT (human evaluation). GPT-4 evaluation confirms similar patterns.",
    399       "supported": "weak"
    400     }
    401   ],
    402   "red_flags": [
    403     {
    404       "flag": "Tiny human evaluation sample",
    405       "detail": "Only 3 human evaluators assessed 20 samples per task. This is too few evaluators to establish reliability of human judgments, and no inter-rater agreement statistics are reported."
    406     },
    407     {
    408       "flag": "No limitations section",
    409       "detail": "The paper lacks any discussion of limitations, threats to validity, or scope boundaries. This is a significant omission for a methods paper making broad claims."
    410     },
    411     {
    412       "flag": "All models are ~7B parameters",
    413       "detail": "All four tested models are approximately 7B parameters. No testing on larger or smaller models, yet the title and claims generalize to 'Large Language Models' broadly."
    414     },
    415     {
    416       "flag": "Artificial memorization setup",
    417       "detail": "For copyright and insecurity tasks, models are first trained to memorize the forget data before unlearning is applied. This may not reflect realistic scenarios where memorization occurs naturally during pre-training on web-scale data, potentially inflating PROD's advantage."
    418     },
    419     {
    420       "flag": "No statistical significance tests",
    421       "detail": "Comparative claims like 'significantly outperforms' and 'superior overall performance' are made without any statistical significance testing, despite running 5 seeds per experiment."
    422     },
    423     {
    424       "flag": "HumanEval contamination unaddressed",
    425       "detail": "HumanEval (published 2021) is used for model utility measurement, but all four models were trained after 2021 and may have seen HumanEval during pre-training. This confounds the utility measurement."
    426     }
    427   ],
    428   "cited_papers": [
    429     {
    430       "title": "Large Language Model Unlearning",
    431       "authors": ["Yuanshun Yao", "Xiaojun Xu", "Yang Liu"],
    432       "year": 2024,
    433       "arxiv_id": "2310.10683",
    434       "relevance": "Core LLM unlearning method (GA baseline) published at NeurIPS 2024; foundational to the field of LLM unlearning."
    435     },
    436     {
    437       "title": "Rethinking machine unlearning for large language models",
    438       "authors": ["Sijia Liu", "Yuanshun Yao", "Jinghan Jia"],
    439       "year": 2025,
    440       "relevance": "Nature Machine Intelligence review of LLM unlearning challenges and methods; establishes that traditional approaches are impractical for LLMs."
    441     },
    442     {
    443       "title": "Instruction tuning for secure code generation",
    444       "authors": ["Jingxuan He", "Mark Vero", "Gabriela Krasnopolska", "Martin Vechev"],
    445       "year": 2024,
    446       "relevance": "SaferCode approach for security-focused code LLM fine-tuning; directly relevant to secure code generation evaluation."
    447     },
    448     {
    449       "title": "Generalization or Memorization: Data Contamination and Trustworthy Evaluation for Large Language Models",
    450       "authors": ["Yihong Dong", "Xue Jiang", "Huanyu Liu"],
    451       "year": 2024,
    452       "relevance": "Studies data contamination in LLM evaluation, directly relevant to benchmark integrity and contamination concerns."
    453     },
    454     {
    455       "title": "Code Llama: Open Foundation Models for Code",
    456       "authors": ["Baptiste Rozière"],
    457       "year": 2023,
    458       "arxiv_id": "2308.12950",
    459       "relevance": "Primary target model for evaluation; widely-used open-source code LLM."
    460     },
    461     {
    462       "title": "LiCoEval: Evaluating LLMs on License Compliance in Code Generation",
    463       "authors": ["Weiwei Xu", "Kai Gao", "Hao He", "Minghui Zhou"],
    464       "year": 2025,
    465       "relevance": "Evaluates copyright/license compliance in LLM-generated code; directly relevant to the copyrighted code unlearning task."
    466     },
    467     {
    468       "title": "How and Why LLMs Use Deprecated APIs in Code Completion? An Empirical Study",
    469       "authors": ["Chong Wang", "Kaifeng Huang"],
    470       "year": 2024,
    471       "arxiv_id": "2406.09834",
    472       "relevance": "Studies deprecated API usage in LLM code completion; provides context for the deprecated API unlearning task."
    473     },
    474     {
    475       "title": "VersiCode: Towards version-controllable code generation",
    476       "authors": ["Tongtong Wu", "Weigang Wu"],
    477       "year": 2024,
    478       "arxiv_id": "2406.07411",
    479       "relevance": "Source dataset for the deprecated API unlearning benchmark; addresses version-specific code generation."
    480     },
    481     {
    482       "title": "Negative Preference Optimization: From Catastrophic Collapse to Effective Unlearning",
    483       "authors": ["Ruizhe Zhang", "Liwei Lin", "Yue Bai", "Song Mei"],
    484       "year": 2024,
    485       "relevance": "NPO baseline method that addresses GA's instability; key comparison point for LLM unlearning approaches."
    486     },
    487     {
    488       "title": "LLM Unlearning via Loss Adjustment with Only Forget Data",
    489       "authors": ["Yijiang Wang", "Jiaheng Wei"],
    490       "year": 2025,
    491       "relevance": "FLAT baseline method using f-divergence for unlearning; addresses the challenge of using only forget data."
    492     },
    493     {
    494       "title": "A Survey on Code Generation with LLM-based Agents",
    495       "authors": ["Yihong Dong", "Xue Jiang"],
    496       "year": 2025,
    497       "arxiv_id": "2508.00083",
    498       "relevance": "Survey of LLM-based code generation agents; provides broader context for code generation capabilities and challenges."
    499     },
    500     {
    501       "title": "Security Weaknesses of Copilot-Generated Code in GitHub Projects: An Empirical Study",
    502       "authors": ["Yujia Fu", "Peng Liang"],
    503       "year": 2025,
    504       "relevance": "Empirical study of security vulnerabilities in Copilot-generated code; relevant to the motivation for insecure code unlearning."
    505     }
    506   ]
    507 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs