scan.json (31315B)
1 { 2 "paper": { 3 "title": "Memorize or Generalize? Evaluating LLM Code Generation with Code Rewriting", 4 "authors": [ 5 "Lizhe Zhang", 6 "Wentao Chen", 7 "Li Zhong", 8 "Letian Peng", 9 "Zilong Wang", 10 "Jingbo Shang" 11 ], 12 "year": 2025, 13 "venue": "arXiv preprint", 14 "arxiv_id": "2503.02296", 15 "doi": "10.48550/arXiv.2503.02296" 16 }, 17 "scan_version": 2, 18 "active_modules": ["experimental_rigor", "data_leakage"], 19 "methodology_tags": ["benchmark-eval"], 20 "key_findings": "The paper proposes code rewriting and the Memorization Risk Index (MRI) to distinguish harmful memorization from benign code reuse in LLM code generation. Key findings: (1) harmful memorization does not increase with model scale and often decreases, primarily by reducing accuracy drops under semantic shifts rather than reducing surface similarity; (2) memorization persists on harder tasks (BIGCODEBENCH) even at scale; (3) SFT improves accuracy but substantially inflates MRI, while PPO achieves a better risk-accuracy trade-off with lower memorization. Evaluated on MBPP+ and BIGCODEBENCH across Qwen-2.5 and Llama-3.1/4 model families.", 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": false, 26 "justification": "The reproducibility statement says 'all evolved tasks and the prompts used during generation will be released publicly upon publication' — this is a promise of future release, not an actual release. No repository URL is provided." 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": false, 31 "justification": "While the base benchmarks MBPP+ and BIGCODEBENCH are public, the paper's primary contribution — the code rewriting, mutation, and paraphrase variant datasets — are not yet released. The reproducibility statement promises release 'upon publication.'" 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": false, 36 "justification": "Section 4.2 mentions '4 NVIDIA A100 GPUs (80GB), using PyTorch and HuggingFace Transformers' but no specific library versions, requirements.txt, or Dockerfile are provided." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": false, 41 "justification": "No step-by-step reproduction instructions are provided. The reproducibility statement acknowledges details are in appendices but no runnable scripts or commands are given." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": false, 48 "justification": "Table 2's mean ± SD values are computed across different models (footnote 3: 'unweighted arithmetic average computed per column across models'), not across experimental runs. No confidence intervals or error bars from repeated runs are reported for individual model evaluations." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": false, 53 "justification": "The paper makes numerous comparative claims (e.g., 'MRI falls from 0.0722 at 0.5B to 0.0113 at 14B') based solely on comparing point estimates. No statistical significance tests (p-values, t-tests, etc.) are used anywhere." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": true, 58 "justification": "The paper consistently reports absolute values with baselines (e.g., 'accuracy was boosted from 0.3158 → 0.3772' for SFT, 'MRI 0.0799 → 0.0795 (PPO) vs 0.1747 (SFT)'). Readers can assess magnitude from the reported numbers with baseline context throughout." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": false, 63 "justification": "The paper acknowledges 'Due to the small size of MBPP+ test split (n = 78), estimation on this split may be imprecise and directional' (Section 4.1), but provides no formal justification for sample sizes or power analysis." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": false, 68 "justification": "The ± SD values in Table 2 are cross-model variability, not run-to-run variance (footnote 3). Section 5.1.1's 'mean ± SD over 6 seeds' refers to 6 model sizes in the series, not random seeds. No multi-run variance for individual model evaluations is reported." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": true, 75 "justification": "The paper compares multiple model families (Qwen-2.5, Llama-3.1, Llama-4), includes mutation and paraphrase as reference baselines for robustness comparison (Section 3.2), and compares SFT vs PPO vs base models (Section 5.2)." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": true, 80 "justification": "Models evaluated include Qwen-2.5 (2024), Llama-3.1 (2024), and Llama-4 (2024) — all very recent. GPT-5 (2025) is used for task generation." 81 }, 82 "ablation_study": { 83 "applies": true, 84 "answer": true, 85 "justification": "The paper decomposes MRI into Sim(Trew) and RADrew and analyzes each component independently. It compares three perturbation types (rewriting, mutation, paraphrase) and studies how each component behaves under different conditions (scale, fine-tuning strategy)." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": true, 90 "justification": "Multiple metrics are used: MRI, Sim(Trew), RADrew, Acc(Tori), RADmut, RADpar. Table 2 reports all of these across models and benchmarks." 91 }, 92 "human_evaluation": { 93 "applies": true, 94 "answer": false, 95 "justification": "Two programmers manually reviewed 10% of generated perturbation tasks for quality (Section 3.1), but this validates the dataset, not the LLM outputs. The schema specifies 'manual classification of the benchmark or dataset itself does not count.' All LLM evaluation is automated (pass/fail on test suites)." 96 }, 97 "held_out_test_set": { 98 "applies": true, 99 "answer": true, 100 "justification": "Section 4.1: 'We use 4:1 train/test split for fine-tuning. For models that undergo SFT and PPO, we train on the training split and evaluate on the test split.' For instruction-only models, the complete benchmark is used (no training on it)." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": true, 105 "justification": "Results are broken down by benchmark (MBPP+ vs BIGCODEBENCH), model family (Qwen vs Llama), model size (0.5B to 32B/70B), perturbation type (rewriting, mutation, paraphrase), and training strategy (base, SFT, PPO)." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": true, 110 "justification": "Appendix D provides 5 regressed task examples per dataset (tasks that PASSED originally but FAILED under code rewriting) from Qwen2.5-Coder-32B-Instruct. The paper also discusses where memorization persists (harder tasks, BIGCODEBENCH) and where scaling fails to help." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper reports that SFT substantially increases memorization (MRI 0.0799 → 0.1747), that scaling is non-monotonic on BIGCODEBENCH (MRI drops then rises at 32B for Qwen-Instruct), and that memorization persists on harder tasks despite scaling." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "All three abstract claims are supported: (1) memorization doesn't increase with scale — Figure 2, Table 1; (2) SFT improves accuracy but introduces memorization — Figure 3; (3) PPO achieves better trade-off — Figure 3 shows lower MRI with maintained accuracy." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": true, 127 "justification": "The causal claims about SFT and PPO effects are supported by controlled experiments where the same base model (Qwen-2.5-7B) undergoes different training strategies. The paper uses ablation-style comparisons (base vs SFT vs PPO) with controlled single-variable manipulation." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": false, 132 "justification": "The title 'Evaluating LLM Code Generation' is broad. The abstract claims 'memorization does not increase with larger models' as a general finding, but the paper tests only 2 model families (Qwen, Llama) on 2 Python benchmarks. The scaling claim is not bounded to these specific families or Python tasks." 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": true, 137 "justification": "The paper explicitly distinguishes benign reuse from harmful memorization (Section 2.2), discusses that high similarity alone is not harmful if the model solves the perturbed task correctly, and considers task difficulty as a factor affecting results (MBPP+ vs BIGCODEBENCH). Section 5.1 discusses how surface-form reuse without failure does not constitute harmful memorization." 138 }, 139 "proxy_outcome_distinction": { 140 "applies": true, 141 "answer": true, 142 "justification": "The paper carefully defines what MRI measures (similarity × accuracy drop) and explicitly distinguishes this from what it claims to capture (harmful memorization). Section 1 defines harmful memorization behaviorally and Section 3.1.1 specifies exactly how MRI operationalizes this definition." 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": true, 148 "answer": true, 149 "justification": "Specific model checkpoints are named: 'Qwen-2.5-0.5B-Instruct' through '32B', 'Qwen-2.5-Coder-0.5B-Instruct' through '32B', 'Llama-3.1-8B-Instruct', 'Llama-3.1-70B-Instruct', 'Llama-4-Scout-17B-Instruct (16E)', 'Llama-4-Maverick-17B-Instruct (128E)'. GPT-5 version 'gpt-5-2025-08-07' specified in Appendix G." 150 }, 151 "prompts_provided": { 152 "applies": true, 153 "answer": true, 154 "justification": "Full prompts for code rewriting, mutation, paraphrase, and LLM judging are provided in Appendix B. The fine-tuning prompt template with exact text is provided in Appendix F (instruction_prefix, response_prefix, chat template)." 155 }, 156 "hyperparameters_reported": { 157 "applies": true, 158 "answer": false, 159 "justification": "Training hyperparameters are detailed in Appendix F (Tables 3-6). GPT-5 settings in Appendix G state 'temperature: default; top-p: default' which is not specifying actual values. Crucially, inference temperature/sampling settings for the evaluated Qwen and Llama models are not reported anywhere." 160 }, 161 "scaffolding_described": { 162 "applies": false, 163 "answer": false, 164 "justification": "No agentic scaffolding is used. Models are evaluated directly on code generation tasks without multi-step workflows, tools, or agent frameworks." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": true, 169 "justification": "The perturbation pipeline is documented: code rewriting process (Section 3.1), mutation noise budget formula (Appendix B.3), LLM judge validation (Section 3.1, Appendix B.2), manual review of 10% by two programmers, test case validation for rewritten tasks (Appendix B end). Train/test split (4:1) documented in Section 4.1." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": false, 176 "justification": "There is no dedicated limitations section. Section 6 'Conclusion and Future Works' mentions next steps (mitigation approach, evaluation transferability) but does not discuss limitations of the current work." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": false, 181 "justification": "No threats to validity are discussed. The paper acknowledges the small MBPP+ test split (n=78) may be 'imprecise and directional' (Section 4.1) but does not systematically discuss threats to validity such as GPT-5 task generation quality, benchmark representativeness, or whether code rewriting preserves difficulty." 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": false, 186 "justification": "The conclusion mentions 'while our current evaluation metrics are tailored for code generation' as a hint at scope, but no explicit statements about what the results do NOT show (e.g., only Python tasks, only two model families, only two benchmarks)." 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": false, 193 "justification": "Neither the evolved task datasets nor the model outputs are available. The reproducibility statement promises future release 'upon publication.'" 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": true, 198 "justification": "The data generation process is described in detail: code rewriting pipeline (Section 3.1, Appendix B.1), mutation formula (Appendix B.3), paraphrasing process (Appendix B.4), GPT-5 configuration (Appendix G), validation pipeline including LLM judge and manual review." 199 }, 200 "recruitment_methods_described": { 201 "applies": false, 202 "answer": false, 203 "justification": "No human participants. Data sources are standard public benchmarks (MBPP+, BIGCODEBENCH)." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": true, 208 "justification": "The pipeline is documented: original task → GPT-5 perturbation (with specific prompts in Appendix B) → LLM-as-judge validation (Appendix B.2) → manual review of 10% by two programmers → test case validation → 4:1 train/test split. The generation cost (~$450 per round) is stated." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": true, 214 "answer": false, 215 "justification": "No funding or acknowledgments section is present in the paper. There is no disclosure of any funding sources." 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "All authors are listed as affiliated with University of California, San Diego. No conflicts with the evaluated models (Qwen, Llama, GPT-5 are from other organizations)." 221 }, 222 "funder_independent_of_outcome": { 223 "applies": true, 224 "answer": false, 225 "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding disclosure does not confirm the absence of funding." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "No competing interests statement is present in the paper." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": true, 236 "answer": false, 237 "justification": "Training data cutoff dates are not stated for any of the evaluated models (Qwen-2.5, Llama-3.1, Llama-4), despite this being critical since the benchmarks MBPP+ and BIGCODEBENCH were published before these models were trained." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": true, 241 "answer": true, 242 "justification": "The entire paper is about detecting and measuring memorization of benchmark solutions by LLMs. Section 2.2 discusses prior work on memorization detection. The code rewriting method is explicitly designed to test whether models recall training solutions rather than generalize." 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": true, 246 "answer": true, 247 "justification": "The paper's central contribution addresses benchmark contamination. Section 1 discusses that 'existing evaluations largely measure memorization via surface or structural overlap.' The code rewriting methodology is designed to reveal harmful memorization when models have seen benchmark solutions during training." 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants. The study evaluates LLMs on code generation benchmarks." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants in the study." 260 }, 261 "demographics_reported": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants. Two programmers reviewed data quality but are not study participants." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants in the study." 270 }, 271 "randomization_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in the study." 275 }, 276 "blinding_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants in the study." 280 }, 281 "attrition_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants in the study." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": true, 290 "answer": false, 291 "justification": "No inference cost, latency, or per-example cost is reported for the model evaluations. Appendix G reports ~$450 for generating one round of perturbations via GPT-5 but this covers data generation, not the evaluation itself." 292 }, 293 "compute_budget_stated": { 294 "applies": true, 295 "answer": true, 296 "justification": "Section 4.2: 'a total computational budget of approximately 40 GPU hours' on '4 NVIDIA A100 GPUs (80GB)'. Appendix G: '~450 USD' for task generation per round." 297 } 298 }, 299 "experimental_rigor": { 300 "seed_sensitivity_reported": { 301 "applies": true, 302 "answer": false, 303 "justification": "No results across multiple random seeds are reported. Section 5.1.1's 'mean ± SD over 6 seeds' actually computes mean and SD across 6 model sizes in a series (0.5B through 32B), not random seeds — this is misleading terminology." 304 }, 305 "number_of_runs_stated": { 306 "applies": true, 307 "answer": false, 308 "justification": "The number of evaluation runs per model is not stated. There is no indication of whether results are from single runs or averaged across multiple runs." 309 }, 310 "hyperparameter_search_budget": { 311 "applies": true, 312 "answer": false, 313 "justification": "Appendix F.1.1 states the learning rate was tuned 'with manual adjustments between 5×10⁻⁶ and 1×10⁻⁵ depending on model performance' but no search budget (configurations tried, compute spent on tuning) is reported." 314 }, 315 "best_config_selection_justified": { 316 "applies": true, 317 "answer": true, 318 "justification": "Checkpoint selection is clearly described: SFT uses 'the epoch immediately prior to overfitting' (Section 3.3.1, Table 5) based on validation loss. PPO uses 'the checkpoint with the highest test reward' (Section 3.3.2, Table 6). Both criteria are principled and stated in advance." 319 }, 320 "multiple_comparison_correction": { 321 "applies": false, 322 "answer": false, 323 "justification": "No statistical significance tests are performed, so there are no p-values to correct for multiple comparisons." 324 }, 325 "self_comparison_bias_addressed": { 326 "applies": true, 327 "answer": false, 328 "justification": "The paper proposes the MRI metric and evaluates it without acknowledging author-evaluation bias. No comparison against alternative memorization metrics (beyond surface similarity) is conducted, and no independent evaluation of MRI's validity is provided." 329 }, 330 "compute_budget_vs_performance": { 331 "applies": true, 332 "answer": false, 333 "justification": "Performance is compared across model sizes (0.5B to 70B) without normalizing for compute budget. Footnote 2 notes Llama-4 MoE models have 'similar per-token activated compute' but no systematic compute-performance analysis is provided." 334 }, 335 "benchmark_construct_validity": { 336 "applies": true, 337 "answer": true, 338 "justification": "The paper explicitly discusses what MBPP+ (introductory) vs BIGCODEBENCH (harder) measure and how difficulty level affects memorization detection. Section 5.1 analyzes how benchmark difficulty interacts with MRI, and the paper's core contribution questions whether standard benchmarks actually measure generalization vs memorization." 339 }, 340 "scaffold_confound_addressed": { 341 "applies": false, 342 "answer": false, 343 "justification": "No scaffolding is involved in the evaluation. Models are evaluated directly on code generation tasks." 344 } 345 }, 346 "data_leakage": { 347 "temporal_leakage_addressed": { 348 "applies": true, 349 "answer": true, 350 "justification": "The paper's entire framework addresses temporal leakage: code rewriting creates novel perturbations of existing benchmarks to test whether models are recalling training-era solutions. Section 2.2 discusses that models may have 'seen near-identical problems during pre-training.'" 351 }, 352 "feature_leakage_addressed": { 353 "applies": true, 354 "answer": false, 355 "justification": "Feature leakage (whether the evaluation setup leaks answer information through context) is not discussed. The paper focuses on memorization of solutions but does not analyze whether the evaluation framework itself might provide hints." 356 }, 357 "non_independence_addressed": { 358 "applies": true, 359 "answer": false, 360 "justification": "No discussion of whether the code rewriting variants are sufficiently independent from training data. The perturbations preserve surface syntax (by design), which could overlap with training data patterns." 361 }, 362 "leakage_detection_method": { 363 "applies": true, 364 "answer": true, 365 "justification": "MRI itself is a leakage detection method: it combines similarity to original solutions with accuracy drop under semantic perturbation to identify when models are recalling training data rather than generalizing. This is a concrete, applied method (Section 3.1.1)." 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "Memorization does not increase with larger models and in many cases alleviates as they scale.", 372 "evidence": "Figure 2 shows MRI declining with scale for Qwen-2.5 Instruct on MBPP+ (0.0722 at 0.5B → 0.0000 at 32B). Table 1 shows similar trends for Llama families. On BIGCODEBENCH, the trend is milder and sometimes non-monotonic (Section 5.1).", 373 "supported": "moderate" 374 }, 375 { 376 "claim": "SFT improves accuracy but substantially amplifies memorization.", 377 "evidence": "Figure 3 and Section 5.2: Qwen-2.5-7B-SFT accuracy boosted from 0.3158 → 0.3772 on BIGCODEBENCH, but MRI increased from 0.0799 → 0.1747. Similar pattern for Coder variant (0.1392 → 0.1921).", 378 "supported": "moderate" 379 }, 380 { 381 "claim": "PPO achieves a more balanced trade-off between memorization and generalization than SFT.", 382 "evidence": "Section 5.2: PPO on Qwen-2.5-7B yields accuracy 0.3509 (vs 0.3772 SFT) with MRI 0.0795 (vs 0.1747 SFT). Similar for Coder: accuracy 0.3728 vs 0.4079, MRI 0.1336 vs 0.1921.", 383 "supported": "moderate" 384 }, 385 { 386 "claim": "Memorization declines rapidly on simpler tasks but persists on more difficult ones.", 387 "evidence": "Section 5.1.1: On MBPP+ (simpler), MRI reaches effectively zero at 32B scale. On BIGCODEBENCH (harder), MRI remains 0.1178 even for Qwen-2.5-Coder-32B-Instruct. RADrew on BIGCODEBENCH stays high (0.37-0.66) while near zero on MBPP+ for large models.", 388 "supported": "strong" 389 }, 390 { 391 "claim": "Coder models encourage code reuse but do not substantially increase harmful memorization.", 392 "evidence": "Section 5.1.1: Coder models have higher Sim(Trew) (0.3237 ± 0.0086 vs 0.2670 ± 0.0268 on BIGCODEBENCH) but comparable RADrew, resulting in only slight MRI increase (0.1367 ± 0.0224 vs 0.1142 ± 0.0247).", 393 "supported": "moderate" 394 } 395 ], 396 "red_flags": [ 397 { 398 "flag": "No run-to-run variance reported", 399 "detail": "All results appear to be single-run evaluations. The '± SD' values in the paper are computed across model sizes or across models, not across experimental runs. There is no way to assess the stability of individual model evaluations." 400 }, 401 { 402 "flag": "Misleading 'seeds' terminology", 403 "detail": "Section 5.1.1 refers to 'mean ± SD over 6 seeds' when computing statistics across the 6 model sizes (0.5B through 32B) in a model series. This language is misleading as 'seeds' typically refers to random seeds in ML research." 404 }, 405 { 406 "flag": "No statistical tests for comparative claims", 407 "detail": "All claims about trends (scaling reduces MRI, SFT increases memorization, PPO is better than SFT) are based on comparing point estimates without any formal statistical testing." 408 }, 409 { 410 "flag": "GPT-5 dependency for task generation", 411 "detail": "All three perturbation types (code rewriting, mutation, paraphrase) and the quality judge are generated by GPT-5. Any systematic biases in GPT-5's generation could affect the evaluation. The paper does not analyze sensitivity to the generating model." 412 }, 413 { 414 "flag": "No limitations section", 415 "detail": "The paper lacks a dedicated limitations or threats-to-validity section despite several potential threats: narrow model families tested (only Qwen and Llama), Python-only evaluation, reliance on GPT-5 for data generation, and small test split for MBPP+ (n=78)." 416 }, 417 { 418 "flag": "Limited model diversity for broad claims", 419 "detail": "The scaling analysis covers only Qwen-2.5 and Llama families. Claims about memorization trends with model scale are presented broadly but tested on a limited set of architectures." 420 } 421 ], 422 "cited_papers": [ 423 { 424 "title": "Evaluating large language models trained on code", 425 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 426 "year": 2021, 427 "arxiv_id": "2107.03374", 428 "relevance": "Introduces HumanEval and Codex, foundational benchmark and model for LLM code generation evaluation." 429 }, 430 { 431 "title": "Is your code generated by chatGPT really correct? Rigorous evaluation of large language models for code generation", 432 "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"], 433 "year": 2023, 434 "relevance": "Introduces EvalPlus/MBPP+, one of the two primary benchmarks used in this paper for evaluating code generation." 435 }, 436 { 437 "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions", 438 "authors": ["Terry Yue Zhuo"], 439 "year": 2024, 440 "relevance": "Introduces BIGCODEBENCH, the harder benchmark used to study memorization persistence on complex tasks." 441 }, 442 { 443 "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code", 444 "authors": ["Naman Jain", "King Han", "Alex Gu"], 445 "year": 2024, 446 "arxiv_id": "2403.07974", 447 "relevance": "Contamination-free benchmark using temporally fresh problems, directly relevant to the memorization vs generalization debate." 448 }, 449 { 450 "title": "Unveiling memorization in code models", 451 "authors": ["Zhou Yang", "Zhipeng Zhao", "Chenyu Wang"], 452 "year": 2024, 453 "doi": "10.1145/3597503.3639074", 454 "relevance": "Studies general memorization in code models, finding it increases with model size — a claim this paper challenges for harmful memorization." 455 }, 456 { 457 "title": "Traces of memorisation in large language models for code", 458 "authors": ["Ali Al-Kaswan", "Maliheh Izadi", "Arie van Deursen"], 459 "year": 2024, 460 "doi": "10.1145/3597503.3639133", 461 "relevance": "Measures general memorization in code LLMs via regurgitation, finding it increases with scale — contrasted with this paper's findings on harmful memorization." 462 }, 463 { 464 "title": "Quantifying contamination in evaluating code generation capabilities of language models", 465 "authors": ["Martin Riddell", "Ansong Ni", "Arman Cohan"], 466 "year": 2024, 467 "arxiv_id": "2403.04811", 468 "relevance": "Quantifies benchmark contamination in code generation evaluation, directly relevant to the contamination problem this paper addresses." 469 }, 470 { 471 "title": "ReCode: Robustness evaluation of code generation models", 472 "authors": ["Shiqi Wang", "Li Zheng", "Haifeng Qian"], 473 "year": 2022, 474 "arxiv_id": "2212.10264", 475 "relevance": "Proposes robustness evaluation through semantic-preserving perturbations — the mutation and paraphrase baselines in this paper are adapted from ReCode." 476 }, 477 { 478 "title": "The pitfalls of memorization: When memorization hurts generalization", 479 "authors": ["Reza Bayat", "Mohammad Pezeshki", "Elvis Dohmatob"], 480 "year": 2024, 481 "arxiv_id": "2412.07684", 482 "relevance": "Theoretical framework for when memorization hurts generalization, directly informing this paper's definition of harmful memorization." 483 }, 484 { 485 "title": "Generalization or memorization: Data contamination and trustworthy evaluation for large language models", 486 "authors": ["Yihong Dong", "Xue Jiang", "Huanyu Liu"], 487 "year": 2024, 488 "arxiv_id": "2402.15938", 489 "relevance": "Studies data contamination and its effect on trustworthy LLM evaluation, measuring memorization via surface/structural overlap." 490 }, 491 { 492 "title": "On memorization of large language models in logical reasoning", 493 "authors": ["Chulin Xie", "Yangsibo Huang", "Chiyuan Zhang"], 494 "year": 2024, 495 "arxiv_id": "2410.23123", 496 "relevance": "Studies LLM memorization in reasoning tasks, extending the memorization vs generalization question beyond code generation." 497 }, 498 { 499 "title": "Qwen2.5-coder technical report", 500 "authors": ["Binyuan Hui", "Jian Yang", "Zeyu Cui"], 501 "year": 2024, 502 "arxiv_id": "2409.12186", 503 "relevance": "Technical report for the Qwen-2.5-Coder model family, one of the primary model series evaluated in this paper." 504 }, 505 { 506 "title": "Measuring memorization in RLHF for code completion", 507 "authors": ["Aneesh Pappu", "Billy Porter", "Ilia Shumailov", "Jamie Hayes"], 508 "year": 2024, 509 "arxiv_id": "2406.11715", 510 "relevance": "Studies memorization specifically in RLHF-trained code models, relevant to this paper's comparison of SFT vs PPO effects on memorization." 511 } 512 ] 513 }