scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21318B)
      1 {
      2   "paper": {
      3     "title": "On the Edge of Memorization in Diffusion Models",
      4     "authors": ["Sam Buchanan", "Druv Pai", "Yi Ma", "Valentin De Bortoli"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2508.17689",
      8     "doi": "10.48550/arXiv.2508.17689"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor"],
     12   "methodology_tags": ["theoretical", "benchmark-eval"],
     13   "key_findings": "The paper introduces a theoretical 'laboratory' for studying memorization vs. generalization in diffusion models trained on Gaussian mixture data. They derive tight approximations of training losses for memorizing and generalizing denoisers and identify a crossover point M* (approximately 4/5 N) at which memorization becomes predominant. The crossover is validated experimentally, achieving prediction errors below 2×10⁻⁴. A low-rank Gaussian model mimicking natural images shows qualitatively similar phase transition behavior.",
     14   "claims": [
     15     {
     16       "claim": "There exists a phase transition from generalization to memorization as model size M increases, analogous to observations in large-scale diffusion models.",
     17       "evidence": "Figure 2 shows memorization ratio transitioning from ~0 to ~1 as M/N increases, with corresponding training/test loss crossover. Section 4.1.",
     18       "supported": "strong"
     19     },
     20     {
     21       "claim": "The phase transition location can be predicted using theoretical loss approximations with extremely low error (train/test error ≤ 2×10⁻⁴).",
     22       "evidence": "Figure 3 and Section 4.1 report regression errors on the loss weighting optimization. The recovered crossover point is Mpt ≈ (4/5)N.",
     23       "supported": "strong"
     24     },
     25     {
     26       "claim": "The loss approximations derived in Theorems 3.1 and 3.2 agree remarkably well with empirical losses even at moderate dimensions.",
     27       "evidence": "Figure 1 shows tight agreement between theoretical approximations and empirical losses at d=50, K=12, N=200.",
     28       "supported": "strong"
     29     },
     30     {
     31       "claim": "The memorization phase transition persists in a low-rank Gaussian mixture model designed to mimic natural image structure.",
     32       "evidence": "Figure 5 shows qualitatively similar phase transition behavior with colored FashionMNIST templates. Section 4.2.",
     33       "supported": "moderate"
     34     }
     35   ],
     36   "checklist": {
     37     "artifacts": {
     38       "code_released": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Code is available at https://github.com/DruvPai/diffusion_mem_gen, stated in the abstract."
     42       },
     43       "data_released": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Data is synthetically generated from Gaussian mixture models with parameters fully specified in the paper and appendix. The code repository enables regeneration."
     47       },
     48       "environment_specified": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Appendix H states: 'We run all experiments on several Nvidia A100 80GB GPUs using Jax 0.6.0 and Equinox 0.12.' Specific library versions are provided."
     52       },
     53       "reproduction_instructions": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "While code is released and experimental details are thorough in Appendix H, no explicit step-by-step reproduction instructions (e.g., README with commands) are described in the paper itself."
     57       }
     58     },
     59     "statistical_methodology": {
     60       "confidence_intervals_or_error_bars": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Figure 7 shows error bars (min/max across 3 seeds) for memorization ratio and loss plots. The paper states variance is 'extremely small.'"
     64       },
     65       "significance_tests": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No statistical significance tests are reported. Claims of agreement between theory and experiment rely on visual comparison and reported MSE values."
     69       },
     70       "effect_sizes_reported": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper reports specific prediction errors (≤ 2×10⁻⁴) and the crossover ratio (4/5 N), providing quantitative magnitudes of effects."
     74       },
     75       "sample_size_justified": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No justification for why N=200, d=50, K=12 were chosen as default parameters, or why the sweep grid uses [50,100,150,200]×[30,40,50,60]×[3,6,9,12]."
     79       },
     80       "variance_reported": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Figure 7 reports variance across 3 random seeds, noting 'extremely small' error bars. Appendix H.5 discusses seed sensitivity."
     84       }
     85     },
     86     "evaluation_design": {
     87       "baselines_included": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "The paper compares trained denoisers against the generalizing denoiser (ground truth), the memorizing denoiser, and partially memorizing denoisers across all experiments."
     91       },
     92       "baselines_contemporary": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "The baselines are inherent to the theoretical framework (optimal generalizing vs. memorizing denoisers). Related theoretical work from 2024-2025 is discussed in Section 5."
     96       },
     97       "ablation_study": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "The paper systematically varies M (model capacity), N (samples), d (dimension), and K (modes) to study their effects on the phase transition. Figure 3 sweeps over 64 (N,d,K) tuples."
    101       },
    102       "multiple_metrics": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper uses memorization ratio (Definition 2.2), training loss, test loss, and 2-Wasserstein distance as evaluation metrics."
    106       },
    107       "human_evaluation": {
    108         "applies": false,
    109         "answer": false,
    110         "justification": "Human evaluation is irrelevant for this theoretical/synthetic experiment paper studying mathematical properties of diffusion models."
    111       },
    112       "held_out_test_set": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Test loss is computed over a held-out set of samples from π⋆ (Figure 2 right panel, Figure 5 right panel). Section 2 mentions estimating generalization error on a held-out set."
    116       },
    117       "per_category_breakdown": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Results are broken down across multiple (N,d,K) configurations in the sweep (Figure 3), and separate results shown for isotropic GMM (Section 4.1) vs. low-rank image model (Section 4.2)."
    121       },
    122       "failure_cases_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 6 (Conclusion) discusses limitations: the model does not capture intrinsic dimensionality or partial data replication. Figure 5 notes 'transient jaggedness' in the low-rank setting."
    126       },
    127       "negative_results_reported": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "All experiments validate the hypothesis. No failed approaches or negative findings are reported."
    131       }
    132     },
    133     "claims_and_evidence": {
    134       "abstract_claims_supported": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The abstract claims about theoretical characterization of the crossover point, experimental validation, and extremely low prediction error are all supported by the results in Sections 3-4."
    138       },
    139       "causal_claims_justified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The causal claim is that model underparameterization determines memorization vs. generalization. This is justified through controlled experiments varying M while holding other parameters fixed (Figures 2, 5), which constitutes adequate single-variable manipulation."
    143       },
    144       "generalization_bounded": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The paper is careful to bound claims to Gaussian mixture models and specific parameterizations. Section 6 explicitly states the framework needs extension for 'additional properties of larger and more realistic datasets.'"
    148       },
    149       "alternative_explanations_discussed": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 5 discusses alternative theories of memorization/generalization: implicit bias of underparameterization (Vastola 2025), stochastic optimization landscape (Wu et al. 2025), and the distinction from benign overfitting (Appendix G)."
    153       },
    154       "proxy_outcome_distinction": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The paper carefully defines memorization (Definition 2.2) and generalization in precise mathematical terms, and acknowledges that their metric is a 'relatively strict' notion of memorization that does not fully capture copyright/privacy concerns."
    158       }
    159     },
    160     "setup_transparency": {
    161       "model_versions_specified": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "The paper does not use pre-trained LLMs or commercial models. The models are Gaussian mixture denoisers with analytically specified structure."
    165       },
    166       "prompts_provided": {
    167         "applies": false,
    168         "answer": false,
    169         "justification": "No prompting is used. The paper trains mathematical denoiser models."
    170       },
    171       "hyperparameters_reported": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Appendix H.1 provides comprehensive hyperparameters: learning rate schedule (warmup-decay from 0 to 10⁻³ to 10⁻⁶), N_epochs (50,000 and 100,000), N_dup=100, L=25 timesteps, ε=10⁻³, Adam optimizer, initialization scheme."
    175       },
    176       "scaffolding_described": {
    177         "applies": false,
    178         "answer": false,
    179         "justification": "No agentic scaffolding is used."
    180       },
    181       "data_preprocessing_documented": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Appendix H.1 fully documents data generation: GMM means sampled uniformly on sphere of radius √d, σ²⋆=1, and for image model: FashionMNIST templates resized to 15×15 with specific color distributions."
    185       }
    186     },
    187     "limitations_and_scope": {
    188       "limitations_section_present": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 6 (Conclusion) contains substantial discussion of limitations: the model needs extension to capture intrinsic dimensionality, partial data replication, and more realistic datasets."
    192       },
    193       "threats_to_validity_specific": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "The paper identifies specific limitations: Gaussian mixture models may not capture all complexities of natural images, the isotropic covariance assumption is simplifying, and the theoretical results require well-separated cluster centers."
    197       },
    198       "scope_boundaries_stated": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The paper explicitly states the framework is limited to Gaussian mixture models and specific parameterizations. Section 6 lists specific extensions needed: 'intrinsic dimensionality or partial data replication.'"
    202       }
    203     },
    204     "data_integrity": {
    205       "raw_data_available": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Data is synthetically generated with fully specified parameters. Code to regenerate all data is available at the GitHub repository."
    209       },
    210       "data_collection_described": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Data generation is fully specified: GMM with K components, means on sphere of radius √d, σ²⋆=1, N samples drawn i.i.d. Appendix H.1 provides all details."
    214       },
    215       "recruitment_methods_described": {
    216         "applies": false,
    217         "answer": false,
    218         "justification": "No human participants. Data is synthetically generated."
    219       },
    220       "data_pipeline_documented": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "The full pipeline from data generation through noising process to training and evaluation is documented in Section 2, Section 4, and Appendix H."
    224       }
    225     },
    226     "conflicts_of_interest": {
    227       "funding_disclosed": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "Acknowledgements section lists specific grants: Simons Foundation-NSF DMS grant #2031899, ONR grant N00014-22-1-2102, NSF grant #2402951, and HKU startup fund."
    231       },
    232       "affiliations_disclosed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "Author affiliations clearly stated: TTIC, UC Berkeley, HKU, and Google DeepMind."
    236       },
    237       "funder_independent_of_outcome": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "Funders are NSF, Simons Foundation, ONR, and HKU — none have a financial interest in whether diffusion models memorize or not. Google DeepMind affiliation of one author is notable but funding is from independent sources."
    241       },
    242       "financial_interests_declared": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No competing interests statement is present. One author is affiliated with Google DeepMind, which has commercial interest in diffusion models, but no financial interests declaration is made."
    246       }
    247     },
    248     "contamination": {
    249       "training_cutoff_stated": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "The paper does not evaluate a pre-trained model on any benchmark. All models are trained from scratch on synthetic data."
    253       },
    254       "train_test_overlap_discussed": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No pre-trained model benchmark evaluation. Train/test separation is inherent in the synthetic setup (held-out samples from π⋆)."
    258       },
    259       "benchmark_contamination_addressed": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No pre-trained model benchmark evaluation. Data is synthetically generated."
    263       }
    264     },
    265     "human_studies": {
    266       "pre_registered": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants."
    270       },
    271       "irb_or_ethics_approval": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants."
    275       },
    276       "demographics_reported": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants."
    280       },
    281       "inclusion_exclusion_criteria": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants."
    285       },
    286       "randomization_described": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants."
    290       },
    291       "blinding_described": {
    292         "applies": false,
    293         "answer": false,
    294         "justification": "No human participants."
    295       },
    296       "attrition_reported": {
    297         "applies": false,
    298         "answer": false,
    299         "justification": "No human participants."
    300       }
    301     },
    302     "cost_and_practicality": {
    303       "inference_cost_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No inference cost or wall-clock time reported despite running experiments on multiple A100 GPUs."
    307       },
    308       "compute_budget_stated": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The paper states 'several Nvidia A100 80GB GPUs' but does not quantify total GPU hours or compute budget."
    312       }
    313     },
    314     "experimental_rigor": {
    315       "seed_sensitivity_reported": {
    316         "applies": true,
    317         "answer": true,
    318         "justification": "Figure 7 and Appendix H.5 report results across 3 random seeds with error bars, noting 'extremely small' variance."
    319       },
    320       "number_of_runs_stated": {
    321         "applies": true,
    322         "answer": true,
    323         "justification": "Appendix H.5 states 3 random seeds were used. Appendix H.1 specifies 20 models trained per (N,d,K) setting."
    324       },
    325       "hyperparameter_search_budget": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "No hyperparameter search budget is reported. The paper uses fixed hyperparameters without discussing whether alternatives were tried."
    329       },
    330       "best_config_selection_justified": {
    331         "applies": true,
    332         "answer": true,
    333         "justification": "The paper uses fixed, theoretically motivated configurations. The loss weighting ˜λ is optimized via the regression problem (15) with explicit train/test error reporting."
    334       },
    335       "multiple_comparison_correction": {
    336         "applies": false,
    337         "answer": false,
    338         "justification": "No statistical hypothesis tests are performed, so multiple comparison correction is not applicable."
    339       },
    340       "self_comparison_bias_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "The authors evaluate their own theoretical predictions against their own experiments without acknowledging potential bias in the experimental setup favoring their theory."
    344       },
    345       "compute_budget_vs_performance": {
    346         "applies": false,
    347         "answer": false,
    348         "justification": "Compute differences between configurations are negligible — the comparison is theoretical (loss approximations) not compute-dependent."
    349       },
    350       "benchmark_construct_validity": {
    351         "applies": true,
    352         "answer": true,
    353         "justification": "The paper extensively discusses whether Gaussian mixtures are an appropriate model for studying memorization (Section 2, Appendix A), referencing prior work using the same framework and discussing its limitations."
    354       },
    355       "scaffold_confound_addressed": {
    356         "applies": false,
    357         "answer": false,
    358         "justification": "No scaffolding is involved."
    359       }
    360     },
    361     "data_leakage": {
    362       "temporal_leakage_addressed": {
    363         "applies": false,
    364         "answer": false,
    365         "justification": "No pre-trained model evaluation. Models are trained from scratch on synthetic data with explicit train/test separation."
    366       },
    367       "feature_leakage_addressed": {
    368         "applies": false,
    369         "answer": false,
    370         "justification": "No pre-trained model evaluation."
    371       },
    372       "non_independence_addressed": {
    373         "applies": false,
    374         "answer": false,
    375         "justification": "No pre-trained model evaluation. Train/test independence is guaranteed by the synthetic i.i.d. data generation."
    376       },
    377       "leakage_detection_method": {
    378         "applies": false,
    379         "answer": false,
    380         "justification": "No pre-trained model evaluation."
    381       }
    382     }
    383   },
    384   "red_flags": [
    385     {
    386       "flag": "Only 3 random seeds",
    387       "detail": "Seed sensitivity analysis uses only 3 seeds (Figure 7), which is a minimal number for assessing variance. However, the reported variance is extremely small."
    388     },
    389     {
    390       "flag": "Synthetic-only validation",
    391       "detail": "All experiments use synthetic Gaussian mixture data. The low-rank image model (Section 4.2) is a step toward realism but remains far from actual diffusion model training on natural images. The practical relevance of the phase transition predictions to real-world models is not empirically tested."
    392     }
    393   ],
    394   "cited_papers": [
    395     {
    396       "title": "Extracting training data from diffusion models",
    397       "authors": ["Nicolas Carlini", "Jamie Hayes", "Milad Nasr"],
    398       "year": 2023,
    399       "relevance": "Empirical study of memorization and data extraction from diffusion models, directly relevant to AI model safety and privacy."
    400     },
    401     {
    402       "title": "Scalable extraction of training data from (production) language models",
    403       "authors": ["Milad Nasr", "Nicholas Carlini"],
    404       "year": 2023,
    405       "arxiv_id": "2311.17035",
    406       "relevance": "Training data extraction from production LLMs, relevant to AI safety and data privacy in deployed models."
    407     },
    408     {
    409       "title": "On provable copyright protection for generative models",
    410       "authors": ["Nikhil Vyas", "Sham M Kakade", "Boaz Barak"],
    411       "year": 2023,
    412       "relevance": "Theoretical framework for copyright protection in generative models, relevant to AI safety and governance."
    413     },
    414     {
    415       "title": "Differentially private diffusion models generate useful synthetic images",
    416       "authors": ["Sahra Ghalebikesabi"],
    417       "year": 2023,
    418       "arxiv_id": "2302.13861",
    419       "relevance": "Privacy-preserving training of diffusion models, relevant to AI safety and responsible deployment."
    420     },
    421     {
    422       "title": "An analytic theory of creativity in convolutional diffusion models",
    423       "authors": ["Mason Kamb", "Surya Ganguli"],
    424       "year": 2024,
    425       "arxiv_id": "2412.20292",
    426       "relevance": "Theoretical analysis of generalization/creativity in diffusion models, complementary theoretical contribution."
    427     }
    428   ]
    429 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs