scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (17518B)
      1 {
      2   "paper": {
      3     "title": "Are GANs Created Equal? A Large-Scale Study",
      4     "authors": ["Mario Lucic", "Karol Kurach", "Marcin Michalski", "Olivier Bousquet", "Sylvain Gelly"],
      5     "year": 2018,
      6     "venue": "NeurIPS",
      7     "arxiv_id": "1711.10337"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The paper states 'we open-sourced our experimental setup and model implementations at goo.gl/G8kf5J' (Section 1)."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses publicly available datasets (MNIST, Fashion-MNIST, CIFAR10, CelebA) and proposes new synthetic datasets (convex polygons) with the open-sourced code."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or detailed environment setup is described in the paper. Only mentions NVIDIA P100 GPUs."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "While code is released, the paper itself does not contain step-by-step reproduction instructions. Hyperparameter ranges are in the appendix but no explicit reproduction guide."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Table 2 reports mean FID ± standard deviation. Figure 5 shows 95% confidence intervals. Figure 3 shows one standard deviation intervals."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper notes that 'any statistically significant comparison of the models is unattainable' with limited budget but does not perform formal significance tests between models. Comparisons are based on overlapping distributions."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "FID scores with baselines are reported (e.g., NS GAN 6.8 vs WGAN 6.7 on MNIST, Table 2), giving concrete magnitudes of differences between models."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper explicitly discusses computational budget constraints ('approximately 6.85 GPU years') and uses 100 hyperparameter samples in wide search, 50 in narrow search, 50 random seeds for stability, and 5000 bootstrap resamples for confidence intervals."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Table 2 reports standard deviations across 50 random seeds. Figures 3 and 5 show variance via standard deviation intervals and confidence intervals respectively."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Seven GAN variants are compared (MM GAN, NS GAN, LSGAN, WGAN, WGAN GP, DRAGAN, BEGAN) plus VAE as a non-GAN baseline."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "All compared models were state-of-the-art or recent at the time (2017-2018 publications: WGAN GP, DRAGAN, BEGAN)."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper systematically varies hyperparameters (learning rate, batch norm, discriminator iterations, beta1, lambda, clipping) and measures their individual impact on FID (Figures 6-7, scatter plots)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "FID, precision, recall, and F1 score are all used as evaluation metrics."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of generated samples is performed. All evaluation is automated via FID, precision, recall, and F1."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "FID is computed between generated samples and 10k samples from the test set (Section 6: 'compute the FID between the 10k samples generated by the model and the 10k samples from the test set')."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down per dataset (MNIST, Fashion-MNIST, CIFAR10, CelebA) in Table 2 and per model across all metrics."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses mode collapse failures (asterisks in Table 2 indicating 'significant outlier runs, usually severe mode collapses or training failures'), and notes BEGAN and VAE's 'underwhelming performance' on the polygon task."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The central finding is a negative result: no GAN algorithm consistently outperforms the original NS GAN. The paper also reports that hyperparameter transfer does not always work (e.g., WGAN is sensitive)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims that 'most models can reach similar scores with enough hyperparameter optimization' and 'we did not find evidence that any of the tested algorithms consistently outperforms the non-saturating GAN' — both supported by Table 2 and Figure 3."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper's causal claims are modest and supported by controlled experiments. The claim that 'improvements can arise from a higher computational budget' is supported by Figure 3 showing budget-FID curves with controlled single-variable manipulation."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section 7 (Limitations) explicitly states that results are limited to the tested architecture, datasets, and optimization settings, and acknowledges that 'some models significantly outperform others under currently unexplored conditions.'"
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 7 discusses alternative explanations: architecture choice could change results, different optimization methods could matter, and FID limitations could affect conclusions."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The models compared are fully specified via their loss functions (Table 1). The architecture is specified as InfoGAN-based with exact details (latent code size 64, batch size 64, training epochs, etc.)."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "This paper does not use prompting — it trains generative models from scratch."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Hyperparameter ranges are reported in Section 6 and Appendix A. Learning rate, beta1, batch norm, discriminator iterations, and model-specific parameters (lambda, gamma, clipping) are all specified."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. This is a standard training and evaluation study."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The datasets used are standard benchmarks (MNIST, Fashion-MNIST, CIFAR10, CelebA). The paper describes how FID samples are computed (10k samples), how the precision/recall datasets are constructed (convex polygons), and the train/test partition procedure."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 7 'Limitations of the Study' is a dedicated section covering three categories of limitations: datasets/architectures, metrics, and hyperparameter exploration."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 7 discusses specific threats: single architecture may not generalize to higher-resolution tasks, FID may be fooled by encoding-specific artifacts, FID cannot detect overfitting, hyperparameter ranges from one dataset may not transfer."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The paper explicitly states 'We cannot exclude the possibility that some models significantly outperform others under currently unexplored conditions' and notes results are limited to the InfoGAN architecture and four specific datasets."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The open-sourced code at goo.gl/G8kf5J includes the experimental setup, and the datasets used are all publicly available standard benchmarks."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "All datasets are standard public benchmarks (MNIST, Fashion-MNIST, CIFAR10, CelebA). The novel convex polygon dataset generation procedure is described in Section 5."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data sources are standard public benchmarks."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline is documented: hyperparameter sampling → training → FID evaluation every 5 epochs → early stopping based on best FID → re-running best model 50 times with different seeds (Section 6)."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding disclosure or acknowledgment of grants. The acknowledgments section only thanks individuals for discussions."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All authors are listed as Google Brain affiliates in the paper header."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Google Brain is not evaluating its own GAN product. The study compares published GAN algorithms neutrally; Google has no financial stake in one algorithm winning over another."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "This paper trains models from scratch on specified datasets. It does not evaluate pre-trained model capabilities on benchmarks."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Models are trained from scratch; no pre-trained model knowledge contamination applies."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Models are trained from scratch; benchmark contamination from pre-training is not applicable."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No per-model inference cost or training cost per run is reported, only the total computational budget."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "The paper states 'Reproducing these experiments requires approximately 6.85 GPU years (NVIDIA P100)' (Section 1, footnote 2)."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Most GAN models can reach similar FID scores with enough hyperparameter optimization and random restarts.",
    286       "evidence": "Table 2 shows overlapping FID distributions across models. Figure 3 shows convergence of minimum FID as computational budget increases.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "No tested algorithm consistently outperforms the non-saturating GAN (NS GAN).",
    291       "evidence": "Table 2 shows NS GAN is competitive or best on multiple datasets. Figure 3 shows no model clearly dominates across budgets.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "A 'bad' model with higher computational budget can outperform a 'good' model with lower budget.",
    296       "evidence": "Figure 3 shows crossing curves where models with worse expected performance at low budgets achieve better FID at higher budgets.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "FID is sensitive to mode dropping and robust to encoding network choice.",
    301       "evidence": "Figure 1b shows FID increases sharply with mode dropping. Figure 1c shows Spearman's rank correlation of 0.9 between InceptionNet and VGG-based FID.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "GAN training is extremely sensitive to hyperparameter settings.",
    306       "evidence": "Figure 4 shows wide variance in FID across hyperparameter samples. Figure 14 shows even narrow-range search produces significant variance.",
    307       "supported": "strong"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "This large-scale study (6.85 GPU years on P100s) finds that no GAN algorithm consistently outperforms the original non-saturating GAN when given fair hyperparameter optimization budgets. Performance differences between state-of-the-art GANs are largely attributable to computational budget and hyperparameter tuning rather than algorithmic innovations. The paper also proposes convex polygon datasets enabling precision/recall measurement and demonstrates that FID is a robust evaluation metric.",
    312   "red_flags": [],
    313   "cited_papers": [
    314     {
    315       "title": "Wasserstein generative adversarial networks",
    316       "authors": ["Martín Arjovsky", "Soumith Chintala", "Léon Bottou"],
    317       "year": 2017,
    318       "relevance": "Foundational GAN variant evaluated in the study; demonstrates importance of rigorous model comparison."
    319     },
    320     {
    321       "title": "Generalization and equilibrium in generative adversarial nets (GANs)",
    322       "authors": ["Sanjeev Arora", "Rong Ge", "Yingyu Liang", "Tengyu Ma", "Yi Zhang"],
    323       "year": 2017,
    324       "relevance": "Theoretical analysis of GAN convergence and generalization relevant to evaluation methodology."
    325     },
    326     {
    327       "title": "Many paths to equilibrium: GANs do not need to decrease a divergence at every step",
    328       "authors": ["William Fedus", "Mihaela Rosca", "Balaji Lakshminarayanan", "Andrew M. Dai", "Shakir Mohamed", "Ian Goodfellow"],
    329       "year": 2018,
    330       "relevance": "Concurrent GAN comparison study using IS; this paper extends it with FID and precision/recall."
    331     },
    332     {
    333       "title": "GANs trained by a two time-scale update rule converge to a local Nash equilibrium",
    334       "authors": ["Martin Heusel", "Hubert Ramsauer", "Thomas Unterthiner", "Bernhard Nessler", "Sepp Hochreiter"],
    335       "year": 2017,
    336       "relevance": "Proposes FID metric that this paper extensively evaluates and validates."
    337     },
    338     {
    339       "title": "Improved techniques for training GANs",
    340       "authors": ["Tim Salimans", "Ian Goodfellow", "Wojciech Zaremba", "Vicki Cheung", "Alec Radford", "Xi Chen"],
    341       "year": 2016,
    342       "relevance": "Proposes Inception Score metric whose limitations motivate this study's focus on FID."
    343     }
    344   ]
    345 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs