scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21438B)
      1 {
      2   "paper": {
      3     "title": "Competition-Level Code Generation with AlphaCode",
      4     "authors": ["Yujia Li", "David Choi", "Junyoung Chung", "Nate Kushman", "Julian Schrittwieser", "Rémi Leblond", "Tom Eccles", "James Keeling", "Felix Gimeno", "Agustin Dal Lago", "Thomas Hubert", "Peter Choy", "Cyprien de Masson d'Autume", "Igor Babuschkin", "Xinyun Chen", "Po-Sen Huang", "Johannes Welbl", "Sven Gowal", "Alexey Cherepanov", "James Molloy", "Daniel J. Mankowitz", "Esme Sutherland Robson", "Pushmeet Kohli", "Nando de Freitas", "Koray Kavukcuoglu", "Oriol Vinyals"],
      5     "year": 2022,
      6     "venue": "arXiv / Science",
      7     "arxiv_id": "2203.07814"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The CodeContests dataset is released on GitHub (https://github.com/deepmind/code_contests), referenced in Section 3.2. However, the AlphaCode model code itself is not released. The dataset release counts as a partial artifact release."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The CodeContests dataset is publicly released on GitHub (Section 3.2, footnote 1). The pre-training GitHub dataset is described but based on a public snapshot."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions JAX, Haiku, TPUv4, bfloat16 precision (Section 4.1), but no requirements.txt, Dockerfile, or detailed environment specification is provided."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided. The model weights are not released, and no README with commands to reproduce results is mentioned."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Table 8 reports 95% confidence intervals for ablation results, e.g., '6.7% (6.5-6.8)'. Bootstrapping is used for metric estimation (Section 2.2, Appendix A.3)."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No formal statistical significance tests (p-values, t-tests, etc.) are reported. Comparisons rely on confidence intervals and point estimates."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Effect sizes are reported with baseline context throughout, e.g., Table 8 shows ablation improvements from 15.2% to 24.1% (10@100k), and Table 5 shows solve rates across conditions."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The validation set has 117 problems and test set 165 problems. No justification is given for why these sizes are sufficient, nor is a power analysis discussed."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Table 8 reports confidence intervals from multiple fine-tuning runs ('at least 3 different models from the same pre-trained checkpoint'). Codeforces evaluation was repeated 3 times (Section 5.1)."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Comparisons against GPT-Neo 2.7B and Codex 12B on APPS (Table 10), and against decoder-only and standard multi-head attention architectures (Table 6)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Codex (Chen et al., 2021) and GPT-Neo were the state-of-the-art code generation models at the time of writing."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Extensive ablation study in Section 5.3 and Table 8, showing the contribution of MLM, tempering, tags/ratings, value conditioning, GOLD, and clustering."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple metrics used: pass@k, 10@k, n@k at various sample budgets. Also reports false positive rates (Table 2) and per-category breakdowns."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Table 2 reports manual examination of 50 solutions per dataset to check false positive rates. Section 6.1 includes qualitative analysis of 50 model-generated solutions for copying behavior."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Strict temporal split: training data before 2021/07/14, validation 2021/07/15-2021/09/20, test after 2021/09/21 (Section 3.2). Results reported on both validation and test sets (Table 5)."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 11 shows solve rates across 10 problem types (greedy, math, DP, etc.) at different model sizes. APPS results broken down by difficulty level (Table 10). Appendix E.2 has difficulty rating buckets."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 6 extensively discusses capabilities and limitations. Section 6.3 analyzes sensitivity to description changes. Table 11 shows weak areas (DP, constructive algorithms). Dead code analysis in Section 6.2."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 6.5 shows validation loss is a poor proxy for solve rate. The 41B model was undertrained due to resource limitations (Section 4.2). Top-k and nucleus sampling did not improve performance (Section 4.4)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims of top 54.3% ranking are supported by Table 4. The three key components (dataset, architecture, sampling+filtering) are validated through ablations in Section 5.3."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims about component contributions are supported by controlled ablation studies (Table 8) where components are added one at a time. Architecture comparisons use matched compute (Table 6)."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper is specific about the evaluation setting: Codeforces competitions, CodeContests dataset, APPS benchmark. Section 8.1 explicitly notes 'there are few direct applications of this work outside of competitive programming.'"
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 6.1 investigates whether the model simply copies from training data. Section 6.3 tests whether the model exploits problem structure shortcuts. Section 6.5 discusses the disconnect between loss and solve rate."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Custom models with exact architecture specifications in Table 3: AlphaCode 300M, 1B, 3B, 9B, 41B with detailed parameter counts, hidden dimensions, head counts, training steps, and tokens."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Figure 5 shows the metadata format. Appendix F provides complete examples of model prompts and samples. The exact input format is documented."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Extensive hyperparameter reporting: learning rates, β values, weight decay, temperature (T=0.2 training, T'=0.25 sampling), batch sizes, training steps (Sections 4.2, 4.3, 4.4, Table 3)."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The full pipeline is described in detail: pre-training → fine-tuning → large-scale sampling → filtering on example tests → clustering via generated test inputs → submission selection (Section 4, Figure 4)."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Pre-training data filtering documented (Section 3.1): files >1MB removed, lines >1000 chars removed, duplicates removed. CodeContests cleaning in Appendix B.2. Generated test case procedure in Section 3.2.1."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 6 is titled 'AlphaCode's capabilities & limitations' and discusses limitations extensively. Section 8 'Broader impact' discusses risks. Section 6.5 discusses loss as a poor proxy."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Specific threats discussed: false positive rates in evaluation (Table 2, Section 3.2.1), temporal split limitations, self-selected participant pool for Codeforces comparison (Section 5.1), 41B model being undertrained."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 8.1 explicitly states 'there are few direct applications of this work outside of competitive programming.' The evaluation is bounded to Codeforces competitions with >5000 participants. Self-selection of competitors is noted."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The CodeContests dataset is publicly released on GitHub (Section 3.2, footnote 1), including problems, solutions, and test cases."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 3 describes data collection in detail: GitHub snapshot date (2021/07/14), Codeforces scraping, integration with Description2Code and CodeNet datasets, temporal split dates."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants recruited. Data comes from public GitHub repositories and Codeforces platform. The Codeforces evaluation uses existing competition infrastructure."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Full pipeline documented: GitHub filtering (Section 3.1), CodeContests curation with cleaning (Appendix B.2), generated test cases (Section 3.2.1), problem filtering criteria (≥5 hidden tests, ≥2 different outputs)."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The paper is from DeepMind (Google). Affiliation is clearly stated: '© 2022 DeepMind. All rights reserved.' All authors are DeepMind employees/interns."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All authors affiliated with DeepMind. Footnotes note Igor Babuschkin moved to OpenAI, Xinyun Chen was a DeepMind intern from UC Berkeley."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "DeepMind/Google funds the research and has a commercial interest in demonstrating AI capabilities. The funder is not independent of the outcome."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement or financial interest disclosure is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Training data cutoff explicitly stated: GitHub snapshot on 2021/07/14, all CodeContests training data publicly released on or before 2021/07/14 (Section 3.2)."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Section 3.2 describes temporal split to prevent leakage. Section 6.1 analyzes copying from training data with longest common substring analysis. Appendix B.3 provides further analysis."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "Temporal split ensures all evaluation problems post-date training data (Section 3.2). Section 6.1 provides extensive analysis showing the model does not copy solutions. Problems are newly created for each competition."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in the study. The paper evaluates an AI system on programming competitions."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants. The system is evaluated against existing competition data."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants recruited for the study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants recruited for the study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants or experimental conditions involving humans."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants or experimental conditions involving humans."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Table 6 reports sampling speed in TPU seconds per sample. Figure 7(b) plots solve rate vs. sampling TPU-seconds per problem. Section 8.2 notes 'hundreds of petaFLOPS days' for sampling and training."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Training compute shown in Figure 7(a) in TPU-days. Table 3 lists training steps and tokens. Section 8.2 states 'Both sampling and training from our model required hundreds of petaFLOPS days.' TPUv4 accelerators mentioned."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "AlphaCode achieved an average ranking within the top 54.3% in simulated Codeforces competitions with more than 5,000 participants.",
    286       "evidence": "Table 4 shows estimated percent rankings across 10 contests. Average estimated ranking is 54.3%. Three evaluation runs were conducted to measure variance (Section 5.1).",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "AlphaCode achieved an estimated Codeforces rating of 1238, within the top 28% of users who participated in a contest in the last 6 months.",
    291       "evidence": "Section 5.1 and Figure 1(b) show the rating estimate. The 28% figure refers to active users, a self-selected subset (footnote 4).",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Three key components are critical for performance: clean dataset, efficient transformer architectures, and large-scale sampling with filtering.",
    296       "evidence": "Table 8 build-up ablation shows each component contributes (10@100k from 15.2% to 24.1%). Table 7 shows pre-training dataset impact. Figure 8 shows filtering/clustering importance.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "AlphaCode does not simply copy solutions from the training data.",
    301       "evidence": "Section 6.1 analyzes longest common substrings between model solutions and training data, finding similar distributions to human solutions. Figure 9 and qualitative analysis of 50 solutions in Figure 10.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "CodeContests reduces false positive rates from 30-60% in existing datasets to 4%.",
    306       "evidence": "Table 2 compares false positive rates: APPS 60%, HumanEval 30%, CodeContests raw 62%, CodeContests 4%. Based on manual examination of 50 problems per dataset.",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "Solve rates scale log-linearly with the number of samples.",
    311       "evidence": "Figure 6 shows approximately log-linear scaling for both 10@k and pass@k metrics across model sizes. However, 10@k curves bend slightly at high sample budgets.",
    312       "supported": "strong"
    313     }
    314   ],
    315   "methodology_tags": ["benchmark-eval"],
    316   "key_findings": "AlphaCode is the first AI system to achieve competitive-level performance in programming competitions, ranking in the top 54.3% of Codeforces participants. The system relies on massive sampling (up to 1M samples per problem) followed by filtering and clustering to select 10 submissions, achieving 34.2% solve rate on the CodeContests validation set. Key innovations include a clean competitive programming dataset with temporal splits and generated test cases reducing false positives to 4%, and sampling-efficient encoder-decoder architectures with multi-query attention. Ablation studies show that each component (MLM, tempering, value conditioning, GOLD, clustering) contributes meaningfully, and solve rates scale log-linearly with both model parameters and sample budget.",
    317   "red_flags": [
    318     {
    319       "flag": "Company evaluating own system",
    320       "detail": "All authors are DeepMind employees. While the evaluation methodology is rigorous (using Codeforces platform and temporal splits), there is an inherent conflict of interest in DeepMind evaluating its own system's capabilities."
    321     },
    322     {
    323       "flag": "Self-selected comparison population",
    324       "detail": "The 'top 28% of users' claim is among users who participated in contests in the last 6 months, which is a self-selected group of active competitive programmers. The paper acknowledges this but the framing could overstate the achievement."
    325     },
    326     {
    327       "flag": "Massive compute requirements not fully quantified",
    328       "detail": "The paper mentions 'hundreds of petaFLOPS days' for both training and sampling but does not provide a precise total compute budget or cost estimate, making practical reproducibility assessment difficult."
    329     }
    330   ],
    331   "cited_papers": [
    332     {
    333       "title": "Evaluating Large Language Models Trained on Code",
    334       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    335       "year": 2021,
    336       "arxiv_id": "2107.03374",
    337       "relevance": "Introduces Codex and HumanEval benchmark, a key baseline and evaluation framework for code generation."
    338     },
    339     {
    340       "title": "Measuring Coding Challenge Competence With APPS",
    341       "authors": ["Dan Hendrycks"],
    342       "year": 2021,
    343       "relevance": "Introduces the APPS dataset of programming problems used as a comparison benchmark in this paper."
    344     },
    345     {
    346       "title": "Program Synthesis with Large Language Models",
    347       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"],
    348       "year": 2021,
    349       "arxiv_id": "2108.07732",
    350       "relevance": "Early work on using large language models for program synthesis, establishing baseline capabilities."
    351     },
    352     {
    353       "title": "Language Models are Few-Shot Learners",
    354       "authors": ["Tom B. Brown"],
    355       "year": 2020,
    356       "arxiv_id": "2005.14165",
    357       "relevance": "GPT-3 paper establishing the foundation for large language model capabilities including code generation."
    358     },
    359     {
    360       "title": "Training Verifiers to Solve Math Word Problems",
    361       "authors": ["Karl Cobbe"],
    362       "year": 2021,
    363       "arxiv_id": "2110.14168",
    364       "relevance": "Demonstrates majority voting and verifier-based sample selection, related to AlphaCode's filtering approach."
    365     },
    366     {
    367       "title": "Extracting Training Data from Large Language Models",
    368       "authors": ["Nicholas Carlini"],
    369       "year": 2021,
    370       "relevance": "Addresses memorization and data extraction concerns in large language models, relevant to AlphaCode's copying analysis."
    371     },
    372     {
    373       "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages",
    374       "authors": ["Zhangyin Feng"],
    375       "year": 2020,
    376       "relevance": "Pre-trained model for code understanding, part of the foundation for transformer-based code generation."
    377     },
    378     {
    379       "title": "Project CodeNet: A Large-Scale AI for Code Dataset for Learning a Diversity of Coding Tasks",
    380       "authors": ["Ruchir Puri"],
    381       "year": 2021,
    382       "relevance": "Large-scale code dataset used as part of AlphaCode's CodeContests training set."
    383     }
    384   ]
    385 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs