scan.json (36044B)
1 { 2 "paper": { 3 "title": "Evaluating Large Language Models Trained on Code", 4 "authors": [ 5 "Mark Chen", 6 "Jerry Tworek", 7 "Heewoo Jun", 8 "Qiming Yuan", 9 "Henrique Ponde de Oliveira Pinto", 10 "Jared Kaplan", 11 "Harri Edwards", 12 "Yuri Burda", 13 "Nicholas Joseph", 14 "Greg Brockman", 15 "Alex Ray", 16 "Raul Puri", 17 "Gretchen Krueger", 18 "Michael Petrov", 19 "Heidy Khlaaf", 20 "Girish Sastry", 21 "Pamela Mishkin", 22 "Brooke Chan", 23 "Scott Gray", 24 "Nick Ryder", 25 "Mikhail Pavlov", 26 "Alethea Power", 27 "Lukasz Kaiser", 28 "Mohammad Bavarian", 29 "Clemens Winter", 30 "Philippe Tillet", 31 "Felipe Petroski Such", 32 "Dave Cummings", 33 "Matthias Plappert", 34 "Fotios Chantzis", 35 "Elizabeth Barnes", 36 "Ariel Herbert-Voss", 37 "William Hebgen Guss", 38 "Alex Nichol", 39 "Alex Paino", 40 "Nikolas Tezak", 41 "Jie Tang", 42 "Igor Babuschkin", 43 "Suchir Balaji", 44 "Shantanu Jain", 45 "William Saunders", 46 "Christopher Hesse", 47 "Andrew N. Carr", 48 "Jan Leike", 49 "Josh Achiam", 50 "Vedant Misra", 51 "Evan Morikawa", 52 "Alec Radford", 53 "Matthew Knight", 54 "Miles Brundage", 55 "Mira Murati", 56 "Katie Mayer", 57 "Peter Welinder", 58 "Bob McGrew", 59 "Dario Amodei", 60 "Sam McCandlish", 61 "Ilya Sutskever", 62 "Wojciech Zaremba" 63 ], 64 "year": 2021, 65 "venue": "arXiv", 66 "arxiv_id": "2107.03374" 67 }, 68 "scan_version": 3, 69 "active_modules": [ 70 "experimental_rigor", 71 "data_leakage" 72 ], 73 "checklist": { 74 "artifacts": { 75 "code_released": { 76 "applies": true, 77 "answer": true, 78 "justification": "The HumanEval evaluation framework is released at https://www.github.com/openai/human-eval. Alignment evaluation data is released at https://github.com/openai/code-align-evals-data. However, the Codex model itself is not released." 79 }, 80 "data_released": { 81 "applies": true, 82 "answer": true, 83 "justification": "The HumanEval dataset of 164 hand-written programming problems is released at https://www.github.com/openai/human-eval, as stated in Section 2.2." 84 }, 85 "environment_specified": { 86 "applies": true, 87 "answer": false, 88 "justification": "No requirements.txt, Dockerfile, or detailed environment setup is provided. The paper mentions Python and numpy (Figure 3) but does not provide enough detail to recreate the environment." 89 }, 90 "reproduction_instructions": { 91 "applies": true, 92 "answer": false, 93 "justification": "No step-by-step reproduction instructions are provided. While the evaluation dataset is released, the model weights are not, and there are no scripts or README with commands to replicate the main experiments." 94 } 95 }, 96 "statistical_methodology": { 97 "confidence_intervals_or_error_bars": { 98 "applies": true, 99 "answer": false, 100 "justification": "All pass@k results in Tables 1 and 2 are reported as point estimates without confidence intervals or error bars. While the paper develops an unbiased estimator for pass@k (Equation 1, Appendix A), no uncertainty bounds on the estimates are provided." 101 }, 102 "significance_tests": { 103 "applies": true, 104 "answer": false, 105 "justification": "Comparative claims (e.g., Codex outperforms GPT-J, Codex-S outperforms Codex) are made by comparing raw pass@k numbers without any statistical significance tests." 106 }, 107 "effect_sizes_reported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Results are reported with full baseline context: Codex-12B solves 28.8% pass@1 vs GPT-3 at 0% and GPT-J at 11.4% (Section 1). Codex-S's improvement over Codex is quantified as 'an average margin of 6.5 percentage points on pass@1 and 15.1 percentage points on pass@100' (Section 4.5). The magnitude of differences is clear throughout." 111 }, 112 "sample_size_justified": { 113 "applies": true, 114 "answer": false, 115 "justification": "The HumanEval dataset contains 164 problems but there is no justification for why 164 was chosen, no power analysis, and no discussion of whether this sample size is sufficient for the claims made." 116 }, 117 "variance_reported": { 118 "applies": true, 119 "answer": false, 120 "justification": "Results are reported as single point estimates. While the unbiased pass@k estimator (Equation 1) accounts for sampling variance mathematically, no standard deviations, error bars, or spread measures across experimental runs are reported." 121 } 122 }, 123 "evaluation_design": { 124 "baselines_included": { 125 "applies": true, 126 "answer": true, 127 "justification": "Multiple baselines are included: GPT-3 (various sizes), GPT-Neo (125M, 1.3B, 2.7B), GPT-J-6B, and TabNine, all evaluated on HumanEval (Table 1). For APPS, GPT-Neo 2.7B fine-tuned results from Hendrycks et al. serve as baseline (Table 2)." 128 }, 129 "baselines_contemporary": { 130 "applies": true, 131 "answer": true, 132 "justification": "GPT-Neo, GPT-J, and TabNine were all contemporary at time of publication (2021). GPT-J was released in May 2021, the same year as this paper." 133 }, 134 "ablation_study": { 135 "applies": true, 136 "answer": true, 137 "justification": "The paper systematically ablates key components: GPT (no code fine-tuning) vs Codex (code fine-tuned) vs Codex-S (supervised fine-tuned), showing the contribution of each stage. Model size is varied across 8 scales (12M to 12B). Different sampling strategies (random, mean log-prob, back-translation) are compared (Figure 7)." 138 }, 139 "multiple_metrics": { 140 "applies": true, 141 "answer": true, 142 "justification": "Multiple evaluation metrics are used: pass@1, pass@10, pass@100 (Table 1), BLEU scores (Figure 8), test loss (Figure 4), and the APPS dataset metrics including raw and filtered pass@k (Table 2)." 143 }, 144 "human_evaluation": { 145 "applies": true, 146 "answer": true, 147 "justification": "Codex-D docstring outputs are graded by hand: 'we grade sample docstrings by hand, considering a docstring correct if it uniquely and accurately specifies the code body. Due to the time consuming nature of this process, we only grade 10 samples per problem, for a total of 1640 problems' (Section 5, Table 3)." 148 }, 149 "held_out_test_set": { 150 "applies": true, 151 "answer": true, 152 "justification": "HumanEval is specifically designed as a held-out test set: 'It is important for these tasks to be hand-written, since our models are trained on a large fraction of GitHub, which already contains solutions to problems from a variety of sources' (Section 2.2)." 153 }, 154 "per_category_breakdown": { 155 "applies": true, 156 "answer": true, 157 "justification": "Table 1 breaks down results by model size across all baselines. Table 2 breaks APPS results by difficulty level (Introductory, Interview, Competition). Figure 11 shows performance degradation by chain length. Figure 5 breaks down results by temperature." 158 }, 159 "failure_cases_discussed": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 6 (Limitations) provides detailed failure analysis: difficulty with long chains of operations (Figure 11), variable binding errors (concrete code example in Section 6), misalignment producing buggy code when prompted with buggy code (Figure 12), and insecure code generation (Figure 15, Appendix G)." 163 }, 164 "negative_results_reported": { 165 "applies": true, 166 "answer": true, 167 "justification": "Multiple negative results: 'we did not observe improvements when starting from a pre-trained language model' (Section 3.2); back-translation ranking 'underperforms mean log-probability ranking' (Section 5); 'choosing the sample based on sum log probability can perform slightly worse than picking randomly' (Section 3.3); misalignment worsens with scale (Figure 12)." 168 } 169 }, 170 "claims_and_evidence": { 171 "abstract_claims_supported": { 172 "applies": true, 173 "answer": true, 174 "justification": "All abstract claims are supported: 28.8% pass rate (Table 1), GPT-3 at 0% (Table 1), GPT-J at 11.4% (Table 1), 70.2% with 100 samples (Section 1, consistent with Figure 1 showing Codex-S at 77.5%). The abstract's qualitative claims about limitations are supported by Section 6." 175 }, 176 "causal_claims_justified": { 177 "applies": true, 178 "answer": true, 179 "justification": "Causal claims about fine-tuning improving performance are supported by controlled comparisons: same base model architecture with and without code fine-tuning (GPT vs Codex), and with and without supervised fine-tuning (Codex vs Codex-S). Each comparison varies a single factor. The alignment analysis (Appendix E) carefully distinguishes capability from alignment." 180 }, 181 "generalization_bounded": { 182 "applies": true, 183 "answer": true, 184 "justification": "The paper explicitly bounds its scope: 'In this work, we focus on the task of generating standalone Python functions from docstrings' (Section 1). The abstract states 'study its Python code-writing capabilities.' While the broader impacts section discusses general code generation, the empirical claims are bounded to Python." 185 }, 186 "alternative_explanations_discussed": { 187 "applies": true, 188 "answer": true, 189 "justification": "Section 7.2 and Appendix E explicitly distinguish misalignment from incompetence as alternative explanations for model failures. Appendix E.3 considers whether poor performance on buggy prompts could be a robustness failure rather than misalignment. Section 4 discusses data distribution mismatch as a factor." 190 }, 191 "proxy_outcome_distinction": { 192 "applies": true, 193 "answer": true, 194 "justification": "Section 2.1 explicitly argues for functional correctness over BLEU score as the evaluation metric, and Figure 8 demonstrates empirically that BLEU is a poor proxy for correctness. The paper measures pass@k and claims code generation capability — the measurement and claim are well-aligned with minimal proxy gap." 195 } 196 }, 197 "setup_transparency": { 198 "model_versions_specified": { 199 "applies": true, 200 "answer": true, 201 "justification": "Exact model sizes are specified for all models evaluated: Codex at 12M, 25M, 42M, 85M, 300M, 679M, 2.5B, and 12B parameters; GPT-Neo at 125M, 1.3B, 2.7B; GPT-J at 6B (Table 1). Since these are the authors' own models, parameter counts uniquely identify each variant." 202 }, 203 "prompts_provided": { 204 "applies": true, 205 "answer": true, 206 "justification": "Figure 2 shows three complete example prompts with the exact format used (header, signature, docstring). Stop sequences are specified: '\\nclass', '\\ndef', '\\n#', '\\nif', '\\nprint' (Section 3.2). Appendix B provides 8 additional full prompt examples. Appendix E.5 shows alignment evaluation prompts." 207 }, 208 "hyperparameters_reported": { 209 "applies": true, 210 "answer": true, 211 "justification": "Detailed hyperparameters are reported: nucleus sampling with top p=0.95, temperatures tested (0.2, 0.4, 0.8), n=200 samples per task. Training: 175-step linear warmup, cosine learning rate decay, 100 billion tokens, Adam with β1=0.9, β2=0.95, ε=10⁻⁸, weight decay 0.1 (Section 3.2)." 212 }, 213 "scaffolding_described": { 214 "applies": false, 215 "answer": false, 216 "justification": "No agentic scaffolding is used. Codex performs direct model inference from prompts without any tool use, retry logic, or multi-step workflow." 217 }, 218 "data_preprocessing_documented": { 219 "applies": true, 220 "answer": true, 221 "justification": "Section 3.1 documents data preprocessing in detail: collected from 54 million GitHub repos (179 GB unique Python files under 1 MB), filtered out auto-generated files, average line length >100, max line length >1000, low alphanumeric percentage, resulting in 159 GB final dataset. Tokenizer adaptations for whitespace are described (Section 3.2)." 222 } 223 }, 224 "limitations_and_scope": { 225 "limitations_section_present": { 226 "applies": true, 227 "answer": true, 228 "justification": "Section 6 is a dedicated 'Limitations' section discussing specific shortcomings. Additionally, Section 7 (Broader Impacts) provides extensive discussion of risks and limitations across over-reliance, misalignment, bias, security, and economic impacts." 229 }, 230 "threats_to_validity_specific": { 231 "applies": true, 232 "answer": true, 233 "justification": "Section 6 discusses specific threats: Codex is 'not sample efficient to train,' struggles with 'docstrings describing long chains of operations' (quantified in Figure 11), and has 'difficulty with binding operations to variables' (concrete code example provided). Appendix E discusses specific alignment threats with empirical evidence." 234 }, 235 "scope_boundaries_stated": { 236 "applies": true, 237 "answer": true, 238 "justification": "The paper explicitly states it focuses on 'generating standalone Python functions from docstrings' (Section 1). Section 6 notes the model 'struggles to parse through increasingly long and higher-level or system-level specifications.' The broader impacts section (7.5) states 'at their current level of capability, Codex models do not materially lower the barrier to entry for malware development.'" 239 } 240 }, 241 "data_integrity": { 242 "raw_data_available": { 243 "applies": true, 244 "answer": false, 245 "justification": "The HumanEval evaluation dataset is released, but the 159 GB training dataset is not. The training data cannot be independently verified. The model weights are also not released, preventing independent replication of results." 246 }, 247 "data_collection_described": { 248 "applies": true, 249 "answer": true, 250 "justification": "Section 3.1 describes data collection in detail: '54 million public software repositories hosted on GitHub, containing 179 GB of unique Python files under 1 MB,' collected in May 2020. Filtering criteria are specified. Section 4.1-4.2 describe supervised fine-tuning data collection from competitive programming sites and CI-traced projects." 251 }, 252 "recruitment_methods_described": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants are involved. Training data comes from public GitHub repositories. HumanEval problems were hand-written by the authors. The Codex-D docstring grading (Section 5) is done by the authors, not recruited participants." 256 }, 257 "data_pipeline_documented": { 258 "applies": true, 259 "answer": true, 260 "justification": "The data pipeline is documented: GitHub scraping (179 GB) → filtering by auto-generation, line length, alphanumeric content (159 GB final). For Codex-S training data: competitive programming problems (10,000 curated) + CI-traced functions (~40,000) → quality filtering using Codex-12B to remove ambiguous/stateful problems (Section 4.3)." 261 } 262 }, 263 "conflicts_of_interest": { 264 "funding_disclosed": { 265 "applies": true, 266 "answer": true, 267 "justification": "The acknowledgments section states 'we thank GitHub for partnering to build GitHub Copilot and Microsoft Azure for supporting model training with infrastructure management,' disclosing corporate support for the research." 268 }, 269 "affiliations_disclosed": { 270 "applies": true, 271 "answer": true, 272 "justification": "Author affiliations are clearly listed: '1 OpenAI, San Francisco, California, USA. 2 Anthropic AI, San Francisco, California, USA. Work performed while at OpenAI. 3 Zipline, South San Francisco, California, USA. Work performed while at OpenAI.'" 273 }, 274 "funder_independent_of_outcome": { 275 "applies": true, 276 "answer": false, 277 "justification": "OpenAI has a direct financial interest in Codex's performance — the paper states 'A distinct production version of Codex powers GitHub Copilot.' OpenAI is evaluating its own commercial product. Microsoft Azure (infrastructure provider) and GitHub (partner for Copilot) also have financial stakes." 278 }, 279 "financial_interests_declared": { 280 "applies": true, 281 "answer": false, 282 "justification": "No competing interests or financial disclosure statement is present. The paper does not include a standard conflicts-of-interest declaration, despite OpenAI's commercial interest in Codex through GitHub Copilot." 283 } 284 }, 285 "contamination": { 286 "training_cutoff_stated": { 287 "applies": true, 288 "answer": true, 289 "justification": "Section 3.1 states 'Our training dataset was collected in May 2020 from 54 million public software repositories hosted on GitHub.'" 290 }, 291 "train_test_overlap_discussed": { 292 "applies": true, 293 "answer": true, 294 "justification": "Section 2.2 explicitly discusses this concern: 'It is important for these tasks to be hand-written, since our models are trained on a large fraction of GitHub, which already contains solutions to problems from a variety of sources. For example, there are more than ten public repositories containing solutions to Codeforces problems, which make up part of the recently proposed APPS dataset.'" 295 }, 296 "benchmark_contamination_addressed": { 297 "applies": true, 298 "answer": true, 299 "justification": "HumanEval was specifically designed to avoid contamination: hand-written problems created after the training data collection (May 2020). The paper notes 'Though not a guarantee for problem novelty, all problems were hand-written and not programmatically copied from existing sources' (Figure 2 caption). For APPS, contamination risk is acknowledged." 300 } 301 }, 302 "human_studies": { 303 "pre_registered": { 304 "applies": false, 305 "answer": false, 306 "justification": "No human participants in this study. The paper evaluates models on benchmarks." 307 }, 308 "irb_or_ethics_approval": { 309 "applies": false, 310 "answer": false, 311 "justification": "No human participants in this study." 312 }, 313 "demographics_reported": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants in this study." 317 }, 318 "inclusion_exclusion_criteria": { 319 "applies": false, 320 "answer": false, 321 "justification": "No human participants in this study." 322 }, 323 "randomization_described": { 324 "applies": false, 325 "answer": false, 326 "justification": "No human participants in this study." 327 }, 328 "blinding_described": { 329 "applies": false, 330 "answer": false, 331 "justification": "No human participants in this study." 332 }, 333 "attrition_reported": { 334 "applies": false, 335 "answer": false, 336 "justification": "No human participants in this study." 337 } 338 }, 339 "cost_and_practicality": { 340 "inference_cost_reported": { 341 "applies": true, 342 "answer": false, 343 "justification": "No inference costs are reported. The paper generates 200 samples per problem for 164 problems across multiple model sizes and temperatures, but does not report the total inference cost, tokens consumed, or wall-clock time." 344 }, 345 "compute_budget_stated": { 346 "applies": true, 347 "answer": true, 348 "justification": "Section 7.6 states 'The original training of GPT-3-12B consumed hundreds of petaflop/s-days of compute, while fine-tuning it to create Codex-12B consumed a similar amount of compute.' Section 3.2 states training was for 100 billion tokens. The platform (Azure) is identified." 349 } 350 }, 351 "experimental_rigor": { 352 "seed_sensitivity_reported": { 353 "applies": true, 354 "answer": false, 355 "justification": "No seed sensitivity analysis is reported. Results are generated from a single set of 200 samples per problem with no discussion of how results vary across random seeds." 356 }, 357 "number_of_runs_stated": { 358 "applies": true, 359 "answer": true, 360 "justification": "The number of samples is clearly stated: 'we generate n ≥ k samples per task (in this paper, we use n = 200 and k ≤ 100)' (Section 2.1). For APPS evaluation, 1000 samples are generated per task." 361 }, 362 "hyperparameter_search_budget": { 363 "applies": true, 364 "answer": false, 365 "justification": "While Figure 5 shows pass@k at different temperatures (0.2, 0.4, 0.8), no formal hyperparameter search budget is stated (number of configurations tried, search method, or total compute spent on search)." 366 }, 367 "best_config_selection_justified": { 368 "applies": true, 369 "answer": true, 370 "justification": "The temperature selection process is transparent: Figure 5 plots pass@k against temperature for various k values, and the optimal temperature is selected from the upper hull. They report 'the optimal temperature for pass@1 is T*=0.2 and the optimal temperature for pass@100 is T*=0.8' for the 679M model (Section 3.3)." 371 }, 372 "multiple_comparison_correction": { 373 "applies": false, 374 "answer": false, 375 "justification": "No formal statistical tests with p-values are performed, so multiple comparison correction is not applicable." 376 }, 377 "self_comparison_bias_addressed": { 378 "applies": true, 379 "answer": false, 380 "justification": "The authors compare their own Codex against GPT-Neo, GPT-J, and TabNine without acknowledging the potential bias of authors evaluating their own system. There is no discussion of how having full control over one system but not others might affect the comparison." 381 }, 382 "compute_budget_vs_performance": { 383 "applies": true, 384 "answer": true, 385 "justification": "Figures 1, 4, and 6 show performance as a function of model size (a proxy for compute). Figure 4 shows test loss follows a power law with model size. Figure 6 shows pass@1 and pass@100 scale as sigmoids in log-parameters." 386 }, 387 "benchmark_construct_validity": { 388 "applies": true, 389 "answer": true, 390 "justification": "Section 2.1 provides extensive discussion of construct validity: argues functional correctness is superior to BLEU for measuring code generation, shows empirically that BLEU fails to distinguish correct from incorrect code (Figure 8), and discusses how functional correctness mirrors real software development practice (test-driven development)." 391 }, 392 "scaffold_confound_addressed": { 393 "applies": false, 394 "answer": false, 395 "justification": "No scaffolding is involved. All models are evaluated via direct inference with the same prompting approach, so there is no scaffold confound." 396 } 397 }, 398 "data_leakage": { 399 "temporal_leakage_addressed": { 400 "applies": true, 401 "answer": true, 402 "justification": "Training data was collected in May 2020 (Section 3.1). HumanEval was hand-written specifically for this evaluation and did not exist before the training data collection, inherently addressing temporal leakage. For APPS, the paper acknowledges that GitHub contains Codeforces solutions." 403 }, 404 "feature_leakage_addressed": { 405 "applies": true, 406 "answer": false, 407 "justification": "The paper does not explicitly discuss whether the evaluation setup leaks information through features. For example, function signatures and docstring style in HumanEval may provide implicit cues not available in real usage scenarios." 408 }, 409 "non_independence_addressed": { 410 "applies": true, 411 "answer": true, 412 "justification": "Section 2.2 addresses non-independence by hand-writing HumanEval: 'It is important for these tasks to be hand-written, since our models are trained on a large fraction of GitHub, which already contains solutions to problems from a variety of sources.' This ensures test problems are independent of training data." 413 }, 414 "leakage_detection_method": { 415 "applies": true, 416 "answer": false, 417 "justification": "No concrete leakage detection method is applied (no canary strings, membership inference, or n-gram overlap analysis). While hand-writing HumanEval is a prevention strategy, the paper acknowledges 'Though not a guarantee for problem novelty' (Figure 2) and no verification method confirms the problems do not overlap with training data." 418 } 419 } 420 }, 421 "claims": [ 422 { 423 "claim": "Codex-12B solves 28.8% of HumanEval problems with a single sample, while GPT-3 solves 0% and GPT-J solves 11.4%.", 424 "evidence": "Table 1 reports pass@1 for Codex-12B at 28.81%, GPT-Neo models at 0.75-6.41%, and GPT-J at 11.62%. GPT-3 models achieve near 0% (Section 3.4).", 425 "supported": "strong" 426 }, 427 { 428 "claim": "With 100 samples per problem, Codex-S solves 77.5% of HumanEval problems.", 429 "evidence": "Figure 1 shows Codex-S pass@100 at 77.5%. Section 4.5 confirms the result and shows consistent improvements over Codex across model sizes.", 430 "supported": "strong" 431 }, 432 { 433 "claim": "Supervised fine-tuning on correctly implemented functions (Codex-S) improves pass@1 by an average of 6.5 percentage points and pass@100 by 15.1 percentage points over Codex.", 434 "evidence": "Section 4.5 reports these margins averaged across model sizes, with Figure 10 showing the comparison visually.", 435 "supported": "strong" 436 }, 437 { 438 "claim": "BLEU score is not a reliable indicator of functional correctness for code generation.", 439 "evidence": "Figure 8 shows significant overlap between BLEU score distributions of correct and incorrect solutions for 4 HumanEval tasks, demonstrating that functionally inequivalent programs can have higher BLEU scores than correct ones (Section 3.3).", 440 "supported": "strong" 441 }, 442 { 443 "claim": "Model performance on chained operations degrades exponentially with docstring complexity.", 444 "evidence": "Figure 11 shows pass rate dropping by roughly a factor of 2-3 per additional chained component in synthetic tasks (Section 6).", 445 "supported": "strong" 446 }, 447 { 448 "claim": "Test loss after code fine-tuning follows a power law with model size.", 449 "evidence": "Figure 4 shows the power law relationship with functional form (N / 5.92×10^7)^-0.13, closely fitting the empirical data across model sizes (Section 3.3).", 450 "supported": "strong" 451 }, 452 { 453 "claim": "Mean token log probability is an effective heuristic for selecting the best sample from multiple generations.", 454 "evidence": "Figure 7 shows mean log-probability ranking significantly outperforms random selection and back-translation ranking, reaching 44.5% when selecting from 100 samples (Section 3.3).", 455 "supported": "strong" 456 }, 457 { 458 "claim": "Codex models frequently generate insecure cryptographic code configurations.", 459 "evidence": "Figure 15 shows Codex produces insecure RSA keys (<2048 bits) or AES contexts (ECB mode) in a significant fraction of samples across model sizes (Appendix G.3). However, the study covers only two cryptographic scenarios.", 460 "supported": "moderate" 461 }, 462 { 463 "claim": "When prompted with subtly buggy code, Codex produces worse code than it is capable of, and this gap increases with model size.", 464 "evidence": "Figure 12 shows the gap between performance with correct context vs buggy context grows with model size. Adding an instruction to write correct code helps partially but does not eliminate the gap (Appendix E).", 465 "supported": "strong" 466 } 467 ], 468 "methodology_tags": [ 469 "benchmark-eval" 470 ], 471 "key_findings": "Codex, a GPT model fine-tuned on 159 GB of GitHub Python code, solves 28.8% of hand-written HumanEval problems with a single sample, vastly outperforming GPT-3 (0%) and GPT-J (11.4%). Repeated sampling is surprisingly effective: 100 samples per problem yield 77.5% pass rate with Codex-S. The paper establishes functional correctness (pass@k) as the appropriate metric over BLEU, demonstrates power law scaling with model size, and provides extensive analysis of limitations including exponential degradation with docstring complexity, misalignment that worsens with scale, and frequent generation of insecure code.", 472 "red_flags": [ 473 { 474 "flag": "Company evaluating own product", 475 "detail": "OpenAI authors evaluate Codex, which powers their commercial product GitHub Copilot. The paper acknowledges the relationship ('A distinct production version of Codex powers GitHub Copilot') but does not include a formal conflicts-of-interest statement. The funder (OpenAI) has direct financial interest in demonstrating strong Codex performance." 476 }, 477 { 478 "flag": "No uncertainty quantification on main results", 479 "detail": "Despite developing an unbiased estimator for pass@k, the paper reports all main results as point estimates without confidence intervals, error bars, or variance across runs. With 164 problems in HumanEval, the standard error on aggregate pass rates could be substantial." 480 }, 481 { 482 "flag": "Training data not released", 483 "detail": "The 159 GB training dataset and model weights are not released. While the evaluation dataset (HumanEval) is public, the core claims about Codex's performance cannot be independently replicated because neither the training data nor the model is available." 484 } 485 ], 486 "cited_papers": [ 487 { 488 "title": "Language Models are Few-Shot Learners", 489 "authors": [ 490 "Brown, T. B.", 491 "Mann, B.", 492 "Ryder, N.", 493 "et al." 494 ], 495 "year": 2020, 496 "arxiv_id": "2005.14165", 497 "relevance": "Foundation GPT-3 model that Codex is fine-tuned from; baseline for code generation capability comparison." 498 }, 499 { 500 "title": "Measuring Coding Challenge Competence with APPS", 501 "authors": [ 502 "Hendrycks, D.", 503 "Basart, S.", 504 "Kadavath, S.", 505 "et al." 506 ], 507 "year": 2021, 508 "arxiv_id": "2105.09938", 509 "relevance": "Coding challenge benchmark used to evaluate Codex alongside HumanEval; measures functional correctness on competitive programming tasks." 510 }, 511 { 512 "title": "GPT-J-6B: A 6 Billion Parameter Autoregressive Language Model", 513 "authors": [ 514 "Wang, B.", 515 "Komatsuzaki, A." 516 ], 517 "year": 2021, 518 "relevance": "Open-source language model baseline for code generation, trained on The Pile with 8% GitHub code." 519 }, 520 { 521 "title": "GPT-Neo: Large Scale Autoregressive Language Modeling with Mesh-Tensorflow", 522 "authors": [ 523 "Black, S.", 524 "Gao, L.", 525 "Wang, P.", 526 "Leahy, C.", 527 "Biderman, S." 528 ], 529 "year": 2021, 530 "relevance": "Open-source GPT-style model serving as baseline for code generation; trained on The Pile." 531 }, 532 { 533 "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages", 534 "authors": [ 535 "Feng, Z.", 536 "Guo, D.", 537 "Tang, D.", 538 "et al." 539 ], 540 "year": 2020, 541 "relevance": "Pre-trained code representation model using BERT objective on docstring-function pairs." 542 }, 543 { 544 "title": "Scaling Laws for Neural Language Models", 545 "authors": [ 546 "Kaplan, J.", 547 "McCandlish, S.", 548 "Henighan, T.", 549 "et al." 550 ], 551 "year": 2020, 552 "arxiv_id": "2003.05950", 553 "relevance": "Establishes power law scaling relationships that Codex's performance also follows after code fine-tuning." 554 }, 555 { 556 "title": "SPoC: Search-based Pseudocode to Code", 557 "authors": [ 558 "Kulal, S.", 559 "Pasupat, P.", 560 "Chandra, K.", 561 "et al." 562 ], 563 "year": 2019, 564 "relevance": "Introduced the pass@k metric for evaluating functional correctness of synthesized code." 565 }, 566 { 567 "title": "Unsupervised Translation of Programming Languages", 568 "authors": [ 569 "Lachaux, M.-A.", 570 "Rozière, B.", 571 "Chanussot, L.", 572 "Lample, G." 573 ], 574 "year": 2020, 575 "arxiv_id": "2006.03511", 576 "relevance": "Demonstrated functional correctness as a better evaluation metric than BLEU for code translation." 577 }, 578 { 579 "title": "Extracting Training Data from Large Language Models", 580 "authors": [ 581 "Carlini, N.", 582 "Tramèr, F.", 583 "Wallace, E.", 584 "et al." 585 ], 586 "year": 2021, 587 "relevance": "Demonstrates privacy risks of training data memorization in large language models, applicable to code models trained on public repositories." 588 }, 589 { 590 "title": "You Autocomplete Me: Poisoning Vulnerabilities in Neural Code Completion", 591 "authors": [ 592 "Schuster, R.", 593 "Song, C.", 594 "Tromer, E.", 595 "Shmatikov, V." 596 ], 597 "year": 2020, 598 "relevance": "Demonstrates data poisoning attacks on code completion models, a supply chain security risk for code generation." 599 }, 600 { 601 "title": "In-IDE Code Generation from Natural Language: Promise and Challenges", 602 "authors": [ 603 "Xu, F. F.", 604 "Vasilescu, B.", 605 "Neubig, G." 606 ], 607 "year": 2021, 608 "arxiv_id": "2101.11149", 609 "relevance": "Evaluates capabilities and challenges of code generation in IDE settings, directly relevant to code generation evaluation." 610 }, 611 { 612 "title": "Learning Autocompletion from Real-World Datasets", 613 "authors": [ 614 "Aye, G. A.", 615 "Kim, S.", 616 "Li, H." 617 ], 618 "year": 2021, 619 "relevance": "Reports on Facebook's internal code autocomplete tool, providing industry perspective on code generation deployment." 620 }, 621 { 622 "title": "The Pile: An 800GB Dataset of Diverse Text for Language Modeling", 623 "authors": [ 624 "Gao, L.", 625 "Biderman, S.", 626 "Black, S.", 627 "et al." 628 ], 629 "year": 2020, 630 "relevance": "Training dataset for GPT-Neo and GPT-J baselines, containing 8% GitHub code that enables programming capabilities." 631 } 632 ], 633 "engagement_factors": { 634 "practical_relevance": { 635 "score": 3, 636 "justification": "Codex directly powers GitHub Copilot, a tool millions of developers use daily, and the paper introduces HumanEval which became a standard benchmark." 637 }, 638 "surprise_contrarian": { 639 "score": 2, 640 "justification": "The finding that repeated sampling (100 samples) jumps from 28.8% to 77.5% was genuinely surprising and counterintuitive to most practitioners." 641 }, 642 "fear_safety": { 643 "score": 2, 644 "justification": "Extensive analysis of insecure code generation, misalignment that worsens with scale, and potential for malware generation makes safety a major theme." 645 }, 646 "drama_conflict": { 647 "score": 1, 648 "justification": "OpenAI evaluating its own commercial product raises mild conflict, but the paper is more celebratory than controversial." 649 }, 650 "demo_ability": { 651 "score": 2, 652 "justification": "HumanEval benchmark is publicly released and Codex was available via API, though the model weights and training data were not released." 653 }, 654 "brand_recognition": { 655 "score": 3, 656 "justification": "From OpenAI, powers GitHub Copilot used by millions, authored by figures including Dario Amodei, Sam McCandlish, and Ilya Sutskever." 657 } 658 } 659 }