scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (32413B)
      1 {
      2   "paper": {
      3     "title": "MultiPL-E: A Scalable and Extensible Approach to Benchmarking Neural Code Generation",
      4     "authors": [
      5       "Federico Cassano",
      6       "John Gouwar",
      7       "Daniel Nguyen",
      8       "Sydney Nguyen",
      9       "Luna Phipps-Costin",
     10       "Donald Pinckney",
     11       "Ming-Ho Yee",
     12       "Yangtian Zi",
     13       "Carolyn Jane Anderson",
     14       "Molly Q Feldman",
     15       "Arjun Guha",
     16       "Michael Greenberg",
     17       "Abhinav Jangda"
     18     ],
     19     "year": 2023,
     20     "venue": "IEEE Transactions on Software Engineering",
     21     "arxiv_id": "2208.08227",
     22     "doi": "10.1109/TSE.2023.3267446"
     23   },
     24   "scan_version": 2,
     25   "active_modules": ["experimental_rigor", "data_leakage"],
     26   "methodology_tags": ["benchmark-eval"],
     27   "key_findings": "MultiPL-E translates Python code generation benchmarks (HumanEval, MBPP) to 18 additional programming languages using lightweight compilers. Codex matches or exceeds Python performance on JavaScript, C++, TypeScript, and Scala. Performance correlates with language popularity, but some niche languages (Lua, Julia) perform surprisingly well. Perplexity does not correlate with code correctness, and static type annotations neither help nor hinder performance overall.",
     28   "checklist": {
     29     "artifacts": {
     30       "code_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The paper states 'The MultiPL-E system, dataset, and tutorial are available at github.com/nuprl/MultiPL-E' (§1, §9). A working repository URL is provided."
     34       },
     35       "data_released": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "The translated benchmark datasets are released at the same GitHub repository. The underlying HumanEval and MBPP benchmarks are also publicly available. Appendix B confirms 'The dataset is publicly available at https://github.com/nuprl/MultiPL-E.'"
     39       },
     40       "environment_specified": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "The paper describes a containerized sandbox for evaluation (§1) and specifies exact language versions for all 18 target languages in Appendix A (e.g., Python 3.10, C++17 with g++17, Java OpenJDK 17, Rust 1.59.0, etc.). The repository includes the containerized evaluation framework."
     44       },
     45       "reproduction_instructions": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "Appendix A states 'Technical information regarding running experiments and evaluating generated programs can be found at github.com/nuprl/MultiPL-E.' The paper also includes a tutorial reference. Specific experimental parameters (temperature, top-p, sample count) are stated in §4.2."
     49       }
     50     },
     51     "statistical_methodology": {
     52       "confidence_intervals_or_error_bars": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "The paper uses binomial mixed-effects models (Appendix C) that report coefficient estimates with standard errors (e.g., 'bβ = -2.59 (+/- 0.3)'). These standard errors provide uncertainty quantification for all comparisons."
     56       },
     57       "significance_tests": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Extensive mixed-effects models with p-values are reported throughout §5-6 and Appendix C. For example, differences between language frequency categories are tested with specific p-values (§5.1.3: 'p = 0.006; p < 0.001; p = 0.002'). The lme4 library in R is used for fitting (Appendix C)."
     61       },
     62       "effect_sizes_reported": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Pass@1 rates are reported as absolute percentages with differences. For example, §5.1.2: 'Codex's performance on JavaScript is better than its performance on Python... (+2.3%; p = 0.43).' Ablation results report percentage differences (§6.1: 'translating natural language terminology has a small but reliable effect'). Absolute rates provide baseline context."
     66       },
     67       "sample_size_justified": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "The paper justifies the 200-sample size: 'we take 200 completions at each temperature and calculate average pass rate using the unbiased sampling estimator' (§4.2). Footnote 6 adds: 'pass@1 rates appear to stabilize around 20 samples, suggesting that future work could achieve a stable estimate with a less computationally costly sample size.'"
     71       },
     72       "variance_reported": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The unbiased sampling estimator from Chen et al. [1] computes expected pass rates from 200 stochastic completions per problem, inherently accounting for sampling variance. The mixed-effects models in Appendix C report standard errors for all coefficients and treat problem number as a random effect to account for per-problem variability."
     76       }
     77     },
     78     "evaluation_design": {
     79       "baselines_included": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Three models are compared against each other (Codex, CodeGen, InCoder) and against previously reported results. The paper replicates Python results from prior work (§5.1.1, §5.2.2) as a validation baseline."
     83       },
     84       "baselines_contemporary": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Codex (codex-davinci-002), CodeGen (16.1B), and InCoder (6.7B) were state-of-the-art code generation models at the time of evaluation (2022). These represent the best available models from three different research groups."
     88       },
     89       "ablation_study": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "§6.1 presents a systematic ablation study of MultiPL-E's translation components with four conditions: Original Prompt, Test-Only Translation, Full Translation, and No Doctests. Additional ablations in §6.2 (type annotations), §6.3 (comment style, argument naming), and §6.4 (language features) isolate individual factors."
     93       },
     94       "multiple_metrics": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The paper reports pass@1, pass@10, and pass@100 (§4.2). It also examines perplexity (§5.1.4) and provides fine-grained error categorization (§6.5, Appendix D)."
     98       },
     99       "human_evaluation": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "§6.5 and Appendix D present a manual error analysis by language experts for four languages (Python, C#, Swift, Racket). 'A language expert then performed a manual investigation of a subset of the completions to derive a set of common error types' (Appendix D.1). This constitutes human evaluation of model outputs."
    103       },
    104       "held_out_test_set": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Unit tests are explicitly hidden from the model. For MBPP, the paper modifies the original format: 'We therefore remove the assertions from the MBPP prompts so that we can use them as hidden unit tests' (§3.1). HumanEval also uses hidden tests by design."
    108       },
    109       "per_category_breakdown": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Results are broken down by language (Fig 6), language frequency category (Fig 7, 9), data type features (Fig 11), translation component (Fig 10), and individual error categories (Tables 25-28). Statistical models are also fitted per-language (Appendix C)."
    113       },
    114       "failure_cases_discussed": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "§6.5 and Appendix D provide extensive error analysis across four languages with specific error categories (RUNTIME, STATIC, TYPE, LANGUAGE, MODEL), concrete code examples of failures (Figures 17-28), and counts of each error type."
    118       },
    119       "negative_results_reported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Several negative results are reported: perplexity does not correlate with code correctness (§5.1.4); type annotations have no overall effect on performance (§6.2, p=0.33); Codex generates Markdown instead of Racket code (§6.5); Python type annotations don't improve Python performance (§6.2.3)."
    123       }
    124     },
    125     "claims_and_evidence": {
    126       "abstract_claims_supported": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The abstract claims are supported: multi-language evaluation of three models (§5), Codex matching/exceeding Python performance for several languages (§5.1.2, Fig 6), exploration of language frequency and features (§5.1.3, §6.4), and scalability of the approach (18 languages with ~200 LOC compilers, Table 1)."
    130       },
    131       "causal_claims_justified": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Causal claims from ablation studies (e.g., 'translating doctests and Python-specific terminology has little impact on better-performing languages' §6.1) are justified by controlled single-variable manipulation. Correlational findings (language frequency vs performance) are appropriately described as correlations, not causes."
    135       },
    136       "generalization_bounded": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper bounds its generalizations: §7 notes 'the (translated) benchmarks may not be representative of the kinds of problems that programmers typically solve in each language' and 'performance on benchmarks may not accurately represent real-world performance.' Claims are tied to specific models, benchmarks, and languages tested."
    140       },
    141       "alternative_explanations_discussed": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "§7 discusses multiple alternative explanations: prompts may not be optimally designed for each language, sampling configurations may not be optimal for non-Python, Codex training set is not public so contamination cannot be ruled out, and benchmark tasks may not be representative of real-world programming."
    145       },
    146       "proxy_outcome_distinction": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "The paper clearly distinguishes between what it measures (pass@k on translated benchmarks) and the broader goal (multi-language code generation capability). §7 explicitly warns that benchmark performance may not represent real-world performance. The paper frames its results in terms of benchmark pass rates, not broad capability claims."
    150       }
    151     },
    152     "setup_transparency": {
    153       "model_versions_specified": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Specific model versions are provided: 'codex-davinci-002' (§4.1, §5.1.1), InCoder 6.7B parameter model (§4.1), and 'CodeGen 16.1B parameter' multilingual model (§4.1). These are specific enough to identify the exact models used."
    157       },
    158       "prompts_provided": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Example prompts are shown in Figures 1, 3, 4, 5. Full translation details with stop tokens and language-specific decisions are documented in Appendix A. The complete prompt generation system is released as open-source code at the repository."
    162       },
    163       "hyperparameters_reported": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "§4.2 states: 'We calculate pass@1 with temperature 0.2, and use temperature 0.8 for pass@10 and pass@100.' Top-p sampling is discussed in §2.2. Stop sequences are detailed per-language in Appendix A."
    167       },
    168       "scaffolding_described": {
    169         "applies": false,
    170         "answer": false,
    171         "justification": "No agentic scaffolding is used. The models are queried directly with prompts and generate completions — no tools, feedback loops, or multi-step workflows."
    172       },
    173       "data_preprocessing_documented": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "§3 documents the entire compilation pipeline: how unit tests are translated (§3.3.1), type translation and inference (§3.3.2), doctest translation (§3.3.3), terminology translation (§3.3.4). §3.4 documents what was excluded (3 HumanEval benchmarks) and modified (2 with randomized tests). MBPP modifications are described in §3.1."
    177       }
    178     },
    179     "limitations_and_scope": {
    180       "limitations_section_present": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "§7 'Threats to Validity' provides substantive discussion of three categories of threats: benchmark representativeness, prompt design sensitivity, and sampling configuration."
    184       },
    185       "threats_to_validity_specific": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "§7 discusses threats specific to this study: 'we evaluate both scripting languages and systems languages on the same task, but programmers frequently use these languages for very different tasks'; prompt sensitivity could improve individual language results; sampling settings optimized for Python may not be optimal for others."
    189       },
    190       "scope_boundaries_stated": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "§7 explicitly states scope limitations: benchmarks are 'a mix of basic programming problems and straightforward interview questions' that 'may not accurately represent real-world performance.' §3.4 states which languages cannot be supported (C, SQL) and why. Footnote 10 acknowledges sampler configuration exploration is 'beyond the scope of an academic group.'"
    194       }
    195     },
    196     "data_integrity": {
    197       "raw_data_available": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The full dataset, benchmark translations, and evaluation framework are released at github.com/nuprl/MultiPL-E. Appendix B confirms: 'The dataset is publicly available' and 'The original data for both datasets is publicly available.'"
    201       },
    202       "data_collection_described": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The data comes from two existing benchmarks (HumanEval, MBPP) whose provenance is described. §3 documents how translations were produced via the compilation pipeline. Appendix B provides a full datasheet following Gebru et al. [44]."
    206       },
    207       "recruitment_methods_described": {
    208         "applies": false,
    209         "answer": false,
    210         "justification": "No human participants. Data comes from standard benchmarks (HumanEval, MBPP) that are publicly available."
    211       },
    212       "data_pipeline_documented": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "The pipeline from Python benchmarks to translated versions is fully documented in §3. §3.4 documents what was excluded (3 HumanEval problems) and modified (2 problems). Appendix B states the timeframe (May-October 2022) and that preprocessing was manual with original data preserved."
    216       }
    217     },
    218     "conflicts_of_interest": {
    219       "funding_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "§10 states: 'This work was partially supported by the National Science Foundation grant CCF-2052696.' GPU loans from Steven Holtzen and Joydeep Biswas and support from Northeastern Research Computing are acknowledged."
    223       },
    224       "affiliations_disclosed": {
    225         "applies": true,
    226         "answer": true,
    227         "justification": "Authors are listed with a note that they are alphabetically ordered with students first, then faculty. The authors are academic researchers, not affiliated with the companies whose models they evaluate."
    228       },
    229       "funder_independent_of_outcome": {
    230         "applies": true,
    231         "answer": true,
    232         "justification": "The National Science Foundation is the funder and has no financial interest in the comparative performance of Codex, CodeGen, or InCoder. This is independent academic funding."
    233       },
    234       "financial_interests_declared": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No competing interests or financial interests statement is present in the paper. While the authors appear to be academic researchers with no obvious conflicts, absence of a declaration is not the same as absence of conflict."
    238       }
    239     },
    240     "contamination": {
    241       "training_cutoff_stated": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "Training cutoff dates are not stated for any of the three models. For Codex: 'Details of its training set and size are not public' (§4.1). InCoder and CodeGen training data is described in terms of sources but no cutoff dates are given."
    245       },
    246       "train_test_overlap_discussed": {
    247         "applies": true,
    248         "answer": true,
    249         "justification": "§5.1.2 explicitly discusses train/test overlap: 'The Codex training set is not public; it is possible that the latest model has been trained on solutions to the HumanEval benchmarks in Python, and this could be inflating its performance.' They argue the translated versions are new datasets, mitigating this concern."
    250       },
    251       "benchmark_contamination_addressed": {
    252         "applies": true,
    253         "answer": true,
    254         "justification": "§5.1.2 addresses contamination: 'MultiPL-HumanEval is a new dataset for the 18 other languages. That Codex matches or exceeds its Python performance on some of these new languages suggests a negligible impact of any train/test overlap.' The novelty of the translated benchmarks is presented as a natural contamination mitigation."
    255       }
    256     },
    257     "human_studies": {
    258       "pre_registered": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study. The evaluation is entirely automated using code generation models and unit tests."
    262       },
    263       "irb_or_ethics_approval": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants. Appendix B confirms: 'Not applicable. The dataset adapts an open source dataset released under the terms of the MIT license.'"
    267       },
    268       "demographics_reported": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "inclusion_exclusion_criteria": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "randomization_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       },
    283       "blinding_described": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants in this study."
    287       },
    288       "attrition_reported": {
    289         "applies": false,
    290         "answer": false,
    291         "justification": "No human participants in this study."
    292       }
    293     },
    294     "cost_and_practicality": {
    295       "inference_cost_reported": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "No inference costs, API costs, or wall-clock times are reported despite generating 200 completions per problem across 19 languages for 3 models (over 2 million completions total). Footnote 10 acknowledges resource intensity but does not quantify it."
    299       },
    300       "compute_budget_stated": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No total compute budget is stated. The paper thanks individuals for GPU loans and Northeastern Research Computing for support (§10), but does not quantify GPU hours, API costs, or total computation."
    304       }
    305     },
    306     "experimental_rigor": {
    307       "seed_sensitivity_reported": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "While 200 stochastic completions per problem are sampled, the paper does not report variation across independent runs or random seeds. The unbiased estimator provides a point estimate but no seed sensitivity analysis is presented."
    311       },
    312       "number_of_runs_stated": {
    313         "applies": true,
    314         "answer": true,
    315         "justification": "§4.2 clearly states: 'we take 200 completions at each temperature.' Two temperature settings are used (0.2 for pass@1, 0.8 for pass@10 and pass@100)."
    316       },
    317       "hyperparameter_search_budget": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "Temperature values (0.2, 0.8) and top-p sampling are adopted from prior work [1] without reporting any search budget. Footnote 10 acknowledges that 'The original experiment on sampler configurations by Chen et al. [1] has not been repeated by any lab' but does not attempt their own search."
    321       },
    322       "best_config_selection_justified": {
    323         "applies": true,
    324         "answer": true,
    325         "justification": "The sampling configuration follows established practice from Chen et al. [1]: 'We use the same sampling configuration that is used in most prior work on code generation' (§7). The paper explicitly states temperature 0.2 for pass@1 (§4.2) rather than selecting the best of multiple configurations."
    326       },
    327       "multiple_comparison_correction": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "Appendix C reports dozens of statistical comparisons across languages, models, and experimental conditions (Tables 2-24) without applying Bonferroni, Holm, or any other multiple comparison correction. Many comparisons are reported as significant at p < 0.05."
    331       },
    332       "self_comparison_bias_addressed": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The authors evaluate their own benchmark translation system but do not acknowledge potential bias from evaluating their own work. They compare against MBXP (§8) and note differences in methodology, but do not discuss how authorship of MultiPL-E might influence their evaluation."
    336       },
    337       "compute_budget_vs_performance": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "The three models differ substantially in size (InCoder 6.7B, CodeGen 16.1B, Codex unknown) but performance is not analyzed as a function of compute budget or model size. The compute disparity is not controlled for in comparisons."
    341       },
    342       "benchmark_construct_validity": {
    343         "applies": true,
    344         "answer": true,
    345         "justification": "§7 explicitly discusses construct validity: 'the (translated) benchmarks may not be representative of the kinds of problems that programmers typically solve in each language. For example, we evaluate both scripting languages and systems languages on the same task, but programmers frequently use these languages for very different tasks.' §5.2.1 also compares HumanEval and MBPP difficulty."
    346       },
    347       "scaffold_confound_addressed": {
    348         "applies": false,
    349         "answer": false,
    350         "justification": "No scaffolding is involved. Models are queried directly with prompts and produce completions without any agentic framework or tool use."
    351       }
    352     },
    353     "data_leakage": {
    354       "temporal_leakage_addressed": {
    355         "applies": true,
    356         "answer": true,
    357         "justification": "§5.1.2 discusses temporal leakage: the original HumanEval Python solutions may be in Codex's training data since HumanEval was published before the model was trained. The paper argues the translated 18-language versions are new and thus not contaminated."
    358       },
    359       "feature_leakage_addressed": {
    360         "applies": true,
    361         "answer": true,
    362         "justification": "The paper explicitly addresses feature leakage in MBPP: the original benchmark 'prompts the model with the same unit tests it uses to test the generated code' (§3.1). MultiPL-E fixes this by hiding the test cases. They also critique MBXP for this same leakage (§8)."
    363       },
    364       "non_independence_addressed": {
    365         "applies": true,
    366         "answer": false,
    367         "justification": "The paper does not discuss whether translated problems are statistically independent from training data beyond the Python contamination discussion. No analysis of structural similarity between benchmark problems and training code is provided."
    368       },
    369       "leakage_detection_method": {
    370         "applies": true,
    371         "answer": false,
    372         "justification": "No concrete leakage detection method (canary strings, membership inference, n-gram overlap) is applied. The paper argues that translated benchmarks are new datasets as a mitigation strategy, but does not empirically verify the absence of leakage."
    373       }
    374     }
    375   },
    376   "claims": [
    377     {
    378       "claim": "Codex matches or exceeds Python performance on JavaScript, C++, Scala, and TypeScript on MultiPL-HumanEval",
    379       "evidence": "§5.1: Mixed-effects model finds no reliable differences between Python and these 4 languages (p=0.43 for JavaScript, p=0.77 for C++, p=0.10 for Scala, p=0.39 for TypeScript). Figure 6 shows pass@1 rates. JavaScript actually exceeds Python by 2.3%.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "There is no strong correlation between model perplexity and correctness of generated code",
    384       "evidence": "§5.1.4, Figure 8: Perplexity is highest for JavaScript and TypeScript while Codex performs best on these. However, the comparison uses perplexity scores from an older Codex model reported by Xu et al. [7].",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "Code generation performance correlates with language popularity, with some niche language exceptions",
    389       "evidence": "§5.1.3: Mixed-effects model finds reliable differences between HIGH and other frequency categories (MEDIUM p=0.006, LOW p<0.001, NICHE p=0.002). However, Lua (NICHE) is 9th-best and Scala (LOW) performs well.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Static type-checking neither helps nor hinders code generation performance",
    394       "evidence": "§6.2: Mixed-effects model finds no significant effect of type annotations on Codex pass@1 for HumanEval (p=0.33) or MBPP (p=0.23). Removing Python type annotations has no effect (p=0.23). However, precise types improve TypeScript performance vs Any types (-2.5%, p<0.001).",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "Code generation performance is sensitive to prompt design choices for both niche and popular languages",
    399       "evidence": "§6.1 ablation: Significant differences between Full Translation and Test-Only (p=0.03) and between No Doctests and Test-Only (p<0.001). §6.3: Multi-line comments improve Racket (+1.9%, p<0.001) but hurt PHP (-3.1%, p=0.001). Argument naming improves Perl by 8% (p<0.001).",
    400       "supported": "strong"
    401     },
    402     {
    403       "claim": "MBPP is a less challenging benchmark than HumanEval",
    404       "evidence": "§5.2.1: MultiPL-MBPP pass@1 is higher than MultiPL-HumanEval for all but 6 of 57 model/language pairs, despite MBPP lacking doctests which are shown to help performance.",
    405       "supported": "strong"
    406     }
    407   ],
    408   "red_flags": [
    409     {
    410       "flag": "Codex training data opacity",
    411       "detail": "Codex's training data and model size are not public (§4.1). The paper cannot rule out that codex-davinci-002 was trained on HumanEval solutions or the translated benchmark problems. The contamination analysis relies on the argument that translations are new, but cannot verify this empirically."
    412     },
    413     {
    414       "flag": "No compute costs reported",
    415       "detail": "The evaluation generates over 2 million completions (200 samples × ~560 problems × 19 languages × 3 models × 2 temperatures) but reports no API costs, GPU hours, or wall-clock time. This makes cost-benefit assessment impossible and hinders reproducibility for resource-constrained researchers."
    416     },
    417     {
    418       "flag": "No multiple comparison correction",
    419       "detail": "Appendix C contains dozens of statistical tests across languages, models, and experimental conditions without any family-wise error rate correction. With ~20 language comparisons per model, some significant results could be spurious."
    420     },
    421     {
    422       "flag": "Model size confound in cross-model comparisons",
    423       "detail": "InCoder (6.7B), CodeGen (16.1B), and Codex (unknown size) are compared directly without controlling for model size or compute. Performance differences could partly reflect scale rather than training approach."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "Evaluating large language models trained on code",
    429       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    430       "year": 2021,
    431       "arxiv_id": "2107.03374",
    432       "relevance": "Introduces HumanEval benchmark and Codex model, the primary benchmark and one of three models evaluated in this paper."
    433     },
    434     {
    435       "title": "Program synthesis with large language models",
    436       "authors": ["J. Austin", "A. Odena", "M. Nye"],
    437       "year": 2021,
    438       "arxiv_id": "2108.07732",
    439       "relevance": "Introduces MBPP benchmark, one of the two benchmarks translated by MultiPL-E."
    440     },
    441     {
    442       "title": "CodeGen: An open large language model for code with multi-turn program synthesis",
    443       "authors": ["E. Nijkamp", "B. Pang", "H. Hayashi"],
    444       "year": 2022,
    445       "arxiv_id": "2203.13474",
    446       "relevance": "Introduces CodeGen, one of the three code generation models evaluated across 19 languages."
    447     },
    448     {
    449       "title": "InCoder: A generative model for code infilling and synthesis",
    450       "authors": ["D. Fried", "A. Aghajanyan", "J. Lin"],
    451       "year": 2022,
    452       "arxiv_id": "2204.05999",
    453       "relevance": "Introduces InCoder, one of the three code generation models evaluated. Demonstrates code infilling capability."
    454     },
    455     {
    456       "title": "A systematic evaluation of large language models of code",
    457       "authors": ["F. F. Xu", "U. Alon", "G. Neubig", "V. J. Hellendoorn"],
    458       "year": 2022,
    459       "relevance": "Evaluates code LLMs on 12 languages using perplexity (not unit tests), providing the perplexity scores used in MultiPL-E's correlation analysis."
    460     },
    461     {
    462       "title": "Productivity assessment of neural code completion",
    463       "authors": ["A. Ziegler", "E. Kalliamvakou", "X. A. Li"],
    464       "year": 2022,
    465       "relevance": "Evaluates developer productivity with neural code completion tools, relevant to understanding real-world code generation impact."
    466     },
    467     {
    468       "title": "Multi-lingual evaluation of code generation models",
    469       "authors": ["B. Athiwaratkun", "S. K. Gouda", "Z. Wang"],
    470       "year": 2022,
    471       "arxiv_id": "2210.14868",
    472       "relevance": "MBXP: concurrent multi-language benchmark effort. MultiPL-E critiques its methodology (visible test cases, greedy decoding, less faithful type translation)."
    473     },
    474     {
    475       "title": "CodeT: Code generation with generated tests",
    476       "authors": ["B. Chen", "F. Zhang", "A. Nguyen"],
    477       "year": 2022,
    478       "arxiv_id": "2207.10397",
    479       "relevance": "Uses generated tests to improve code generation, reports codex-davinci-002 results used for Python replication comparison."
    480     },
    481     {
    482       "title": "Competition-level code generation with AlphaCode",
    483       "authors": ["Y. Li", "D. Choi", "J. Chung"],
    484       "year": 2022,
    485       "arxiv_id": "2203.07814",
    486       "relevance": "Demonstrates code generation on competitive programming problems, a related but distinct evaluation paradigm."
    487     },
    488     {
    489       "title": "Measuring coding challenge competence with APPS",
    490       "authors": ["D. Hendrycks", "S. Basart", "S. Kadavath"],
    491       "year": 2021,
    492       "arxiv_id": "2105.09938",
    493       "relevance": "APPS benchmark for code generation from competitive programming, identified as a potential future translation target for MultiPL-E."
    494     },
    495     {
    496       "title": "GPT-NeoX-20B: An open-source autoregressive language model",
    497       "authors": ["S. Black", "S. Biderman", "E. Hallahan"],
    498       "year": 2022,
    499       "arxiv_id": "2204.06745",
    500       "relevance": "Open-source large language model used for code generation tasks, relevant to the code generation model landscape."
    501     },
    502     {
    503       "title": "PaLM: Scaling language modeling with Pathways",
    504       "authors": ["A. Chowdhery", "S. Narang", "J. Devlin"],
    505       "year": 2022,
    506       "arxiv_id": "2204.02311",
    507       "relevance": "Large language model with code generation capabilities (PaLM-Coder), noted as unavailable for academic evaluation."
    508     }
    509   ]
    510 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs