scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30836B)
      1 {
      2   "paper": {
      3     "title": "Measuring The Impact Of Programming Language Distribution",
      4     "authors": [
      5       "Gabriel Orlanski",
      6       "Kefan Xiao",
      7       "Xavier Garcia",
      8       "Jeffrey Hui",
      9       "Joshua Howland",
     10       "Jonathan Malmaud",
     11       "Jacob Austin",
     12       "Rishabh Singh",
     13       "Michele Catasta"
     14     ],
     15     "year": 2023,
     16     "venue": "International Conference on Machine Learning",
     17     "arxiv_id": "2302.01973",
     18     "doi": "10.48550/arXiv.2302.01973"
     19   },
     20   "scan_version": 2,
     21   "active_modules": ["experimental_rigor", "data_leakage"],
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "BabelCode is open-sourced at https://github.com/google-research/babelcode (footnote 1 and abstract). The paper states 'BabelCode is open-sourced, has an extensive test suite, and supports evaluating four benchmarks in 14 languages.'"
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The evaluation benchmarks (BC-HumanEval, BC-MBPP, BC-Transcoder, TP3) are released through the BabelCode framework. The underlying benchmarks (HumanEval, MBPP, Transcoder, P3) are publicly available. The curated training corpus is not released, but the evaluation data is."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper mentions T5X and SeqIO frameworks but does not provide environment setup details sufficient to recreate the software environment."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions are provided in the paper. The BabelCode repo likely contains some instructions, but the paper itself does not include a 'Reproducing Results' section or specific commands to replicate the training and evaluation pipeline."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "All results in Tables 4-11 are point estimates (pass@k values) with no confidence intervals or error bars. No uncertainty quantification is provided for any result."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper makes many comparative claims (e.g., '48.17% improvement compared to PaLM-Coder 8B') based solely on comparing point estimates. No statistical significance tests are applied to any comparison."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Effect sizes are reported as relative improvements with baseline context throughout. For example, '12.34% higher pass@k across all tasks' (abstract), '66.48% better pass@k on low-resource languages' (abstract), and detailed per-language relative differences in Tables 12-13."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No justification is provided for the number of problems per benchmark (161 for BC-HumanEval, 855 for BC-MBPP, etc.), the number of programming languages (14), or the model sizes chosen. No power analysis is discussed."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "Results are from single training runs per configuration. No standard deviations, variance across seeds, or spread measures are reported. The pass@k estimator uses n samples (200 or 50) per problem but this addresses sampling noise in the estimator, not experimental variance."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Models trained on the natural distribution serve as baselines for the balancing experiments. PaLM-Coder 8B and 62B are included as external baselines (Figure 4, Tables 4-11)."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "PaLM-Coder (Chowdhery et al., 2022) was a contemporary state-of-the-art model at the time of writing. The paper also compares against PaLM (non-code-finetuned) as an additional reference."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "The paper ablates the Unimax N parameter (N=1,2,3,4) across three model sizes (1B, 2B, 4B) on four benchmarks. This systematically evaluates the contribution of the data balancing strategy at different duplication limits. Results in Figure 5-6 and Tables 4-13."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Multiple metrics are used: pass@1, pass@25, pass@100, plus qualitative prediction-level metrics including % error, % failed tests, % test cases passed per problem, % timed out (Section 5.3, Tables 16-19)."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "Evaluation is entirely automated through execution-based pass@k metrics on test suites. No human evaluation of code quality, readability, or correctness beyond test cases is performed."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Results are reported on established benchmarks (HumanEval, MBPP, Transcoder, TP3) that are separate from the training data. No dev/test set confusion — the benchmarks are used only for evaluation, and hyperparameters (T=0.8, top_p=0.95) are fixed rather than tuned on the test data."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Extensive per-language breakdowns are provided in all results tables (Tables 4-15), covering all 14 programming languages individually. Additional breakdowns by high-resource vs. low-resource groups are provided in Tables 12-13 and Figures 5-6."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 5.3 analyzes prediction-level failure modes: compilation errors, runtime errors, timeouts, and partial test case failures. Figures 7, 9, 10 and Tables 16-19 provide detailed failure analysis for both HR and LR languages."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper reports that balanced training hurts high-resource language performance (15.47%, 14.00%, 9.35% losses for 1B, 2B, 4B). It also reports that LR improvements from balanced data 'do not scale with model size' (Section 1). The 1B model on Unimax for BC-Transcoder Python shows a negative average change of -26.04% (Section 5.2)."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The abstract's specific numerical claims (12.34% overall improvement, 66.48% LR improvement, 12.94% HR cost, 30.77% LR translation improvement, 19.58% HR translation cost) are supported by the results in Section 5 and the appendix tables."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The paper makes causal claims about training distribution affecting model performance. The study design is adequate: models are trained from scratch with controlled variables (same architecture, same vocabulary, same total compute budget) varying only the training distribution. This controlled experiment supports causal claims about the effect of data balancing."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Claims are generally bounded to the tested setting. The title refers to 'programming language distribution' which matches the scope. Results are reported per-language, per-model-size, and per-dataset. The abstract specifies exact metrics and percentages rather than broad generalizations."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Section 5.1 discusses transfer learning from similar languages as an explanation for scaling behavior. Section 5.3 discusses that NL2Code tasks depend on unique code-docstring pairs while translation does not, explaining differential effects of balancing. The paper also discusses that performance gains may stem from unique data rather than duplication (Section 5.3: 'less than 1% improvement when going from Unimax 1 to Unimax 2')."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper measures pass@k on code generation and translation benchmarks and claims pass@k performance. It does not inflate measurements into broader claims about 'code quality' or 'developer productivity.' The claims match the granularity of the measurements."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Full architecture details are provided in Table 2: layers, heads, d_model, training tokens for all model sizes (1B, 2B, 4B). PaLM-Coder specifications are also listed. Architecture follows Chowdhery et al. (2022). Vocabulary size (64k tokens) and training framework (T5X, SeqIO) are specified."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Full prompt templates are provided in Appendix E (E.1 for generation, E.2 for translation) with concrete examples showing the exact text sent to the model, including the filled-in fields for specific languages."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 4.1 reports: learning rate 0.01, Adafactor optimizer, batch size 256, context window 2048, warmup steps = 10% of total, training steps per size (38K/77K/190K). Section 4.5 reports: T=0.8, top_p=0.95, no top_k, n=200/50 samples."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "No agentic scaffolding is used. Models generate code directly via sampling with no tool use, retry logic, or multi-step reasoning."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 4.2 describes the data pipeline: publicly available web code → license filtering (similar to Kocetkov et al.) → heuristic quality filtering → near-deduplication → language segmentation by GitHub Linguist file extensions. Table 3 provides the final language distribution percentages. The Unimax balancing algorithm is described in Section 3."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion (Section 7) is brief and forward-looking without discussing limitations."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No specific threats to validity are discussed anywhere in the paper. There is no acknowledgment of potential issues with single-run evaluations, benchmark contamination risk, or the limited model size range."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "The paper does not explicitly state what the results do NOT show. There is no discussion of scope limitations such as: results are specific to decoder-only models, the 14 chosen languages, or the specific training data source."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "The curated training corpus is not released. Only the BabelCode evaluation framework is open-sourced. The raw training data (filtered GitHub code) cannot be independently verified."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 4.2 describes data collection: 'Our curated source code corpus was obtained by collecting publicly available code data on the web using a custom code data collection system.' License filtering, quality heuristics, and near-deduplication are described. Language segmentation by file extension is detailed in Table 3."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No human participants. Data sources are publicly available code from the web and standard benchmarks."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "The general pipeline steps are described (collection → license filter → quality filter → deduplication → language segmentation) but intermediate counts at each stage are not provided. The paper does not state how many files were removed at each filtering step — only the final distribution percentages are given in Table 3."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No explicit funding statement is provided. The acknowledgments section thanks individuals but does not mention grants, corporate funding, or sponsorship. Author affiliations (Google Labs, Google Brain) imply corporate funding but this is not explicitly disclosed."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations are clearly listed: Google Labs, Google Brain, and NYU Department of Computer Science. The footnote notes 'Work Done While At Google' for the first author."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "Most authors are Google employees (Google Labs, Google Brain). Google has a commercial interest in code LLM capabilities and the training strategies investigated. The funder is not independent of the outcomes."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial interests statement is present in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "The paper does not state when the GitHub training data was collected or what time period it covers. No training data cutoff date is provided."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No discussion of whether HumanEval, MBPP, or Transcoder solutions appeared in the curated GitHub training corpus. Given that training data comes from public GitHub and these benchmarks are widely forked/solved on GitHub, this is a significant omission."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "HumanEval (published 2021), MBPP (2021), and Transcoder (2020) were publicly available before the training data was presumably collected. No contamination analysis or decontamination was performed or discussed."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study. It is a purely computational benchmark evaluation."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants. IRB approval is not applicable."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No inference costs, latency, or wall-clock time for evaluation is reported despite generating 200 programs per problem for HumanEval and 50 for translation tasks across 14 languages and 15 model configurations."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "Training tokens are stated (20.2B, 40.4B, 100B for 1B, 2B, 4B models) but no GPU hours, hardware type, wall-clock training time, or total compute budget (e.g., FLOPs) is reported. Fifteen models were trained (5 distributions × 3 sizes) — the total compute footprint is not quantified."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Results appear to be from single training runs per configuration. No multi-seed experiments or seed sensitivity analysis is reported."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The number of generated samples per problem is stated (n=200 for HumanEval, n=50 for translation) for the pass@k estimator, but each model configuration appears to be trained and evaluated once. The number of independent experimental replications is not stated."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No hyperparameter search is reported. The learning rate (0.01), optimizer (Adafactor), batch size (256), and sampling parameters (T=0.8, top_p=0.95) appear to be fixed choices without reporting how they were selected or how many configurations were tried."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": true,
    319         "justification": "All Unimax N configurations (N=1,2,3,4) and the natural distribution are reported for every model size and dataset, rather than cherry-picking the best configuration. Figures 5-6 and Tables 4-13 show all results transparently."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The paper makes hundreds of comparisons across 14 languages, 5 distributions, 3 model sizes, and 4 datasets without any multiple comparison correction. No Bonferroni, Holm, or other family-wise error rate corrections are applied."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The authors compare their BC models against PaLM-Coder, but both are from Google. There is no acknowledgment that comparing their own models/implementations against baselines may introduce systematic bias per Lucic et al. (2018)."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "Section 5.1 notes that the BC-4B model saw 53B more tokens than PaLM-Coder 62B, but performance is not systematically reported as a function of compute budget. The comparison between a 4B model with 100B tokens vs. PaLM-Coder 62B with 46.8B tokens conflates model size and training compute."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "The paper does not discuss whether pass@k on HumanEval, MBPP, Transcoder, or TP3 actually measures 'programming language capability' or 'code generation ability' in a meaningful sense. No construct validity analysis is provided."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": false,
    343         "answer": false,
    344         "justification": "No scaffolding is involved. Models generate code directly via sampling without any agentic scaffold, tool use, or multi-step reasoning."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "HumanEval (2021), MBPP (2021), and Transcoder (2020) were published before the training data was presumably collected from GitHub. Solutions to these benchmarks are widely available on GitHub. No temporal leakage analysis is performed."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "Section 4.5 notes that translated signatures are provided in prompts and dismisses this as 'trivial to translate,' but does not systematically analyze whether the prompt construction leaks information that would not be available in real usage."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No discussion of whether the curated GitHub training corpus contains code structurally similar to the benchmark problems. GitHub repositories frequently contain solutions to HumanEval and MBPP problems."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No concrete leakage detection or prevention method is used. No canary strings, membership inference tests, n-gram overlap analysis, or decontamination pipelines are mentioned."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Training on a balanced corpus (Unimax) results in 12.34% higher pass@k across all tasks and languages compared to the natural distribution baseline.",
    373       "evidence": "Section 5.2 and Tables 4-11 provide pass@k results across 4 benchmarks, 14 languages, and 3 model sizes for natural vs. Unimax distributions.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Balanced training achieves 66.48% better pass@k on low-resource languages at the cost of only 12.94% decrease on high-resource languages.",
    378       "evidence": "Section 5.2, Tables 12-13 show per-language relative changes. The LR/HR tradeoff is documented across all model sizes and datasets.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Scaling model size mitigates the high-resource performance loss from balanced training, going from 39.70% loss with 1B to 2.47% loss with 4B.",
    383       "evidence": "Section 5.2 and Figure 5 show that increasing model size reduces the HR performance gap between natural and Unimax distributions.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "The 2B model trained on natural data outperforms PaLM-Coder 8B on BC-HumanEval pass@100 for 12 of 14 languages, with average improvements of 48.17%.",
    388       "evidence": "Section 5.1 and Table 8 show pass@100 comparison. The authors attribute this partly to more training tokens and a filtered training dataset.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "For translation tasks, uniformly balancing languages is the optimal strategy, as translation does not require natural language understanding from the training corpus.",
    393       "evidence": "Section 5.3 shows consistent improvements in test cases passed for both HR and LR languages on TP3 with balanced data. The paper argues this is because translation does not depend on diverse NL-code pairs.",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "For generation tasks, Unimax N=3 is optimal with respect to the tradeoff between LR gains and HR losses.",
    398       "evidence": "Section 5.2 reports Unimax 3 differences of 130.17%, 87.80%, and 36.00% (LR gain minus HR loss) for 1B, 2B, and 4B models on generation tasks.",
    399       "supported": "moderate"
    400     }
    401   ],
    402   "methodology_tags": ["benchmark-eval"],
    403   "key_findings": "BabelCode enables execution-based evaluation of code models across 14 programming languages and 4 benchmarks. Training decoder-only models (1B-4B) on Unimax-balanced data distributions significantly improves low-resource language performance (average 66.48% better pass@k) at the cost of moderate high-resource degradation (12.94%), a tradeoff that improves with model scale. Translation tasks benefit more uniformly from balanced data than generation tasks, suggesting different dependencies on training corpus diversity.",
    404   "red_flags": [
    405     {
    406       "flag": "No uncertainty quantification",
    407       "detail": "All results are point estimates from single training runs with no error bars, confidence intervals, or variance across seeds. With 15 model configurations and hundreds of comparisons, the statistical reliability of observed differences is unknown."
    408     },
    409     {
    410       "flag": "No contamination analysis",
    411       "detail": "HumanEval, MBPP, and Transcoder are publicly available benchmarks whose solutions are widely shared on GitHub. The training corpus is curated from public GitHub code, but no decontamination or overlap analysis is performed."
    412     },
    413     {
    414       "flag": "Confounded baseline comparison",
    415       "detail": "The comparison between BC models and PaLM-Coder conflates multiple variables: different training data (filtered 14-language corpus vs. all GitHub), different training objectives (UL2 + causal LM vs. standard LM), different vocabulary (code-specific 64K vs. multilingual NL), and different training token counts. The 48.17% improvement claim does not isolate any single factor."
    416     },
    417     {
    418       "flag": "No limitations discussion",
    419       "detail": "The paper has no limitations or threats-to-validity section. Issues such as single-seed results, proprietary training data, limited model size range (1-4B), and potential benchmark contamination are not acknowledged."
    420     }
    421   ],
    422   "cited_papers": [
    423     {
    424       "title": "Evaluating large language models trained on code",
    425       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    426       "year": 2021,
    427       "arxiv_id": "2107.03374",
    428       "relevance": "Introduces Codex and the HumanEval benchmark, a foundational code generation evaluation used as one of the four benchmarks in this paper."
    429     },
    430     {
    431       "title": "PaLM: Scaling language modeling with pathways",
    432       "authors": ["Aakanksha Chowdhery", "Sharan Narang", "Jacob Devlin"],
    433       "year": 2022,
    434       "arxiv_id": "2204.02311",
    435       "relevance": "PaLM-Coder serves as the primary external baseline for comparison; architecture decisions in this paper follow PaLM."
    436     },
    437     {
    438       "title": "Program synthesis with large language models",
    439       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"],
    440       "year": 2021,
    441       "arxiv_id": "2108.07732",
    442       "relevance": "Introduces the MBPP benchmark used as one of the four evaluation datasets in this paper."
    443     },
    444     {
    445       "title": "A scalable and extensible approach to benchmarking nl2code for 18 programming languages",
    446       "authors": ["Federico Cassano", "John Gouwar", "Daniel Nguyen"],
    447       "year": 2022,
    448       "arxiv_id": "2208.08227",
    449       "relevance": "MultiPL-E is a concurrent multilingual code evaluation framework compared against BabelCode in Table 1."
    450     },
    451     {
    452       "title": "Multi-lingual evaluation of code generation models",
    453       "authors": ["Ben Athiwaratkun", "Sanjay Krishna Gouda", "Zijian Wang"],
    454       "year": 2023,
    455       "relevance": "MBXP is the most similar concurrent framework to BabelCode; detailed comparison in Sections 2.1-2.2."
    456     },
    457     {
    458       "title": "Competition-level code generation with alphacode",
    459       "authors": ["Yujia Li", "David Choi", "Junyoung Chung"],
    460       "year": 2022,
    461       "relevance": "Demonstrates decoder-only code models achieving competitive programming performance, relevant to the broader code LLM landscape."
    462     },
    463     {
    464       "title": "The Stack: 3 TB of permissively licensed source code",
    465       "authors": ["Denis Kocetkov", "Raymond Li", "Loubna Ben Allal"],
    466       "year": 2022,
    467       "arxiv_id": "2211.15533",
    468       "relevance": "Major open-source code dataset referenced for license filtering methodology and language distribution statistics."
    469     },
    470     {
    471       "title": "SantaCoder: don't reach for the stars!",
    472       "authors": ["Loubna Ben Allal", "Raymond Li", "Denis Kocetkov"],
    473       "year": 2023,
    474       "arxiv_id": "2301.03988",
    475       "relevance": "Contemporary open-source code LLM demonstrating decoder-only training on permissively licensed code."
    476     },
    477     {
    478       "title": "Unsupervised translation of programming languages",
    479       "authors": ["Baptiste Roziere", "Marie-Anne Lachaux", "Lowik Chanussot", "Guillaume Lample"],
    480       "year": 2020,
    481       "relevance": "Introduces the TransCoder benchmark and approach for code translation, used as one of the four evaluation datasets."
    482     },
    483     {
    484       "title": "CodeBERT: A pre-trained model for programming and natural languages",
    485       "authors": ["Zhangyin Feng", "Daya Guo", "Duyu Tang"],
    486       "year": 2020,
    487       "relevance": "First encoder-only model pre-trained on code, establishing the paradigm of pre-trained code models."
    488     },
    489     {
    490       "title": "CodeT5: Identifier-aware unified pre-trained encoder-decoder models for code understanding and generation",
    491       "authors": ["Yue Wang", "Weishi Wang", "Shafiq Joty", "Steven C.H. Hoi"],
    492       "year": 2021,
    493       "relevance": "Encoder-decoder code model examining different pre-training strategies for code understanding and generation."
    494     },
    495     {
    496       "title": "Unified pre-training for program understanding and generation",
    497       "authors": ["Wasi Ahmad", "Saikat Chakraborty", "Baishakhi Ray", "Kai-Wei Chang"],
    498       "year": 2021,
    499       "relevance": "PLBART investigates data balancing strategies for code pre-training, the most directly related prior work on language distribution effects."
    500     },
    501     {
    502       "title": "Measuring coding challenge competence with APPS",
    503       "authors": ["Dan Hendrycks", "Steven Basart", "Saurav Kadavath"],
    504       "year": 2021,
    505       "relevance": "Execution-based code generation benchmark testing competitive programming capability of LLMs."
    506     }
    507   ]
    508 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs