scan.json (29908B)
1 { 2 "paper": { 3 "title": "HumanEval-XL: A Multilingual Code Generation Benchmark for Cross-lingual Natural Language Generalization", 4 "authors": [ 5 "Qiwei Peng", 6 "Yekun Chai", 7 "Xuhong Li" 8 ], 9 "year": 2024, 10 "venue": "International Conference on Language Resources and Evaluation", 11 "arxiv_id": "2402.16694", 12 "doi": "10.48550/arXiv.2402.16694" 13 }, 14 "scan_version": 3, 15 "active_modules": ["experimental_rigor", "data_leakage"], 16 "methodology_tags": ["benchmark-eval"], 17 "key_findings": "HumanEval-XL establishes a multilingual code generation benchmark spanning 23 natural languages and 12 programming languages with 22,080 prompts. GPT-4 consistently outperforms all other models across languages, while GPT-3.5 unexpectedly lags behind the code-specialized CodeGen2-16B on most PLs except Python. LLMs within the same family show high correlation across language resource levels (Pearson 0.8–0.87), but models from different families do not correlate. Low-resource and non-Latin-script languages (Afro-Asiatic, Greek, Iranian, Turkic families) consistently yield lower performance.", 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The abstract states: 'We make our evaluation code and data publicly available at https://github.com/FloatAI/humaneval-xl.'" 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The benchmark data (22,080 prompts across 23 NLs and 12 PLs) is released at the same GitHub URL mentioned in the abstract." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "No environment specifications (requirements.txt, Dockerfile, library versions) are provided in the paper. Only inference hyperparameters are mentioned." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "No step-by-step reproduction instructions are included in the paper. The stopping criteria and sampling parameters are described, but no runnable reproduction guide is provided." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": false, 45 "justification": "The ± values in Tables 3 and 4 represent standard deviation across languages within a group, not uncertainty in the measurements. The main results in Tables 5–16 are point estimates with no confidence intervals or error bars on the pass@1 measurements." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": false, 50 "justification": "No statistical significance tests are used. The paper claims 'GPT-3.5 lags behind CodeGen2-16B' and makes other comparative claims based solely on comparing point estimates. Pearson correlations are reported (0.8, 0.87) but without significance tests." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "Pass@1 scores are reported with sufficient context for comparison. For example, Table 3 shows GPT-4 at 78.54 vs GPT-3.5 at 62.50 on Class 5 Python, and all appendix tables provide per-model, per-language absolute scores enabling effect size computation." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "The benchmark uses 80 parallel coding problems derived from Multilingual HumanEval. No justification is given for why 80 problems are sufficient for the claims being made across 23 NLs and 12 PLs." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "Results appear to be from single runs. The paper states 'Due to constrained computing resources, we report pass@1 for all experiments' but does not mention multiple runs or report variance across experimental runs." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "Multiple models are compared: CodeT5+ (220M, 770M, 2B), CodeGen2 (1B, 3.7B, 7B, 16B), GPT-3.5, and GPT-4, serving as baselines against each other." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "GPT-4 (2023) and GPT-3.5 (2022) were contemporary at the time of writing. CodeGen2 and CodeT5+ were recent code-specialized models." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": false, 82 "justification": "The benchmark construction has a multi-stage pipeline (extraction, translation, BERTScore quality check, manual quality control) but no ablation is provided to show the contribution of each stage (e.g., effect of removing BERTScore filtering)." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": false, 87 "justification": "Only pass@1 is reported. The paper explicitly states: 'Due to constrained computing resources, we report pass@1 for all experiments.' No other metrics (pass@k, BLEU, etc.) are used." 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": false, 92 "justification": "Human evaluation was used only for quality control of the benchmark translations (Stage 4), not for evaluating the LLMs' code generation outputs. All system evaluation is automated via test case execution." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": true, 97 "justification": "The benchmark is used purely for evaluation with no model tuning. The translated prompts are new (created by GPT-4 translation), and models are evaluated zero-shot on these problems." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Extensive breakdowns are provided: per programming language (12 PLs, Tables 5–16), per natural language (23 NLs), per language resource class (Table 3), and per language family (Table 4)." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": false, 107 "justification": "No qualitative error analysis or failure case examples are provided. The paper reports aggregate pass@1 scores but does not examine what types of problems or prompts cause failures." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "Several negative findings are reported: CodeT5+ 'struggles to solve problems across most scenarios,' GPT-3.5 'lags behind CodeGen2-16B in all PLs except Python,' and 'current LLMs struggle to capture the equivalent semantic meaning expressed in different languages.'" 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract's claims are supported: the benchmark does cover 23 NLs and 12 PLs with 22,080 prompts and avg 8.33 test cases. The experiments confirm that LLMs struggle with cross-lingual NL generalization as claimed." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": false, 124 "justification": "The paper makes causal claims without adequate justification: 'specialized code pre-training plays a pivotal role in code generation' and 'the encoder-decoder structure of CodeT5+ hinders its ability to complete code generation tasks.' These attribute performance differences to specific causes (architecture, training data) without controlled experiments isolating these factors." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": true, 129 "justification": "The paper generally bounds its claims to the tested setting. The title references their specific benchmark, and claims like 'significant challenge for current LLMs' are framed within the context of their benchmark evaluation. The conclusion states it as 'a pioneering step.'" 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "No alternative explanations are discussed. Performance differences are attributed to model architecture and training data without considering confounds like prompt format sensitivity, translation quality variation across languages, or GPT-4's advantage from having generated the translations." 135 }, 136 "proxy_outcome_distinction": { 137 "applies": true, 138 "answer": true, 139 "justification": "The paper measures pass@1 on code generation tasks and frames results in terms of code generation capability. The measurement (pass@1 across NL-PL pairs) matches the claimed scope (multilingual code generation proficiency) without overreaching." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": true, 145 "answer": false, 146 "justification": "GPT-3.5 and GPT-4 are referred to by marketing names only, with no API version or snapshot date (references cite OpenAI 2022 and OpenAI 2023 but no model version like gpt-4-0613). CodeT5+ and CodeGen2 specify parameter counts but not exact checkpoints." 147 }, 148 "prompts_provided": { 149 "applies": true, 150 "answer": true, 151 "justification": "The benchmark problems themselves ARE the prompts (function signatures + NL docstrings), and these are released at the GitHub repository. The evaluation follows HumanEval's code completion format where the prompt is the problem description." 152 }, 153 "hyperparameters_reported": { 154 "applies": true, 155 "answer": true, 156 "justification": "Appendix A states: 'The specific parameters for the sampling process were set to a top-p value of 0.95 and a temperature of 0.2' applied consistently across all models." 157 }, 158 "scaffolding_described": { 159 "applies": false, 160 "answer": false, 161 "justification": "No agentic scaffolding is used. Models perform direct code completion from prompts without any scaffold, tool use, or iterative refinement." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section 3.2 documents the 4-stage pipeline: NL extraction, GPT-4 translation with back-translation, BERTScore quality check (threshold 0.95, up to 3 iterations), and manual quality control with heuristic checks." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": false, 173 "justification": "There is no dedicated limitations or threats-to-validity section. The paper goes directly from Analysis and Discussion (Section 4.3) to Conclusion (Section 5) without any substantive discussion of limitations." 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": false, 178 "justification": "No specific threats to validity are discussed anywhere in the paper. Issues like translation quality, small problem set size (80), single-run results, and potential contamination are not addressed." 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": false, 183 "justification": "No explicit scope boundaries are stated. The paper does not articulate what the results do NOT show or what settings are excluded from the claims." 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": true, 190 "justification": "The benchmark data is released at https://github.com/FloatAI/humaneval-xl, allowing independent verification of the prompts, translations, and test cases." 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "Section 3.2 describes the complete data construction process: starting from Multilingual HumanEval, extracting NL portions, translating via GPT-4, back-translating for quality checking with BERTScore, and manual verification." 196 }, 197 "recruitment_methods_described": { 198 "applies": false, 199 "answer": false, 200 "justification": "No human participants. The benchmark is derived from the existing Multilingual HumanEval dataset through automated translation." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": true, 205 "justification": "The full pipeline is documented in Section 3.2 and Figure 1: extraction → translation → back-translation → BERTScore filtering (threshold 0.95, up to 3 iterations) → manual quality control. The final count of 80 parallel problems is stated." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "The Acknowledgments section states: 'This work was partially supported by DisAI... a project funded by European Union under the Horizon Europe, GA No. 101079164.'" 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "Author affiliations are clearly stated: Qiwei Peng from University of Copenhagen, Yekun Chai and Xuhong Li from Baidu Inc." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": true, 222 "justification": "EU Horizon Europe funding for combating disinformation is independent of the benchmark evaluation outcomes. However, two authors are from Baidu, which develops competing LLMs, though no Baidu models are evaluated." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "No competing interests or financial interests statement is included in the paper. Two authors are affiliated with Baidu Inc., which has commercial interests in LLM development." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": true, 233 "answer": false, 234 "justification": "No training data cutoff dates are stated for any of the evaluated models (GPT-3.5, GPT-4, CodeGen2, CodeT5+). This is critical since the benchmark derives from HumanEval (published 2021)." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": true, 238 "answer": false, 239 "justification": "No discussion of potential train/test overlap. The benchmark problems derive from HumanEval, which was published in 2021 and is widely available online. Models trained after 2021 may have seen the original problems." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": true, 243 "answer": false, 244 "justification": "HumanEval has been publicly available since 2021. All evaluated models (GPT-3.5, GPT-4, CodeGen2, CodeT5+) were likely trained on data including HumanEval solutions. While the NL prompts are translated, the underlying coding problems and test cases are the same. This contamination risk is not addressed." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study. It is a benchmark evaluation of LLMs." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": true, 287 "answer": false, 288 "justification": "No inference costs or latency are reported. The paper mentions 'constrained computing resources' as motivation for only reporting pass@1 but does not quantify the actual cost of running 22,080 evaluations across 9 model configurations." 289 }, 290 "compute_budget_stated": { 291 "applies": true, 292 "answer": false, 293 "justification": "No computational budget is stated. Neither GPU hours for open-source models nor API costs for GPT-3.5/GPT-4 evaluations are reported." 294 } 295 }, 296 "experimental_rigor": { 297 "seed_sensitivity_reported": { 298 "applies": true, 299 "answer": false, 300 "justification": "No seed sensitivity analysis. Results appear to be from single runs with temperature=0.2 and top-p=0.95. No variation across random seeds is reported." 301 }, 302 "number_of_runs_stated": { 303 "applies": true, 304 "answer": false, 305 "justification": "The number of experimental runs is never stated. The results in Tables 5–16 appear to be from single evaluations." 306 }, 307 "hyperparameter_search_budget": { 308 "applies": true, 309 "answer": false, 310 "justification": "The choice of temperature=0.2 and top-p=0.95 is stated but not justified. No hyperparameter search is mentioned." 311 }, 312 "best_config_selection_justified": { 313 "applies": true, 314 "answer": false, 315 "justification": "The sampling parameters (temperature=0.2, top-p=0.95) are presented without justification for why these values were chosen or how they affect results." 316 }, 317 "multiple_comparison_correction": { 318 "applies": true, 319 "answer": false, 320 "justification": "The paper makes numerous comparisons across 9 model configurations, 23 NLs, and 12 PLs without any multiple comparison correction. Pearson correlations are reported without significance testing." 321 }, 322 "self_comparison_bias_addressed": { 323 "applies": true, 324 "answer": false, 325 "justification": "The authors created the benchmark and evaluate models on it without acknowledging the bias of evaluating on their own benchmark. Additionally, GPT-4 was used to create the translations and is also one of the evaluated models." 326 }, 327 "compute_budget_vs_performance": { 328 "applies": true, 329 "answer": false, 330 "justification": "Performance is not reported as a function of compute budget. Models of vastly different sizes and costs are compared on pass@1 without normalizing for compute." 331 }, 332 "benchmark_construct_validity": { 333 "applies": true, 334 "answer": false, 335 "justification": "The paper does not discuss whether pass@1 on GPT-4-translated prompts actually measures 'cross-lingual NL generalization' as claimed, or whether translation artifacts could confound the results. The BERTScore quality check ensures translation similarity but does not validate the construct." 336 }, 337 "scaffold_confound_addressed": { 338 "applies": false, 339 "answer": false, 340 "justification": "No scaffolding is involved. All models perform direct code completion without agentic scaffolding." 341 } 342 }, 343 "data_leakage": { 344 "temporal_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "Not addressed. HumanEval was published in 2021. All evaluated models (GPT-3.5/4, CodeGen2, CodeT5+) were trained on data potentially including HumanEval and its solutions. The translated NL prompts are new, but the underlying coding problems and test cases are identical." 348 }, 349 "feature_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "Not discussed. The function signatures and test cases remain unchanged from HumanEval, meaning models that memorized HumanEval solutions could potentially benefit regardless of the NL prompt language." 353 }, 354 "non_independence_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "Not addressed. All 22,080 prompts derive from the same 80 HumanEval problems (translating each into 23 NLs × 12 PLs), meaning results across NLs for the same problem are not independent." 358 }, 359 "leakage_detection_method": { 360 "applies": true, 361 "answer": false, 362 "justification": "No leakage detection method is applied. No canary strings, membership inference, n-gram overlap analysis, or temporal splits are used." 363 } 364 } 365 }, 366 "claims": [ 367 { 368 "claim": "GPT-4 consistently demonstrates superior performance across different PLs and NLs (except C#), outperforming all other models significantly.", 369 "evidence": "Figure 2 and Tables 5–16 show GPT-4 achieving the highest pass@1 scores across nearly all PL-NL combinations, e.g., 82.50% on English-Python vs 66.25% for GPT-3.5 (Table 5).", 370 "supported": "strong" 371 }, 372 { 373 "claim": "GPT-3.5 lags behind CodeGen2-16B in all PLs except Python, despite having considerably more parameters, underscoring the crucial role of training on coding data.", 374 "evidence": "Tables 5–16 show CodeGen2-16B outperforming GPT-3.5 on Java, JavaScript, C#, Go, Kotlin, PHP, Ruby, Scala, Swift, and TypeScript. The causal attribution to 'coding data' is speculative.", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "LLMs within the same family but with different parameter scales exhibit high correlation across different levels of language resources (Pearson 0.8 for CodeGen2, 0.87 for GPT).", 379 "evidence": "Section 4.3 reports Pearson correlations computed from Table 3 data, comparing performance patterns across NL resource classes.", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "The encoder-decoder architecture of CodeT5+ hinders its ability to complete code generation tasks compared to decoder-only models.", 384 "evidence": "Tables 5–16 show CodeT5+ (2B) performing near-zero across most scenarios while CodeGen2 (3.7B) with a decoder-only structure performs much better. However, the causal attribution to architecture alone ignores differences in training data and scale.", 385 "supported": "weak" 386 }, 387 { 388 "claim": "Languages from Afro-Asiatic, Indo-European (Greek), Indo-European (Iranian), and Turkic families generally yield lower results compared to other language families.", 389 "evidence": "Table 4 shows these language families scoring lower, e.g., Greek yields 71.25% for GPT-4 vs 80.31% for Indo-European (Germanic) on Python.", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "Current LLMs struggle to capture the equivalent semantic meaning expressed in different languages for code generation.", 394 "evidence": "Performance variation across NLs is shown in Figure 2 and Tables 3–4 with std dev across language groups (e.g., GPT-3.5 Python: 62.50±5.06 for Class 5 vs 60.42±2.86 for Class 3). The variation is modest for GPT-4 (~77-79% across groups).", 395 "supported": "moderate" 396 } 397 ], 398 "red_flags": [ 399 { 400 "flag": "GPT-4 translation circularity", 401 "detail": "GPT-4 was used to translate the benchmark prompts into 23 languages (Section 3.2) AND is one of the evaluated models. GPT-4 may have an advantage on prompts it generated, creating a systematic bias in its favor. This confound is never discussed." 402 }, 403 { 404 "flag": "No contamination analysis on HumanEval-derived problems", 405 "detail": "The benchmark's 80 coding problems are directly derived from HumanEval (published 2021). All evaluated models were likely trained on data containing HumanEval solutions. While prompts are translated, the function signatures and test cases remain identical, allowing memorized solutions to still work. This fundamental contamination risk is completely ignored." 406 }, 407 { 408 "flag": "Single metric with single runs", 409 "detail": "Only pass@1 is reported from apparently single runs. With only 80 problems per NL-PL pair, individual pass@1 values have a granularity of 1.25 percentage points (1/80), meaning small reported differences may not be meaningful." 410 }, 411 { 412 "flag": "No limitations section", 413 "detail": "The paper contains no limitations, threats to validity, or scope boundary discussion despite significant methodological concerns (small problem set, contamination risk, translation quality, single metric)." 414 }, 415 { 416 "flag": "Unjustified causal attributions", 417 "detail": "Performance differences are attributed to specific causes (model architecture, code-specific training) without controlled experiments. Multiple confounds exist: training data size and composition, instruction tuning, RLHF, and model scale all differ simultaneously." 418 } 419 ], 420 "cited_papers": [ 421 { 422 "title": "Evaluating Large Language Models Trained on Code", 423 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 424 "year": 2021, 425 "relevance": "Introduced HumanEval, the foundational benchmark from which HumanEval-XL is derived; central to LLM code generation evaluation." 426 }, 427 { 428 "title": "Multi-lingual Evaluation of Code Generation Models", 429 "authors": ["Ben Athiwaratkun", "Sanjay Krishna Gouda", "Zijian Wang"], 430 "year": 2023, 431 "relevance": "Extended HumanEval to multiple programming languages (Multilingual HumanEval), the direct predecessor to HumanEval-XL." 432 }, 433 { 434 "title": "Program Synthesis with Large Language Models", 435 "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"], 436 "year": 2021, 437 "arxiv_id": "2108.07732", 438 "relevance": "Introduced MBPP benchmark for code generation evaluation, one of the key benchmarks in this space." 439 }, 440 { 441 "title": "ERNIE-Code: Beyond English-Centric Cross-Lingual Pretraining for Programming Languages", 442 "authors": ["Yekun Chai", "Shuohuan Wang", "Chao Pang"], 443 "year": 2023, 444 "relevance": "Pioneered multilingual NL+PL pretraining across 116 NLs and 6 PLs, directly relevant to cross-lingual code generation." 445 }, 446 { 447 "title": "CodeGen2: Lessons for Training LLMs on Programming and Natural Languages", 448 "authors": ["Erik Nijkamp", "Hiroaki Hayashi", "Caiming Xiong"], 449 "year": 2023, 450 "arxiv_id": "2305.02309", 451 "relevance": "One of the primary models evaluated in this paper; code-specialized LLM for multilingual code generation." 452 }, 453 { 454 "title": "CodeT5+: Open Code Large Language Models for Code Understanding and Generation", 455 "authors": ["Yue Wang", "Hung Le", "Akhilesh Deepak Gotmare"], 456 "year": 2023, 457 "arxiv_id": "2305.07922", 458 "relevance": "Encoder-decoder code LLM evaluated in the paper, provides architecture comparison for code generation." 459 }, 460 { 461 "title": "Execution-Based Evaluation for Open-Domain Code Generation", 462 "authors": ["Zhiruo Wang", "Shuyan Zhou", "Daniel Fried", "Graham Neubig"], 463 "year": 2022, 464 "arxiv_id": "2212.10481", 465 "relevance": "Introduced ODEX dataset with 4 NLs and Python, a direct multilingual code generation predecessor." 466 }, 467 { 468 "title": "Starcoder: May the Source Be With You!", 469 "authors": ["Raymond Li", "Loubna Ben Allal", "Yangtian Zi"], 470 "year": 2023, 471 "arxiv_id": "2305.06161", 472 "relevance": "Major open-source code LLM relevant to multilingual code generation capabilities." 473 }, 474 { 475 "title": "CodeGeeX: A Pre-trained Model for Code Generation with Multilingual Benchmarking on HumanEval-X", 476 "authors": ["Qinkai Zheng", "Xiao Xia", "Xu Zou"], 477 "year": 2023, 478 "relevance": "Extended HumanEval to multiple PLs (HumanEval-X), directly related benchmark for multilingual code generation." 479 }, 480 { 481 "title": "BLOOM: A 176B-Parameter Open-Access Multilingual Language Model", 482 "authors": ["Teven Le Scao", "Angela Fan", "Christopher Akiki"], 483 "year": 2022, 484 "arxiv_id": "2211.05100", 485 "relevance": "Large multilingual model pre-trained on 46 NLs and 13 PLs, relevant to multilingual code generation evaluation." 486 }, 487 { 488 "title": "Measuring Coding Challenge Competence with APPS", 489 "authors": ["Dan Hendrycks", "Steven Basart", "Saurav Kadavath"], 490 "year": 2021, 491 "relevance": "Code generation benchmark with 10,000 problems from competitions, part of the evaluation landscape HumanEval-XL extends." 492 } 493 ], 494 "engagement_factors": { 495 "practical_relevance": { 496 "score": 2, 497 "justification": "The benchmark is directly usable by practitioners evaluating multilingual code generation models; code and data are released." 498 }, 499 "surprise_contrarian": { 500 "score": 1, 501 "justification": "Finding that LLMs struggle with non-English prompts is somewhat expected; the GPT-3.5 vs CodeGen2 finding is mildly surprising." 502 }, 503 "fear_safety": { 504 "score": 0, 505 "justification": "No safety or security concerns are raised by this benchmark paper." 506 }, 507 "drama_conflict": { 508 "score": 0, 509 "justification": "No controversy or conflict; straightforward benchmark contribution." 510 }, 511 "demo_ability": { 512 "score": 2, 513 "justification": "Code and data are publicly available at the GitHub URL for immediate use and evaluation." 514 }, 515 "brand_recognition": { 516 "score": 1, 517 "justification": "Evaluates GPT-4 and GPT-3.5 which are well-known, but the benchmark itself is from less prominent authors (Copenhagen/Baidu)." 518 } 519 } 520 }