scan.json (28748B)
1 { 2 "paper": { 3 "title": "NALA_MAINZ at BLP-2025 Task 2: A Multi-agent Approach for Bangla Instruction to Python Code Generation", 4 "authors": [ 5 "Hossain Shaikh Saadi", 6 "Faria Alam", 7 "Mario Sanz-Guerrero", 8 "Minh Duc Bui", 9 "Manuel Mager", 10 "Katharina von der Wense" 11 ], 12 "year": 2025, 13 "venue": "BLP-2025 Workshop (arXiv preprint)", 14 "arxiv_id": "2511.16787", 15 "doi": "10.48550/arXiv.2511.16787" 16 }, 17 "scan_version": 2, 18 "active_modules": ["experimental_rigor", "data_leakage"], 19 "methodology_tags": ["benchmark-eval"], 20 "key_findings": "A two-stage multi-agent pipeline (code generation + selective debugging) wins BLP-2025 Bangla→Python code generation with GPT-5 at 95.4% Pass@1. The debugger stage provides the largest gains (GPT-5 jumps from 64.6% to 95.4%), driven primarily by exposing the model to unit tests and error traces. External test case augmentation is critical—without it, Pass@1 drops to 86.0%. Translation from Bangla to English generally hurts or has mixed effects, suggesting the models handle Bangla instructions adequately.", 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "The abstract states 'We make our code publicly available' with a GitHub link provided in footnote 1 (https://github.com/shaikhsaadi999/blp25_code_genneration)." 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "The development and test datasets are provided by the shared task organizers (Raihan et al., 2025a,c), and the external dataset is the publicly available Austin et al. (2021). Standard public datasets used without modification." 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": false, 36 "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment specifications are mentioned in the paper. Only the proprietary API providers are named." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": false, 41 "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub repository is linked but the paper itself contains no instructions for replicating the experiments." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": false, 48 "justification": "All results are reported as point estimates (e.g., 95.4, 64.6, 82.60) with no confidence intervals, error bars, or uncertainty measures anywhere in the paper." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": false, 53 "justification": "The paper claims GPT-5 'outperforms' other models and that Stage 2 'consistently boosts' performance based solely on comparing raw numbers. No statistical significance tests (t-tests, bootstrap, etc.) are used." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": true, 58 "justification": "The paper reports percentage improvements with baseline context: 'for GPT-5, the increment is 47.67%' (from 64.6 to 95.4), and similar for other models (Section 6). The effect of external data is also quantified: 86.00 → 95.4." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": false, 63 "justification": "The dataset sizes (400 dev, 500 test) are dictated by the shared task organizers. No justification is given for whether these sizes are adequate for the claims being made, nor is any power analysis discussed." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": false, 68 "justification": "All results appear to be from single runs. No standard deviations, variance across seeds, or multiple-run statistics are reported anywhere in the paper." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": true, 75 "justification": "Four proprietary models are compared (GPT-5, GPT-4.1, Claude Sonnet 4, Gemini-2.5-Flash) across both pipeline stages, and Stage 1 (no debugging) serves as a baseline for Stage 2 (with debugging)." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": true, 80 "justification": "All models compared are contemporary and competitive: GPT-5, GPT-4.1, Claude Sonnet 4, and Gemini-2.5-Flash are all current-generation proprietary models." 81 }, 82 "ablation_study": { 83 "applies": true, 84 "answer": true, 85 "justification": "Multiple ablations are presented: Stage 1 vs Stage 2 (effect of debugging), with vs without external data (86.00 vs 95.4), generated vs external test cases (84.00 vs 95.4), and with vs without Bangla→English translation (Figure 2)." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": false, 90 "justification": "Only Pass@1 is used as the evaluation metric. Error rate is also reported but is just 100 - Pass@1, not an independent metric. No other metrics (e.g., code quality, efficiency, readability) are measured." 91 }, 92 "human_evaluation": { 93 "applies": true, 94 "answer": false, 95 "justification": "Evaluation is entirely automated via pytest unit test pass/fail. No human evaluation of code quality, readability, or correctness beyond test passing is conducted." 96 }, 97 "held_out_test_set": { 98 "applies": true, 99 "answer": true, 100 "justification": "The final evaluation uses hidden unit tests on Codabench (Section 6, 'Overfitting to the unit tests'). The dev set (400 samples) and test set (500 samples) are clearly separated, and the shared task uses hidden tests for final scoring." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": false, 105 "justification": "Only aggregate Pass@1 scores are reported. No breakdown by problem difficulty, instruction type, or error category is provided. The paper notes some codes 'cannot handle edge cases' but doesn't quantify which categories fail." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": false, 110 "justification": "The paper notes generically that 'generated codes sometimes cannot handle edge cases, or they are not generalizing well' (Section 6) but provides no specific failure examples, error analysis, or qualitative examination of where the system breaks down." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Several negative results are reported: Gemini-2.5-Flash shows minimal improvement with debugging (only 13.68% gain), translation hurts GPT-5 performance in both stages, and generated test cases (84.00) underperform external tests (95.4). The dev→test performance gap (99.8→95.4) is also honestly reported." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "The abstract claims Pass@1 of 95.4 and first place, both supported by Table 2 and the shared task results. The claim of a 'simple but effective' multi-agent pipeline is supported by the two-stage architecture and results." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": true, 127 "justification": "Causal claims ('structured feedback consistently boosts model performance', 'targeted, test-driven refinement substantially improves code synthesis') are supported by controlled ablation studies: removing the debugging stage, removing external data, and comparing translation effects. These are single-variable manipulations within the pipeline." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": true, 132 "justification": "Claims are bounded to the tested setting. The title specifies 'BLP-2025 Task 2' and 'Bangla Instruction to Python Code Generation.' The conclusion says 'code synthesis in an underserved language,' appropriately scoped to the shared task context." 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper discusses overfitting to unit tests as one alternative explanation, but does not consider other factors: whether GPT-5's superiority is due to better Bangla training data, whether the improvement from Stage 2 is simply due to having more context (tests) rather than the multi-agent architecture, or whether the external dataset advantage reflects data leakage rather than generalization." 138 }, 139 "proxy_outcome_distinction": { 140 "applies": true, 141 "answer": true, 142 "justification": "The paper measures Pass@1 (passing unit tests) and claims Pass@1 scores. It does not overclaim beyond the measurement — no broader claims about 'code quality' or 'developer productivity.' The proxy matches the claim granularity." 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": true, 148 "answer": false, 149 "justification": "Models are identified by marketing names only: 'GPT-5', 'GPT-4.1', 'Gemini-2.5-Flash', 'Claude-Sonnet-4' (Section 5). No API versions, snapshot dates, or model IDs are provided. Model behavior changes across versions." 150 }, 151 "prompts_provided": { 152 "applies": true, 153 "answer": true, 154 "justification": "Full prompt templates are provided in Appendix A for all agents: coding agent user prompt, debugger agent system and user prompts, and test generation agent system and user prompts. The fill values are data-instance variables ({spec.name}, {spec.instruction_bn}, {code}, etc.) that are deterministic from the dataset." 155 }, 156 "hyperparameters_reported": { 157 "applies": true, 158 "answer": false, 159 "justification": "Only 'reasoning effort' settings are mentioned (low for coding agent, high for debugger, Section 5). No temperature, top-p, max tokens, or other sampling parameters are reported for any of the four proprietary APIs." 160 }, 161 "scaffolding_described": { 162 "applies": true, 163 "answer": true, 164 "justification": "The two-agent pipeline is described in detail in Section 5 with Algorithm 1 and Figure 1: code generation → test execution → selective debugging with error trace extraction. The feedback mechanism, retry logic (max 2 attempts in Stage 1), and agent roles are all specified." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": true, 169 "justification": "Data handling is documented: dev set composition (400 instructions, 3 unit tests each), test set (500 instructions, 1 unit test), external dataset matching by function name (480/500 matched), and how non-overlapping external tests are appended (Section 4)." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": true, 176 "justification": "A dedicated 'Limitations' section is present at the end of the paper with substantive discussion of reproducibility concerns, reliance on external data, and model opacity." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": true, 181 "justification": "Specific threats are discussed: proprietary-only models limiting reproducibility, 10pp score drop without external data enrichment, and inability to attribute performance differences due to opaque training data. These are specific to this study, not generic disclaimers." 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": true, 186 "justification": "Explicit scope boundaries: 'focuses exclusively on proprietary models... does not evaluate any open-source models,' dependence on external dataset acknowledged with quantified impact (10pp), and 'we lack transparency into their training data' stated as a boundary on interpretability of results." 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": false, 193 "justification": "The shared task input data is available through organizers, but the raw experimental outputs (generated code, intermediate debugging results, model responses) are not released for independent verification. Results depend on proprietary API outputs that cannot be exactly reproduced." 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": true, 198 "justification": "Data sources are well described in Section 4: organizer-provided development set (400 instructions, 3 unit tests each), test set (500 instructions, 1 unit test), and external dataset (Austin et al. 2021) with function name matching yielding 480 matches." 199 }, 200 "recruitment_methods_described": { 201 "applies": false, 202 "answer": false, 203 "justification": "No human participants. The study uses shared task benchmark data and publicly available datasets." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": true, 208 "justification": "The data pipeline is documented: function name matching against external dataset (480/500), non-overlapping test extraction and appending, two-stage processing with Algorithm 1 specifying how failed codes are forwarded to the debugger." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": true, 214 "answer": true, 215 "justification": "Funding is disclosed in the Acknowledgement section: 'This work was supported by the Carl Zeiss Foundation through the TOPML and MAINCE projects (grant numbers P2021-02-014 and P2022-08-009I).'" 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "Author affiliations are clearly listed: Johannes Gutenberg University Mainz, Saarland University, University of Colorado Boulder. Authors are academic researchers with no disclosed affiliation to the proprietary model providers (OpenAI, Google, Anthropic)." 221 }, 222 "funder_independent_of_outcome": { 223 "applies": true, 224 "answer": true, 225 "justification": "The Carl Zeiss Foundation is a general research funder with no financial stake in the outcome of a Bangla code generation shared task or the relative performance of proprietary LLM APIs." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "No competing interests or financial disclosure statement is present in the paper. Absence of a declaration is not absence of conflict." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": true, 236 "answer": false, 237 "justification": "No training data cutoff dates are stated for any of the four proprietary models used (GPT-5, GPT-4.1, Claude Sonnet 4, Gemini-2.5-Flash). This is critical since the models may have been trained on similar programming problems." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": true, 241 "answer": false, 242 "justification": "No discussion of whether the BLP-2025 shared task problems, the Austin et al. (2021) dataset, or similar programming problems appeared in any model's training data." 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": true, 246 "answer": false, 247 "justification": "The Austin et al. (2021) dataset was published before all models' training cutoffs and is likely in their training data. The BLP-2025 task adapts existing problems to Bangla. Neither contamination vector is discussed." 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants in this study. It is a benchmark evaluation of code generation models." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants in this study." 260 }, 261 "demographics_reported": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants in this study." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants in this study." 270 }, 271 "randomization_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in this study." 275 }, 276 "blinding_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants in this study." 280 }, 281 "attrition_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants in this study." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": true, 290 "answer": false, 291 "justification": "No API costs, tokens consumed, wall-clock time, or cost per example reported despite using four different proprietary APIs across two stages with multiple attempts per sample." 292 }, 293 "compute_budget_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "No total computational budget, API spend, or resource usage is stated anywhere in the paper." 297 } 298 }, 299 "experimental_rigor": { 300 "seed_sensitivity_reported": { 301 "applies": true, 302 "answer": false, 303 "justification": "No mention of random seeds or seed sensitivity analysis. All results appear to be from single runs with no variation assessment." 304 }, 305 "number_of_runs_stated": { 306 "applies": true, 307 "answer": false, 308 "justification": "The number of experimental runs is never stated. Results are presented without indicating how many times the pipeline was executed." 309 }, 310 "hyperparameter_search_budget": { 311 "applies": true, 312 "answer": false, 313 "justification": "No hyperparameter search budget is reported. The 'reasoning effort' settings (low/high) appear chosen without documented exploration of alternatives." 314 }, 315 "best_config_selection_justified": { 316 "applies": true, 317 "answer": true, 318 "justification": "The paper states 'For our primary submission, we use GPT-5 since it is the best-performing model in the dev set' (Section 5). Model selection is based on development set performance, not test set." 319 }, 320 "multiple_comparison_correction": { 321 "applies": true, 322 "answer": false, 323 "justification": "Multiple model comparisons are made across both stages without any statistical tests at all, let alone correction for multiple comparisons." 324 }, 325 "self_comparison_bias_addressed": { 326 "applies": true, 327 "answer": false, 328 "justification": "No discussion of author-evaluation bias. The authors designed the pipeline and evaluated it themselves without acknowledging this potential bias or involving independent evaluation." 329 }, 330 "compute_budget_vs_performance": { 331 "applies": true, 332 "answer": false, 333 "justification": "No performance-vs-compute analysis despite comparing models of likely very different cost profiles (GPT-5 vs Gemini-2.5-Flash). GPT-5 wins on accuracy but the cost trade-off is never discussed." 334 }, 335 "benchmark_construct_validity": { 336 "applies": true, 337 "answer": false, 338 "justification": "No discussion of whether Pass@1 on unit tests actually measures Bangla instruction-to-code generation ability, or whether the benchmark has construct validity issues (e.g., whether solutions that pass tests are actually correct for the Bangla instructions)." 339 }, 340 "scaffold_confound_addressed": { 341 "applies": true, 342 "answer": true, 343 "justification": "All four models are evaluated within the same two-stage pipeline with identical prompts, test augmentation, and debugging workflow. The scaffold is held constant across model comparisons, controlling for this confound." 344 } 345 }, 346 "data_leakage": { 347 "temporal_leakage_addressed": { 348 "applies": true, 349 "answer": false, 350 "justification": "No discussion of temporal leakage. The Austin et al. (2021) external dataset was published years before the models' training, meaning solutions to those problems likely appear in training data. This is never addressed." 351 }, 352 "feature_leakage_addressed": { 353 "applies": true, 354 "answer": false, 355 "justification": "The debugger stage provides unit tests and error traces as input, giving the model substantial information about expected behavior. While this is by design, the paper does not discuss whether this constitutes feature leakage relative to a real-world deployment scenario." 356 }, 357 "non_independence_addressed": { 358 "applies": true, 359 "answer": false, 360 "justification": "No discussion of whether the shared task problems are independent of examples in the models' training data, or whether the Austin et al. (2021) external test cases create overlap between training and evaluation." 361 }, 362 "leakage_detection_method": { 363 "applies": true, 364 "answer": false, 365 "justification": "No concrete leakage detection or prevention method is applied. No canary strings, membership inference, n-gram overlap analysis, or decontamination is mentioned." 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "The multi-agent pipeline achieves first place in BLP-2025 Task 2 with Pass@1 of 95.4%", 372 "evidence": "Table 2 shows GPT-5 achieves 95.4 Pass@1 in Stage 2. The paper states this secured first place on the official leaderboard (Section 1, Section 7).", 373 "supported": "strong" 374 }, 375 { 376 "claim": "The selective debugging stage (Stage 2) dramatically improves performance over code generation alone (Stage 1)", 377 "evidence": "Tables 1-2: GPT-5 jumps 64.6→95.4 (+47.67%), GPT-4.1 58.0→82.6 (+41.37%), Claude Sonnet 4 58.2→79.0 (+35.73%), Gemini-2.5-Flash 52.6→59.8 (+13.68%).", 378 "supported": "moderate" 379 }, 380 { 381 "claim": "External test case augmentation is critical, improving GPT-5 from 86.00 to 95.4 Pass@1", 382 "evidence": "Section 6 ('Effect of external data'): 'without the external unit test for GPT-5, our proposed pipeline achieves an 86.00 Pass@1 score'.", 383 "supported": "moderate" 384 }, 385 { 386 "claim": "Generated test cases (84.00 Pass@1) underperform external test cases (95.4) but still substantially improve over Stage 1 (64.6)", 387 "evidence": "Section 6 ('Effect of generated unit tests'): GPT-5 with generated tests achieves 84.00, lower than 95.4 with external tests but much higher than 64.60 without tests.", 388 "supported": "moderate" 389 }, 390 { 391 "claim": "Translation from Bangla to English has mixed effects and generally does not help GPT-5", 392 "evidence": "Figure 2 and Section 6 ('Effect of translation'): GPT-5 Stage 1 drops 64.6→60.8, Stage 2 drops 95.8→94.8 with translation. Other models show mixed results (some improve in Stage 2).", 393 "supported": "moderate" 394 }, 395 { 396 "claim": "The system overfits to provided unit tests, with performance dropping from 99.8 (dev, all tests visible) to 95.4 (test, hidden tests)", 397 "evidence": "Section 6 ('Overfitting to the unit tests'): dev phase achieves 99.8 Pass@1 with all test cases visible; test phase with hidden tests drops to 95.4.", 398 "supported": "strong" 399 } 400 ], 401 "red_flags": [ 402 { 403 "flag": "No statistical tests for any comparative claims", 404 "detail": "All claims of model superiority and component effectiveness are based on comparing raw Pass@1 numbers without significance tests, confidence intervals, or multiple-run variance. With only point estimates, the reported differences could be within noise." 405 }, 406 { 407 "flag": "Single-run results with no variance assessment", 408 "detail": "All experiments appear to be single runs. LLM outputs are stochastic — the same pipeline could produce different results on a second run. Without multiple runs, the stability of the 95.4% result is unknown." 409 }, 410 { 411 "flag": "Contamination risk from Austin et al. (2021) dataset", 412 "detail": "The external dataset (Austin et al. 2021) was published 4+ years before these models were trained. Solutions to these problems are very likely in the training data. The 480 external test cases may be testing memorization rather than Bangla comprehension." 413 }, 414 { 415 "flag": "Proprietary-only evaluation limits reproducibility", 416 "detail": "All four models are proprietary APIs with opaque training data and changing behavior. Results cannot be exactly reproduced, and the paper acknowledges this in its Limitations section." 417 }, 418 { 419 "flag": "No cost analysis despite multi-stage API usage", 420 "detail": "The pipeline calls proprietary APIs multiple times per sample (generation + retry + debugging). GPT-5 with high reasoning effort is expensive. No cost data is provided, making practical deployment assessment impossible." 421 } 422 ], 423 "cited_papers": [ 424 { 425 "title": "Evaluating large language models trained on code", 426 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 427 "year": 2021, 428 "arxiv_id": "2107.03374", 429 "relevance": "Introduces HumanEval and Pass@k metrics that are the standard for code generation evaluation, directly relevant to benchmark methodology." 430 }, 431 { 432 "title": "Program synthesis with large language models", 433 "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"], 434 "year": 2021, 435 "arxiv_id": "2108.07732", 436 "relevance": "Provides the external dataset (MBPP) used for test case augmentation; foundational LLM code generation benchmark." 437 }, 438 { 439 "title": "A survey on code generation with LLM-based agents", 440 "authors": ["Yihong Dong", "Xue Jiang", "Jiaru Qian"], 441 "year": 2025, 442 "arxiv_id": "2508.00083", 443 "relevance": "Recent survey on LLM-based agent code generation approaches, directly relevant to the survey scope." 444 }, 445 { 446 "title": "Large language models for code generation: A comprehensive survey of challenges, techniques, evaluation, and applications", 447 "authors": ["Nam Huynh", "Beiyu Lin"], 448 "year": 2025, 449 "arxiv_id": "2503.01245", 450 "relevance": "Comprehensive survey on LLM code generation covering evaluation methods and challenges." 451 }, 452 { 453 "title": "A survey on large language models for code generation", 454 "authors": ["Juyong Jiang", "Fan Wang", "Jiasi Shen"], 455 "year": 2024, 456 "arxiv_id": "2406.00515", 457 "relevance": "Survey on LLM code generation documenting the English-centric bias in benchmarks and systems." 458 }, 459 { 460 "title": "Qwen2.5-coder technical report", 461 "authors": ["Binyuan Hui", "Jian Yang", "Zeyu Cui"], 462 "year": 2024, 463 "arxiv_id": "2409.12186", 464 "relevance": "Technical report for a major open-source code generation model, relevant to code LLM evaluation." 465 }, 466 { 467 "title": "StarCoder: may the source be with you!", 468 "authors": ["Raymond Li", "Loubna Ben Allal", "Yangtian Zi"], 469 "year": 2023, 470 "arxiv_id": "2305.06161", 471 "relevance": "Open-source code generation model; relevant to the landscape of code LLMs being evaluated." 472 }, 473 { 474 "title": "mHumanEval - a multilingual benchmark to evaluate large language models for code generation", 475 "authors": ["Nishat Raihan", "Antonios Anastasopoulos", "Marcos Zampieri"], 476 "year": 2025, 477 "relevance": "Multilingual code generation benchmark directly relevant to evaluating LLMs on non-English code generation tasks." 478 }, 479 { 480 "title": "MConala: A benchmark for code generation from multiple natural languages", 481 "authors": ["Zhiruo Wang", "Grace Cuenca", "Shuyan Zhou"], 482 "year": 2023, 483 "arxiv_id": "2203.08388", 484 "relevance": "Multilingual code generation benchmark highlighting the English-centric limitation of existing evaluations." 485 }, 486 { 487 "title": "Qwen3 technical report", 488 "authors": ["An Yang", "Anfeng Li", "Baosong Yang"], 489 "year": 2025, 490 "arxiv_id": "2505.09388", 491 "relevance": "Technical report for a major LLM family; relevant to the landscape of models used for code generation." 492 } 493 ] 494 }