scan.json (25546B)
1 { 2 "paper": { 3 "title": "Scaling Laws for Code: Every Programming Language Matters", 4 "authors": ["Jian Yang", "Shawn Guo", "Lin Jing", "Wei Zhang", "Aishan Liu", "Chuan Hao", "Zhoujun Li", "Wayne Xin Zhao", "Xianglong Liu", "Weifeng Lv", "Bryan Dai"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2512.13472", 8 "doi": "10.48550/arXiv.2512.13472" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "Different programming languages exhibit distinct scaling behaviors: interpreted languages (e.g., Python) show larger scaling exponents than compiled languages (e.g., Rust), and irreducible loss orders languages by intrinsic complexity (C# < Java ≈ Rust < Go < TypeScript < JavaScript < Python). Multilingual pre-training provides synergistic benefits for most languages, with syntactically similar pairs (Java-C#) showing the largest gains (20.5% improvement). Parallel pairing of code translations significantly enhances cross-lingual capabilities with favorable scaling properties. A proportion-dependent multilingual scaling law enables optimal token allocation that outperforms uniform distribution.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No repository URL, code archive, or release link found anywhere in the paper." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "The training corpus and custom translation evaluation set are described but no download links or public release is provided." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper describes model architecture (LLaMA-2 style with SwiGLU, RoPE, MHA, RMSNorm) but provides no environment specifications, dependency lists, or library versions." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No reproduction instructions, scripts, or step-by-step guides are provided." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Results in Tables 1, 3, and Figure 6 report point estimates only. No confidence intervals or error bars are provided for any results." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "Claims like 'parallel pairing significantly outperforms baseline' and '20.5% improvement' are made based on comparing numbers directly, with no statistical significance tests applied." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Percentage improvements with baseline context are consistently reported, e.g., 'Java-C# combination achieves validation loss of 0.718 compared to 0.903 for Java self-repetition—a remarkable 20.5% improvement' (Section 4.2). Table 1 shows absolute values and relative improvement percentages." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification for why 10 model sizes, 6 token budgets, or 7 programming languages were chosen. The choices appear pragmatic but are not justified." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "Each experimental configuration appears to be a single training run. No variance, standard deviation, or spread measures across runs are reported." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Baselines include: monolingual self-repetition baseline (Section 4), random shuffling baseline (Section 5), and uniform allocation baseline (Section 6)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Baselines represent standard practice (uniform allocation, monolingual training). The paper also references the recent code scaling law work [19] (2025). The baselines are appropriate for this type of study." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "The study systematically varies individual factors: language-specific vs. mixed training (Section 4), different data organization strategies (random shuffling vs. parallel pairing, Section 5), and uniform vs. optimized allocation (Section 6)." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Evaluation uses validation loss (cross-entropy), Pass@1 on MultiPL-E, and BLEU score for code translation (Table 3, Figure 6)." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "No human evaluation of generated code or translations. Evaluation is entirely automated (loss, Pass@1, BLEU)." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "Section 5.1 describes a carefully curated held-out evaluation set: '50 Python files from GitHub' with manual translations to 6 target languages, yielding 2,100 translation instances. MultiPL-E is a separate benchmark." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down per programming language throughout (Table 1 synergy matrix, Table 3 per-language MultiPL-E, Figure 6 per-language Pass@1 and BLEU). Figure 4 shows per-direction translation results." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper discusses where multilingual training hurts: 'when Python is the target PL, mixing with most auxiliary PL produces small negative effects' (Section 4.2). Specific negative synergy values are reported." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Python's negative transfer from most languages is reported explicitly: 'JavaScript (Δ = −0.009), TypeScript (Δ = −0.007), C# (Δ = −0.013), Go (Δ = −0.016), and Rust (Δ = −0.021)' (Section 4.2)." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims about interpreted vs. compiled language scaling (supported by Figure 2), multilingual synergistic benefits (Table 1), parallel pairing advantages (Figure 3-4, Table 3), and optimal allocation outperforming uniform (Figure 6) are all backed by experimental results." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "Causal claims like 'parallel pairing significantly enhances cross-lingual abilities' are supported by controlled experiments comparing strategies under identical compute budgets and architectures. The experimental design isolates the variable being studied (Section 4.1 compares D_Li+D_Li vs D_Li+D_Lj)." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "The Limitations section explicitly bounds generalization: 'only seven programming languages,' 'largest model reaches 14B parameters,' 'evaluation focuses on code translation and generation benchmarks,' and 'synergy coefficients are fitted to our specific corpus.'" 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper offers interpretations for findings (e.g., why Java-C# have high synergy) but does not discuss alternative explanations for the observed scaling patterns or consider confounding factors beyond the variables studied." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper's claims are well-matched to its measurements. It measures validation loss, Pass@1, and BLEU, and frames findings in terms of these specific metrics rather than making broader unmeasured claims about code quality or developer productivity." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Models are trained from scratch with specified architectures (LLaMA-2 style with SwiGLU, RoPE, MHA, RMSNorm) and exact parameter counts (10 sizes from 0.1B to 3.1B, plus 0.5B/1.5B/3B/7B for translation experiments). Since these are custom-trained models, architecture specification is appropriate." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "Section 5.4 mentions 'prompt-based concatenation' as a pre-training strategy but does not provide the actual prompt templates used. MultiPL-E evaluation prompts are not shown." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "No learning rate, optimizer, batch size, warmup schedule, or other training hyperparameters are reported in the provided text. Only architecture and data volume are specified." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. This is a pre-training study." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": false, 162 "justification": "The paper states 'We collect a high-quality training corpus' but provides no details on filtering, deduplication, or preprocessing steps applied to the training data." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "A dedicated 'Limitations' section appears at the end of the paper with five specific limitations discussed." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Limitations are specific to this study: 'only seven programming languages,' 'largest model reaches 14B parameters with 1T tokens, whereas state-of-the-art code LLMs exceed 100B,' 'synergy coefficients are fitted to our specific corpus; different data distributions may yield varying patterns.'" 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "The paper explicitly states what is NOT shown: extending to low-resource languages, validation at extreme scales (>100B), complex tasks like program repair, and dynamic curriculum strategies." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw data (training corpus, validation losses per run, fitted parameters datasets) is made available for independent verification." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 5.1 describes the cross-lingual evaluation set construction: '50 Python files from GitHub' selected by 'three software engineers,' human annotators producing translations for 6 target languages. Training corpus composition is described (900B code + 100B FineWeb-Edu)." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants are studied. The software engineers creating the evaluation set are annotators, not research subjects, and the data is model training runs." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": false, 201 "justification": "The pipeline from raw code data to training corpus is not documented. No filtering criteria, deduplication methods, or data processing steps are described." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding sources, grants, or sponsorships are disclosed in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are listed: Beihang University, Ubiquant (a quantitative finance firm), and Renmin University of China." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "Ubiquant is a quantitative finance company that likely uses code LLMs. Their financial interest in scaling law outcomes is not discussed. No funding disclosure makes independence assessment impossible." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement or financial interest disclosures are present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "Models are trained from scratch on a custom corpus, but the paper does not state when the training data was collected or its temporal boundaries relative to MultiPL-E." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether MultiPL-E problems or solutions appear in the training corpus. Since they train from scratch with a custom corpus, they could verify this but do not." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "MultiPL-E is a public benchmark. The paper does not discuss whether its problems could appear in the training data." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants are studied." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants are studied." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants are studied." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants are studied." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants are studied." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants are studied." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants are studied." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No inference cost or latency is reported for the trained models." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": true, 289 "justification": "The abstract states 'Equivalent to 336,000+ H800 hours' for the 1000+ experiments." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No mention of multiple random seeds. Each of the 1000+ experiments appears to be a single training run." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The total number of configurations is stated (420 for Section 3, 28 for Section 4) but it is not stated whether each configuration was run multiple times." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No hyperparameter search is described. Training hyperparameters (learning rate, optimizer, etc.) are not even reported, let alone search budgets." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "The 'optimized allocation' in Section 6 is derived analytically from fitted scaling laws and synergy matrices, not cherry-picked from trial runs." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors train all models and evaluate them without acknowledging potential bias from implementing and tuning their own systems." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": true, 326 "justification": "Performance is explicitly shown as a function of compute (model size and data size) throughout the paper. Figure 2 shows scaling surfaces. Section 6 compares strategies at identical compute budgets (400B tokens)." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "MultiPL-E is used without discussing whether Pass@1 on this benchmark adequately measures multilingual code generation capability." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved. Models are evaluated directly." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of temporal leakage. The training corpus could contain solutions to MultiPL-E problems that were published before data collection." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the evaluation setup leaks information. The custom translation eval set is created by the authors, but overlap with training data is not analyzed." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether training and test data share structural similarities (e.g., same repositories, similar coding patterns)." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention method is described or applied." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "Interpreted languages (e.g., Python) benefit more from increased model size and data than compiled languages (e.g., Rust), showing larger scaling exponents.", 365 "evidence": "Figure 2 shows fitted scaling parameters: Python has the highest αN and αD values, while Rust shows notably smaller exponents (Section 3.2).", 366 "supported": "strong" 367 }, 368 { 369 "claim": "Irreducible loss establishes a complexity ordering: C# < Java ≈ Rust < Go < TypeScript < JavaScript < Python.", 370 "evidence": "Fitted L∞ values from Figure 2 across 420 training runs with systematic variation in model size and data (Section 3.2).", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Multilingual pre-training provides synergistic benefits, with Java-C# showing 20.5% improvement over monolingual baseline.", 375 "evidence": "Table 1 shows synergy gain matrix from 28 bilingual mixture experiments. Java-C# achieves validation loss of 0.718 vs 0.903 for self-repetition (Section 4.2).", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Python suffers negative transfer when mixed with most other languages during pre-training.", 380 "evidence": "Table 1 / Section 4.2: negative synergy for Python with JavaScript (Δ = −0.009), TypeScript (−0.007), C# (−0.013), Go (−0.016), Rust (−0.021). Only Java provides positive synergy.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Parallel pairing (concatenating code with translations) significantly enhances cross-lingual abilities compared to random shuffling.", 385 "evidence": "Figure 3 shows parallel pairing achieves lower validation loss on unseen translation directions across all model sizes (0.2B-7B). Table 3 shows better MultiPL-E scores (Section 5.3-5.4).", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Optimized token allocation achieves higher average performance across all PLs compared to uniform distribution under the same compute budget.", 390 "evidence": "Figure 6: optimized allocation achieves Pass@1 21.34 vs 19.84 baseline and BLEU 13.9 vs 13.3. Both trained on 400B tokens (Section 6.4).", 391 "supported": "moderate" 392 } 393 ], 394 "red_flags": [ 395 { 396 "flag": "No error bars or variance across runs", 397 "detail": "Over 1000 experiments are conducted but each appears to be a single run. No variance, standard deviation, or confidence intervals are reported. Given that neural network training is stochastic, the fitted scaling parameters could be sensitive to random seed." 398 }, 399 { 400 "flag": "Missing training hyperparameters", 401 "detail": "Critical training details (learning rate, optimizer, batch size, warmup schedule) are absent from the paper. This makes reproduction impossible even if code and data were released." 402 }, 403 { 404 "flag": "Small final evaluation", 405 "detail": "The key validation claim (Section 6) compares only two 1.5B models on 400B tokens. The improvement in Pass@1 (19.84 → 21.34) and BLEU (13.3 → 13.9) is modest and based on single runs with no error bars." 406 }, 407 { 408 "flag": "No contamination analysis despite custom training", 409 "detail": "The authors train from scratch with a custom corpus, giving them full control and ability to verify train/test overlap with MultiPL-E, yet no contamination analysis is performed." 410 }, 411 { 412 "flag": "Industry affiliation not discussed", 413 "detail": "Ubiquant is a quantitative finance company. The substantial compute resources (336K H800 hours) likely came from them, but no funding disclosure or conflict of interest statement is present." 414 } 415 ], 416 "cited_papers": [ 417 { 418 "title": "Evaluating large language models trained on code", 419 "authors": ["Mark Chen", "Jerry Tworek"], 420 "year": 2021, 421 "arxiv_id": "2107.03374", 422 "relevance": "Foundational Codex/HumanEval paper establishing code generation evaluation methodology." 423 }, 424 { 425 "title": "Training compute-optimal large language models", 426 "authors": ["Jordan Hoffmann", "Sebastian Borgeaud"], 427 "year": 2022, 428 "arxiv_id": "2203.15556", 429 "relevance": "Chinchilla scaling law paper that this work extends to multilingual code." 430 }, 431 { 432 "title": "DeepSeek-Coder: When the large language model meets programming", 433 "authors": ["Daya Guo", "Qihao Zhu"], 434 "year": 2024, 435 "arxiv_id": "2401.14196", 436 "relevance": "Major code LLM trained on multiple programming languages, directly relevant to multilingual code pre-training." 437 }, 438 { 439 "title": "StarCoder: May the source be with you!", 440 "authors": ["Raymond Li", "Loubna Ben Allal"], 441 "year": 2023, 442 "arxiv_id": "2305.06161", 443 "relevance": "Large-scale multilingual code model pre-training, relevant to understanding code LLM development." 444 }, 445 { 446 "title": "Scaling laws for code: A more data-hungry regime", 447 "authors": ["Xianzhen Luo", "Wenzhen Zheng"], 448 "year": 2025, 449 "arxiv_id": "2510.08702", 450 "relevance": "Direct predecessor establishing code-specific scaling laws that this paper extends to multilingual settings." 451 }, 452 { 453 "title": "Scaling laws for neural language models", 454 "authors": ["Jared Kaplan", "Sam McCandlish"], 455 "year": 2020, 456 "arxiv_id": "2001.08361", 457 "relevance": "Foundational scaling laws paper for language models." 458 }, 459 { 460 "title": "Code Llama: Open foundation models for code", 461 "authors": ["Baptiste Roziere", "Jonas Gehring"], 462 "year": 2023, 463 "relevance": "Major open-source code LLM relevant to understanding multilingual code model training." 464 }, 465 { 466 "title": "Qwen2.5-Coder technical report", 467 "authors": ["Binyuan Hui", "Jian Yang"], 468 "year": 2024, 469 "arxiv_id": "2409.12186", 470 "relevance": "Recent code LLM with multilingual pre-training, co-authored by this paper's first author." 471 }, 472 { 473 "title": "Are emergent abilities of large language models a mirage?", 474 "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"], 475 "year": 2023, 476 "relevance": "Challenges emergent abilities narrative relevant to scaling law interpretation." 477 }, 478 { 479 "title": "CodeBERT: A pre-trained model for programming and natural languages", 480 "authors": ["Zhangyin Feng", "Daya Guo"], 481 "year": 2020, 482 "relevance": "Early code pre-training work establishing the field this paper studies." 483 } 484 ] 485 }