scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25546B)
      1 {
      2   "paper": {
      3     "title": "Scaling Laws for Code: Every Programming Language Matters",
      4     "authors": ["Jian Yang", "Shawn Guo", "Lin Jing", "Wei Zhang", "Aishan Liu", "Chuan Hao", "Zhoujun Li", "Wayne Xin Zhao", "Xianglong Liu", "Weifeng Lv", "Bryan Dai"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2512.13472",
      8     "doi": "10.48550/arXiv.2512.13472"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "Different programming languages exhibit distinct scaling behaviors: interpreted languages (e.g., Python) show larger scaling exponents than compiled languages (e.g., Rust), and irreducible loss orders languages by intrinsic complexity (C# < Java ≈ Rust < Go < TypeScript < JavaScript < Python). Multilingual pre-training provides synergistic benefits for most languages, with syntactically similar pairs (Java-C#) showing the largest gains (20.5% improvement). Parallel pairing of code translations significantly enhances cross-lingual capabilities with favorable scaling properties. A proportion-dependent multilingual scaling law enables optimal token allocation that outperforms uniform distribution.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL, code archive, or release link found anywhere in the paper."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The training corpus and custom translation evaluation set are described but no download links or public release is provided."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper describes model architecture (LLaMA-2 style with SwiGLU, RoPE, MHA, RMSNorm) but provides no environment specifications, dependency lists, or library versions."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No reproduction instructions, scripts, or step-by-step guides are provided."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Results in Tables 1, 3, and Figure 6 report point estimates only. No confidence intervals or error bars are provided for any results."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Claims like 'parallel pairing significantly outperforms baseline' and '20.5% improvement' are made based on comparing numbers directly, with no statistical significance tests applied."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Percentage improvements with baseline context are consistently reported, e.g., 'Java-C# combination achieves validation loss of 0.718 compared to 0.903 for Java self-repetition—a remarkable 20.5% improvement' (Section 4.2). Table 1 shows absolute values and relative improvement percentages."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for why 10 model sizes, 6 token budgets, or 7 programming languages were chosen. The choices appear pragmatic but are not justified."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Each experimental configuration appears to be a single training run. No variance, standard deviation, or spread measures across runs are reported."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include: monolingual self-repetition baseline (Section 4), random shuffling baseline (Section 5), and uniform allocation baseline (Section 6)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines represent standard practice (uniform allocation, monolingual training). The paper also references the recent code scaling law work [19] (2025). The baselines are appropriate for this type of study."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The study systematically varies individual factors: language-specific vs. mixed training (Section 4), different data organization strategies (random shuffling vs. parallel pairing, Section 5), and uniform vs. optimized allocation (Section 6)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Evaluation uses validation loss (cross-entropy), Pass@1 on MultiPL-E, and BLEU score for code translation (Table 3, Figure 6)."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation of generated code or translations. Evaluation is entirely automated (loss, Pass@1, BLEU)."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Section 5.1 describes a carefully curated held-out evaluation set: '50 Python files from GitHub' with manual translations to 6 target languages, yielding 2,100 translation instances. MultiPL-E is a separate benchmark."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down per programming language throughout (Table 1 synergy matrix, Table 3 per-language MultiPL-E, Figure 6 per-language Pass@1 and BLEU). Figure 4 shows per-direction translation results."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses where multilingual training hurts: 'when Python is the target PL, mixing with most auxiliary PL produces small negative effects' (Section 4.2). Specific negative synergy values are reported."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Python's negative transfer from most languages is reported explicitly: 'JavaScript (Δ = −0.009), TypeScript (Δ = −0.007), C# (Δ = −0.013), Go (Δ = −0.016), and Rust (Δ = −0.021)' (Section 4.2)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims about interpreted vs. compiled language scaling (supported by Figure 2), multilingual synergistic benefits (Table 1), parallel pairing advantages (Figure 3-4, Table 3), and optimal allocation outperforming uniform (Figure 6) are all backed by experimental results."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims like 'parallel pairing significantly enhances cross-lingual abilities' are supported by controlled experiments comparing strategies under identical compute budgets and architectures. The experimental design isolates the variable being studied (Section 4.1 compares D_Li+D_Li vs D_Li+D_Lj)."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The Limitations section explicitly bounds generalization: 'only seven programming languages,' 'largest model reaches 14B parameters,' 'evaluation focuses on code translation and generation benchmarks,' and 'synergy coefficients are fitted to our specific corpus.'"
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper offers interpretations for findings (e.g., why Java-C# have high synergy) but does not discuss alternative explanations for the observed scaling patterns or consider confounding factors beyond the variables studied."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper's claims are well-matched to its measurements. It measures validation loss, Pass@1, and BLEU, and frames findings in terms of these specific metrics rather than making broader unmeasured claims about code quality or developer productivity."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Models are trained from scratch with specified architectures (LLaMA-2 style with SwiGLU, RoPE, MHA, RMSNorm) and exact parameter counts (10 sizes from 0.1B to 3.1B, plus 0.5B/1.5B/3B/7B for translation experiments). Since these are custom-trained models, architecture specification is appropriate."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "Section 5.4 mentions 'prompt-based concatenation' as a pre-training strategy but does not provide the actual prompt templates used. MultiPL-E evaluation prompts are not shown."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No learning rate, optimizer, batch size, warmup schedule, or other training hyperparameters are reported in the provided text. Only architecture and data volume are specified."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. This is a pre-training study."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The paper states 'We collect a high-quality training corpus' but provides no details on filtering, deduplication, or preprocessing steps applied to the training data."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "A dedicated 'Limitations' section appears at the end of the paper with five specific limitations discussed."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Limitations are specific to this study: 'only seven programming languages,' 'largest model reaches 14B parameters with 1T tokens, whereas state-of-the-art code LLMs exceed 100B,' 'synergy coefficients are fitted to our specific corpus; different data distributions may yield varying patterns.'"
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper explicitly states what is NOT shown: extending to low-resource languages, validation at extreme scales (>100B), complex tasks like program repair, and dynamic curriculum strategies."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw data (training corpus, validation losses per run, fitted parameters datasets) is made available for independent verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 5.1 describes the cross-lingual evaluation set construction: '50 Python files from GitHub' selected by 'three software engineers,' human annotators producing translations for 6 target languages. Training corpus composition is described (900B code + 100B FineWeb-Edu)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants are studied. The software engineers creating the evaluation set are annotators, not research subjects, and the data is model training runs."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The pipeline from raw code data to training corpus is not documented. No filtering criteria, deduplication methods, or data processing steps are described."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding sources, grants, or sponsorships are disclosed in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are listed: Beihang University, Ubiquant (a quantitative finance firm), and Renmin University of China."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Ubiquant is a quantitative finance company that likely uses code LLMs. Their financial interest in scaling law outcomes is not discussed. No funding disclosure makes independence assessment impossible."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement or financial interest disclosures are present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "Models are trained from scratch on a custom corpus, but the paper does not state when the training data was collected or its temporal boundaries relative to MultiPL-E."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether MultiPL-E problems or solutions appear in the training corpus. Since they train from scratch with a custom corpus, they could verify this but do not."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "MultiPL-E is a public benchmark. The paper does not discuss whether its problems could appear in the training data."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are studied."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are studied."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are studied."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are studied."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are studied."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants are studied."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants are studied."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference cost or latency is reported for the trained models."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "The abstract states 'Equivalent to 336,000+ H800 hours' for the 1000+ experiments."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No mention of multiple random seeds. Each of the 1000+ experiments appears to be a single training run."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The total number of configurations is stated (420 for Section 3, 28 for Section 4) but it is not stated whether each configuration was run multiple times."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search is described. Training hyperparameters (learning rate, optimizer, etc.) are not even reported, let alone search budgets."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "The 'optimized allocation' in Section 6 is derived analytically from fitted scaling laws and synergy matrices, not cherry-picked from trial runs."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors train all models and evaluate them without acknowledging potential bias from implementing and tuning their own systems."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Performance is explicitly shown as a function of compute (model size and data size) throughout the paper. Figure 2 shows scaling surfaces. Section 6 compares strategies at identical compute budgets (400B tokens)."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "MultiPL-E is used without discussing whether Pass@1 on this benchmark adequately measures multilingual code generation capability."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved. Models are evaluated directly."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of temporal leakage. The training corpus could contain solutions to MultiPL-E problems that were published before data collection."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup leaks information. The custom translation eval set is created by the authors, but overlap with training data is not analyzed."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether training and test data share structural similarities (e.g., same repositories, similar coding patterns)."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention method is described or applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Interpreted languages (e.g., Python) benefit more from increased model size and data than compiled languages (e.g., Rust), showing larger scaling exponents.",
    365       "evidence": "Figure 2 shows fitted scaling parameters: Python has the highest αN and αD values, while Rust shows notably smaller exponents (Section 3.2).",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "Irreducible loss establishes a complexity ordering: C# < Java ≈ Rust < Go < TypeScript < JavaScript < Python.",
    370       "evidence": "Fitted L∞ values from Figure 2 across 420 training runs with systematic variation in model size and data (Section 3.2).",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Multilingual pre-training provides synergistic benefits, with Java-C# showing 20.5% improvement over monolingual baseline.",
    375       "evidence": "Table 1 shows synergy gain matrix from 28 bilingual mixture experiments. Java-C# achieves validation loss of 0.718 vs 0.903 for self-repetition (Section 4.2).",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Python suffers negative transfer when mixed with most other languages during pre-training.",
    380       "evidence": "Table 1 / Section 4.2: negative synergy for Python with JavaScript (Δ = −0.009), TypeScript (−0.007), C# (−0.013), Go (−0.016), Rust (−0.021). Only Java provides positive synergy.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Parallel pairing (concatenating code with translations) significantly enhances cross-lingual abilities compared to random shuffling.",
    385       "evidence": "Figure 3 shows parallel pairing achieves lower validation loss on unseen translation directions across all model sizes (0.2B-7B). Table 3 shows better MultiPL-E scores (Section 5.3-5.4).",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Optimized token allocation achieves higher average performance across all PLs compared to uniform distribution under the same compute budget.",
    390       "evidence": "Figure 6: optimized allocation achieves Pass@1 21.34 vs 19.84 baseline and BLEU 13.9 vs 13.3. Both trained on 400B tokens (Section 6.4).",
    391       "supported": "moderate"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "No error bars or variance across runs",
    397       "detail": "Over 1000 experiments are conducted but each appears to be a single run. No variance, standard deviation, or confidence intervals are reported. Given that neural network training is stochastic, the fitted scaling parameters could be sensitive to random seed."
    398     },
    399     {
    400       "flag": "Missing training hyperparameters",
    401       "detail": "Critical training details (learning rate, optimizer, batch size, warmup schedule) are absent from the paper. This makes reproduction impossible even if code and data were released."
    402     },
    403     {
    404       "flag": "Small final evaluation",
    405       "detail": "The key validation claim (Section 6) compares only two 1.5B models on 400B tokens. The improvement in Pass@1 (19.84 → 21.34) and BLEU (13.3 → 13.9) is modest and based on single runs with no error bars."
    406     },
    407     {
    408       "flag": "No contamination analysis despite custom training",
    409       "detail": "The authors train from scratch with a custom corpus, giving them full control and ability to verify train/test overlap with MultiPL-E, yet no contamination analysis is performed."
    410     },
    411     {
    412       "flag": "Industry affiliation not discussed",
    413       "detail": "Ubiquant is a quantitative finance company. The substantial compute resources (336K H800 hours) likely came from them, but no funding disclosure or conflict of interest statement is present."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Evaluating large language models trained on code",
    419       "authors": ["Mark Chen", "Jerry Tworek"],
    420       "year": 2021,
    421       "arxiv_id": "2107.03374",
    422       "relevance": "Foundational Codex/HumanEval paper establishing code generation evaluation methodology."
    423     },
    424     {
    425       "title": "Training compute-optimal large language models",
    426       "authors": ["Jordan Hoffmann", "Sebastian Borgeaud"],
    427       "year": 2022,
    428       "arxiv_id": "2203.15556",
    429       "relevance": "Chinchilla scaling law paper that this work extends to multilingual code."
    430     },
    431     {
    432       "title": "DeepSeek-Coder: When the large language model meets programming",
    433       "authors": ["Daya Guo", "Qihao Zhu"],
    434       "year": 2024,
    435       "arxiv_id": "2401.14196",
    436       "relevance": "Major code LLM trained on multiple programming languages, directly relevant to multilingual code pre-training."
    437     },
    438     {
    439       "title": "StarCoder: May the source be with you!",
    440       "authors": ["Raymond Li", "Loubna Ben Allal"],
    441       "year": 2023,
    442       "arxiv_id": "2305.06161",
    443       "relevance": "Large-scale multilingual code model pre-training, relevant to understanding code LLM development."
    444     },
    445     {
    446       "title": "Scaling laws for code: A more data-hungry regime",
    447       "authors": ["Xianzhen Luo", "Wenzhen Zheng"],
    448       "year": 2025,
    449       "arxiv_id": "2510.08702",
    450       "relevance": "Direct predecessor establishing code-specific scaling laws that this paper extends to multilingual settings."
    451     },
    452     {
    453       "title": "Scaling laws for neural language models",
    454       "authors": ["Jared Kaplan", "Sam McCandlish"],
    455       "year": 2020,
    456       "arxiv_id": "2001.08361",
    457       "relevance": "Foundational scaling laws paper for language models."
    458     },
    459     {
    460       "title": "Code Llama: Open foundation models for code",
    461       "authors": ["Baptiste Roziere", "Jonas Gehring"],
    462       "year": 2023,
    463       "relevance": "Major open-source code LLM relevant to understanding multilingual code model training."
    464     },
    465     {
    466       "title": "Qwen2.5-Coder technical report",
    467       "authors": ["Binyuan Hui", "Jian Yang"],
    468       "year": 2024,
    469       "arxiv_id": "2409.12186",
    470       "relevance": "Recent code LLM with multilingual pre-training, co-authored by this paper's first author."
    471     },
    472     {
    473       "title": "Are emergent abilities of large language models a mirage?",
    474       "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"],
    475       "year": 2023,
    476       "relevance": "Challenges emergent abilities narrative relevant to scaling law interpretation."
    477     },
    478     {
    479       "title": "CodeBERT: A pre-trained model for programming and natural languages",
    480       "authors": ["Zhangyin Feng", "Daya Guo"],
    481       "year": 2020,
    482       "relevance": "Early code pre-training work establishing the field this paper studies."
    483     }
    484   ]
    485 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs