scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29669B)
      1 {
      2   "paper": {
      3     "title": "TigerCoder: A Novel Suite of LLMs for Code Generation in Bangla",
      4     "authors": [
      5       "Nishat Raihan",
      6       "Antonios Anastasopoulos",
      7       "Marcos Zampieri"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2509.09101",
     12     "doi": "10.48550/arXiv.2509.09101"
     13   },
     14   "scan_version": 2,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "TigerCoder, the first dedicated Bangla code-generation LLM family (1B and 9B parameters), achieves 11-13% Pass@1 improvements over prior models on Python benchmarks, with larger gains on other programming languages. Machine-translating Bangla prompts to English does not improve code generation performance, contradicting a common assumption. Most existing LLMs exhibit large performance drops (20-60 percentage points) when coding prompts are in Bangla rather than English, with TigerLLM being the only prior model maintaining parity. The combined use of self-instruct, synthetic, and translated instruction datasets yields the best fine-tuning results.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "GitHub repository link provided in abstract footnote: https://github.com/mraihan-gmu/TigerCoder/. The paper states 'We open-source all resources to advance further Bangla LLM research.'"
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper claims open-source release of all three Bangla-Code-Instruct datasets (SI, Syn, TE — 300K total) and the MBPP-Bangla benchmark (974 problems) via the same GitHub repository. Licensed under CC-BY-SA-4.0 per Appendix D."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Hardware is mentioned (NVIDIA A100 40GB, 80GB RAM, Google Colab) and Python 3.13.0 is stated for code validation, but no requirements.txt, Dockerfile, or library version listing is provided. The fine-tuning framework is not specified."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions are described. Appendix D mentions 'a Colab notebook offers a plug-and-play demo' but no reproduction scripts or README commands for replicating main experiments are provided in the paper itself."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "All results in Tables 3, 4, and 7 are point estimates with no confidence intervals, error bars, or uncertainty measures. Pass@K values are reported as single numbers."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper claims 'significant 11-18% performance gains' but no statistical significance tests (p-values, t-tests, bootstrap tests) are used. All comparative claims are based solely on comparing point estimates."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Table 7 includes explicit Δ columns showing improvement over baseline (e.g., TigerCoder 9B: +0.11 P@1 on mHumanEval, +0.13 on MBPP). Baseline values are available in Table 3 for context."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No justification for dataset sizes (100K per subset, 974 benchmark problems). The choice of 974 MBPP problems is inherited from the original benchmark but no power analysis or sample size reasoning is given for any evaluation."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. Table 6 states Seed=42, suggesting single-run results with no spread measures."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Extensive baselines in Tables 3-4: proprietary models (GPT-3.5, GPT-4o-mini, Gemini Flash 2.5), multilingual open-source (LLaMA 3.2, Gemma 3, Pangea, Phi-4), and Bangla-specific models (Titu-LLM, Bong-LLaMA, Bangla-LLaMA, Bangla-Gemma, TigerLLM)."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Baselines include recent models: Gemma 3 (2025), Gemini Flash 2.5 (2024), GPT-4o-mini (2024), LLaMA 3.2 (2024). These represent the current state of the art."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Figure 2 provides a thorough ablation study testing all 7 combinations of the three instruction datasets (SI, Syn, TE) for both 1B and 9B model sizes, showing individual and synergistic contributions."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Three variations of Pass@K are used (K=1, 10, 100) across two benchmarks (mHumanEval-Bangla, MBPP-Bangla) and five programming languages."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No human evaluation of model-generated code outputs. All evaluation is automated via test case execution (Pass@K). Human involvement was limited to benchmark creation (translation and verification), not system output evaluation."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The evaluation benchmarks (MBPP-Bangla, mHumanEval-Bangla) are entirely separate from the training data (Bangla-Code-Instruct). The fine-tuning data consists of instruction-code pairs, while the test benchmarks are independently curated problem sets."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results are broken down by programming language in Appendix E (C++, Java, JavaScript, Ruby in addition to Python), by benchmark (mHumanEval vs MBPP), and by dataset combination (Figure 2). MBPP-Bangla also has five topical classes, though per-class results are not reported."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Figure 1 shows specific code keyword mistranslation examples causing failures in the MT approach. Section 5 provides analysis of why machine translation produces poor results. However, no error analysis of where TigerCoder itself fails."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "RQ2 (Section 5) is entirely a negative result: machine translation does not help Bangla code generation. The Syn dataset alone being less effective than SI or TE is also reported in Figure 2."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The abstract claims 'significant 11-18% performance gains at Pass@1' but Table 7 shows the 9B model achieves 11% and 13% at Pass@1 (range: 11-13%, not 11-18%). The 18% gain only appears at Pass@100 (MBPP, 9B model). The 1B model's Pass@1 gains are only 5%. The abstract conflates Pass@1 and higher-K metrics."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The main causal claim is that curated datasets improve performance. The ablation study (Figure 2) provides controlled single-variable manipulation of dataset combinations, and the fine-tuning comparison against the TigerLLM base model is a reasonable before/after controlled intervention."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The conclusion extends to 'other low-resource languages' ('an effective and replicable blueprint for the future of efficient, high-performance LLM development for Bangla and other low-resource languages') but results are only on Bangla. The title is appropriately scoped, but the conclusion overclaims."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "No discussion of alternative explanations. The strong TigerLLM base model's contribution vs. the fine-tuning data is not disentangled — would these datasets help other base models equally? Potential overfitting to MBPP-style tasks is not considered."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper measures Pass@K on code generation benchmarks and claims code generation capability. The measurements match the claims at appropriate granularity — they don't overframe Pass@K as 'developer productivity' or 'software quality.' The scope is clearly code generation on benchmarks."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "Models are identified by marketing names only: 'GPT-4o-mini', 'GPT-3.5', 'Gemini-Flash 2.5', 'Claude-3.5-Sonnet'. No API version strings, snapshot dates, or specific model identifiers (e.g., 'gpt-4o-mini-2024-07-18') are provided for any model."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "No actual prompt text is provided for model evaluation or data generation. Appendix B describes generation prompts in natural language ('Provide only the Python code', 'Ensure the code is self-contained') but does not reproduce the full prompt templates used."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Table 6 provides comprehensive fine-tuning hyperparameters for both 1B and 9B models: learning rate, batch size, gradient accumulation, epochs, optimizer, scheduler, warmup, precision. However, inference/generation parameters (temperature, top-p, number of samples n for Pass@K) are not reported."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. Models are evaluated directly on code generation tasks via standard inference."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Data curation is extensively documented: Section 3 and Appendix D for MBPP-Bangla (5-step pipeline), Appendix A for SI subset (seed prompts, self-instruction loop, filtering), Appendix B for Syn (BERTScore filtering), Appendix C for TE (MT + quality filtering with CometKiwi/BERTScore thresholds)."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "A dedicated 'Limitations' section is present after the conclusion, discussing scope constraints of the current work."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The limitations section contains only generic future work statements: 'expanding dataset diversity and task complexity', 'investigating larger model architectures and additional programming languages.' No specific threats like benchmark contamination risk, single-seed fragility, or selection bias in the benchmark adaptation process are discussed."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "The limitations mention the work 'focuses on Bangla code generation using 1B and 9B parameter models' but do not explicitly state what the results do NOT show. No statements about what claims are NOT being made or what populations/settings are excluded."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "All datasets (Bangla-Code-Instruct: 300K pairs, MBPP-Bangla: 974 problems) are claimed to be released via GitHub under CC-BY-SA-4.0. The JSONL format with task ID, prompts, reference codes, and test cases is specified in Appendix D."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Data collection is described in detail: MBPP-Bangla curation (Section 3, Appendix D), SI subset (5000 expert seeds → self-instruction, Appendix A), Syn subset (GPT-4o + Claude 3.5 generation, Appendix B), TE subset (Evol-Instruct translation, Appendix C)."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "Translator/verifier qualifications are stated (native Bangla speakers, TOEFL >100, polyglot programmer) but how these individuals were recruited or selected is not described. The relationship to the authors (e.g., co-authors, lab members, hired annotators) is not disclosed."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Each dataset's pipeline is documented with filtering criteria and stages: SI (cosine similarity <0.95 filter, syntax+execution validation, Appendix A), Syn (BERTScore ≥0.7 diversity filter, syntax check, Appendix B), TE (CometKiwi QE >0.85 + BERTScore F1 >0.95, Appendix C). Counts at each stage are implied by the final 100K per subset."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding information, acknowledgments section, or grant numbers appear anywhere in the paper. The authors are at George Mason University but no funding sources are disclosed."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are clearly stated: all three authors are at George Mason University, Fairfax, VA, USA. They are not evaluating a commercial product they are affiliated with."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No funding information is disclosed, making it impossible to assess funder independence. The use of GPT-4o and Claude 3.5 Sonnet APIs for data generation implies some financial expenditure, but the source of these funds is not stated."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial disclosure statement appears in the paper. The authors evaluate their own models (TigerLLM, TigerCoder) but no declaration regarding related IP or commercial interests is made."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No training data cutoff dates are stated for any evaluated model (GPT-3.5, GPT-4o-mini, Gemini Flash, LLaMA 3.2, etc.) or for the TigerLLM base model. This is critical since MBPP was published in 2021."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No discussion of potential train/test overlap. MBPP-Bangla is derived from MBPP (2021), and the English version of these problems could appear in the training data of all evaluated models. The Bangla translation may partially mitigate this but this is never discussed."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "MBPP was published in 2021 and is widely available online. All evaluated models were trained after 2021 and could have seen these problems in English. The paper uses this benchmark without any contamination analysis or acknowledgment of this risk."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in the study. Translators and verifiers are part of the research team creating the benchmark, not study subjects."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in the study. The paper has an ethical considerations section focused on translation quality and open-source release, not IRB matters."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants. Translator qualifications are described (native Bangla, TOEFL >100) but these are annotators, not study subjects."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in the study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in the study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in the study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in the study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No inference costs, latency, tokens consumed, or cost per example are reported. The paper proposes models for practical use but provides no cost information for inference."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Section 7 states: 'We conduct finetuning on a single NVIDIA A100 (40GB) through Google Colab, supported by 80GB RAM and 256GB storage. The process completes in approximately 96 hours.' Hardware and training time are quantified."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "Table 6 reports Seed=42 for both models, indicating a single seed. No seed sensitivity analysis or results across multiple seeds are provided."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The number of experimental runs is not explicitly stated. Seed=42 in Table 6 implies a single training run, but this is not confirmed. The number of samples per problem for Pass@K computation is also not stated."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "Table 6 caption says 'Empirically selected hyperparameters' implying some search was conducted, but no search budget, number of configurations tried, or search method is described."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": true,
    315         "justification": "Figure 2 reports all 7 dataset combinations for both model sizes, transparently showing which combinations work best. The final model uses all three datasets (SI+Syn+TE) based on these results."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "Many comparisons are made across models, datasets, benchmarks, and programming languages, but no statistical tests are performed at all, let alone corrections for multiple comparisons."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors evaluate their own models (TigerLLM, TigerCoder) against baselines. No acknowledgment that self-evaluation may introduce bias, and no independent evaluation is mentioned."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The paper highlights that TigerCoder 1B 'surpasses systems 27× larger' but does not report performance as a function of compute budget. No compute-normalized comparisons or efficiency curves are provided."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "No discussion of whether Pass@K on MBPP-Bangla and mHumanEval-Bangla actually measures real-world Bangla code generation capability. The benchmarks test basic-to-intermediate programming problems, which may not represent practical coding needs."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No scaffolding is used. All models are evaluated via direct inference on code generation benchmarks."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "Not discussed. MBPP was published in 2021 and all evaluated models were trained after 2021. The English problems could be memorized by models, and the Bangla translation does not guarantee protection if models can cross-lingually transfer."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "Not discussed. No analysis of whether evaluation setup provides information beyond what would be available in real Bangla code generation use cases."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "Not discussed. The fine-tuning data (Bangla-Code-Instruct-TE) derives from Evol-Instruct, and the test benchmarks derive from MBPP. Whether there is topical overlap between Evol-Instruct and MBPP is not analyzed."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No concrete leakage detection or prevention method is used. No canary strings, decontamination pipeline, n-gram overlap analysis, or membership inference tests."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "TigerCoder achieves 11-18% Pass@1 improvements over existing multilingual and general-purpose Bangla LLMs",
    369       "evidence": "Table 7 shows 9B model gains of Δ=0.11 (mHumanEval P@1) and Δ=0.13 (MBPP P@1). The 1B model gains are only Δ=0.05 at P@1. The 18% figure appears only at P@100 (MBPP 9B: Δ=0.18), not at P@1 as the abstract claims.",
    370       "supported": "weak"
    371     },
    372     {
    373       "claim": "Machine-translating Bangla prompts to English does not improve code generation performance",
    374       "evidence": "Table 4 shows MT variants produce similar or worse results compared to direct Bangla prompts across all models and benchmarks. Figure 1 provides qualitative analysis of keyword mistranslation causing failures.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Most LLMs exhibit significant performance drops when coding prompts are in Bangla rather than English",
    379       "evidence": "Table 3 shows consistent drops across all models except TigerLLM: LLaMA drops from 0.73 to 0.15 on mHumanEval P@1, Phi-4 from 0.79 to 0.10, GPT-3.5 from 0.79 to 0.56.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Curated high-quality datasets empower smaller models to overcome low-resource limitations, challenging the notion that scale alone drives performance",
    384       "evidence": "Table 7 shows TigerCoder 1B (with curated data) outperforms Gemma-3 27B on Bangla benchmarks. Figure 2's ablation shows dataset combination effects.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "Combined use of all three instruction datasets (SI+Syn+TE) yields the best fine-tuning results",
    389       "evidence": "Figure 2 shows all-three combination achieves top scores across both benchmarks and model sizes, with 9B reaching 0.82 P@1 on MBPP-Bangla.",
    390       "supported": "moderate"
    391     }
    392   ],
    393   "red_flags": [
    394     {
    395       "flag": "Abstract overclaiming",
    396       "detail": "The abstract claims '11-18% performance gains at Pass@1' but the 18% gain is at Pass@100, not Pass@1. The 1B model achieves only 5% at Pass@1. The actual Pass@1 range for the 9B model is 11-13%."
    397     },
    398     {
    399       "flag": "No error bars or uncertainty quantification",
    400       "detail": "All results are from a single seed (seed=42) with no variance, confidence intervals, or repeated runs. Pass@K values can vary significantly with different random seeds and generation temperatures, neither of which are explored."
    401     },
    402     {
    403       "flag": "Benchmark contamination risk unaddressed",
    404       "detail": "MBPP was published in 2021 and is widely available. All evaluated models were trained after this date and could have memorized the English problems. Translation to Bangla does not eliminate contamination risk via cross-lingual transfer, and this is never discussed."
    405     },
    406     {
    407       "flag": "Self-evaluation bias",
    408       "detail": "The authors evaluate their own models (TigerLLM and TigerCoder) against external baselines. TigerLLM is presented as the strongest prior baseline. No independent evaluation or acknowledgment of self-evaluation bias."
    409     },
    410     {
    411       "flag": "Missing inference parameters",
    412       "detail": "Temperature, top-p, number of samples per problem (n), and other generation parameters for Pass@K evaluation are not reported. These critically affect Pass@K scores, especially at higher K values."
    413     },
    414     {
    415       "flag": "Generalization overclaim",
    416       "detail": "The conclusion claims results provide 'an effective and replicable blueprint for Bangla and other low-resource languages' but only Bangla was tested. No evidence that the approach transfers to other languages."
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "Evaluating large language models trained on code",
    422       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    423       "year": 2021,
    424       "arxiv_id": "2107.03374",
    425       "relevance": "Introduced HumanEval benchmark, foundational for code generation evaluation including this paper's mHumanEval-Bangla adaptation."
    426     },
    427     {
    428       "title": "Program synthesis with large language models",
    429       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"],
    430       "year": 2021,
    431       "arxiv_id": "2108.07732",
    432       "relevance": "Introduced MBPP benchmark, the direct source of this paper's MBPP-Bangla benchmark."
    433     },
    434     {
    435       "title": "mHumanEval - a multilingual benchmark to evaluate large language models for code generation",
    436       "authors": ["Nishat Raihan", "Antonios Anastasopoulos", "Marcos Zampieri"],
    437       "year": 2025,
    438       "relevance": "Prior work by same authors creating multilingual code generation benchmarks; provides mHumanEval-Bangla used as evaluation benchmark."
    439     },
    440     {
    441       "title": "Qwen2.5-coder technical report",
    442       "authors": ["Binyuan Hui", "Jian Yang", "Zeyu Cui"],
    443       "year": 2024,
    444       "arxiv_id": "2409.12186",
    445       "relevance": "State-of-the-art code LLM demonstrating advances in code generation capabilities."
    446     },
    447     {
    448       "title": "Cross-lingual generalization through multitask finetuning",
    449       "authors": ["Niklas Muennighoff", "Alex Wang", "Alena Fenogenova"],
    450       "year": 2023,
    451       "relevance": "Documents performance gaps in multilingual code generation, showing models perform poorly on non-English code tasks."
    452     },
    453     {
    454       "title": "Self-instruct: Aligning language models with self-generated instructions",
    455       "authors": ["Yizhong Wang", "Yeganeh Kordi", "Swaroop Mishra"],
    456       "year": 2023,
    457       "relevance": "Core methodology used for creating the Bangla-Code-Instruct-SI dataset; key technique for LLM data curation."
    458     },
    459     {
    460       "title": "WizardLM: Empowering large language models to follow complex instructions",
    461       "authors": ["Canwen Xu", "Ruqing Wang", "Yeyun Gong"],
    462       "year": 2023,
    463       "arxiv_id": "2304.12244",
    464       "relevance": "Source of Evol-Instruct dataset used as basis for the Bangla-Code-Instruct-TE translated subset."
    465     },
    466     {
    467       "title": "The llama 3 herd of models",
    468       "authors": ["Abhimanyu Dubey", "Abhinav Jauhri"],
    469       "year": 2024,
    470       "arxiv_id": "2407.21783",
    471       "relevance": "LLaMA 3 model evaluated as baseline; major open-source multilingual LLM with minimal Bangla support."
    472     },
    473     {
    474       "title": "Challenges to using large language models in code generation and repair",
    475       "authors": ["Liliana Pasquale", "Antonino Sabetta", "Marcelo d'Amorim"],
    476       "year": 2025,
    477       "relevance": "Discusses limitations and challenges of LLMs for code generation in software engineering."
    478     },
    479     {
    480       "title": "Scaling laws for neural language models",
    481       "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"],
    482       "year": 2020,
    483       "arxiv_id": "2001.08361",
    484       "relevance": "TigerCoder claims to challenge the 'scale alone drives performance' thesis from this foundational work."
    485     },
    486     {
    487       "title": "BabelCode: LLM as a polyglot programmer",
    488       "authors": ["Tianyi Wang", "Yang Ye", "Panupong Pasupat"],
    489       "year": 2023,
    490       "arxiv_id": "2303.03845",
    491       "relevance": "Multilingual code generation evaluation framework relevant to the multi-PL evaluation in this work."
    492     }
    493   ]
    494 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs