scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30426B)
      1 {
      2   "paper": {
      3     "title": "StarCoder: may the source be with you!",
      4     "authors": ["Raymond Li", "Loubna Ben Allal", "Yangtian Zi", "Niklas Muennighoff", "Denis Kocetkov", "Chenghao Mou", "Marc Marone", "Christopher Akiki", "Jia Li", "Jenny Chim", "Qian Liu", "Evgenii Zheltonozhskii", "Terry Yue Zhuo", "Thomas Wang", "Olivier Dehaene", "Mishig Davaadorj", "Joel Lamy-Poirier", "João Monteiro", "Oleh Shliazhko", "Nicolas Gontier", "Nicholas Meade", "Armel Zebaze", "Ming-Ho Yee", "Logesh Kumar Umapathi", "Jian Zhu", "Benjamin Lipkin", "Muhtasham Oblokulov", "Zhiruo Wang", "Rudra Murthy", "Jason Stillerman", "Siva Sankalp Patel", "Dmitry Abulkhanov", "Marco Zocca", "Manan Dey", "Zhihan Zhang", "Nour Fahmy", "Urvashi Bhattacharyya", "Wenhao Yu", "Swayam Singh", "Sasha Luccioni", "Paulo Villegas", "Maxim Kunakov", "Fedor Zhdanov", "Manuel Romero", "Tony Lee", "Nadav Timor", "Jennifer Ding", "Claire Schlesinger", "Hailey Schoelkopf", "Jan Ebert", "Tri Dao", "Mayank Mishra", "Alex Gu", "Jennifer Robinson", "Carolyn Jane Anderson", "Brendan Dolan-Gavitt", "Danish Contractor", "Siva Reddy", "Daniel Fried", "Dzmitry Bahdanau", "Yacine Jernite", "Carlos Muñoz Ferrandis", "Sean Hughes", "Thomas Wolf", "Arjun Guha", "Leandro von Werra", "Harm de Vries"],
      5     "year": 2023,
      6     "venue": "Transactions on Machine Learning Research",
      7     "arxiv_id": "2305.06161"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "checklist": {
     12     "artifacts": {
     13       "code_released": {
     14         "applies": true,
     15         "answer": true,
     16         "justification": "Section 11 states 'open-sourcing all code repositories for building the model on GitHub.' The Code LM Evaluation Harness is also released (Section 6). Models are released on Hugging Face."
     17       },
     18       "data_released": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The training data is The Stack v1.2 (Kocetkov et al., 2022), which is publicly available. The PII annotation dataset is available under gated access at https://hf.co/BigCode (Section 4.3)."
     22       },
     23       "environment_specified": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. Hardware is described (512 A100 GPUs, Section 5.6) but software dependencies and library versions are not specified in sufficient detail for reproduction."
     27       },
     28       "reproduction_instructions": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No step-by-step reproduction instructions or README with commands are provided in the paper. Training hyperparameters are given but there are no runnable scripts or instructions to replicate the training or evaluation."
     32       }
     33     },
     34     "statistical_methodology": {
     35       "confidence_intervals_or_error_bars": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Results throughout (Tables 12-27) are reported as point estimates without confidence intervals or error bars."
     39       },
     40       "significance_tests": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper makes numerous comparative claims ('StarCoder outperforms every open Code LLM') based solely on comparing numbers without any statistical significance tests."
     44       },
     45       "effect_sizes_reported": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "Results are reported with absolute scores and baselines in context (e.g., 'StarCoder 33.6 vs code-cushman-001 33.5 on HumanEval', Table 12), allowing readers to assess the magnitude of differences."
     49       },
     50       "sample_size_justified": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The number of samples for pass@k estimation (n=200 for open models, n=20 for API models) is stated but not justified. No power analysis or discussion of whether these sample sizes are sufficient."
     54       },
     55       "variance_reported": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "No variance, standard deviation, or spread measures are reported across runs. Table D.3 shows per-problem pass counts vary dramatically across checkpoints (e.g., 5 to 197) but no systematic variance reporting is done for the main results."
     59       }
     60     },
     61     "evaluation_design": {
     62       "baselines_included": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Extensive baselines including CodeGen-16B-Multi, CodeGen-16B-Mono, CodeGeeX, code-cushman-001, LLaMA family, PaLM, LaMDA (Tables 12-27)."
     66       },
     67       "baselines_contemporary": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "Baselines include contemporary models like LLaMA (2023), CodeGeeX (2023), and code-cushman-001. These were state-of-the-art at the time of evaluation."
     71       },
     72       "ablation_study": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No ablation study examining which architectural choices or training data components contribute to performance. StarCoder vs StarCoderBase shows the effect of Python fine-tuning but no systematic component ablation."
     76       },
     77       "multiple_metrics": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Multiple metrics are used: pass@1 on HumanEval/MBPP/DS-1000/MultiPL-E, BLEU for docstring generation, perplexity for long contexts, F1 for PII detection, stereotype/ICAT scores for bias, toxicity scores."
     81       },
     82       "human_evaluation": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "No human evaluation of the model's code generation quality. All evaluations are automated (pass@k on test cases, automated metrics). Section 8 has qualitative examples but no systematic human evaluation."
     86       },
     87       "held_out_test_set": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Training data was decontaminated by removing files containing docstrings or solutions from HumanEval, MBPP, APPS, GSM8K, and DS1000 (Section 5.2). Perplexity evaluation used GPL-licensed repos not in the training data (Section 6.4)."
     91       },
     92       "per_category_breakdown": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Extensive per-category breakdowns: DS-1000 results by library (Table 13), MultiPL-E results by programming language (Table 15), StereoSet by bias domain (Table 25), PII detection by entity type (Tables 5, 8, 9)."
     96       },
     97       "failure_cases_discussed": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Section 6.3 discusses performance instability for low-resource languages (R's pass@1 drops between checkpoints). Table D.3 shows high variance in per-problem pass rates. Section E.3 discusses empty/pass solutions. Section 6.2.2 discusses security vulnerabilities."
    101       },
    102       "negative_results_reported": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Several negative results: performance drops for low-resource languages during training (Section 6.3, Figure 2), Ruby and Racket underperformance for StarCoder vs StarCoderBase (Table 15), higher toxicity than CodeGen (Table 26), the prompt that boosts StarCoder hurts CodeGen (Section E.3)."
    106       }
    107     },
    108     "claims_and_evidence": {
    109       "abstract_claims_supported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Abstract claims about outperforming open Code LLMs and matching code-cushman-001 are supported by Tables 12-15. Claims about PII redaction are supported by Section 4. Attribution tool claims are supported by Section 9."
    113       },
    114       "causal_claims_justified": {
    115         "applies": true,
    116         "answer": false,
    117         "justification": "The paper makes implicit causal claims (e.g., Python fine-tuning 'resulting in the creation of StarCoder' which outperforms on Python) but lacks controlled ablations. The claim that code+NL training contributes to reasoning (Section 7.4: 'we speculate that the mixture of code and natural language...contributes') is acknowledged as speculation."
    118       },
    119       "generalization_bounded": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Claims are generally bounded to the tested settings. The paper specifies exact benchmarks and languages tested. Section 10.2 notes English-only evaluations as a limitation. Claims are made per-benchmark rather than about 'code generation' in general."
    123       },
    124       "alternative_explanations_discussed": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "Alternative explanations for performance differences are not systematically discussed. For example, the paper does not consider whether StarCoder's advantage comes from data quality, data quantity, architecture, or training approach. Section 6.3 notes performance variance but does not explain it."
    128       },
    129       "proxy_outcome_distinction": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper appropriately uses benchmark-specific metrics (pass@1, BLEU, F1) without overclaiming broader capabilities like 'programming ability.' Section 6.1.2 explicitly discusses how HumanEval/MBPP are 'simple programming puzzles that are not representative of the code that most programmers write' and uses DS-1000 as a more realistic complement."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "Model sizes and architectures are specified in detail (Table 11). For external models: CodeGen-16B-Multi, CodeGen-16B-Mono, CodeGeeX-13B, code-cushman-001 (12B) are named with sizes. LLaMA sizes (7B, 13B, 33B, 65B) are specified."
    140       },
    141       "prompts_provided": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "Prompt templates are provided in Section 5.1 (training format), Section E.3 (HumanEval prompting), and Appendix F (full technical assistant prompt). The evaluation harness uses standard benchmark prompts which are publicly available."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Training hyperparameters detailed in Section 5.5 (learning rate, batch size, Adam parameters, warmup). Evaluation hyperparameters: temperature 0.2 for pass@1, temperature 0.8 for k>1, n=200 samples (Section 6.1.1). DS-1000: temperature=0.2, top_p=0.5, max_length=1024 (Table 13)."
    150       },
    151       "scaffolding_described": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "No agentic scaffolding is used. StarCoder is evaluated as a standalone code generation model."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 3 provides extensive documentation of data curation: language selection criteria (Section 3.1), visual inspection with 18 annotators (Section 3.1), XML/alpha/HTML/JSON/YAML filters with specific thresholds, deduplication pipeline (Section 3.5), Git commit filters (Table 4), and decontamination (Section 5.2)."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 10.2 'Limitations' provides substantive discussion covering dataset licensing limitations, opt-out process gaps, PII detection limitations, malicious code risks, model limitations, and English-only evaluations."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 10.2 discusses specific threats: the license detector may have misclassified repositories, PII model has false positives/negatives that vary across languages, duplicate code in other repos can bypass opt-out, potential training data overlap in some evaluations (Tables 18, 20)."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 10.2 explicitly states: 'We evaluated the performance of StarCoder solely on English-based benchmarks.' Specific attribution tool limitations are stated: 'will not find matches to code that was not included or that was removed from the dataset.' Code attribution tools 'do not attempt to distinguish between generic code or protected content.'"
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "The training data (The Stack v1.2) is publicly available. The PII annotation dataset is available under gated access. Models are publicly released on Hugging Face."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 3 describes data collection in extensive detail: source (permissively licensed GitHub repos via The Stack), language selection criteria, filtering procedures, deduplication methods, and data volumes (Tables 1-2)."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "PII annotation recruitment is described in Section 4.1: 1,399 crowd-workers from 35 countries via Toloka platform, compensation of $7.30/hour based on purchasing power parity analysis, average 3.1 hours worked. Data inspection used 18 community annotators (Section 3.1)."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The full pipeline is documented with volumes at each stage: Tables 1-2 show file counts and volumes after deduplication and after filtering. Section 3.3 describes GitHub issues filtering with volume removed at each step (18%, 17%, 14%). Git commit filters in Table 4."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Acknowledgements section states: 'We would thank Hugging Face for providing the compute resources to train the StarCoder models.' NSF grants SES-2326174 and CCF-2102288 and Adams Fellowship are also disclosed."
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Author affiliations are listed in detail: Hugging Face, ServiceNow Research, multiple universities and research labs. The project is co-stewarded by two industry research labs (Section 1)."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "Hugging Face provided compute and many authors are affiliated with Hugging Face. Hugging Face hosts The Stack dataset and the StarCoder models, giving them a commercial interest in the models performing well."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No competing interests statement is provided. Several authors are from companies (Hugging Face, ServiceNow, IBM Research, etc.) that could benefit commercially from StarCoder's success."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "The paper does not state a specific training data cutoff date. The Stack v1.2 is referenced but no date range for when the GitHub data was collected is provided in this paper."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": true,
    231         "answer": true,
    232         "justification": "Section 5.2 describes explicit decontamination: removing files containing docstrings/solutions from HumanEval, MBPP, APPS, GSM8K, and DS1000. The paper also acknowledges potential overlap for type prediction and docstring benchmarks (Tables 18, 20)."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": true,
    236         "answer": true,
    237         "justification": "Section 5.2 directly addresses contamination by decontaminating the training data against HumanEval, MBPP, APPS, GSM8K, and DS1000. The paper notes that some evaluation datasets (type prediction, docstring generation) 'may overlap with the training data' (Tables 18, 20)."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "No human subjects study. The PII annotation is data collection, not a study of human participants."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human subjects study requiring IRB approval."
    250       },
    251       "demographics_reported": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human subjects study. Crowd-worker demographics (countries) are reported for the annotation task but this is data collection, not a human study."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human subjects study."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human subjects study."
    265       },
    266       "blinding_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human subjects study."
    270       },
    271       "attrition_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human subjects study."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": false,
    281         "justification": "No inference cost or latency is reported for StarCoder despite evaluating it extensively across benchmarks."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": true,
    286         "justification": "Section 5.6: 512 A100 80GB GPUs across 64 nodes. Section 5.7: 320,256 GPU-hours for StarCoderBase, 16.68 tonnes CO2eq. StarCoder fine-tuning adds 3.5% training time (0.58 tonnes CO2eq). PII detection: 800 GPU-hours (Section 4.3)."
    287       }
    288     },
    289     "experimental_rigor": {
    290       "seed_sensitivity_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No results reported across multiple random seeds. Table D.3 shows dramatic per-problem variance across training checkpoints for R but this is training dynamics, not seed sensitivity of the final model evaluation."
    294       },
    295       "number_of_runs_stated": {
    296         "applies": true,
    297         "answer": true,
    298         "justification": "Section 6.1.1 states n=200 samples for open models, n=20 for API models. DS-1000 reports 'mean pass@1 accuracy averaged over 40 samples' (Table 13). Security benchmark: 25 completions per scenario (Section 6.2.2)."
    299       },
    300       "hyperparameter_search_budget": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No hyperparameter search budget is reported. The training hyperparameters appear chosen but no search procedure or budget is described."
    304       },
    305       "best_config_selection_justified": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "No justification for why the specific architecture size (15.5B) or training configuration was chosen. No comparison of configurations tried."
    309       },
    310       "multiple_comparison_correction": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "No statistical tests are performed, so no multiple comparison correction either. The paper compares across 19 languages and many benchmarks without any correction."
    314       },
    315       "self_comparison_bias_addressed": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "The authors developed StarCoder and evaluate it against baselines using their own evaluation harness (Code LM Evaluation Harness, Section 6). No acknowledgment of self-comparison bias. However, they do use standard benchmarks which mitigates this somewhat."
    319       },
    320       "compute_budget_vs_performance": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "No comparison at matched compute budgets. StarCoder (15.5B) is compared against models of very different sizes (1.1B to 540B) without normalizing for compute. Table D.1 shows code-davinci-002 (175B) substantially outperforms StarCoderBase but compute difference is not discussed."
    324       },
    325       "benchmark_construct_validity": {
    326         "applies": true,
    327         "answer": true,
    328         "justification": "Section 6.1.2 explicitly discusses construct validity: 'A major limitation of HumanEval and MBPP is that they are simple programming puzzles that are not representative of the code that most programmers write.' The paper uses DS-1000 as a more realistic complement and shows HumanEval/MBPP performance 'does not always correlate' with DS-1000 performance."
    329       },
    330       "scaffold_confound_addressed": {
    331         "applies": false,
    332         "answer": false,
    333         "justification": "No scaffolding is involved. Models are evaluated as standalone code generators."
    334       }
    335     },
    336     "data_leakage": {
    337       "temporal_leakage_addressed": {
    338         "applies": true,
    339         "answer": true,
    340         "justification": "Section 5.2 describes decontamination by removing files containing docstrings/solutions from evaluation benchmarks. The paper explicitly addresses temporal concerns by noting when benchmarks were published relative to data collection."
    341       },
    342       "feature_leakage_addressed": {
    343         "applies": true,
    344         "answer": false,
    345         "justification": "No explicit discussion of feature leakage. The evaluation setup (providing function signatures and docstrings) is standard but not analyzed for potential leakage of answer information."
    346       },
    347       "non_independence_addressed": {
    348         "applies": true,
    349         "answer": true,
    350         "justification": "Section 3.5 describes near-deduplication (MinHash with Jaccard similarity 0.7) to address duplicate data. The paper acknowledges potential overlap between evaluation datasets and training data for type prediction and docstring benchmarks (Tables 18, 20)."
    351       },
    352       "leakage_detection_method": {
    353         "applies": true,
    354         "answer": true,
    355         "justification": "Section 5.2 describes a concrete decontamination pipeline: removing files containing docstrings/solutions from HumanEval, MBPP, APPS, GSM8K, and DS1000. Section 9.1 describes a Bloom-filter-based membership inference tool for post-hoc detection."
    356       }
    357     }
    358   },
    359   "claims": [
    360     {
    361       "claim": "StarCoder outperforms every open Code LLM that supports multiple programming languages on HumanEval and MBPP.",
    362       "evidence": "Table 12: StarCoder achieves 33.6% pass@1 on HumanEval and 52.7% on MBPP, exceeding all open-access models listed. StarCoderBase achieves 30.4% and 49.0% respectively.",
    363       "supported": "strong"
    364     },
    365     {
    366       "claim": "StarCoder matches or outperforms OpenAI's code-cushman-001 model.",
    367       "evidence": "Table 12: StarCoder 33.6% vs code-cushman-001 33.5% on HumanEval; 52.7% vs 45.9% on MBPP. Table 13: StarCoder outperforms on DS-1000 overall (26.0% vs 18.1%).",
    368       "supported": "strong"
    369     },
    370     {
    371       "claim": "StarCoder substantially outperforms all other models on DS-1000 data science benchmarks.",
    372       "evidence": "Table 13: StarCoder achieves 26.0% overall vs code-cushman-001's 18.1% and CodeGen-16B-Mono's 11.7%. Improvement is consistent across all 7 library categories.",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "StarCoderBase outperforms every open-access model on all 19 programming languages in MultiPL-E.",
    377       "evidence": "Table 15: StarCoderBase leads or is competitive on most languages. However, CodeGen-16B-Multi outperforms on Ruby (0.00 vs 17.25 but this seems like an error as StarCoder gets only 1.24), suggesting some nuance.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "The PII detection model outperforms regex baselines on email, IP address, and key detection.",
    382       "evidence": "Table 8: NER + pseudo labels achieves F1 of 98.15% (email), 91.94% (IP), 70.41% (key) vs regex baseline of 96.83%, 78.65%, 6.74%.",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "StarCoderBase shows lower social bias than LLaMA-13B and CodeGen-16B-Multi.",
    387       "evidence": "Table 25: StarCoderBase overall stereotype score 55.53% (closest to ideal 50%) vs LLaMA-13B 63.40% and CodeGen 61.29%. ICAT score 76.65% vs 64.14% and 67.55%.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "Model performance on HumanEval/MBPP does not always correlate with performance on DS-1000.",
    392       "evidence": "Section 6.1.2: CodeGen-Mono slightly outperforms code-cushman-001 on HumanEval/MBPP but is significantly worse on DS-1000 (11.7% vs 18.1%).",
    393       "supported": "strong"
    394     }
    395   ],
    396   "methodology_tags": ["benchmark-eval"],
    397   "key_findings": "StarCoder (15.5B parameters) is the highest-performing open-access code LLM at time of publication, matching or outperforming the closed-access code-cushman-001 across HumanEval, MBPP, and DS-1000 benchmarks. The paper introduces a comprehensive PII redaction pipeline with a custom encoder model achieving >90% F1 on most entity types, and novel attribution tools (Bloom filter membership checking and BM25 search index). StarCoderBase shows lower social bias than comparably-sized models and benefits substantially from its 8K context window. Performance on simple benchmarks (HumanEval/MBPP) does not correlate well with more realistic data science tasks (DS-1000).",
    398   "red_flags": [
    399     {
    400       "flag": "No statistical significance tests",
    401       "detail": "All performance comparisons across all benchmarks are raw number comparisons without any statistical tests. Claims like 'outperforms every open Code LLM' rest on comparing point estimates without uncertainty quantification."
    402     },
    403     {
    404       "flag": "Potential self-evaluation bias",
    405       "detail": "The authors developed the model, the evaluation harness (Code LM Evaluation Harness, Section 6), and ran all evaluations. No independent evaluation is included."
    406     },
    407     {
    408       "flag": "Non-independent funder",
    409       "detail": "Hugging Face provided compute resources and many authors are Hugging Face employees. Hugging Face commercially hosts the model and dataset, creating a financial interest in positive results."
    410     },
    411     {
    412       "flag": "Missing variance reporting",
    413       "detail": "Table D.3 reveals dramatic variance in pass rates across training checkpoints (e.g., 5 to 197 out of 200 for the same problem), yet no systematic variance reporting is done for the final model's evaluation results."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Evaluating large language models trained on code",
    419       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    420       "year": 2021,
    421       "relevance": "Introduces HumanEval benchmark and Codex, the foundational code generation evaluation framework."
    422     },
    423     {
    424       "title": "Program synthesis with large language models",
    425       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"],
    426       "year": 2021,
    427       "relevance": "Introduces MBPP benchmark for evaluating code generation from natural language descriptions."
    428     },
    429     {
    430       "title": "SantaCoder: don't reach for the stars!",
    431       "authors": ["Loubna Ben Allal", "Raymond Li", "Denis Kocetkov"],
    432       "year": 2023,
    433       "relevance": "Predecessor open code model from BigCode community; establishes architecture and training approaches built upon by StarCoder."
    434     },
    435     {
    436       "title": "MultiPL-E: a scalable and polyglot approach to benchmarking neural code generation",
    437       "authors": ["Federico Cassano", "John Gouwar", "Daniel Nguyen"],
    438       "year": 2023,
    439       "arxiv_id": "2208.08227",
    440       "relevance": "Multi-language code generation benchmark translating HumanEval/MBPP to 18 languages."
    441     },
    442     {
    443       "title": "Asleep at the keyboard? Assessing the security of GitHub Copilot's code contributions",
    444       "authors": ["Hammond Pearce", "Baleegh Ahmad", "Benjamin Tan", "Brendan Dolan-Gavitt", "Ramesh Karri"],
    445       "year": 2022,
    446       "relevance": "Security benchmark for code generation models measuring vulnerability rates."
    447     },
    448     {
    449       "title": "DS-1000: a natural and reliable benchmark for data science code generation",
    450       "authors": ["Yuhang Lai", "Chengxi Li", "Yiming Wang"],
    451       "year": 2022,
    452       "relevance": "Realistic data science code generation benchmark addressing limitations of HumanEval/MBPP."
    453     },
    454     {
    455       "title": "CodeGen: an open large language model for code with multi-turn program synthesis",
    456       "authors": ["Erik Nijkamp", "Bo Pang", "Hiroaki Hayashi"],
    457       "year": 2023,
    458       "relevance": "Key baseline open code LLM (16B parameters) for multi-language code generation."
    459     },
    460     {
    461       "title": "The Stack: 3 TB of permissively licensed source code",
    462       "authors": ["Denis Kocetkov", "Raymond Li", "Loubna Ben Allal"],
    463       "year": 2022,
    464       "arxiv_id": "2211.15533",
    465       "relevance": "The training dataset for StarCoder with data governance framework for code LLM training."
    466     },
    467     {
    468       "title": "InCoder: a generative model for code infilling and synthesis",
    469       "authors": ["Daniel Fried", "Armen Aghajanyan", "Jessy Lin"],
    470       "year": 2022,
    471       "relevance": "Fill-in-the-middle code model, key baseline for infilling evaluation."
    472     },
    473     {
    474       "title": "Holistic evaluation of language models",
    475       "authors": ["Percy Liang", "Rishi Bommasani", "Tony Lee"],
    476       "year": 2022,
    477       "relevance": "HELM benchmark suite used for evaluating StarCoder on natural language reasoning tasks."
    478     },
    479     {
    480       "title": "PAL: Program-aided language models",
    481       "authors": ["Luyu Gao", "Aman Madaan", "Shuyan Zhou"],
    482       "year": 2022,
    483       "relevance": "Program-aided reasoning approach used to evaluate StarCoder's mathematical reasoning capabilities."
    484     },
    485     {
    486       "title": "The gradient of generative AI release: Methods and considerations",
    487       "authors": ["Irene Solaiman"],
    488       "year": 2023,
    489       "relevance": "Framework for responsible AI model release that informed StarCoder's release strategy."
    490     }
    491   ]
    492 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs