scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26527B)
      1 {
      2   "paper": {
      3     "title": "CodeMMLU: A Multi-Task Benchmark for Assessing Code Understanding & Reasoning Capabilities of CodeLLMs",
      4     "authors": [
      5       "Dung Nguyen Manh",
      6       "Thang Phan Chau",
      7       "Nam Le Hai",
      8       "Thong T. Doan",
      9       "Nam V. Nguyen",
     10       "Quang Pham",
     11       "Nghi D. Q. Bui"
     12     ],
     13     "year": 2024,
     14     "venue": "ICLR 2025",
     15     "arxiv_id": "2410.01999"
     16   },
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The abstract states 'CodeMMLU is publicly available at: CodeMMLU' and the benchmark is distributed under the MIT license (Appendix A.3). Appendix C provides HuggingFace model IDs for all evaluated models."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The benchmark dataset is publicly released under the MIT license (Appendix A.3). The abstract and Section 1 state it is publicly available."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup with library versions is provided in the paper or appendix."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No step-by-step reproduction instructions, README with commands, or scripts to replicate experiments are described in the paper. The paper describes the evaluation setup (Section 4.1) but does not provide executable reproduction instructions."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "Results in Tables 3, 7, 9, 10, 11 report only point estimates (accuracy percentages) with no confidence intervals or error bars."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper makes comparative claims (e.g., 'GPT-4o outperformed all models', 'DeepSeek R1 performs significantly worse than its base model') based solely on comparing accuracy numbers without any statistical significance tests."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "The paper provides percentage improvements with baseline context throughout, e.g., 'DeepSeek-Coder-33b surpasses its base model by approximately 29%' (Section 4.2), and Table 4 shows absolute and relative performance changes for each answer position. The Pearson correlation r=0.61 is reported for knowledge vs. real-world task alignment (Section 4.2)."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No justification is given for the number of questions per category (e.g., why 163 code completion, 76 code repair) or for the sample of 100 instances used in manual verification (Section 3.3). No power analysis is discussed."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Standard deviation (STD) is reported in Table 8 for selection bias experiments across answer orderings. However, across-run variance for the main evaluation results is not reported, though MCQ benchmarks with deterministic answer extraction are typically single-run by design."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The paper compares CodeMMLU performance against results on established benchmarks including HumanEval, MBPP, MMLU, and GSM8k (Table 7). Multiple model families serve as baselines against each other."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "The evaluation includes contemporary state-of-the-art models: GPT-4o, GPT o3-mini, Claude 3.7 Sonnet, DeepSeek R1/V3, Llama 3.3-70B, Phi-4, and QwenCoder 2.5 (Table 3, Figure 1). These are current as of early 2025."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The paper provides ablation-like analyses: comparing zero-shot vs few-shot vs CoT prompting strategies (Figure 3, Appendix B.2), examining the effect of answer order permutations (Tables 4, 8), and comparing base vs instruction-tuned models within families."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "The paper uses accuracy as the primary metric but supplements it with Pearson correlation (r=0.61 for knowledge vs. real-world task correlation), perplexity scores (Table 5), n-gram accuracy for contamination analysis (Table 6), and standard deviation for selection bias analysis (Table 8)."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No human evaluation of the benchmark quality or model outputs is performed. The manual verification of 100 instances per subject (Section 3.3) is for dataset quality assurance during construction, not evaluation of model outputs."
     92       },
     93       "held_out_test_set": {
     94         "applies": false,
     95         "answer": false,
     96         "justification": "This is a benchmark paper that evaluates pre-existing models on a new test set. There is no model training or tuning involved, so the train/test split concept does not apply in the traditional sense."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Extensive per-category breakdowns are provided: Tables 9-11 show results for each of 9 task categories (API & Frameworks, PL Syntax, Software Principles, DBMS & SQL, Others, Code Completion, Fill in the Blank, Code Repair, Execution Prediction) across all models and prompt settings."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Section 4.2 and Appendix B.2 discuss failure cases in detail, including CoT failures with qualitative examples (Figures 11, 12, 14). The paper shows specific examples where reasoning models fail due to over-reasoning."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Several negative results are reported: CoT prompting consistently hurts performance (Section 4.2, Figure 3), reasoning models like DeepSeek R1 underperform their base models (Table 3), and the discrepancy between HumanEval and MCQ format performance (Section 4.2, Table 4, Figure 7)."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The abstract claims the benchmark has ~20,000 questions (Table 2 confirms 19,912), spans diverse domains and programming languages (Table 2 confirms 52 topics, 10+ languages), and that state-of-the-art models struggle (Table 3 shows GPT-4o at 56.40% overall). All claims are supported by the results."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The paper makes causal claims such as 'the additional complexity introduced by step-by-step reasoning does not align well with knowledge-seeking tasks' (Section 4.2) and that instruction tuning 'substantially' improves performance. These are inferred from observational comparisons without controlled experiments isolating the causal mechanisms."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The title claims to assess 'Code Understanding & Reasoning Capabilities' broadly, but the benchmark is limited to multiple-choice format, which is a narrow proxy for understanding. The paper does not sufficiently bound its claims to the MCQ evaluation format. The conclusion states CodeMMLU evaluates 'a wide range of software knowledge and real-world programming tasks' without qualifying the MCQ limitation."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper discusses alternative explanations for key findings: the CoT performance decline might be due to 'overreasoning' rather than lack of capability (Section 4.2), MCQ bias could explain some performance variation rather than true understanding differences (Section 4.2, Tables 4, 8), and data leakage could explain open-ended benchmark performance (Section A.2)."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Appendix C, Table 12 provides specific model identifiers including API version dates for proprietary models (e.g., 'GPT-4o-2024-05-13', 'Claude-3.5-sonnet-20241022') and HuggingFace model IDs for open-source models (e.g., 'meta-llama/Meta-Llama-3.1-70B-Instruct')."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Full prompt text is provided in Appendix C.1 for all evaluation configurations: zero-shot, few-shot, and Chain-of-Thought prompts for each task type (knowledge MCQ, code completion, fill-in-the-blank, code repair, defect detection). Data creation prompts for distractor generation are also provided."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "No temperature, top-p, max tokens, or other sampling parameters are reported for any of the 43 models evaluated. Section 4.1 describes the answer extraction method but not the generation parameters."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding is used. The evaluation involves direct prompting of LLMs with MCQ questions and extracting answers via regex."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 3.3 and Appendix A.1 document the data preprocessing pipeline in detail: rule-based filtering (removing HTML, images, links), LLM-based filtering (three criteria with threshold of 4, removing ~25.6% of raw data), MinHash LSH deduplication (256 permutations, 0.8 threshold), and execution-based filtering for fundamental tasks. Figure 2 provides a visual overview."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "There is no dedicated limitations or threats-to-validity section in the paper. The paper acknowledges the difficulty of eliminating data leakage (Appendix A.2) but does not have a structured discussion of limitations."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "No specific threats to validity are discussed. The paper does not address potential issues such as the representativeness of MCQ format for measuring understanding, the quality of LLM-generated distractors, or the sensitivity of results to prompt format."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound claims to the MCQ evaluation format or acknowledge that MCQ performance may not generalize to actual code understanding in real-world settings."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The benchmark dataset is publicly released under the MIT license (Appendix A.3), which includes the questions, answer choices, and ground truth labels, allowing independent verification."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 3 describes data collection in detail: knowledge-based questions from W3Schools, GeeksforGeeks, and Common Crawl (Section 3.1); fundamental tasks derived from HumanEval, QuixBugs, LeetCode, and IBM CodeNet (Section 3.2). Specific Common Crawl portions (CC-MAIN-2021-41 to CC-MAIN-2024-46) are identified."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants are involved. Data sources are publicly available benchmark datasets and web resources."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Figure 2 provides a visual overview of the data creation pipeline. Section 3.3 and Appendix A.1 describe each filtering step with quantitative results (e.g., removing ~25.6% via LLM filtering). The process from raw collection through filtering to final benchmark is documented."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding sources or acknowledgments section is present in the paper. The authors are affiliated with FPT Software AI Center (a commercial entity) but funding is not disclosed."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are clearly listed: FPT Software AI Center, Hanoi University of Science and Technology, and Independent Researcher. These are prominently displayed on the first page."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding information is provided. Since the authors are primarily from FPT Software AI Center (a commercial software company), and the paper evaluates models including those from companies FPT may partner with or compete with, the funding independence cannot be assessed."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial interests statement is present in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "Training data cutoff dates are not stated for any of the 43 models evaluated. This is relevant because the knowledge-based questions are sourced from publicly available web content that could appear in training data."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": true,
    233         "justification": "Appendix A.2 directly addresses data contamination. The paper uses perplexity analysis (Table 5) and n-gram accuracy analysis (Table 6) across multiple model families (Mistral, DeepSeek, Llama) to assess potential data leakage, showing CodeMMLU has higher perplexity (less leakage) than comparable benchmarks."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": true,
    238         "justification": "Appendix A.2 addresses contamination risk using the methodology from Xu et al. (2024). Tables 5-6 compare CodeMMLU's contamination metrics (perplexity and 5-gram accuracy) against CodeScope and CodeApex, showing significantly lower contamination risk for CodeMMLU. The paper also notes that MCQ format transformation and synthetic distractors mitigate leakage risk."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants are involved in this benchmark evaluation study."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants are involved in this benchmark evaluation study."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants are involved in this benchmark evaluation study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants are involved in this benchmark evaluation study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants are involved in this benchmark evaluation study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants are involved in this benchmark evaluation study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants are involved in this benchmark evaluation study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No inference costs, API costs, tokens consumed, or wall-clock time are reported for the evaluation of 43 models across ~20,000 questions in multiple prompt configurations."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No GPU hours, total API spend, hardware used, or compute budget are stated for either benchmark construction or model evaluation."
    288       }
    289     }
    290   },
    291   "claims": [
    292     {
    293       "claim": "Even state-of-the-art models struggle with CodeMMLU, with GPT-4o achieving only 56.40% overall accuracy and the best model (GPT o3-mini) reaching 62.36%.",
    294       "evidence": "Table 3 shows GPT-4o at 56.40% and GPT o3-mini at 62.36% CodeMMLU accuracy. Figure 1 provides a radar chart visualization across all task categories.",
    295       "supported": "strong"
    296     },
    297     {
    298       "claim": "Chain-of-Thought prompting consistently hurts performance on CodeMMLU compared to zero-shot and few-shot settings.",
    299       "evidence": "Figure 3 shows CoT underperforming zero-shot and few-shot for GPT-4o across all 9 task categories. Section 4.2 and Appendix B.2 provide detailed analysis showing this pattern across multiple models.",
    300       "supported": "strong"
    301     },
    302     {
    303       "claim": "DeepSeek R1 performs significantly worse than its base model DeepSeek V3 on CodeMMLU (43.91% vs 49.08%), despite being designed for reasoning tasks.",
    304       "evidence": "Table 3 shows DeepSeek R1 at 43.91% vs DeepSeek V3 at 49.08%. Section 4.2 attributes this to 'overreasoning'.",
    305       "supported": "strong"
    306     },
    307     {
    308       "claim": "There is a strong correlation (r=0.61) between software knowledge performance and real-world coding task performance.",
    309       "evidence": "Section 4.2 reports a Pearson correlation of r=0.61 computed from 43 LLMs across 15 families. Figure 5 visualizes the correlation.",
    310       "supported": "moderate"
    311     },
    312     {
    313       "claim": "LLMs exhibit significant selection bias in MCQ format, with performance varying dramatically based on answer position.",
    314       "evidence": "Table 4 shows extreme fluctuations (e.g., CodeLlama-7B-Python: 0% for position A, 90.24% for position B). Table 8 and Figure 9 provide STD analysis showing high-quality models are more resilient.",
    315       "supported": "strong"
    316     },
    317     {
    318       "claim": "Performance on open-ended code generation (HumanEval) does not reliably predict performance on MCQ code completion for the same questions.",
    319       "evidence": "Section 4.2 and Figure 7 show low correlation coefficients (r=0.10-0.13) between HumanEval pass rates and CodeMMLU code completion accuracy for the same questions.",
    320       "supported": "strong"
    321     },
    322     {
    323       "claim": "CodeMMLU is the first MCQ benchmark for software and coding-related tasks.",
    324       "evidence": "Section 1 claims this explicitly. Table 1 compares against existing benchmarks (APPS, MBPP, HumanEval, CRUXEval, LiveCodeBench, CodeApex), showing only CodeApex has MCQ format but with a more limited scope (2,056 vs 19,912 questions).",
    325       "supported": "moderate"
    326     }
    327   ],
    328   "methodology_tags": [
    329     "benchmark-eval"
    330   ],
    331   "key_findings": "CodeMMLU is a 19,912-question multiple-choice benchmark for evaluating LLM code understanding across 52 topics and 10+ programming languages, published at ICLR 2025. Evaluation of 43 LLMs reveals that even top models achieve modest accuracy (GPT o3-mini at 62.36%), Chain-of-Thought prompting consistently degrades performance, and reasoning models like DeepSeek R1 underperform their base models due to overreasoning. A notable discrepancy exists between open-ended code generation performance (HumanEval) and MCQ code comprehension for the same questions, suggesting generation benchmarks may overestimate code understanding.",
    332   "red_flags": [
    333     {
    334       "flag": "No limitations section",
    335       "detail": "An ICLR 2025 paper with no dedicated limitations section is unusual. The paper does not discuss threats to validity or bound its claims about 'code understanding' to what the MCQ format can actually measure."
    336     },
    337     {
    338       "flag": "No hyperparameters reported",
    339       "detail": "Temperature, top-p, and other sampling parameters are not reported for any of the 43 models evaluated. These settings significantly affect output quality and could explain some performance variation."
    340     },
    341     {
    342       "flag": "No statistical significance tests",
    343       "detail": "All comparative claims are based on raw accuracy differences without confidence intervals or significance tests. Given that MCQ evaluation on a fixed test set is deterministic for a given model configuration, single-run comparisons may be acceptable, but the paper still claims 'significant' differences without statistical backing."
    344     },
    345     {
    346       "flag": "Overclaimed scope",
    347       "detail": "The paper claims to assess 'code understanding & reasoning capabilities' but only measures MCQ performance. Multiple-choice format is a narrow proxy for actual understanding; a model could exploit answer distributions or elimination strategies rather than genuinely understanding code."
    348     },
    349     {
    350       "flag": "No funding disclosure",
    351       "detail": "Authors are primarily affiliated with FPT Software AI Center (a commercial entity) but no funding or conflict of interest statement is provided."
    352     }
    353   ],
    354   "cited_papers": [
    355     {
    356       "title": "Evaluating large language models trained on code",
    357       "authors": ["Mark Chen", "Jerry Tworek"],
    358       "year": 2021,
    359       "arxiv_id": "2107.03374",
    360       "relevance": "Introduces HumanEval, a foundational code generation benchmark that CodeMMLU converts to MCQ format and directly compares against."
    361     },
    362     {
    363       "title": "Measuring massive multitask language understanding",
    364       "authors": ["Dan Hendrycks", "Collin Burns", "Steven Basart"],
    365       "year": 2020,
    366       "arxiv_id": "2009.03300",
    367       "relevance": "Introduces the MMLU benchmark that directly inspired CodeMMLU's design philosophy of MCQ-based LLM evaluation."
    368     },
    369     {
    370       "title": "CRUXEval: A benchmark for code reasoning, understanding and execution",
    371       "authors": ["Alex Gu", "Baptiste Rozière"],
    372       "year": 2024,
    373       "arxiv_id": "2401.03065",
    374       "relevance": "A closely related code reasoning benchmark that evaluates LLMs' ability to predict code execution outcomes."
    375     },
    376     {
    377       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    378       "authors": ["Naman Jain", "King Han"],
    379       "year": 2024,
    380       "arxiv_id": "2403.07974",
    381       "relevance": "A contamination-aware code benchmark that uses temporal filtering, addressing the same data leakage concerns as CodeMMLU."
    382     },
    383     {
    384       "title": "Large language models are not robust multiple choice selectors",
    385       "authors": ["Chujie Zheng", "Hao Zhou"],
    386       "year": 2024,
    387       "relevance": "Directly relevant to CodeMMLU's analysis of MCQ selection bias in LLMs; the paper builds on this finding."
    388     },
    389     {
    390       "title": "On leakage of code generation evaluation datasets",
    391       "authors": ["Alexandre Matton", "Tom Sherborne"],
    392       "year": 2024,
    393       "arxiv_id": "2407.07565",
    394       "relevance": "Addresses benchmark contamination in code evaluation, a key methodological concern that CodeMMLU attempts to mitigate."
    395     },
    396     {
    397       "title": "Benchmarking benchmark leakage in large language models",
    398       "authors": ["Ruijie Xu", "Zengzhi Wang"],
    399       "year": 2024,
    400       "arxiv_id": "2404.18824",
    401       "relevance": "Provides the contamination detection methodology (perplexity and n-gram accuracy) that CodeMMLU uses to assess data leakage."
    402     },
    403     {
    404       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    405       "authors": ["Jiawei Liu", "Chunqiu Steven Xia"],
    406       "year": 2024,
    407       "relevance": "Questions reliability of automated code evaluation, motivating CodeMMLU's approach of MCQ format over execution-based testing."
    408     },
    409     {
    410       "title": "To CoT or not to CoT? Chain-of-thought helps mainly on math and symbolic reasoning",
    411       "authors": ["Zayne Sprague", "Fangcong Yin"],
    412       "year": 2024,
    413       "arxiv_id": "2409.12183",
    414       "relevance": "Supports CodeMMLU's finding that CoT prompting does not improve performance on non-mathematical tasks."
    415     },
    416     {
    417       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    418       "authors": ["Jason Wei", "Xuezhi Wang"],
    419       "year": 2023,
    420       "arxiv_id": "2201.11903",
    421       "relevance": "Foundational CoT prompting paper; CodeMMLU provides evidence against its effectiveness in code understanding domains."
    422     },
    423     {
    424       "title": "Program synthesis with large language models",
    425       "authors": ["Jacob Austin", "Augustus Odena"],
    426       "year": 2021,
    427       "arxiv_id": "2108.07732",
    428       "relevance": "Introduces MBPP, a widely-used code generation benchmark that CodeMMLU compares against for ranking consistency."
    429     },
    430     {
    431       "title": "Starcoder 2 and the stack v2: The next generation",
    432       "authors": ["Anton Lozhkov", "Raymond Li"],
    433       "year": 2024,
    434       "arxiv_id": "2402.19173",
    435       "relevance": "One of the open-source code LLM families evaluated on CodeMMLU, relevant to the survey's coverage of code generation models."
    436     }
    437   ]
    438 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs