scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21638B)
      1 {
      2   "paper": {
      3     "title": "CodeMMLU: A Multi-Task Benchmark for Assessing Code Understanding & Reasoning Capabilities of CodeLLMs",
      4     "authors": ["Dung Nguyen Manh", "Thang Phan Chau", "Nam Le Hai", "Thong T. Doan", "Nam V. Nguyen", "Quang Pham", "Nghi D. Q. Bui"],
      5     "year": 2024,
      6     "venue": "ICLR 2025",
      7     "arxiv_id": "2410.01999"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The paper states 'CodeMMLU is publicly available at: CodeMMLU' with a GitHub link referenced, and mentions MIT license distribution."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "CodeMMLU benchmark dataset (~20,000 questions) is publicly released under MIT license as stated in Appendix A.3."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or detailed environment setup with library versions is provided in the paper."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions, README with commands, or scripts to replicate experiments are described in the paper."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results are reported as point estimates (accuracy percentages) throughout Tables 3, 7, 8 with no confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper makes comparative claims (e.g., 'GPT-4o outperformed all models') based solely on comparing accuracy numbers without any statistical significance tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Performance differences are reported with baseline context, e.g., 'DeepSeek-Coder-33b surpasses its base model by approximately 29%', and percentage improvements/drops for CoT and answer permutation experiments with deltas shown in Table 4."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification is given for why ~20,000 questions is the right size, nor is there power analysis for the number of models evaluated."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Standard deviation is reported for selection bias experiments in Table 8 (STD column). However, no variance across multiple runs of the same model is reported."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares against existing benchmarks (HumanEval, MBPP, MMLU, GSM8k) in Table 7 and evaluates 43 LLMs across 15 families."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include contemporary models like GPT-4o, GPT o3-mini, Claude 3.7 Sonnet, DeepSeek R1, Phi4, and Qwen2.5 — all recent at time of publication."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper examines the impact of different prompting strategies (zero-shot, few-shot, CoT) in Figure 3 and Appendix B.2, and studies answer permutation effects as a form of ablation on the MCQ format."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "The primary and essentially only metric used is accuracy (percentage of correct answers). No additional metrics like F1 or per-difficulty breakdowns with separate scoring are used."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Manual verification was performed: 'we randomly selected 100 instances from each subject area for manual verification against the three criteria' (Section 3.3). This evaluates the benchmark quality, though not system outputs directly."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "There is no explicit discussion of a held-out test set separate from the reported evaluation set. The entire CodeMMLU appears to be both the benchmark and the reported test set."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Detailed per-category breakdowns are provided in Tables 3 and 9-11, with results split across syntactic knowledge, semantic knowledge, and fundamental tasks, plus per-subject results in Figure 4."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Failure cases are discussed qualitatively in Appendix B.2 with specific examples (Figures 11, 12, 14) showing where CoT reasoning fails, and Section 4.2 discusses where models struggle (execution prediction)."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that CoT prompting consistently impairs performance (Section 4.2, Figure 3), and that reasoning models like DeepSeek R1 underperform compared to non-reasoning counterparts."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims that 'even state-of-the-art models struggle with CodeMMLU' are supported by Table 3 showing no model exceeding ~62% accuracy. Claims about benchmark scale (~20,000 questions) and diversity are verified in Table 2."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper makes causal claims like 'CoT consistently impair performance' and 'the additional complexity introduced by step-by-step reasoning does not align well with knowledge-seeking tasks' without controlled experiments isolating the causal mechanism. The CoT analysis is observational across models."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title 'Assessing Code Understanding & Reasoning Capabilities of CodeLLMs' is broad but the benchmark is MCQ-only, primarily English, and draws from specific sources (W3Schools, Common Crawl, HumanEval, LeetCode). The paper does not explicitly bound its generalization claims to these formats and sources."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper discusses alternative explanations: CoT performance drop may be due to 'overreasoning' (Section 4.2), HumanEval-MCQ discrepancy could indicate data leakage in open-ended benchmarks vs. actual understanding. The data leakage analysis in Appendix A.2 considers whether results reflect memorization."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Models are listed by marketing names (e.g., 'GPT-4o', 'Claude3.7 Sonnet', 'DeepSeek R1') without API version strings or snapshot dates. No specific model IDs like 'gpt-4o-2024-05-13' are provided."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The paper references prompt configurations in Appendix C.1 for the LLM-based filter and shows example prompts/configurations for zero-shot, few-shot, and CoT settings."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No temperature, top-p, max tokens, or other sampling parameters are reported for the model evaluations."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. Models are evaluated directly on MCQ questions."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Data preprocessing is well-documented in Section 3.3 and Appendix A.1, including rule-based filtering, LLM-based filtering with threshold of 4, MinHash LSH deduplication with 256 permutations and 0.8 similarity threshold, and execution-based filtering. The pipeline is shown in Figure 2."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "There is no dedicated limitations or threats-to-validity section in the paper."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No specific threats to validity are discussed. The data contamination analysis in Appendix A.2 acknowledges leakage concerns but does not constitute a threats-to-validity section."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show or what populations/settings are excluded from the claims."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The benchmark dataset is publicly released under MIT license, allowing independent verification of the questions and answers."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Data collection is described in detail in Section 3: knowledge-based questions from W3Schools, Common Crawl, GeeksForGeeks; fundamental tasks from HumanEval, QuixBugs, LeetCode, IBM CodeNet. Specific sources and processes are documented."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants — this is a benchmark evaluation paper using public data sources and automated evaluation."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The full data pipeline is documented in Figure 2 and Sections 3.1-3.3, including crawling, filtering stages (rule-based, LLM-based, execution-based), deduplication, and distractor synthesis. Appendix A.1 provides additional detail including the ~25.6% removal rate from filtering."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section is present in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: FPT Software AI Center, Hanoi University of Science and Technology, and Independent Researcher."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is disclosed, so independence cannot be assessed. Authors are from FPT Software AI Center, which has a potential interest in code LLM evaluation tools."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No training data cutoff dates are stated for any of the evaluated models, despite this being critical for assessing whether models may have seen the benchmark data."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Appendix A.2 directly addresses data leakage using perplexity analysis and n-gram accuracy (Tables 5, 6) following the methodology of Xu et al. (2024), showing CodeMMLU has higher perplexity and lower n-gram accuracy than other benchmarks."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "Section A.2 acknowledges that 'creating a fully leakage-free benchmark is virtually impossible' and describes mitigation strategies: transforming seed data into MCQ format, synthetic distractors, and quantitative contamination analysis via perplexity and n-gram accuracy."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this benchmark evaluation study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this benchmark evaluation study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this benchmark evaluation study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this benchmark evaluation study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this benchmark evaluation study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this benchmark evaluation study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this benchmark evaluation study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No API costs, tokens consumed, or wall-clock time for running the benchmark across 43 models is reported, despite the paper's claim that CodeMMLU is 'cost-effective'."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total computational budget, GPU hours, or API spend for the evaluation campaign is reported."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Even state-of-the-art models struggle with CodeMMLU, with the best model (GPT o3-mini) achieving only 62.36% accuracy.",
    286       "evidence": "Table 3 shows GPT o3-mini at 62.36% and GPT-4o at 56.40% overall accuracy across CodeMMLU tasks.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Chain-of-Thought prompting consistently impairs performance on CodeMMLU compared to zero-shot and few-shot settings.",
    291       "evidence": "Figure 3 shows GPT-4o performance declining with CoT across most task categories. Section 4.2 and Appendix B.2 provide analysis with specific examples of reasoning failures.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "There is a moderate correlation (Pearson r=0.61) between performance on knowledge-based tasks and real-world coding challenges.",
    296       "evidence": "Section 4.2 reports Pearson r=0.61 derived from accuracy of 43 LLMs across 15 families, shown in Figure 5.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "LLMs exhibit significant selection bias sensitivity in MCQ format, with performance fluctuating dramatically based on answer position.",
    301       "evidence": "Table 4 shows DeepSeek-Coder-34B with standard deviation of 36.66 across answer positions. Table 8 shows STD values ranging from 2.81 (Claude3 Opus) to 38.07 (CodeLlama-7B-Instruct).",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Performance in open-ended code generation (HumanEval) does not reliably predict performance in MCQ code completion tasks.",
    306       "evidence": "Figure 7 shows weak correlation coefficients (0.097-0.134) between HumanEval and CodeMMLU code completion for several models. Table 4 shows large discrepancies.",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "Reasoning models like DeepSeek R1 tend to overreason, generating the longest responses while underperforming compared to non-reasoning counterparts.",
    311       "evidence": "Table 3 shows DeepSeek R1 (43.91%) underperforming DeepSeek V3 (49.08%). Figure 6 shows R1 generates ~1750 tokens on average vs ~200 for GPT-4o.",
    312       "supported": "moderate"
    313     }
    314   ],
    315   "methodology_tags": ["benchmark-eval"],
    316   "key_findings": "CodeMMLU is a ~20,000 question MCQ benchmark for code understanding spanning 52 topics and 10+ programming languages. Evaluation of 43 LLMs reveals that even the best models achieve only ~62% accuracy, with significant performance gaps between knowledge tasks and real-world coding challenges. Chain-of-Thought prompting generally hurts performance on code knowledge tasks, and models show substantial selection bias sensitivity to answer ordering in MCQ format. The weak correlation between HumanEval and MCQ performance suggests open-ended benchmarks may overestimate code understanding.",
    317   "red_flags": [
    318     {
    319       "flag": "No limitations section",
    320       "detail": "The paper lacks any dedicated limitations or threats-to-validity section, which is unusual for an ICLR paper proposing a new benchmark."
    321     },
    322     {
    323       "flag": "No hyperparameters reported",
    324       "detail": "Temperature, top-p, and other sampling parameters are not reported for any of the 43+ models evaluated, making reproduction difficult and potentially affecting results."
    325     },
    326     {
    327       "flag": "No statistical significance tests",
    328       "detail": "All comparative claims are based on raw accuracy numbers without significance tests, despite small performance differences between some models."
    329     },
    330     {
    331       "flag": "Model versions unspecified",
    332       "detail": "All models referenced by marketing names only (GPT-4o, Claude 3.7 Sonnet, etc.) without API versions or snapshot dates, making exact reproduction impossible."
    333     },
    334     {
    335       "flag": "Cost-effectiveness claimed without evidence",
    336       "detail": "The paper claims CodeMMLU is 'cost-effective' but provides no actual cost data for running the benchmark."
    337     }
    338   ],
    339   "cited_papers": [
    340     {
    341       "title": "Evaluating large language models trained on code",
    342       "authors": ["Mark Chen", "Jerry Tworek"],
    343       "year": 2021,
    344       "arxiv_id": "2107.03374",
    345       "relevance": "HumanEval benchmark paper, foundational code generation evaluation that CodeMMLU extends to MCQ format."
    346     },
    347     {
    348       "title": "Measuring massive multitask language understanding",
    349       "authors": ["Dan Hendrycks"],
    350       "year": 2020,
    351       "arxiv_id": "2009.03300",
    352       "relevance": "MMLU benchmark that inspired CodeMMLU's multi-task MCQ design for code domains."
    353     },
    354     {
    355       "title": "CRUXEval: A benchmark for code reasoning, understanding and execution",
    356       "authors": ["Alex Gu", "Baptiste Roziere"],
    357       "year": 2024,
    358       "arxiv_id": "2401.03065",
    359       "relevance": "Code reasoning benchmark evaluating LLMs' ability to understand program execution."
    360     },
    361     {
    362       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    363       "authors": ["Naman Jain"],
    364       "year": 2024,
    365       "arxiv_id": "2403.07974",
    366       "relevance": "Contamination-aware code benchmark using temporal splits, relevant to benchmark methodology."
    367     },
    368     {
    369       "title": "Large language models are not robust multiple choice selectors",
    370       "authors": ["Chujie Zheng"],
    371       "year": 2024,
    372       "relevance": "Studies MCQ selection bias in LLMs, directly related to CodeMMLU's findings on answer order sensitivity."
    373     },
    374     {
    375       "title": "Benchmarking benchmark leakage in large language models",
    376       "authors": ["Ruijie Xu"],
    377       "year": 2024,
    378       "arxiv_id": "2404.18824",
    379       "relevance": "Provides the data contamination detection methodology (perplexity, n-gram accuracy) used by CodeMMLU."
    380     },
    381     {
    382       "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions",
    383       "authors": ["Terry Yue Zhuo"],
    384       "year": 2024,
    385       "arxiv_id": "2406.15877",
    386       "relevance": "Contemporary code generation benchmark for LLMs with complex evaluation criteria."
    387     },
    388     {
    389       "title": "On leakage of code generation evaluation datasets",
    390       "authors": ["Alexandre Matton"],
    391       "year": 2024,
    392       "arxiv_id": "2407.07565",
    393       "relevance": "Directly addresses data leakage risks in code benchmarks, motivating CodeMMLU's contamination mitigation."
    394     },
    395     {
    396       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    397       "authors": ["Jiawei Liu"],
    398       "year": 2024,
    399       "relevance": "Rigorous evaluation methodology for LLM code generation (HumanEval+), relevant to benchmark design quality."
    400     },
    401     {
    402       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    403       "authors": ["Jason Wei"],
    404       "year": 2023,
    405       "arxiv_id": "2201.11903",
    406       "relevance": "Foundational CoT prompting paper whose effectiveness CodeMMLU challenges for code knowledge tasks."
    407     }
    408   ]
    409 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs