scan.json (21681B)
1 { 2 "paper": { 3 "title": "CRUXEval: A Benchmark for Code Reasoning, Understanding and Execution", 4 "authors": ["Alex Gu", "Baptiste Rozière", "Hugh Leather", "Armando Solar-Lezama", "Gabriel Synnaeve", "Sida I. Wang"], 5 "year": 2024, 6 "venue": "arXiv", 7 "arxiv_id": "2401.03065" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "The paper references code and data via links (e.g., 'checking the numsteps variable in our code here' in Appendix A.3), and provides HuggingFace checkpoints in Appendix B. The benchmark is publicly available." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The benchmark of 800 Python functions with input-output pairs is released. The paper references public availability and HuggingFace checkpoints for models in Appendix B." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided. The paper mentions HuggingFace model checkpoints but not the software environment needed to reproduce experiments." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions or README with commands are described in the paper. Prompts are provided in appendices, but there is no guide for replicating the full evaluation pipeline." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": true, 36 "justification": "Bootstrap confidence intervals are reported. Fig. 2 shows (2.5, 97.5) percentile whiskers and (25, 75) percentile boxes from 10000 bootstrap samples. Section 3.3 discusses noise measurement." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": true, 41 "justification": "Paired bootstrap tests are used to compare models. Section 4 states 'if the median bar clears the whisker in Fig. 2, then the difference actually holds with >97.5% probability under paired bootstrap.' Section 3.3 reports significance at alpha=0.05 level." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "Absolute performance differences are reported with context. E.g., 'GPT-4 with CoT achieves pass@1 of 75% and 81%' vs 'Code Llama 34B achieves pass@1 of 50% and 46%'. Fig. 1 shows performance differences between model pairs." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": true, 51 "justification": "Section 3.3 explicitly justifies the 800-sample benchmark size via bootstrap analysis, measuring noise at various dataset sizes (200, 400, 800, 1600) and showing 800 is sufficient to detect meaningful differences." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Section 3.3 reports type 1 noise ~1.5% and type 2 noise ~0.2% at N=800. Bootstrap percentile ranges are shown in figures. The Limitations section notes standard deviations of ~1.5% for individual models." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Twenty models are evaluated including base models (StarCoder, Code Llama, DeepSeek), instruction-tuned/distilled models (WizardCoder, Phind, Phi), and proprietary models (GPT-3.5, GPT-4), providing extensive baselines." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "Models evaluated include GPT-4, Code Llama, DeepSeek Coder, and other models that were state-of-the-art at time of writing (late 2023/early 2024)." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "The paper ablates CoT vs direct prediction (Sec 5.1), fine-tuning effects (Sec 5.2), and analyzes the impact of dataset size (Sec 3.3). Confusion matrices in Fig. 7 show where CoT helps vs hurts." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Both pass@1 (T=0.2) and pass@5 (T=0.8) are reported. Two separate tasks (input prediction CRUXEval-I and output prediction CRUXEval-O) are evaluated." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": true, 83 "justification": "The authors manually checked the 22 problems where GPT-4 scored 0/10 on both tasks to verify they pass the criteria of being simple problems (Sec 6). This constitutes manual inspection of system outputs." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "The 800-sample benchmark is the test set. For fine-tuning experiments (Sec 5.2), training and testing accuracy are reported separately (Fig. 8), with decontamination applied." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down by model family, size, base vs instruct, input vs output prediction. Correlation matrices (Fig. 5) show per-model-pair correlations. Dataset statistics show distributions by code length, step count." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Section 6 provides detailed qualitative analysis of GPT-4 failures with specific code examples (Listings 3-10), categorizing failure types (string concatenation errors, misleading variable names, counting limitations)." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "Several negative results: distilled models (WizardCoder, Phind, Phi) don't improve on CRUXEval despite HumanEval gains. CoT does not help Code Llama 13B or GPT-3.5 on input prediction. Fine-tuning plateaus at ~70%." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Abstract claims about GPT-4 CoT achieving 75%/81%, Code Llama 34B achieving 50%/46%, and distilled models not showing the same improvements are all supported by results in Sections 4-5." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "Causal claims are limited and well-supported. Claims like 'CoT improves performance' are backed by controlled comparisons (same model with/without CoT). The paper is careful to say 'suggests' for correlational findings." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": true, 120 "justification": "The Limitations section (Sec 7) explicitly bounds generalization: 'our benchmark only measures the input and output prediction accuracy of relatively simple and self-contained Python functions distilled from a single model.' Prompt sensitivity is acknowledged." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": true, 125 "justification": "Section 7 discusses alternative explanations: prompt sensitivity (Mizrahi et al. 2023), information loss from pass@k, potential contamination in fine-tuning, and that interpreters could replace model execution. Sec 6 considers tokenization as an alternative explanation for GPT-4 failures." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "Model sizes are specified (e.g., 'Code Llama 34B', 'StarCoder 15.5B') and HuggingFace checkpoints listed in Appendix B, but GPT-3.5 and GPT-4 are referenced without API version or snapshot date." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "All prompts are provided in Appendix D.2 (evaluation prompts) and D.3 (CoT prompts). The benchmark generation prompt is shown in Listing 11 and Appendix A.2." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Temperature settings are reported: T=0.2 for pass@1, T=0.8 for pass@5. Number of samples: N=100 for non-GPT models, N=10 for GPT models. Generation temperature T=1 for benchmark construction." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "No agentic scaffolding is used. Models are evaluated via direct prompting and CoT prompting without any agent framework." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 3.2 documents the filtering pipeline in detail: compile-time filters (argument usage, code length 75-300 chars, syntax), runtime filters (no floats, time limits), and quality filters (no imports, deterministic, no side effects)." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 7 'Limitations and Future Work' provides extensive discussion across multiple subsections covering benchmark scope, prompt sensitivity, pass@k limitations, fine-tuning caveats, and more." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "Specific threats discussed: prompt sensitivity may favor certain models (Sec 7), standard deviations of ~1.5% mean close model pairs cannot be distinguished (e.g., Phind 47.9% vs CodeLlama 46.5%), weak decontamination in fine-tuning may inflate scores." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 7 explicitly states what was NOT tested: longer/harder code, other programming languages, open-domain code, different prompting techniques, different temperatures. States the benchmark only measures 'relatively simple and self-contained Python functions.'" 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": true, 176 "justification": "The benchmark data (800 functions with input-output pairs) is publicly released. The paper provides links to code and data." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 3 describes the full data collection process: Code Llama 34B generates candidate functions using 69 standard library functions as seeds, with 25 few-shot prompt combinations, producing 102,000 functions and 489,306 input-output pairs." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants. The benchmark is synthetically generated from a language model." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The full pipeline is documented: generation (Sec 3.1, 102K functions, 489K pairs) → filtering (Sec 3.2, compile-time, runtime, quality filters) → statistical sizing (Sec 3.3, ~1700 → 800 samples via bootstrap analysis)." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": true, 198 "justification": "Section 9 Acknowledgements discloses NSF Graduate Research Fellowship Grant No. 2141064 and NSF Grant CCF:2217064 with Intel Corporation." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are listed: MIT CSAIL and Meta AI. The lead author's work was primarily done during a Meta AI internship." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": true, 208 "justification": "Funding is from NSF and Intel, neither of which has a direct stake in the benchmark results. Meta AI is an affiliation but the paper evaluates multiple companies' models, not just Meta's." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests statement is provided. Authors from Meta AI evaluate Code Llama (a Meta model) but this potential conflict is not explicitly acknowledged." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "No training data cutoff dates are stated for any of the evaluated models. The paper does not discuss when models' training data was collected." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": true, 225 "justification": "The benchmark is newly generated by Code Llama 34B, making overlap with existing training data unlikely. Section 5.2 explicitly addresses decontamination for fine-tuning experiments. The novel generation approach inherently mitigates this risk." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": true, 230 "justification": "The benchmark is newly created (not pre-existing public data), so contamination risk is inherently low. The generate-and-filter approach using Code Llama 34B produces novel functions. Section 5.2 discusses decontamination for fine-tuning." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in the study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in the study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in the study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in the study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in the study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in the study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in the study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "No inference cost, API spend, or tokens consumed are reported despite using GPT-3.5 and GPT-4 APIs with N=10 samples across 800 problems." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No total computational budget, GPU hours, or hardware specifications are reported for running the 20 model evaluations or the fine-tuning experiments." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "GPT-4 with CoT achieves pass@1 of 74.8% on input prediction and 81.9% on output prediction, the best among all evaluated models.", 286 "evidence": "Section 5.1 and Table 3 in the Appendix report these numbers. Fig. 2 shows GPT-4 CoT as the top-performing configuration.", 287 "supported": "strong" 288 }, 289 { 290 "claim": "Models distilled on GPT-4 data (WizardCoder, Phind, Phi) show large HumanEval improvements over base models but do not show similar improvements on CRUXEval.", 291 "evidence": "Section 5 and Fig. 3 show WizardCoder 34B improves HumanEval from 53.7% to 73.2% over Code Llama 34B but shows no significant CRUXEval improvement. Statistical significance confirmed via paired bootstrap.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "800 benchmark examples are sufficient to reliably distinguish model performance differences.", 296 "evidence": "Section 3.3 provides bootstrap analysis at various dataset sizes (Fig. 1), showing that at N=800, type 1 noise is ~1.5% per model and paired differences are statistically significant at alpha=0.05 for many model pairs.", 297 "supported": "strong" 298 }, 299 { 300 "claim": "CoT improves output prediction more than input prediction, and does not help all models equally.", 301 "evidence": "Section 5.1 shows CoT does not improve Code Llama 13B or GPT-3.5 on input prediction. Fig. 4b and confusion matrices in Fig. 7 show CoT can hurt individual samples for weaker models.", 302 "supported": "strong" 303 }, 304 { 305 "claim": "Simple fine-tuning on execution data improves CRUXEval performance but plateaus at under 70%, far from solving the benchmark.", 306 "evidence": "Section 5.2 and Fig. 8 show training vs test accuracy curves for Code Llama 34B fine-tuned on 140K samples, with test accuracy plateauing.", 307 "supported": "moderate" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval"], 311 "key_findings": "CRUXEval is a benchmark of 800 Python functions testing code reasoning via input and output prediction tasks. Evaluating 20 code LMs reveals that GPT-4 with CoT achieves 75%/82% pass@1 but no model comes close to solving the benchmark. Models distilled on GPT-4 data (WizardCoder, Phind, Phi) show inflated HumanEval scores without corresponding CRUXEval improvements, suggesting HumanEval-focused training does not transfer to code understanding. The benchmark construction uses a rigorous generate-and-filter pipeline with bootstrap-justified sample sizes.", 312 "red_flags": [ 313 { 314 "flag": "Meta authors evaluating Meta model", 315 "detail": "Four of six authors are from Meta AI, and Code Llama (a Meta model) is both used to generate the benchmark and is one of the evaluated models. This potential conflict is not explicitly acknowledged." 316 }, 317 { 318 "flag": "No GPT model version specified", 319 "detail": "GPT-3.5 and GPT-4 are used without specifying API versions or snapshot dates. Model behavior changes across versions, making these results non-reproducible for the proprietary models." 320 } 321 ], 322 "cited_papers": [ 323 { 324 "title": "Evaluating large language models trained on code", 325 "authors": ["Mark Chen", "Jerry Tworek"], 326 "year": 2021, 327 "arxiv_id": "2107.03374", 328 "relevance": "Introduces HumanEval, the primary code generation benchmark that CRUXEval complements and compares against." 329 }, 330 { 331 "title": "Program synthesis with large language models", 332 "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"], 333 "year": 2021, 334 "arxiv_id": "2108.07732", 335 "relevance": "Introduces MBPP benchmark and early work on measuring execution ability of code LMs." 336 }, 337 { 338 "title": "SWE-bench: Can language models resolve real-world GitHub issues?", 339 "authors": ["Carlos E. Jimenez", "John Yang"], 340 "year": 2023, 341 "arxiv_id": "2310.06770", 342 "relevance": "Major benchmark for evaluating LLM software engineering capabilities on real-world tasks." 343 }, 344 { 345 "title": "Code Llama: Open foundation models for code", 346 "authors": ["Baptiste Rozière", "Jonas Gehring"], 347 "year": 2023, 348 "arxiv_id": "2308.12950", 349 "relevance": "Open-source code LLM used both to generate the benchmark and as a key evaluated model." 350 }, 351 { 352 "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation", 353 "authors": ["Jiawei Liu", "Chunqiu Steven Xia"], 354 "year": 2023, 355 "arxiv_id": "2305.01210", 356 "relevance": "HumanEval+ augments HumanEval with better test cases, related to benchmark quality concerns." 357 }, 358 { 359 "title": "Chain-of-thought prompting elicits reasoning in large language models", 360 "authors": ["Jason Wei", "Xuezhi Wang"], 361 "year": 2022, 362 "relevance": "Foundational CoT prompting method evaluated as an improvement technique on CRUXEval." 363 }, 364 { 365 "title": "The false promise of imitating proprietary LLMs", 366 "authors": ["Arnav Gudibande", "Eric Wallace"], 367 "year": 2023, 368 "arxiv_id": "2305.15717", 369 "relevance": "Questions whether distilling from proprietary models leads to genuine capability improvements, directly relevant to CRUXEval's findings on distilled models." 370 }, 371 { 372 "title": "WizardCoder: Empowering code large language models with Evol-Instruct", 373 "authors": ["Ziyang Luo"], 374 "year": 2023, 375 "arxiv_id": "2306.08568", 376 "relevance": "Key distilled model showing HumanEval gains without CRUXEval improvements, central to the paper's findings." 377 }, 378 { 379 "title": "Reasoning or reciting? Exploring the capabilities and limitations of language models through counterfactual tasks", 380 "authors": ["Zhaofeng Wu"], 381 "year": 2023, 382 "arxiv_id": "2307.02477", 383 "relevance": "Examines LLM reasoning failures through counterfactual tasks, related to CRUXEval's investigation of code reasoning failures." 384 }, 385 { 386 "title": "Show your work: Scratchpads for intermediate computation with language models", 387 "authors": ["Maxwell Nye"], 388 "year": 2021, 389 "arxiv_id": "2112.00114", 390 "relevance": "Introduces scratchpad approach for training LMs on execution traces, directly related to CRUXEval's code execution evaluation." 391 } 392 ] 393 }