scan.json (20148B)
1 { 2 "paper": { 3 "title": "CRUXEVAL-X: A Benchmark for Multilingual Code Reasoning, Understanding and Execution", 4 "authors": ["Ruiyang Xu", "Jialun Cao", "Yaojie Lu", "Ming Wen", "Hongyu Lin", "Xianpei Han", "Ben He", "Shing-Chi Cheung", "Le Sun"], 5 "year": 2024, 6 "venue": "arXiv", 7 "arxiv_id": "2408.13001" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "The paper states the benchmark is available and references a GitHub repository. The abstract and construction pipeline describe publicly available data." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The benchmark dataset (CRUXEVAL-X with 19 languages, 500 aligned entries, 19K test cases) is described as publicly available. Built on the public CruxEval dataset." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided. The paper mentions using various LLM APIs and models but does not specify library versions or environment details." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions are provided in the paper. The construction pipeline is described at a high level but no README or runnable scripts are referenced." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "All results in Tables 2 are point estimates (Pass@1 percentages) with no confidence intervals or error bars." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper makes comparative claims (e.g., Deepseekcoder-V2 is better than GPT-4o-mini) based solely on comparing numbers without statistical significance tests." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper reports absolute Pass@1 scores with baselines for context (e.g., 'phi-1, trained solely on Python, scored 11.8% Pass@1 on Python input prediction and 23.6% on Perl'), providing magnitude context." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper states '500 entries are sufficient to distinguish the effectiveness of the LLMs' but provides no power analysis or formal justification for this claim." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "Temperature is set to 0 with greedy decoding, so single-run results are reported. No variance across multiple runs is reported." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "The paper evaluates 24 LLMs across multiple categories (general, multilingual code, instruction-tuned, single-language), providing extensive comparisons." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "Baselines include GPT-4o, Deepseekcoder-V2, Qwen2, Llama-3, and other contemporary models from 2024." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Table 1 shows ablation of the construction pipeline (w/o Iter vs w/ Iter). Section 4.1 analyzes key factors affecting code reasoning. The data bias analysis in Appendix A also serves as an ablation." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Two tasks are evaluated: input reasoning and output reasoning, both measured with Pass@1. Additionally, syntactic correctness rate and semantic correctness rate are analyzed." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": false, 83 "justification": "No human evaluation of LLM outputs is performed. Evaluation is entirely automated via test case pass/fail." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "The benchmark is used as a test set for evaluating LLMs. The paper also checks for data contamination against Stack v2, finding only 0.8% overlap." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Table 2 provides per-language breakdowns for all 19 programming languages across all 24 models. Figure 3 breaks down by code complexity factors." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Section 4.4 provides case studies of failures (e.g., phi-1.5 failing on Racket due to distinct syntax). Appendix C discusses per-language translation difficulties." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports that single-LLM translation has low success rates, that some languages (Racket) have consistently worst results, and that the initial overlap was only 333 questions across all PLs." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Abstract claims about 19 PLs, 600+ subjects per language, 19K tests, correlation between language pairs, and cross-language generalization (34.4% Pass@1) are all supported by Tables 1-2 and Figures 3-5." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": false, 115 "justification": "The paper claims 'improvement in NL reasoning positively impacts code reasoning' (Section 4.2.3) based on comparing phi-1 to phi-1.5, but this is observational — the models differ in many ways beyond NL training data." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": true, 120 "justification": "Claims are generally bounded to the tested models and languages. The title says 'Multilingual' which is accurate for 19 languages. The paper acknowledges that translated code cannot reflect language-specific features (Limitations section)." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper does not discuss alternative explanations for observed correlations or cross-language generalization. For instance, the phi-1.5 cross-language finding could be due to shared tokenization or natural language overlap rather than true code reasoning transfer." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "Models are referred to by marketing names: 'GPT-4o', 'GPT-3.5-Turbo', 'GPT-4o-mini' without snapshot dates or API versions. Open-source models are specified by name and parameter count but not exact checkpoint versions." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "The paper references prompts in Appendix D and shows prompt templates in Figure 2. The evaluation task format is clearly specified." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Temperature is set to 0 with greedy decoding for evaluation (Section 3.1). Construction pipeline also specifies temperature 0 for repair (Section 2.3.2)." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "No agentic scaffolding is used. Models are prompted directly for code reasoning." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "The full construction pipeline is documented in Section 2 with three steps: function signature translation, test suite translation, and iterative generation & repair. Table 1 shows counts at each stage." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 7 is a dedicated 'Limitations' section discussing three specific limitations." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "The limitations section discusses specific threats: model-generated data may introduce bias, translation cannot guarantee perfect accuracy (500 out of 800 aligned), and translated code cannot reflect language-specific features." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "The paper explicitly states that translated codes from Python cannot reflect language-specific features, that this is a trade-off for alignment, and that 500 entries were deemed sufficient though derived from 800." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": true, 176 "justification": "The benchmark data (code subjects and test cases in 19 languages) is released publicly." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 2 describes the full construction pipeline: starting from CruxEval (800 Python subjects), translating through three steps, with specific LLMs (GPT-3.5-Turbo, DeepseekCoder-33B, GPT-4o) used at each stage." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants. Data source is the existing CruxEval benchmark, a standard public dataset." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "Table 1 documents the pipeline from 800 subjects through Step I, Step II, w/o Iter, and w/ Iter stages with exact counts per language at each stage. Section 2.3.3 documents the overlap-based refinement (333 → 462 → 500)." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding or acknowledgments section is visible in the paper text." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are listed: Chinese Academy of Sciences, HKUST, Huazhong University. None of the authors appear to be affiliated with the companies whose models are evaluated." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information is disclosed, so independence cannot be assessed." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial disclosure statement is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "No training data cutoff dates are stated for any of the 24 evaluated models." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": true, 225 "justification": "Section 2.4 (Quality Analysis - Data Leakage) compares the benchmark against Stack v2 (67.5TB of GitHub data), finding only 0.8% overlap, indicating minimal leakage risk." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": true, 230 "justification": "The paper explicitly addresses contamination: the benchmark is constructed via automated translation rather than sourcing from contest websites, and the 0.8% overlap with Stack v2 is reported. The paper also notes contest website solutions suffer from contamination (Section 1)." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "No inference costs or API costs are reported for evaluating 24 models across 19 languages, despite this being a substantial evaluation." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No total computational budget is stated for either benchmark construction or evaluation." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "CRUXEVAL-X contains 19 programming languages with at least 600 subjects each and 19K content-consistent test cases total.", 286 "evidence": "Table 1 shows the construction pipeline results with per-language counts, all reaching 600+ after iterative generation and repair.", 287 "supported": "strong" 288 }, 289 { 290 "claim": "Even GPT-4o can only achieve Pass@1 around 70% on CRUXEVAL-X, indicating the benchmark is challenging.", 291 "evidence": "Table 2 shows GPT-4o achieving 64.6-75.4% across languages for input reasoning and 70.8-77.6% for output reasoning.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "A model trained solely on Python (phi-1) can achieve at most 34.4% Pass@1 in other languages, demonstrating cross-language generalization.", 296 "evidence": "Table 2 shows phi-1 and phi-1.5 results across all 19 languages. phi-1.5 achieves up to 34.4% in shell scripting for input reasoning despite being trained only on Python.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "JavaScript and TypeScript show the strongest positive correlation among all PL pairs (0.87 and 0.91 on both tasks).", 301 "evidence": "Figure 5 shows the correlation heatmap calculated via cosine similarity of LLM performance vectors.", 302 "supported": "strong" 303 }, 304 { 305 "claim": "The benchmark has minimal data leakage risk, with only 0.8% overlap with Stack v2.", 306 "evidence": "Section 2.4 reports comparison against Stack v2 (67.5TB of GitHub data).", 307 "supported": "moderate" 308 }, 309 { 310 "claim": "Improvement in natural language reasoning positively impacts code reasoning, as shown by phi-1 to phi-1.5 comparison.", 311 "evidence": "Section 4.2.3 compares phi-1 (Python-only) to phi-1.5 (Python + NL) showing 10.7% vs 21.7% average input reasoning.", 312 "supported": "weak" 313 } 314 ], 315 "methodology_tags": ["benchmark-eval"], 316 "key_findings": "CRUXEVAL-X is a multilingual code reasoning benchmark spanning 19 programming languages, constructed via an automated test-guided translation pipeline from the Python-only CruxEval dataset. Evaluation of 24 LLMs reveals that code reasoning capabilities are highly correlated across programming languages, with JavaScript-TypeScript showing the strongest correlation and Racket the weakest. Models trained solely on Python (phi-1, phi-1.5) demonstrate unexpected cross-language generalization, achieving 16-26% success rates in unseen languages. The automated pipeline successfully translated 500 of 800 original subjects to all 19 languages with 0.8% overlap against Stack v2 training data.", 317 "red_flags": [ 318 { 319 "flag": "No statistical rigor on comparisons", 320 "detail": "All comparative claims between models are based on point estimates without confidence intervals, error bars, or significance tests, despite using greedy decoding (single run)." 321 }, 322 { 323 "flag": "Weak causal claim about NL-to-code transfer", 324 "detail": "The claim that NL training improves code reasoning (phi-1 vs phi-1.5) is based on comparing two models that differ in multiple dimensions, not just NL data." 325 }, 326 { 327 "flag": "No cost reporting despite large-scale evaluation", 328 "detail": "Evaluating 24 models across 19 languages and constructing benchmarks using GPT-3.5, DeepseekCoder, and GPT-4o involves substantial costs that are not reported." 329 } 330 ], 331 "cited_papers": [ 332 { 333 "title": "Evaluating large language models trained on code", 334 "authors": ["Mark Chen", "Jerry Tworek"], 335 "year": 2021, 336 "arxiv_id": "2107.03374", 337 "relevance": "Introduced HumanEval, the foundational code generation benchmark that CRUXEVAL-X addresses biases in." 338 }, 339 { 340 "title": "CruxEval: A benchmark for code reasoning, understanding and execution", 341 "authors": ["Alex Gu", "Baptiste Rozière"], 342 "year": 2024, 343 "arxiv_id": "2401.03065", 344 "relevance": "The Python-only code reasoning benchmark that CRUXEVAL-X extends to 19 languages." 345 }, 346 { 347 "title": "MultiPL-E: A scalable and polyglot approach to benchmarking neural code generation", 348 "authors": ["Federico Cassano"], 349 "year": 2023, 350 "relevance": "Provided multilingual translation rules adopted by CRUXEVAL-X for test suite translation." 351 }, 352 { 353 "title": "SWE-bench: Can language models resolve real-world GitHub issues?", 354 "authors": ["Carlos E Jimenez"], 355 "year": 2023, 356 "arxiv_id": "2310.06770", 357 "relevance": "Major LLM code benchmark for real-world software engineering tasks." 358 }, 359 { 360 "title": "McEval: Massively multilingual code evaluation", 361 "authors": ["Linzheng Chai"], 362 "year": 2024, 363 "arxiv_id": "2406.07436", 364 "relevance": "Competing multilingual code benchmark using human annotation at $12K cost." 365 }, 366 { 367 "title": "Concerned with data contamination? Assessing countermeasures in code language model", 368 "authors": ["Jialun Cao"], 369 "year": 2024, 370 "arxiv_id": "2403.16898", 371 "relevance": "Addresses benchmark contamination risks in code LLM evaluation." 372 }, 373 { 374 "title": "Rectifier: Code translation with corrector via LLMs", 375 "authors": ["Xin Yin"], 376 "year": 2024, 377 "arxiv_id": "2407.07472", 378 "relevance": "LLM-based code translation approach showing low success rates that CRUXEVAL-X's pipeline improves upon." 379 }, 380 { 381 "title": "Deepseek-coder: When the large language model meets programming", 382 "authors": ["Daya Guo"], 383 "year": 2024, 384 "arxiv_id": "2401.14196", 385 "relevance": "One of the key code LLMs evaluated in the benchmark." 386 }, 387 { 388 "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation", 389 "authors": ["Jiawei Liu"], 390 "year": 2024, 391 "relevance": "Addresses rigor in evaluating LLM-generated code, directly relevant to benchmark methodology." 392 }, 393 { 394 "title": "Code Llama: Open foundation models for code", 395 "authors": ["Baptiste Roziere"], 396 "year": 2023, 397 "arxiv_id": "2308.12950", 398 "relevance": "Major open-source code LLM family evaluated across multiple variants in the benchmark." 399 } 400 ] 401 }