scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20148B)
      1 {
      2   "paper": {
      3     "title": "CRUXEVAL-X: A Benchmark for Multilingual Code Reasoning, Understanding and Execution",
      4     "authors": ["Ruiyang Xu", "Jialun Cao", "Yaojie Lu", "Ming Wen", "Hongyu Lin", "Xianpei Han", "Ben He", "Shing-Chi Cheung", "Le Sun"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2408.13001"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The paper states the benchmark is available and references a GitHub repository. The abstract and construction pipeline describe publicly available data."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The benchmark dataset (CRUXEVAL-X with 19 languages, 500 aligned entries, 19K test cases) is described as publicly available. Built on the public CruxEval dataset."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided. The paper mentions using various LLM APIs and models but does not specify library versions or environment details."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided in the paper. The construction pipeline is described at a high level but no README or runnable scripts are referenced."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "All results in Tables 2 are point estimates (Pass@1 percentages) with no confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper makes comparative claims (e.g., Deepseekcoder-V2 is better than GPT-4o-mini) based solely on comparing numbers without statistical significance tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports absolute Pass@1 scores with baselines for context (e.g., 'phi-1, trained solely on Python, scored 11.8% Pass@1 on Python input prediction and 23.6% on Perl'), providing magnitude context."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper states '500 entries are sufficient to distinguish the effectiveness of the LLMs' but provides no power analysis or formal justification for this claim."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Temperature is set to 0 with greedy decoding, so single-run results are reported. No variance across multiple runs is reported."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper evaluates 24 LLMs across multiple categories (general, multilingual code, instruction-tuned, single-language), providing extensive comparisons."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include GPT-4o, Deepseekcoder-V2, Qwen2, Llama-3, and other contemporary models from 2024."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Table 1 shows ablation of the construction pipeline (w/o Iter vs w/ Iter). Section 4.1 analyzes key factors affecting code reasoning. The data bias analysis in Appendix A also serves as an ablation."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Two tasks are evaluated: input reasoning and output reasoning, both measured with Pass@1. Additionally, syntactic correctness rate and semantic correctness rate are analyzed."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of LLM outputs is performed. Evaluation is entirely automated via test case pass/fail."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The benchmark is used as a test set for evaluating LLMs. The paper also checks for data contamination against Stack v2, finding only 0.8% overlap."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 2 provides per-language breakdowns for all 19 programming languages across all 24 models. Figure 3 breaks down by code complexity factors."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 4.4 provides case studies of failures (e.g., phi-1.5 failing on Racket due to distinct syntax). Appendix C discusses per-language translation difficulties."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that single-LLM translation has low success rates, that some languages (Racket) have consistently worst results, and that the initial overlap was only 333 questions across all PLs."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims about 19 PLs, 600+ subjects per language, 19K tests, correlation between language pairs, and cross-language generalization (34.4% Pass@1) are all supported by Tables 1-2 and Figures 3-5."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper claims 'improvement in NL reasoning positively impacts code reasoning' (Section 4.2.3) based on comparing phi-1 to phi-1.5, but this is observational — the models differ in many ways beyond NL training data."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Claims are generally bounded to the tested models and languages. The title says 'Multilingual' which is accurate for 19 languages. The paper acknowledges that translated code cannot reflect language-specific features (Limitations section)."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss alternative explanations for observed correlations or cross-language generalization. For instance, the phi-1.5 cross-language finding could be due to shared tokenization or natural language overlap rather than true code reasoning transfer."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Models are referred to by marketing names: 'GPT-4o', 'GPT-3.5-Turbo', 'GPT-4o-mini' without snapshot dates or API versions. Open-source models are specified by name and parameter count but not exact checkpoint versions."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The paper references prompts in Appendix D and shows prompt templates in Figure 2. The evaluation task format is clearly specified."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Temperature is set to 0 with greedy decoding for evaluation (Section 3.1). Construction pipeline also specifies temperature 0 for repair (Section 2.3.2)."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. Models are prompted directly for code reasoning."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The full construction pipeline is documented in Section 2 with three steps: function signature translation, test suite translation, and iterative generation & repair. Table 1 shows counts at each stage."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 7 is a dedicated 'Limitations' section discussing three specific limitations."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The limitations section discusses specific threats: model-generated data may introduce bias, translation cannot guarantee perfect accuracy (500 out of 800 aligned), and translated code cannot reflect language-specific features."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The paper explicitly states that translated codes from Python cannot reflect language-specific features, that this is a trade-off for alignment, and that 500 entries were deemed sufficient though derived from 800."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The benchmark data (code subjects and test cases in 19 languages) is released publicly."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 2 describes the full construction pipeline: starting from CruxEval (800 Python subjects), translating through three steps, with specific LLMs (GPT-3.5-Turbo, DeepseekCoder-33B, GPT-4o) used at each stage."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data source is the existing CruxEval benchmark, a standard public dataset."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Table 1 documents the pipeline from 800 subjects through Step I, Step II, w/o Iter, and w/ Iter stages with exact counts per language at each stage. Section 2.3.3 documents the overlap-based refinement (333 → 462 → 500)."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section is visible in the paper text."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: Chinese Academy of Sciences, HKUST, Huazhong University. None of the authors appear to be affiliated with the companies whose models are evaluated."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is disclosed, so independence cannot be assessed."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial disclosure statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No training data cutoff dates are stated for any of the 24 evaluated models."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Section 2.4 (Quality Analysis - Data Leakage) compares the benchmark against Stack v2 (67.5TB of GitHub data), finding only 0.8% overlap, indicating minimal leakage risk."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "The paper explicitly addresses contamination: the benchmark is constructed via automated translation rather than sourcing from contest websites, and the 0.8% overlap with Stack v2 is reported. The paper also notes contest website solutions suffer from contamination (Section 1)."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference costs or API costs are reported for evaluating 24 models across 19 languages, despite this being a substantial evaluation."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total computational budget is stated for either benchmark construction or evaluation."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CRUXEVAL-X contains 19 programming languages with at least 600 subjects each and 19K content-consistent test cases total.",
    286       "evidence": "Table 1 shows the construction pipeline results with per-language counts, all reaching 600+ after iterative generation and repair.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Even GPT-4o can only achieve Pass@1 around 70% on CRUXEVAL-X, indicating the benchmark is challenging.",
    291       "evidence": "Table 2 shows GPT-4o achieving 64.6-75.4% across languages for input reasoning and 70.8-77.6% for output reasoning.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "A model trained solely on Python (phi-1) can achieve at most 34.4% Pass@1 in other languages, demonstrating cross-language generalization.",
    296       "evidence": "Table 2 shows phi-1 and phi-1.5 results across all 19 languages. phi-1.5 achieves up to 34.4% in shell scripting for input reasoning despite being trained only on Python.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "JavaScript and TypeScript show the strongest positive correlation among all PL pairs (0.87 and 0.91 on both tasks).",
    301       "evidence": "Figure 5 shows the correlation heatmap calculated via cosine similarity of LLM performance vectors.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "The benchmark has minimal data leakage risk, with only 0.8% overlap with Stack v2.",
    306       "evidence": "Section 2.4 reports comparison against Stack v2 (67.5TB of GitHub data).",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "Improvement in natural language reasoning positively impacts code reasoning, as shown by phi-1 to phi-1.5 comparison.",
    311       "evidence": "Section 4.2.3 compares phi-1 (Python-only) to phi-1.5 (Python + NL) showing 10.7% vs 21.7% average input reasoning.",
    312       "supported": "weak"
    313     }
    314   ],
    315   "methodology_tags": ["benchmark-eval"],
    316   "key_findings": "CRUXEVAL-X is a multilingual code reasoning benchmark spanning 19 programming languages, constructed via an automated test-guided translation pipeline from the Python-only CruxEval dataset. Evaluation of 24 LLMs reveals that code reasoning capabilities are highly correlated across programming languages, with JavaScript-TypeScript showing the strongest correlation and Racket the weakest. Models trained solely on Python (phi-1, phi-1.5) demonstrate unexpected cross-language generalization, achieving 16-26% success rates in unseen languages. The automated pipeline successfully translated 500 of 800 original subjects to all 19 languages with 0.8% overlap against Stack v2 training data.",
    317   "red_flags": [
    318     {
    319       "flag": "No statistical rigor on comparisons",
    320       "detail": "All comparative claims between models are based on point estimates without confidence intervals, error bars, or significance tests, despite using greedy decoding (single run)."
    321     },
    322     {
    323       "flag": "Weak causal claim about NL-to-code transfer",
    324       "detail": "The claim that NL training improves code reasoning (phi-1 vs phi-1.5) is based on comparing two models that differ in multiple dimensions, not just NL data."
    325     },
    326     {
    327       "flag": "No cost reporting despite large-scale evaluation",
    328       "detail": "Evaluating 24 models across 19 languages and constructing benchmarks using GPT-3.5, DeepseekCoder, and GPT-4o involves substantial costs that are not reported."
    329     }
    330   ],
    331   "cited_papers": [
    332     {
    333       "title": "Evaluating large language models trained on code",
    334       "authors": ["Mark Chen", "Jerry Tworek"],
    335       "year": 2021,
    336       "arxiv_id": "2107.03374",
    337       "relevance": "Introduced HumanEval, the foundational code generation benchmark that CRUXEVAL-X addresses biases in."
    338     },
    339     {
    340       "title": "CruxEval: A benchmark for code reasoning, understanding and execution",
    341       "authors": ["Alex Gu", "Baptiste Rozière"],
    342       "year": 2024,
    343       "arxiv_id": "2401.03065",
    344       "relevance": "The Python-only code reasoning benchmark that CRUXEVAL-X extends to 19 languages."
    345     },
    346     {
    347       "title": "MultiPL-E: A scalable and polyglot approach to benchmarking neural code generation",
    348       "authors": ["Federico Cassano"],
    349       "year": 2023,
    350       "relevance": "Provided multilingual translation rules adopted by CRUXEVAL-X for test suite translation."
    351     },
    352     {
    353       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    354       "authors": ["Carlos E Jimenez"],
    355       "year": 2023,
    356       "arxiv_id": "2310.06770",
    357       "relevance": "Major LLM code benchmark for real-world software engineering tasks."
    358     },
    359     {
    360       "title": "McEval: Massively multilingual code evaluation",
    361       "authors": ["Linzheng Chai"],
    362       "year": 2024,
    363       "arxiv_id": "2406.07436",
    364       "relevance": "Competing multilingual code benchmark using human annotation at $12K cost."
    365     },
    366     {
    367       "title": "Concerned with data contamination? Assessing countermeasures in code language model",
    368       "authors": ["Jialun Cao"],
    369       "year": 2024,
    370       "arxiv_id": "2403.16898",
    371       "relevance": "Addresses benchmark contamination risks in code LLM evaluation."
    372     },
    373     {
    374       "title": "Rectifier: Code translation with corrector via LLMs",
    375       "authors": ["Xin Yin"],
    376       "year": 2024,
    377       "arxiv_id": "2407.07472",
    378       "relevance": "LLM-based code translation approach showing low success rates that CRUXEVAL-X's pipeline improves upon."
    379     },
    380     {
    381       "title": "Deepseek-coder: When the large language model meets programming",
    382       "authors": ["Daya Guo"],
    383       "year": 2024,
    384       "arxiv_id": "2401.14196",
    385       "relevance": "One of the key code LLMs evaluated in the benchmark."
    386     },
    387     {
    388       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    389       "authors": ["Jiawei Liu"],
    390       "year": 2024,
    391       "relevance": "Addresses rigor in evaluating LLM-generated code, directly relevant to benchmark methodology."
    392     },
    393     {
    394       "title": "Code Llama: Open foundation models for code",
    395       "authors": ["Baptiste Roziere"],
    396       "year": 2023,
    397       "arxiv_id": "2308.12950",
    398       "relevance": "Major open-source code LLM family evaluated across multiple variants in the benchmark."
    399     }
    400   ]
    401 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs