scan.json (31518B)
1 { 2 "paper": { 3 "title": "SolEval: Benchmarking Large Language Models for Repository-level Solidity Code Generation", 4 "authors": [ 5 "Zhiyuan Peng", 6 "Xin Yin", 7 "Rui Qian", 8 "Peiqin Lin", 9 "Yongkang Liu", 10 "Hao Zhang", 11 "Chenhao Ying", 12 "Yuan Luo" 13 ], 14 "year": 2025, 15 "venue": "arXiv.org", 16 "arxiv_id": "2502.18793", 17 "doi": "10.48550/arXiv.2502.18793" 18 }, 19 "scan_version": 2, 20 "active_modules": ["experimental_rigor", "data_leakage"], 21 "methodology_tags": ["benchmark-eval"], 22 "key_findings": "SolEval is the first repository-level benchmark for Solidity smart contract generation, containing 1,125 filtered function samples from 9 real-world projects. Evaluation of 16 LLMs shows the best model (DeepSeek-V3) achieves only 26.29% Pass@10, revealing significant room for improvement. A correctness-gas efficiency trade-off was observed: models excelling at Pass@k generate gas-inefficient contracts. Supervised fine-tuning on Qwen-7B improved Pass@5 from 16.67% to 58.33%, demonstrating the benchmark's utility for model improvement.", 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The abstract states 'We release our data and code at https://github.com/pzy2000/SolEval.' A specific GitHub URL is provided." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "The benchmark data is released at the same GitHub URL. Appendix C.3 describes the dataset files (dataset.json, example.json, raw.json)." 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": false, 38 "justification": "Section B.2 describes hardware (Intel Xeon Gold 6226R, 192GB RAM, 8 NVIDIA RTX A8000 GPUs, Ubuntu 20.04.1 LTS) and mentions PyTorch and HuggingFace, but no requirements.txt, Dockerfile, or library version numbers are specified in the paper." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": false, 43 "justification": "The paper provides some reproduction hints (fuzzing seed 666, approximate compute times: ~1 week for Table 4, ~24 hours for Table 5) but no step-by-step reproduction instructions. The GitHub repo is linked but the paper itself lacks sufficient detail to reproduce without consulting external resources." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": false, 50 "justification": "Tables 4, 5, 6, and 7 report only point estimates (e.g., '26.29%'). No confidence intervals, error bars, or ± notation appear despite results being averaged over five independent runs." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "No statistical significance tests are reported. Claims like 'DeepSeek-V3 performs best' and 'RAG improves performance' are based solely on comparing raw numbers without any hypothesis tests." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "The SFT experiment reports effect sizes with baseline context: 'Pass@5 increasing from 16.67% to 58.33%' (Table 6). The ablation study (Table 5) reports results for each configuration with comparable baselines, providing effect magnitude context." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "No justification for why 1,125 (or 1,507) benchmark samples is sufficient. No power analysis is discussed. The sample size is a result of filtering criteria, not a deliberate design choice." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": false, 70 "justification": "Section 4.2 states 'all experimental results are averaged over five independent runs' but no standard deviation, IQR, or spread measure is reported. Tables contain only point estimates." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "16 LLMs are compared against each other in Table 4, and the benchmark itself is compared against prior benchmarks (BenchSol, HumanEval, MBPP, etc.) in Table 1." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "The evaluated models include very recent releases: DeepSeek-V3, DeepSeek-R1, GPT-4o, QwQ-32B, and Qwen2.5-Coder, all from 2024-2025." 83 }, 84 "ablation_study": { 85 "applies": true, 86 "answer": true, 87 "justification": "Table 5 presents an ablation study on RAG and Context for DeepSeek-V3 with four conditions (±RAG × ±Context). Figure 4 studies the impact of different shot numbers on Qwen2.5-Coder-7B." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "The paper uses multiple evaluation metrics: Pass@k, Compile@k, Gas@k, Gas Fee, and Vul@k, covering functional correctness, compilation success, gas efficiency, and vulnerability rate." 93 }, 94 "human_evaluation": { 95 "applies": true, 96 "answer": false, 97 "justification": "Evaluation of LLM-generated code is entirely automated through test case execution, compilation checking, gas measurement via Forge, and vulnerability detection via Slither. Human annotators were involved in benchmark construction but not in evaluating model outputs." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": true, 102 "justification": "For SFT, Section 4.3 states: 'we ensured that there were no overlapping functions between the training and test sets' and used a separate set of repositories as the test set. For the main evaluation, the benchmark is a fixed test set not used for any model tuning." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "Table 4 provides per-model breakdowns across all metrics. Results are grouped by model size (6.7B-16B, 32B-671B). Table 2 provides per-project statistics." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": true, 112 "justification": "Section 5.3 and Figure 9 show a failure case where DeepSeek-R1-Distill-Qwen-7B fails an easy boolean AND operation. The paper discusses how models 'can fail in some really easy cases' while succeeding on harder problems." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": true, 117 "justification": "The paper reports that DeepSeek-R1-Distill-Qwen-7B, despite claiming o1-mini-level performance, underperforms CodeLlama-7B on Solidity. The correctness-gas trade-off is reported as a negative finding: the best-performing model (DeepSeek-V3) generates the most gas-inefficient contracts." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "Abstract claims are supported: 'best-performing LLM achieves only 26.29% Pass@10' matches DeepSeek-V3 in Table 4; 'Pass@5 increasing from 16.67% to 58.33%' matches Table 6; '1,507 samples from 28 different repositories' is stated (though internally inconsistent with the 1,125 filtered figure used in evaluation)." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": true, 129 "justification": "Causal claims about RAG and Context improving performance are supported by a controlled ablation study (Table 5) with four conditions (±RAG × ±Context). The SFT improvement claim is supported by before/after comparison on the same model. These are adequate designs for the causal claims made." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": true, 134 "justification": "The paper bounds claims to Solidity code generation specifically. The Limitations section explicitly notes the monolingual limitation, inability to test GPT-5/Claude, and metric limitations. The title and claims focus on Solidity specifically." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper provides single interpretations for findings without exploring alternatives. DeepSeek-R1-Distill's poor performance is attributed to 'lack of knowledge of Solidity' without considering other explanations (e.g., instruction-following vs coding, prompt sensitivity). No alternative explanations for the gas-correctness trade-off are considered." 140 }, 141 "proxy_outcome_distinction": { 142 "applies": true, 143 "answer": true, 144 "justification": "The paper measures Pass@k, Compile@k, Gas@k, and Vul@k, and frames these as measuring code generation correctness, compilability, gas efficiency, and security. The metrics directly measure the claimed outcomes. The paper also discusses the distinction between functional correctness and broader quality (gas, vulnerability)." 145 } 146 }, 147 "setup_transparency": { 148 "model_versions_specified": { 149 "applies": true, 150 "answer": false, 151 "justification": "Open-source models are identified by name and size (e.g., 'DeepSeek-Coder-6.7B', 'CodeLlama-7B') but without exact checkpoint/HuggingFace IDs. Closed-source models use marketing names: 'GPT-4o' and 'GPT-4o-mini' without snapshot dates or API versions. 'DeepSeek-V3' is accessed via API without version specification." 152 }, 153 "prompts_provided": { 154 "applies": true, 155 "answer": true, 156 "justification": "Full prompt templates with actual text are provided in Figures 5-6 and Appendix C. The prompts include exact markers ('// IMPLEMENT THE FUNCTIONALITY BASED ON THE PROVIDED REQUIREMENT', '// START_OF_REQUIREMENT', etc.) and complete examples." 157 }, 158 "hyperparameters_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section 4.2 reports: n=10 samples, nucleus sampling with temperature=1, top-p=0.95, max generation length=512, greedy search for k=1. Section 4.3 reports SFT hyperparameters: max input length 2048, 3 epochs, 9:1 train/validation split, TRL library defaults." 162 }, 163 "scaffolding_described": { 164 "applies": false, 165 "answer": false, 166 "justification": "No agentic scaffolding is used. The evaluation is direct LLM inference with few-shot prompting. RAG is used for example selection but is not agentic scaffolding." 167 }, 168 "data_preprocessing_documented": { 169 "applies": true, 170 "answer": true, 171 "justification": "Section 3 and Figure 3 document the full pipeline: project selection (64 projects from 6 GitHub organizations), function parsing (17,823 → 1,125 after filtering), test construction, human annotation by 5 annotators, and context parsing. Filtering criteria are explicit (LOC ≥ 5, exclude test/deprecated functions, etc.)." 172 } 173 }, 174 "limitations_and_scope": { 175 "limitations_section_present": { 176 "applies": true, 177 "answer": true, 178 "justification": "A dedicated 'Limitations' section lists four specific limitations: monolingual benchmark, inability to evaluate GPT-5/Claude Opus, metrics don't provide optimization mechanisms, and scope limited to tested models." 179 }, 180 "threats_to_validity_specific": { 181 "applies": true, 182 "answer": true, 183 "justification": "The limitations are specific to this study: monolingual focus on Solidity excluding Vyper and Rust, funding constraints preventing GPT-5/Claude evaluation, gas/vulnerability metrics evaluate but don't optimize. These are concrete, not generic disclaimers." 184 }, 185 "scope_boundaries_stated": { 186 "applies": true, 187 "answer": true, 188 "justification": "The paper explicitly states what was NOT tested: other smart contract languages (Vyper, Rust), newer models (GPT-5, Claude Opus), and optimization/remediation methods. Future work directions explicitly bound the current scope." 189 } 190 }, 191 "data_integrity": { 192 "raw_data_available": { 193 "applies": true, 194 "answer": true, 195 "justification": "The benchmark data is released at https://github.com/pzy2000/SolEval. Appendix C.3 describes the data files: dataset.json (source information), example.json (RAG examples), and raw.json." 196 }, 197 "data_collection_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "Section 3 describes the full data collection process in five phases: project selection from 6 GitHub organizations, function parsing with Tree-sitter, test construction with Forge, human annotation with inter-annotator agreement (Fleiss' κ = 0.8), and context parsing via static analysis." 201 }, 202 "recruitment_methods_described": { 203 "applies": true, 204 "answer": true, 205 "justification": "For annotators: Section 3.4 and Appendix E describe recruiting 'five master's students with at least three years of Solidity experience' compensated at $100/hour. For benchmark data: Section 3.1 describes selecting from 'six popular GitHub organizations' sorted by star count, filtering low-star projects." 206 }, 207 "data_pipeline_documented": { 208 "applies": true, 209 "answer": true, 210 "justification": "Figure 3 and Section 3 document the full pipeline with counts at each stage: 64 projects → 17,823 functions → 1,125 functions from 9 projects after filtering → 2,217 test cases → human annotation → context parsing → final SolEval benchmark." 211 } 212 }, 213 "conflicts_of_interest": { 214 "funding_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "Acknowledgements section lists: National Key Research and Development Program of China (Grant 2024YFB2705300), Shanghai Science and Technology Innovation Action Plan (Grant 23511100400), National Natural Science Foundation of China (Grant 62402313), and Open Research Fund of the State Key Laboratory of Blockchain and Data Security." 218 }, 219 "affiliations_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Author affiliations are listed: Shanghai Jiao Tong University, Shanghai SimMed Technology Co, Zhejiang University, Fudan University, LMU Munich, Munich Center for Machine Learning, Northeastern University, and Universiti Sains Malaysia. Authors are not affiliated with any of the evaluated model providers." 223 }, 224 "funder_independent_of_outcome": { 225 "applies": true, 226 "answer": true, 227 "justification": "Funding comes from Chinese government research programs (National Key R&D, NSFC, Shanghai Action Plan) and a university state key laboratory. None of the funders have a financial interest in how specific LLMs perform on the benchmark." 228 }, 229 "financial_interests_declared": { 230 "applies": true, 231 "answer": false, 232 "justification": "No competing interests or financial disclosure statement is present. One author is affiliated with 'Shanghai SimMed Technology Co, Ltd' but no competing interests statement addresses whether this creates a conflict." 233 } 234 }, 235 "contamination": { 236 "training_cutoff_stated": { 237 "applies": true, 238 "answer": false, 239 "justification": "No training data cutoff dates are stated for any of the 16 evaluated models. The paper cites model papers but does not specify when their training data was collected." 240 }, 241 "train_test_overlap_discussed": { 242 "applies": true, 243 "answer": false, 244 "justification": "The paper uses human-annotated comments to reduce memorization of prompts (Section 3.4), but does not discuss whether the code functions themselves (the expected outputs from popular repos like OpenZeppelin) overlap with model training data. The code-level overlap is the primary contamination concern and is not addressed." 245 }, 246 "benchmark_contamination_addressed": { 247 "applies": true, 248 "answer": false, 249 "justification": "The benchmark code comes from popular, high-star GitHub repositories (OpenZeppelin, Solady, etc.) that are almost certainly in the training data of models like GPT-4o and DeepSeek-V3. The paper addresses prompt memorization through human annotation but does not address whether models have memorized the ground-truth code functions." 250 } 251 }, 252 "human_studies": { 253 "pre_registered": { 254 "applies": false, 255 "answer": false, 256 "justification": "The study evaluates LLMs on a benchmark, not human participants. Human annotators constructed the benchmark but are not study subjects." 257 }, 258 "irb_or_ethics_approval": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants are studied. The benchmark evaluates LLMs, not humans." 262 }, 263 "demographics_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in the evaluation. Annotator details (master's students with 3+ years Solidity experience) are for benchmark construction, not as study subjects." 267 }, 268 "inclusion_exclusion_criteria": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants are studied in the evaluation." 272 }, 273 "randomization_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants or experimental conditions assigned to humans." 277 }, 278 "blinding_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in the evaluation." 282 }, 283 "attrition_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants in the evaluation." 287 } 288 }, 289 "cost_and_practicality": { 290 "inference_cost_reported": { 291 "applies": true, 292 "answer": true, 293 "justification": "Figure 4 reports average runtime per example (~1.6s to ~2.8s depending on shot count) and average token length (733-1706 tokens). Section B.2 provides total reproduction time estimates: 'approximately one week' for Table 4, '24 hours' for Table 5." 294 }, 295 "compute_budget_stated": { 296 "applies": true, 297 "answer": true, 298 "justification": "Section B.2 states the hardware: '16-core workstation equipped with an Intel(R) Xeon(R) Gold 6226R CPU @ 2.90GHz, 192GB RAM, and 8 NVIDIA RTX A8000 GPUs.' Approximate wall-clock time is given for each experiment. The section notes 'The computational budget, including GPU hours, the number of GPUs, and the total parallelism across them, is crucial.'" 299 } 300 }, 301 "experimental_rigor": { 302 "seed_sensitivity_reported": { 303 "applies": true, 304 "answer": false, 305 "justification": "Section 4.2 states 'all experimental results are averaged over five independent runs' but no per-run results, standard deviation, or seed sensitivity analysis is reported. Only averaged point estimates appear in the tables." 306 }, 307 "number_of_runs_stated": { 308 "applies": true, 309 "answer": true, 310 "justification": "Section 4.2 explicitly states: 'we set the total number (denoted as n) of samples generated by an LLM to 10' and 'all experimental results are averaged over five independent runs.'" 311 }, 312 "hyperparameter_search_budget": { 313 "applies": true, 314 "answer": false, 315 "justification": "No hyperparameter search budget is reported. For SFT, Section 4.3 states 'All other hyperparameters were kept at the default values provided by the TRL library' suggesting no search was conducted. For inference, fixed hyperparameters are used without mentioning alternatives tried." 316 }, 317 "best_config_selection_justified": { 318 "applies": true, 319 "answer": true, 320 "justification": "The 1-shot setting is justified via Figure 4, which shows diminishing returns with more shots alongside sharply rising cost. Qwen-7B is chosen for SFT 'due to its strong performance in initial evaluations.' All model configurations are reported transparently." 321 }, 322 "multiple_comparison_correction": { 323 "applies": false, 324 "answer": false, 325 "justification": "No statistical significance tests are performed in the paper, so correction for multiple comparisons is structurally inapplicable." 326 }, 327 "self_comparison_bias_addressed": { 328 "applies": true, 329 "answer": false, 330 "justification": "The authors create the benchmark and evaluate models on it without acknowledging potential biases in benchmark construction choices (e.g., function selection, test case design) that could favor certain models or coding patterns." 331 }, 332 "compute_budget_vs_performance": { 333 "applies": true, 334 "answer": false, 335 "justification": "Models ranging from 6.7B to 671B parameters are compared without normalizing for compute. DeepSeek-V3 (671B) is compared directly against 7B models without discussing the 100x compute difference. No performance-per-compute analysis is provided." 336 }, 337 "benchmark_construct_validity": { 338 "applies": true, 339 "answer": false, 340 "justification": "The paper argues for repository-level evaluation over standalone functions but does not discuss whether Pass@k on their specific 1,125 functions (from 9 projects, mostly from a few large repos like Solady) actually measures general Solidity code generation capability. No discussion of construct validity or comparison with alternative evaluation approaches." 341 }, 342 "scaffold_confound_addressed": { 343 "applies": false, 344 "answer": false, 345 "justification": "No agentic scaffolding is involved. All models are evaluated with the same direct prompting setup (same prompt template, same RAG/context configuration)." 346 } 347 }, 348 "data_leakage": { 349 "temporal_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "The benchmark code comes from established GitHub repos (OpenZeppelin, Solady, etc.) that predate most evaluated models' training data. The paper does not discuss temporal leakage — whether the ground-truth code existed before the models' training cutoffs." 353 }, 354 "feature_leakage_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "No discussion of whether the evaluation setup (providing function signatures, requirements, and repository context) leaks information beyond what a developer would have in real-world code generation scenarios." 358 }, 359 "non_independence_addressed": { 360 "applies": true, 361 "answer": false, 362 "justification": "For SFT, Section 4.3 ensures no overlapping functions and uses repository-level splits. However, for the main 16-model evaluation, there is no discussion of whether the benchmark functions from popular repos are independent of model training data. The code from OpenZeppelin et al. is almost certainly in training sets." 363 }, 364 "leakage_detection_method": { 365 "applies": true, 366 "answer": true, 367 "justification": "Section 3.4 uses human-annotated comments as a concrete prevention method: 'to reduce the LLMs' memorization effects, as original comments are highly likely to have been encountered during the pre-training phase.' For SFT, Section 4.3 applies function-level deduplication and excludes 9/30 repos with 'potential data leakage.'" 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "The best-performing LLM (DeepSeek-V3) achieves only 26.29% Pass@10 on SolEval, indicating substantial room for improvement in Solidity code generation.", 374 "evidence": "Table 4 shows DeepSeek-V3 at 26.29% Pass@10, with all other models scoring between 5.91% and 23.70%.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "Supervised fine-tuning on Qwen-7B using SolEval improves Pass@5 from 16.67% to 58.33%.", 379 "evidence": "Table 6 reports before/after SFT metrics: Pass@5 16.67%→58.83%, Compile@5 66.67%→100%, Gas@1 0%→19.84%, Vul@1 26.61%→7.35%.", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "There is a fundamental trade-off between functional correctness and gas efficiency in Solidity code generation: DeepSeek-V3 ranks highest in Pass@10 but generates contracts with high gas fees.", 384 "evidence": "Table 4/7 shows DeepSeek-V3 highest in Pass@k but with gas fee of -7525, while smaller models like DeepSeek-R1-Distill-Qwen-7B (lowest Pass@10 at 5.91%) generates cheaper contracts (fee -3472).", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "RAG and context information improve LLMs' performance in Solidity code generation.", 389 "evidence": "Table 5 ablation on DeepSeek-V3: Pass@1 increases from 20.17% (no RAG, no context) to 21.72% (both RAG and context). Compile@1 increases from 50.32% to 53.35%.", 390 "supported": "weak" 391 }, 392 { 393 "claim": "DeepSeek-R1-Distill-Qwen-7B underperforms CodeLlama-7B on Solidity despite claiming comparable performance to ChatGPT-o1-mini on mainstream benchmarks.", 394 "evidence": "Table 4: DeepSeek-R1-Distill-Qwen-7B achieves 5.91% Pass@10 vs CodeLlama-7B at 14.26%.", 395 "supported": "strong" 396 } 397 ], 398 "red_flags": [ 399 { 400 "flag": "Benchmark contamination from popular repositories", 401 "detail": "The benchmark uses code from highly popular GitHub repositories (OpenZeppelin, Solady, Foundry) that are almost certainly in the training data of all evaluated models. The paper addresses prompt memorization through human annotation but does not address whether models have memorized the ground-truth code functions themselves." 402 }, 403 { 404 "flag": "No error bars despite multiple runs", 405 "detail": "Results are averaged over five independent runs but no standard deviation, confidence intervals, or variance measures are reported. The reader cannot assess result stability or determine whether performance differences between models are statistically meaningful." 406 }, 407 { 408 "flag": "Internal inconsistency in sample counts", 409 "detail": "The abstract and Table 1 claim 1,507 samples from 28 repositories, but Table 2 shows 1,125 filtered functions from 9 projects, and Figure 2 states 'SolEval benchmark is made up of 1,125 samples.' The discrepancy is not explained." 410 }, 411 { 412 "flag": "Small effect sizes for RAG/Context with no significance tests", 413 "detail": "The ablation study (Table 5) shows RAG improves Pass@1 by only ~1.5 percentage points (20.17%→21.72%), which could easily be within noise given no statistical tests or error bars are provided. The paper presents this as a meaningful improvement." 414 }, 415 { 416 "flag": "SFT evaluation details incomplete", 417 "detail": "The SFT evaluation test set composition (21 repos after excluding 9 of 30 for data leakage) is not fully described: how many test functions, from which domains, and whether the test repos are representative. The dramatic improvement (16.67%→58.33%) could partly reflect overfitting or data characteristics." 418 } 419 ], 420 "cited_papers": [ 421 { 422 "title": "Evaluating large language models trained on code", 423 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 424 "year": 2021, 425 "arxiv_id": "2107.03374", 426 "relevance": "Introduced HumanEval, the foundational benchmark for LLM code generation evaluation that SolEval extends to Solidity." 427 }, 428 { 429 "title": "CoderEval: A benchmark of pragmatic code generation with generative pre-trained models", 430 "authors": ["Hao Yu", "Bo Shen", "Dezhi Ran"], 431 "year": 2024, 432 "relevance": "Repository-level code generation benchmark for Python and Java that directly inspires SolEval's approach to non-standalone functions." 433 }, 434 { 435 "title": "DevEval: Evaluating code generation in practical software projects", 436 "authors": ["Jia Li", "Ge Li", "Yunfei Zhao"], 437 "year": 2024, 438 "arxiv_id": "2401.06401", 439 "relevance": "Large-scale repository-level code generation benchmark for Python with 1,825 samples, a direct comparison point for SolEval." 440 }, 441 { 442 "title": "ClassEval: A manually-crafted benchmark for evaluating LLMs on class-level code generation", 443 "authors": ["Xueying Du", "Mingwei Liu", "Kaixin Wang"], 444 "year": 2023, 445 "arxiv_id": "2308.01861", 446 "relevance": "Class-level code generation benchmark evaluating LLMs beyond standalone functions." 447 }, 448 { 449 "title": "Code llama: Open foundation models for code", 450 "authors": ["Baptiste Roziere", "Jonas Gehring", "Fabian Gloeckle"], 451 "year": 2023, 452 "arxiv_id": "2308.12950", 453 "relevance": "Major open-source code generation model family evaluated in SolEval, serving as a baseline for Solidity code generation." 454 }, 455 { 456 "title": "MultiPL-E: A scalable and polyglot approach to benchmarking neural code generation", 457 "authors": ["Federico Cassano", "John Gouwar", "Daniel Nguyen"], 458 "year": 2023, 459 "relevance": "First multi-language parallel code generation benchmark extending HumanEval to 18 languages, highlighting the gap in Solidity support." 460 }, 461 { 462 "title": "Benchmarking large language models for ethereum smart contract development", 463 "authors": ["Etienne Daspe", "Mathis Durand", "Julien Hatin"], 464 "year": 2024, 465 "relevance": "BenchSol: the only prior Solidity code generation benchmark (15 samples), which SolEval aims to supersede." 466 }, 467 { 468 "title": "DeepSeek-R1", 469 "authors": ["DeepSeek"], 470 "year": 2025, 471 "relevance": "Reasoning-focused LLM whose distilled versions are evaluated and found to underperform on Solidity despite strong mainstream benchmark scores." 472 }, 473 { 474 "title": "Magicoder: Source code is all you need", 475 "authors": ["Yuxiang Wei", "Zhe Wang", "Jiawei Liu"], 476 "year": 2023, 477 "arxiv_id": "2312.02120", 478 "relevance": "Open-source code generation model using synthetic data, evaluated as a baseline in SolEval." 479 }, 480 { 481 "title": "OpenCodeInterpreter: Integrating code generation with execution and refinement", 482 "authors": ["Tianyu Zheng", "Ge Zhang", "Tianhao Shen"], 483 "year": 2024, 484 "arxiv_id": "2402.14658", 485 "relevance": "Code generation system with execution-based refinement, evaluated as a baseline for Solidity code generation." 486 }, 487 { 488 "title": "A3-CodGen: A repository-level code generation framework for code reuse with local-aware, global-aware, and third-party-library-aware", 489 "authors": ["Dianshu Liao", "Shidong Pan", "Xiaoyu Sun"], 490 "year": 2024, 491 "relevance": "Repository-level code generation framework addressing context dependencies, directly relevant to SolEval's approach to non-standalone functions." 492 }, 493 { 494 "title": "DS-1000: A natural and reliable benchmark for data science code generation", 495 "authors": ["Yuhang Lai", "Chengxi Li", "Yiming Wang"], 496 "year": 2023, 497 "relevance": "Data science code generation benchmark introducing non-standalone functions and addressing data leakage through function modification." 498 } 499 ] 500 }