scan.json (28394B)
1 { 2 "paper": { 3 "title": "AutoCodeBench: Large Language Models are Automatic Code Benchmark Generators", 4 "authors": [ 5 "Jason Chou", 6 "Ao Liu", 7 "Yuchi Deng", 8 "Zhiying Zeng", 9 "Tao Zhang", 10 "Haotian Zhu", 11 "Jianwei Cai", 12 "Yue Mao", 13 "Chenchen Zhang", 14 "Lingyun Tan", 15 "Ziyan Xu", 16 "Bohui Zhai", 17 "Hengyi Liu", 18 "Speed Zhu", 19 "Wiggin Zhou", 20 "Fengzong Lian" 21 ], 22 "year": 2025, 23 "venue": "arXiv", 24 "arxiv_id": "2508.09101", 25 "doi": "10.48550/arXiv.2508.09101" 26 }, 27 "checklist": { 28 "artifacts": { 29 "code_released": { 30 "applies": true, 31 "answer": true, 32 "justification": "The paper references a 'Homepage' link at the top (Section 1 header area) and states they 'open-source a multilingual sandbox that supports 20+ programming languages' (Section 1, contribution 3). The benchmark data and sandbox are described as released artifacts." 33 }, 34 "data_released": { 35 "applies": true, 36 "answer": true, 37 "justification": "AutoCodeBench is presented as a released benchmark comprising 3,920 problems. The paper references a homepage and the benchmark is intended for community use. The data (benchmark problems and test cases) is described as released." 38 }, 39 "environment_specified": { 40 "applies": true, 41 "answer": false, 42 "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided in the paper. The sandbox uses Docker containers (Appendix C) but no reproducible environment files for running the full pipeline or evaluation are specified." 43 }, 44 "reproduction_instructions": { 45 "applies": true, 46 "answer": false, 47 "justification": "No step-by-step reproduction instructions are provided. The paper describes the workflow conceptually but does not include a README or specific commands for reproducing the benchmark construction or evaluation results." 48 } 49 }, 50 "statistical_methodology": { 51 "confidence_intervals_or_error_bars": { 52 "applies": true, 53 "answer": false, 54 "justification": "All results in Tables 4, 5, and 6 are reported as point estimates (Pass@1 percentages) with no confidence intervals or error bars." 55 }, 56 "significance_tests": { 57 "applies": true, 58 "answer": false, 59 "justification": "The paper makes multiple comparative claims (e.g., 'Claude Opus 4 Shows State-of-the-art Performance') but provides no statistical significance tests. Rankings are based solely on comparing raw numbers." 60 }, 61 "effect_sizes_reported": { 62 "applies": true, 63 "answer": true, 64 "justification": "The paper reports performance differences with baseline context. For example, multi-turn refinement: 'DeepSeek-V3-0324 achieves remarkable improvement from 48.1% to 59.7%' (Section 3.6), and multi-logic performance drops are quantified (e.g., '-3.4' for Claude Opus 4 in Figure 4)." 65 }, 66 "sample_size_justified": { 67 "applies": true, 68 "answer": false, 69 "justification": "The benchmark size of 3,920 problems and 196 per language are not justified through any formal analysis. The manual verification uses 6 annotators across 6 languages but no justification is given for these sample sizes." 70 }, 71 "variance_reported": { 72 "applies": true, 73 "answer": false, 74 "justification": "No variance, standard deviation, or spread measures are reported. All evaluations appear to be single-run results (greedy decoding or single API calls). The only multi-sample experiment is the difficulty filtering (10 samples per problem) but no variance is reported for the final evaluation." 75 } 76 }, 77 "evaluation_design": { 78 "baselines_included": { 79 "applies": true, 80 "answer": true, 81 "justification": "The paper compares AutoCodeBench against existing benchmarks (HumanEval, MBPP, BigCodeBench, LiveCodeBench, FullStackBench, McEval) in Table 1, and evaluates 30+ models as baselines against each other." 82 }, 83 "baselines_contemporary": { 84 "applies": true, 85 "answer": true, 86 "justification": "The evaluated models include the most recent proprietary and open-source models as of mid-2025: Claude Opus 4 (20250514), o3-high (20250416), Grok-4, Gemini 2.5 Pro, DeepSeek-R1-0528, etc." 87 }, 88 "ablation_study": { 89 "applies": true, 90 "answer": true, 91 "justification": "Table 7 shows the effect of each filtering stage on model performance (initial stage, after simple problem filtering, after critic filtering), effectively ablating the pipeline stages. Section 3.4-3.6 also analyze individual benchmark features (multi-logic, scaling, multi-turn refinement)." 92 }, 93 "multiple_metrics": { 94 "applies": true, 95 "answer": false, 96 "justification": "The paper uses only Pass@1 as the evaluation metric. While Pass@K is briefly shown in Figure 5 (right), the main evaluation across all tables uses only Pass@1. No other metrics (e.g., Pass@5, functional correctness, partial credit) are used." 97 }, 98 "human_evaluation": { 99 "applies": true, 100 "answer": true, 101 "justification": "Section 4.1 describes manual verification by 6 professional annotators who assessed the quality of benchmark problems across 6 programming languages, finding 87.6% accuracy. This validates the automated benchmark construction process." 102 }, 103 "held_out_test_set": { 104 "applies": false, 105 "answer": false, 106 "justification": "This is a benchmark paper, not a model training paper. The benchmark itself serves as a test set for evaluating models, so the concept of a held-out test set does not apply in the traditional sense." 107 }, 108 "per_category_breakdown": { 109 "applies": true, 110 "answer": true, 111 "justification": "Extensive per-language breakdowns are provided in Tables 4, 5, and 6 (20 languages). Per-category analysis includes multi-logic vs. full dataset (Figure 4), popular vs. low-resource languages (Figure 3), and difficulty distributions." 112 }, 113 "failure_cases_discussed": { 114 "applies": true, 115 "answer": true, 116 "justification": "Section 4.1 and Appendix B discuss failure cases in the benchmark itself (12.4% inaccurate problems). The most common issue is 'incomplete problem descriptions' where test functions reference entities not in the problem statement. Section 3.4 discusses model failures on multi-logic problems." 117 }, 118 "negative_results_reported": { 119 "applies": true, 120 "answer": true, 121 "justification": "The paper reports that the benchmark has a 12.4% error rate (Section 4.1), acknowledges model bias in the generation process (Section 4.2), and discusses performance drops on multi-logic problems (Figure 4). The overall finding that even top models score below 53% on ACB is itself a negative result." 122 } 123 }, 124 "claims_and_evidence": { 125 "abstract_claims_supported": { 126 "applies": true, 127 "answer": true, 128 "justification": "The abstract claims that AutoCodeBench comprises 3,920 problems across 20 languages (supported by Table 2), that even advanced LLMs struggle (supported by Tables 4-5 showing top scores below 53%), and that the method is fully automated (supported by workflow description in Section 2.2). All abstract claims are backed by results." 129 }, 130 "causal_claims_justified": { 131 "applies": true, 132 "answer": true, 133 "justification": "Causal claims are modest. The multi-turn refinement experiment (Section 3.6) uses controlled ablation (Turn 1 vs. 2 vs. 3). The pipeline ablation in Table 7 shows effects of each stage. The claim 'reasoning mode helps' (Section 3.2) compares the same models with and without reasoning, which is an adequate controlled comparison." 134 }, 135 "generalization_bounded": { 136 "applies": true, 137 "answer": false, 138 "justification": "The title claims LLMs are 'Automatic Code Benchmark Generators' generically, but the method is only demonstrated with DeepSeek-V3-0324 as the generator model. The paper does not bound claims to the specific models used for generation. The abstract says models 'struggle with the complexity, diversity, and multilingual nature of these tasks' without bounding to the specific 20 languages tested." 139 }, 140 "alternative_explanations_discussed": { 141 "applies": true, 142 "answer": true, 143 "justification": "Section 4.2 discusses model bias as an alternative explanation for performance differences, noting that using DeepSeek models in the pipeline may favor DeepSeek family models. They analyze this quantitatively in Table 7 and conclude the bias impact is 'minimal' but acknowledge it exists." 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": true, 149 "answer": true, 150 "justification": "Specific model versions are provided throughout: 'Claude Opus 4 (20250514)', 'o3-high (20250416)', 'o4-mini (2025-04-16)', 'GPT4.1 (2025-04-14)', 'GPT4o (2024-11-20)', 'DeepSeek-V3-0324', 'DeepSeek-R1-0528', etc. (Tables 4-5, Section 3.1)." 151 }, 152 "prompts_provided": { 153 "applies": true, 154 "answer": true, 155 "justification": "Full prompts are provided: the system prompt for evaluation (Section 3.1), code solution generation (Figure 12), test function generation (Figure 13), programming problem generation (Figure 14), LLM-as-Critic (Figure 10), and language translation (Figure 15). The evaluation system prompt is quoted verbatim." 156 }, 157 "hyperparameters_reported": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section 3.1 states: 'for proprietary LLMs... we directly call their APIs without any additional parameters', 'remaining models use greedy decoding with temperature set to 0', and specific models use 'officially recommended parameters'. While not exhaustive, the inference parameters are described." 161 }, 162 "scaffolding_described": { 163 "applies": false, 164 "answer": false, 165 "justification": "No agentic scaffolding is used. Models are evaluated on direct code generation from a problem prompt, with no tool use, memory, or iterative agent loops (except the multi-turn refinement experiment in Section 3.6, which is a simple feedback loop rather than agentic scaffolding)." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": true, 170 "justification": "Section 2.2 documents the full pipeline: code snippet extraction from Stack-Edu, solution generation, test function generation via sandbox, problem generation, and three-stage filtering (difficulty, quality, diversity). Section 2.2.5 describes the language translation process. Section 2.2.6 describes Lite construction." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": false, 177 "justification": "There is no dedicated limitations or threats-to-validity section. Section 4.2 discusses model bias but is framed as 'Hypotheses on Model Bias' rather than a limitations section. The paper lacks a systematic discussion of limitations." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": false, 182 "justification": "No specific threats to validity are discussed. Section 4.2 on model bias is the closest, but it does not address other threats such as the 12.4% error rate's impact on rankings, the reliance on a single generator model, or the approximate translation approach for 14 languages." 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": false, 187 "justification": "The paper does not explicitly state what the results do NOT show. It does not bound the generalizability of AutoCodeGen to other generator models, does not discuss what types of problems are excluded by the filtering, and does not explicitly state that the 20 languages may not represent all programming paradigms." 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": true, 194 "justification": "The benchmark data (problems, test cases, solutions) is described as publicly released through the project homepage. The raw benchmark data can be downloaded for independent verification." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": true, 199 "justification": "Section 2.2 provides detailed description of data collection: seed code extracted from Stack-Edu (Section 2.2.1), processed through solution generation, test function generation via sandbox execution, problem generation, and three-stage filtering. The pipeline is well-documented." 200 }, 201 "recruitment_methods_described": { 202 "applies": true, 203 "answer": true, 204 "justification": "For the manual verification (Section 4.1), 6 professional annotators are mentioned, though their recruitment is described only as being from Tencent (Acknowledgements). For the benchmark data, the source (Stack-Edu from The Stack v2, real GitHub repositories) is clearly documented." 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": true, 209 "justification": "The full pipeline is documented in Section 2.2 with clear stages: seed extraction -> solution generation -> test function generation (input generation, output generation via sandbox, integration) -> problem generation -> three-stage filtering (difficulty, quality, diversity). Table 7 shows filtering effects." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding sources are disclosed. The paper lists Tencent affiliations but does not include a funding disclosure or acknowledgments section mentioning grants or sponsors." 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "All authors are listed as 'Hunyuan Team, Tencent' on the first page. The affiliation is clearly stated." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": false, 226 "justification": "All authors are from Tencent, which produces the Hunyuan models evaluated in the benchmark. Tencent has a financial interest in the benchmark's adoption and in Hunyuan models' performance. The funder (Tencent, as the employer) is not independent of the outcome." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": false, 231 "justification": "No competing interests statement or financial disclosure is provided. The authors are employees of Tencent, which produces Hunyuan models evaluated in the paper, but this potential conflict is not explicitly declared." 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": true, 237 "answer": false, 238 "justification": "No training data cutoff dates are stated for any of the 30+ evaluated models. Since the benchmark problems are synthetically generated from existing code, the risk of contamination from the source data (Stack-Edu) appearing in model training data is not addressed." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": true, 242 "answer": false, 243 "justification": "No discussion of potential train/test overlap. The benchmark is generated from Stack-Edu (sourced from The Stack v2, which comes from GitHub). Models trained on GitHub code may have seen the seed code snippets. This contamination vector is not discussed." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": true, 247 "answer": false, 248 "justification": "While the paper claims the benchmark is 'human-free' and novel, it does not address whether the seed code from Stack-Edu or the generated problems could overlap with model training data. The paper does not discuss contamination risk at all, which is a significant omission for a benchmark paper." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "The manual verification (Section 4.1) involves annotators checking benchmark quality, not a human subjects study. This is quality assurance, not a human experiment." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "The annotators are professional employees performing quality assessment of code problems, not research participants in a human subjects study." 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human subjects study. The 6 annotators are described as 'professional annotators' but this is quality assurance work, not a study requiring demographic reporting." 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human subjects study. Annotators are internal Tencent employees performing QA, not recruited participants." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human subjects experiment with conditions requiring randomization." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human subjects experiment requiring blinding." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human subjects study with participant attrition." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": true, 291 "answer": false, 292 "justification": "No inference costs are reported for any of the 30+ model evaluations. The paper does not report API costs, tokens consumed, or evaluation time despite extensive use of both proprietary APIs and locally deployed models." 293 }, 294 "compute_budget_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "No computational budget is stated. The paper deployed models using VLLM but does not report GPU hours, hardware specifications, total API spend, or the computational cost of the benchmark construction pipeline (which involves multiple LLM calls per data point)." 298 } 299 } 300 }, 301 "claims": [ 302 { 303 "claim": "AutoCodeBench is a large-scale code generation benchmark with 3,920 problems evenly distributed across 20 programming languages, featuring high difficulty.", 304 "evidence": "Table 2 shows 3,920 problems with 37,777 test cases across 20 languages. Over 60% classified as hard (2,428/3,920). Table 1 compares with existing benchmarks showing AutoCodeBench has the highest difficulty rating.", 305 "supported": "strong" 306 }, 307 { 308 "claim": "Even the most advanced LLMs struggle with AutoCodeBench, with no model achieving above 53% average Pass@1.", 309 "evidence": "Table 4 shows the top model (Claude Opus 4 Think) achieves 52.4% average Pass@1 on ACB. All other models score lower. Table 5 shows top performance of 64.5% on ACB-Lite.", 310 "supported": "strong" 311 }, 312 { 313 "claim": "Claude Opus 4 shows state-of-the-art performance across both reasoning and non-reasoning modes.", 314 "evidence": "Tables 4 and 5 show Claude Opus 4 (Think) ranks first on both ACB (52.4%) and ACB-Lite (64.5%). In non-reasoning mode, Claude Opus 4 achieves 50.9% on ACB, also first among non-reasoning models (Section 3.2).", 315 "supported": "strong" 316 }, 317 { 318 "claim": "AutoCodeBench achieves 87.6% accuracy rate based on manual verification by professional annotators.", 319 "evidence": "Section 4.1 describes verification by 6 annotators across 6 languages (Python, C++, Java, JavaScript, Go, Shell). Table 8 shows accuracy rates ranging from 83.5% (Python) to 93.3% (Shell), averaging 87.6%.", 320 "supported": "moderate" 321 }, 322 { 323 "claim": "Multi-turn refinement with sandbox feedback substantially improves model performance.", 324 "evidence": "Figure 6 shows DeepSeek-V3-0324 improving from 48.1% to 59.7% (+11.6pp) over 3 turns. Qwen2.5-Coder-32B-Instruct improves from 35.8% to 47.4% (+11.6pp). Only 3 models tested (Section 3.6).", 325 "supported": "moderate" 326 }, 327 { 328 "claim": "The automated workflow may introduce favorable bias toward DeepSeek family models, but the impact is minimal.", 329 "evidence": "Section 4.2 and Table 7 analyze bias at different pipeline stages. The Critic process benefits DeepSeek-R1-0528 (+7.5) but also provides greater improvements to reasoning models like o3 (+6.8) and Gemini 2.5 Pro (+7.0) than DeepSeek-V3-0324 (+5.9).", 330 "supported": "weak" 331 }, 332 { 333 "claim": "Models show significantly worse performance on low-resource programming languages compared to popular ones.", 334 "evidence": "Figure 3 shows performance gap widens for low-resource languages (Racket, Shell, Elixir, TS) ranging from 45.3 to 62.0, vs. popular languages (Python, C++, Java, C#) ranging 50.4 to 53.8 (Section 3.3).", 335 "supported": "strong" 336 } 337 ], 338 "methodology_tags": [ 339 "benchmark-eval" 340 ], 341 "key_findings": "AutoCodeBench is a fully automated, large-scale code generation benchmark with 3,920 problems across 20 programming languages, generated through an LLM-sandbox interaction pipeline without human annotation. Even the most advanced LLMs (Claude Opus 4 achieving 52.4% Pass@1) struggle with the benchmark's complexity and multilingual diversity. Performance gaps between models widen significantly on low-resource programming languages, and all models show measurable degradation on multi-logic problems requiring implementation of multiple functions. Manual verification by 6 annotators found 87.6% of generated problems are accurate, but this still implies ~12% noise in the benchmark.", 342 "red_flags": [ 343 { 344 "flag": "Conflict of interest: Tencent evaluating own models", 345 "detail": "All authors are from Tencent (Hunyuan Team). Tencent's Hunyuan models are among the evaluated models. While Hunyuan models don't top the leaderboard, the benchmark design choices (e.g., which languages to include, difficulty calibration) could subtly favor or disadvantage specific model families. This conflict is not disclosed." 346 }, 347 { 348 "flag": "No contamination analysis", 349 "detail": "The benchmark seeds come from Stack-Edu (derived from GitHub via The Stack v2). Many evaluated models were trained on GitHub code. The paper does not discuss whether seed code or generated problems could overlap with model training data, which is a critical omission for a benchmark paper." 350 }, 351 { 352 "flag": "Generator model bias insufficiently addressed", 353 "detail": "DeepSeek-V3-0324 was used to generate all code solutions and problems. The paper acknowledges potential bias (Section 4.2) but the analysis is limited to 6 languages and the conclusion that bias is 'minimal' is not well-supported. The analysis shows DeepSeek-R1-0528 benefits most from the Critic stage (+7.5 vs +5.9 for DeepSeek-V3-0324)." 354 }, 355 { 356 "flag": "12.4% benchmark error rate", 357 "detail": "Manual verification found only 87.6% accuracy (Section 4.1). For a benchmark used to rank models, a ~12% noise rate could affect ranking reliability, especially when performance differences between top models are small (e.g., 52.4% vs 51.1% between Claude Opus 4 and o3-high). The paper does not analyze how noise affects rankings." 358 }, 359 { 360 "flag": "No uncertainty quantification", 361 "detail": "All 30+ models are ranked based on single-run Pass@1 scores with no error bars, confidence intervals, or repeated evaluations. Small performance differences (e.g., 1-2 percentage points) are used to establish rankings without statistical testing." 362 }, 363 { 364 "flag": "Single evaluation metric", 365 "detail": "Only Pass@1 is used as the primary evaluation metric. No complementary metrics (e.g., partial credit, code quality, time-to-solution) are reported, limiting the evaluation's comprehensiveness." 366 } 367 ], 368 "cited_papers": [ 369 { 370 "title": "Evaluating Large Language Models Trained on Code", 371 "authors": ["Mark Chen", "Jerry Tworek"], 372 "year": 2021, 373 "arxiv_id": "2107.03374", 374 "relevance": "Introduced HumanEval, the foundational code generation benchmark that AutoCodeBench aims to supersede." 375 }, 376 { 377 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 378 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"], 379 "year": 2024, 380 "arxiv_id": "2310.06770", 381 "relevance": "Major benchmark for evaluating LLM code generation on real-world software engineering tasks, referenced for comparison." 382 }, 383 { 384 "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code", 385 "authors": ["Naman Jain", "King Han", "Alex Gu"], 386 "year": 2025, 387 "relevance": "Contemporary benchmark addressing data contamination through continuous sourcing from competitive programming platforms." 388 }, 389 { 390 "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions", 391 "authors": ["Terry Yue Zhuo"], 392 "year": 2025, 393 "relevance": "Large-scale code generation benchmark evaluated alongside AutoCodeBench in the comparison table." 394 }, 395 { 396 "title": "Multi-SWE-bench: A Multilingual Benchmark for Issue Resolving", 397 "authors": ["Daoguang Zan", "Zhirong Huang"], 398 "year": 2025, 399 "arxiv_id": "2504.02605", 400 "relevance": "Multilingual extension of SWE-bench referenced for evaluating code agents across multiple programming languages." 401 }, 402 { 403 "title": "McEval: Massively Multilingual Code Evaluation", 404 "authors": ["Linzheng Chai", "Shukai Liu"], 405 "year": 2025, 406 "relevance": "Massively multilingual benchmark covering 40 languages, directly compared with AutoCodeBench in Tables 1 and Figures 8-9." 407 }, 408 { 409 "title": "KodCode: A Diverse, Challenging, and Verifiable Synthetic Dataset for Coding", 410 "authors": ["Zhangchen Xu", "Yang Liu"], 411 "year": 2025, 412 "arxiv_id": "2503.02951", 413 "relevance": "Synthetic code benchmark using direct test case generation, contrasted with AutoCodeBench's sandbox-based approach." 414 }, 415 { 416 "title": "LLM Evaluators Recognize and Favor Their Own Generations", 417 "authors": ["Arjun Panickssery", "Samuel R. Bowman", "Shi Feng"], 418 "year": 2024, 419 "relevance": "Referenced regarding model self-preference bias, directly relevant to AutoCodeBench's use of DeepSeek models in the generation pipeline." 420 }, 421 { 422 "title": "DeepSeek-Coder-V2: Breaking the Barrier of Closed-Source Models in Code Intelligence", 423 "authors": ["DeepSeek-AI", "Qihao Zhu"], 424 "year": 2024, 425 "arxiv_id": "2406.11931", 426 "relevance": "Key model used in AutoCodeBench's construction pipeline for difficulty filtering and as an evaluated model." 427 }, 428 { 429 "title": "Qwen2.5-Coder Technical Report", 430 "authors": ["Binyuan Hui", "Jian Yang"], 431 "year": 2024, 432 "arxiv_id": "2409.12186", 433 "relevance": "Code-specialized LLM series extensively evaluated on AutoCodeBench across multiple model sizes." 434 }, 435 { 436 "title": "FullStack Bench: Evaluating LLMs as Full Stack Coders", 437 "authors": ["Bytedance"], 438 "year": 2025, 439 "arxiv_id": "2412.00535", 440 "relevance": "Practical multilingual benchmark directly compared with AutoCodeBench, covering 16 languages in real-world scenarios." 441 }, 442 { 443 "title": "Seed-Coder: Let the Code Model Curate Data for Itself", 444 "authors": ["ByteDance Seed", "Yuyu Zhang"], 445 "year": 2025, 446 "arxiv_id": "2506.03524", 447 "relevance": "Self-curating code model evaluated on AutoCodeBench, representing the data synthesis approach to code LLM training." 448 } 449 ] 450 }