scan.json (30966B)
1 { 2 "paper": { 3 "title": "OJBench: A Competition Level Code Benchmark For Large Language Models", 4 "authors": [ 5 "Zhexu Wang", 6 "Yiping Liu", 7 "Yejie Wang", 8 "Wenyang He", 9 "Bofei Gao", 10 "Muxi Diao", 11 "Yanxu Chen", 12 "Kelin Fu", 13 "Flood Sung", 14 "Zhilin Yang", 15 "Tianyu Liu", 16 "Weiran Xu" 17 ], 18 "year": 2025, 19 "venue": "arXiv preprint (under review)", 20 "arxiv_id": "2506.16395", 21 "doi": "10.48550/arXiv.2506.16395" 22 }, 23 "scan_version": 3, 24 "active_modules": ["experimental_rigor", "data_leakage"], 25 "methodology_tags": ["benchmark-eval"], 26 "key_findings": "OJBench is a 232-problem competitive programming benchmark sourced from NOI and ICPC that is substantially harder than LiveCodeBench. Evaluation of 37 models shows the best model (Gemini-2.5-pro-exp-03-25) achieves only 38.91% Pass@1 overall and 9.48% on hard problems. Reasoning-oriented models significantly outperform non-reasoning models, and CPP outperforms Python for most reasoning models on these competition-level tasks. Iterative refinement with execution feedback helps fix compile errors but struggles with time-limit-exceeded errors requiring more efficient algorithms.", 27 "checklist": { 28 "artifacts": { 29 "code_released": { 30 "applies": true, 31 "answer": false, 32 "justification": "A GitHub URL (https://github.com/He-Ren/OJBench) is listed, but footnote 4 states 'We are finalizing our code and website. The article will be updated once everything is ready.' This is a promise of future release, not a currently available artifact." 33 }, 34 "data_released": { 35 "applies": true, 36 "answer": false, 37 "justification": "The benchmark dataset of 232 problems with test cases is described but not yet released. The paper states the code and website are being finalized." 38 }, 39 "environment_specified": { 40 "applies": true, 41 "answer": false, 42 "justification": "Appendix E mentions 'two computing clusters equipped with 8 NVIDIA A100-80GB GPUs' and use of VLLM, but no requirements.txt, Dockerfile, or detailed dependency specifications are provided." 43 }, 44 "reproduction_instructions": { 45 "applies": true, 46 "answer": false, 47 "justification": "No step-by-step reproduction instructions are provided. The code and evaluation framework are stated to be in preparation." 48 } 49 }, 50 "statistical_methodology": { 51 "confidence_intervals_or_error_bars": { 52 "applies": true, 53 "answer": false, 54 "justification": "All results in Table 2 are point estimates (e.g., '33.30' Pass@1 for o4-mini). No confidence intervals or error bars are reported anywhere in the paper." 55 }, 56 "significance_tests": { 57 "applies": true, 58 "answer": false, 59 "justification": "The paper makes numerous comparative claims (e.g., 'reasoning-oriented models significantly outperformed non-reasoning-oriented models') based solely on comparing raw numbers in Table 2 without any statistical significance tests." 60 }, 61 "effect_sizes_reported": { 62 "applies": true, 63 "answer": true, 64 "justification": "The paper reports absolute improvements with baseline context, e.g., 'Qwen3-235B-A22B and DeepSeek-R1 exhibit a pass@8 improvement of 14.55 and 11.05 in python, respectively, compared to pass@1' (Section 3.2). Table 3 compares OJBench vs LiveCodeBench scores for the same models." 65 }, 66 "sample_size_justified": { 67 "applies": true, 68 "answer": false, 69 "justification": "The benchmark contains 232 problems (36 Easy, 79 Medium, 117 Hard) but no justification is given for why this number is sufficient for the claims being made." 70 }, 71 "variance_reported": { 72 "applies": true, 73 "answer": false, 74 "justification": "Pass@1 is computed from 8 candidate solutions per problem, but no variance, standard deviation, or confidence intervals across runs are reported. All results are single point estimates." 75 } 76 }, 77 "evaluation_design": { 78 "baselines_included": { 79 "applies": true, 80 "answer": true, 81 "justification": "The paper evaluates 37 models spanning non-reasoning open-source (CodeLlama, DeepSeek-Coder, Qwen2.5-Coder), reasoning open-source (DeepSeek-R1, QWQ, Qwen3), and closed-source (GPT series, Claude, Gemini, o-series) categories, providing extensive cross-model comparisons." 82 }, 83 "baselines_contemporary": { 84 "applies": true, 85 "answer": true, 86 "justification": "The evaluation includes state-of-the-art models released in 2025 such as o4-mini, Gemini-2.5-pro-exp-03-25, Qwen3-235B-A22B, and DeepSeek-R1." 87 }, 88 "ablation_study": { 89 "applies": false, 90 "answer": false, 91 "justification": "OJBench is a benchmark dataset, not a multi-component system. There are no components to ablate." 92 }, 93 "multiple_metrics": { 94 "applies": true, 95 "answer": true, 96 "justification": "The paper reports Pass@1 and Pass@8 for both Python and CPP, plus per-difficulty-level breakdowns (Easy, Medium, Hard pass rates). Table 2 presents all these metrics." 97 }, 98 "human_evaluation": { 99 "applies": true, 100 "answer": false, 101 "justification": "All evaluation is automated via test case pass/fail. No human evaluation of model outputs is conducted. Manual verification was performed only on the GPT-4o translations of problem descriptions, not on model solutions." 102 }, 103 "held_out_test_set": { 104 "applies": true, 105 "answer": true, 106 "justification": "OJBench serves entirely as a test set. Models are evaluated as-is without any fine-tuning or development on the benchmark problems." 107 }, 108 "per_category_breakdown": { 109 "applies": true, 110 "answer": true, 111 "justification": "Table 2 provides breakdowns by difficulty level (Easy, Medium, Hard). Results are also categorized by model type (reasoning vs non-reasoning, open vs closed-source) and programming language (Python vs CPP)." 112 }, 113 "failure_cases_discussed": { 114 "applies": true, 115 "answer": true, 116 "justification": "Section 4.3 analyzes error types (CE, TLE, WA, MLE, RTE) and their distribution. Section 4.4 and Figure 5 provide a detailed case analysis of Qwen3-235B-A22B's reasoning process on a failed problem, showing repetitive restatements and incorrect strategy selection." 117 }, 118 "negative_results_reported": { 119 "applies": true, 120 "answer": true, 121 "justification": "The paper reports that models struggle to resolve TLE errors through refinement (Section 4.3), that almost all non-reasoning models score 0% on hard problems (Section 3.2), and that the Qwen3 distilled models perform worse in CPP than Python counter to the general trend (Section 4.2)." 122 } 123 }, 124 "claims_and_evidence": { 125 "abstract_claims_supported": { 126 "applies": true, 127 "answer": true, 128 "justification": "The abstract claims that 'even state-of-the-art reasoning-oriented models, such as o4-mini and Gemini-2.5-pro-exp, struggle with highly challenging competition-level problems.' Table 2 confirms this: the best model achieves only 38.91% Pass@1 overall and 9.48% on hard problems." 129 }, 130 "causal_claims_justified": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper makes causal claims without adequate study design. Section 3.2 states 'reinforcement learning and distillation from powerful reasoning-oriented models hold significant potential for improving models' code reasoning capabilities,' attributing performance differences to training methodology without controlling for confounds like model size, training data volume, or compute budget." 134 }, 135 "generalization_bounded": { 136 "applies": true, 137 "answer": true, 138 "justification": "The title specifies 'Competition Level' and the paper explicitly bounds scope in Appendix A: 'The problem types of OJBench are mainly focused on the field of algorithm competitions' and acknowledges insufficient coverage of IoT, blockchain, and AI-code application scenarios." 139 }, 140 "alternative_explanations_discussed": { 141 "applies": true, 142 "answer": false, 143 "justification": "The paper offers single explanations for findings without considering alternatives. For example, CPP's advantage is attributed to 'CPP is inherently a high-performance programming language' (Section 4.2) without considering training data composition, tokenization differences, or prompt formatting effects." 144 }, 145 "proxy_outcome_distinction": { 146 "applies": true, 147 "answer": false, 148 "justification": "The paper measures Pass@n on 232 NOI/ICPC algorithmic competition problems but broadly frames this as assessing 'code reasoning abilities' and 'competitive-level code reasoning abilities of LLMs' throughout. Competitive programming is a narrow subset of code reasoning; this proxy gap is not acknowledged." 149 } 150 }, 151 "setup_transparency": { 152 "model_versions_specified": { 153 "applies": true, 154 "answer": false, 155 "justification": "Some models have version dates (Claude-3.5-sonnet-20241022, GPT-4o-20241120, o1-20241217, Gemini-2.5-pro-exp-03-25) but several lack specific versions: 'GPT3.5-Turbo', 'GPT4-Turbo', and 'o4-mini' are listed without snapshot dates or API versions. Open-source models are specified by name and size." 156 }, 157 "prompts_provided": { 158 "applies": true, 159 "answer": false, 160 "justification": "No prompts or system instructions are shown anywhere in the paper or appendix. Section 3.1 describes the evaluation setup but does not include the actual prompt text given to models." 161 }, 162 "hyperparameters_reported": { 163 "applies": true, 164 "answer": false, 165 "justification": "Section 3.1 states 'we assess the models using the hyper parameters officially recommended by each model, including temperature, top_k, and top_p' and 'max_tokens to 64k' for reasoning models, but the actual per-model hyperparameter values are not listed." 166 }, 167 "scaffolding_described": { 168 "applies": false, 169 "answer": false, 170 "justification": "The main evaluation involves direct code generation without agentic scaffolding. The refinement experiment (Section 4.3) uses simple error feedback but is not an agentic scaffold." 171 }, 172 "data_preprocessing_documented": { 173 "applies": true, 174 "answer": true, 175 "justification": "Section 2.1 and Figure 1 document the pipeline: collection from Logu and ICPC official website, validation with contestant code submissions, filtering out special-judge problems, GPT-4o translation of NOI problems with manual verification. Table 1 provides final statistics." 176 } 177 }, 178 "limitations_and_scope": { 179 "limitations_section_present": { 180 "applies": true, 181 "answer": true, 182 "justification": "Appendix A contains a dedicated 'Limitations and Broader Impact' section with substantive discussion of two specific limitations: limited coverage and insufficient diversity." 183 }, 184 "threats_to_validity_specific": { 185 "applies": true, 186 "answer": true, 187 "justification": "Appendix A identifies specific threats: '(1) Limited coverage: The data of OJBench mainly comes from open-source programming competition platforms' and '(2) Insufficient diversity: The problem types of OJBench are mainly focused on the field of algorithm competitions' with specific examples of uncovered areas." 188 }, 189 "scope_boundaries_stated": { 190 "applies": true, 191 "answer": true, 192 "justification": "Appendix A explicitly states 'our evaluation cannot cover the entire scope of programming competitions' and lists specific excluded scenarios: 'development combining artificial intelligence and code, Internet of Things programming, and blockchain smart contract development.'" 193 } 194 }, 195 "data_integrity": { 196 "raw_data_available": { 197 "applies": true, 198 "answer": false, 199 "justification": "The benchmark problems and test cases are not yet publicly available. The paper states the code and website are being finalized." 200 }, 201 "data_collection_described": { 202 "applies": true, 203 "answer": true, 204 "justification": "Section 2.1 describes data sources (Logu platform for NOI, ICPC official website), what was collected (problem descriptions in Markdown, comprehensive test cases from competition organizers), and the filtering process (validation with contestant code, removal of special-judge problems)." 205 }, 206 "recruitment_methods_described": { 207 "applies": false, 208 "answer": false, 209 "justification": "No human participants. Data consists of programming competition problems sourced from well-known competition platforms (NOI, ICPC)." 210 }, 211 "data_pipeline_documented": { 212 "applies": true, 213 "answer": true, 214 "justification": "Figure 1 shows the full pipeline. Section 2.1 documents each step: collection from two sources, validation against contestant solutions, filtering special-judge problems, translation with GPT-4o, and manual verification. Final count of 232 problems is given in Table 1." 215 } 216 }, 217 "conflicts_of_interest": { 218 "funding_disclosed": { 219 "applies": true, 220 "answer": false, 221 "justification": "No funding or acknowledgments section is present in the paper. No grants, sponsors, or funding agencies are mentioned." 222 }, 223 "affiliations_disclosed": { 224 "applies": true, 225 "answer": true, 226 "justification": "Author affiliations are clearly listed: Beijing University of Posts and Telecommunications, Tsinghua University, University of Chinese Academy of Sciences, Peking University, and Moonshot AI." 227 }, 228 "funder_independent_of_outcome": { 229 "applies": true, 230 "answer": false, 231 "justification": "No funding is disclosed, so independence cannot be assessed. Some authors are affiliated with Moonshot AI (an AI company that develops LLMs), though Moonshot AI models are not directly evaluated in the benchmark." 232 }, 233 "financial_interests_declared": { 234 "applies": true, 235 "answer": false, 236 "justification": "No competing interests or financial interests statement is present in the paper." 237 } 238 }, 239 "contamination": { 240 "training_cutoff_stated": { 241 "applies": true, 242 "answer": false, 243 "justification": "No training data cutoff dates are stated for any of the 37 evaluated models. This is critical since NOI and ICPC problems are publicly available and could appear in training data." 244 }, 245 "train_test_overlap_discussed": { 246 "applies": true, 247 "answer": false, 248 "justification": "The paper mentions selecting NOI/ICPC 'to minimize overlap with existing benchmarks' (Section 2.1) but does not discuss whether these publicly available competition problems appear in model training data." 249 }, 250 "benchmark_contamination_addressed": { 251 "applies": true, 252 "answer": false, 253 "justification": "NOI and ICPC problems are publicly available on competition platforms (Logu, ICPC official website). Models trained on internet-scale data could have seen these problems and their solutions. No contamination analysis or mitigation is performed." 254 } 255 }, 256 "human_studies": { 257 "pre_registered": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study. It is a benchmark evaluation of LLMs on programming competition problems." 261 }, 262 "irb_or_ethics_approval": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants. The ethical statement (Appendix B) addresses data collection ethics and crowd worker compensation but no human subjects were studied." 266 }, 267 "demographics_reported": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this benchmark evaluation study." 271 }, 272 "inclusion_exclusion_criteria": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this benchmark evaluation study." 276 }, 277 "randomization_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in this benchmark evaluation study." 281 }, 282 "blinding_described": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants in this benchmark evaluation study." 286 }, 287 "attrition_reported": { 288 "applies": false, 289 "answer": false, 290 "justification": "No human participants in this benchmark evaluation study." 291 } 292 }, 293 "cost_and_practicality": { 294 "inference_cost_reported": { 295 "applies": true, 296 "answer": false, 297 "justification": "Appendix E mentions hardware ('8 NVIDIA A100-80GB GPUs') and time ('about two hours' per model) for open-source models under 72B parameters, but no API costs for closed-source models are reported." 298 }, 299 "compute_budget_stated": { 300 "applies": true, 301 "answer": false, 302 "justification": "Partial compute information is given in Appendix E for open-source models only. No total compute budget across all 37 models is stated, and closed-source API costs are not reported." 303 } 304 }, 305 "experimental_rigor": { 306 "seed_sensitivity_reported": { 307 "applies": true, 308 "answer": false, 309 "justification": "Eight candidate solutions are generated per problem but no seed sensitivity analysis is performed. Results are not reported across multiple random seeds or independent runs." 310 }, 311 "number_of_runs_stated": { 312 "applies": true, 313 "answer": true, 314 "justification": "Section 3.1 states 'we use API or VLLM to generate eight candidate solutions for each problem' for computing Pass@n metrics." 315 }, 316 "hyperparameter_search_budget": { 317 "applies": true, 318 "answer": true, 319 "justification": "Section 3.1 states 'we assess the models using the hyper parameters officially recommended by each model,' which implicitly indicates no hyperparameter search was conducted — defaults were used for all models." 320 }, 321 "best_config_selection_justified": { 322 "applies": true, 323 "answer": true, 324 "justification": "The paper uses officially recommended hyperparameters for each model (Section 3.1), which is a principled and justified approach for a benchmark evaluation paper." 325 }, 326 "multiple_comparison_correction": { 327 "applies": false, 328 "answer": false, 329 "justification": "No statistical tests are performed in the paper, so correction for multiple comparisons is moot. The absence of tests is captured by the statistical_methodology category." 330 }, 331 "self_comparison_bias_addressed": { 332 "applies": false, 333 "answer": false, 334 "justification": "The authors are not evaluating their own model or system. They evaluate third-party models on their benchmark. Self-comparison bias is not applicable." 335 }, 336 "compute_budget_vs_performance": { 337 "applies": true, 338 "answer": false, 339 "justification": "Models with vastly different compute requirements (1.5B to 671B parameters, open-source vs API) are compared without any discussion of performance relative to compute budget." 340 }, 341 "benchmark_construct_validity": { 342 "applies": true, 343 "answer": false, 344 "justification": "The paper claims to measure 'code reasoning abilities' but only tests algorithmic competition problems. No discussion of whether competitive programming success equates to general code reasoning capability. The paper compares difficulty against LiveCodeBench (Table 3) but does not question construct validity." 345 }, 346 "scaffold_confound_addressed": { 347 "applies": false, 348 "answer": false, 349 "justification": "The main evaluation involves direct code generation without scaffolding. Models generate solutions directly from problem descriptions." 350 } 351 }, 352 "data_leakage": { 353 "temporal_leakage_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "NOI and ICPC problems were created for past competitions and published on public platforms. Models trained on internet data could have seen these problems and their solutions. No temporal leakage analysis is performed." 357 }, 358 "feature_leakage_addressed": { 359 "applies": true, 360 "answer": false, 361 "justification": "No discussion of whether the problem descriptions or input-output examples provide information that could leak answer structure. The use of GPT-4o for translating NOI problems introduces a potential channel of familiarity for GPT-family models." 362 }, 363 "non_independence_addressed": { 364 "applies": true, 365 "answer": false, 366 "justification": "No discussion of whether model training data contains NOI/ICPC problem solutions, related competition problems, or editorial discussions that could create non-independence between train and test data." 367 }, 368 "leakage_detection_method": { 369 "applies": true, 370 "answer": false, 371 "justification": "No concrete leakage detection or prevention methods are used. No canary strings, membership inference tests, n-gram overlap analysis, or temporal splits are applied." 372 } 373 } 374 }, 375 "claims": [ 376 { 377 "claim": "OJBench is substantially harder than LiveCodeBench for state-of-the-art models.", 378 "evidence": "Table 3 compares the same models on both benchmarks: o4-mini scores 63.7% on LCB vs 33.3% on OJBench; Gemini-2.5-pro scores 65.9% on LCB vs 38.91% on OJBench; Qwen3-235B scores 56.6% on LCB vs 25.97% on OJBench (Section 4.1).", 379 "supported": "strong" 380 }, 381 { 382 "claim": "Reasoning-oriented models significantly outperform non-reasoning-oriented models on competitive coding tasks.", 383 "evidence": "Table 2 shows clear separation: best non-reasoning model (DeepSeek-V3-0324) achieves 25.54% Pass@1 in Python, while multiple reasoning models exceed 25% (DeepSeek-R1 at 26.02%, Qwen3-235B at 25.97%, o4-mini at 33.30%, Gemini-2.5-pro at 38.91%).", 384 "supported": "strong" 385 }, 386 { 387 "claim": "For most reasoning-oriented models, CPP outperforms Python on competition-level problems.", 388 "evidence": "Table 2 and Figure 3 show CPP advantages for o4-mini (46.12% vs 33.30% Pass@1), o1 (33.24% vs 26.45%), Gemini-2.5-pro (44.26% vs 38.91%). However, DeepSeek-R1-Distill-Qwen models show the opposite pattern (Section 4.2).", 389 "supported": "moderate" 390 }, 391 { 392 "claim": "Models can improve pass rates through iterative refinement with execution feedback, but struggle with time-limit-exceeded errors.", 393 "evidence": "Figure 4 shows CE errors decrease significantly through refinement rounds, but TLE errors persist. The paper explains TLE requires designing more efficient algorithms, which current models struggle with (Section 4.3).", 394 "supported": "moderate" 395 }, 396 { 397 "claim": "Even the most powerful reasoning-oriented models achieve only single-digit pass rates on hard competition problems.", 398 "evidence": "Table 2 shows Hard pass rates: Gemini-2.5-pro 9.48%, o3-mini 6.84%, o4-mini 5.77%, Qwen3-235B 4.27%, DeepSeek-R1 3.53%. All non-reasoning models score below 4%.", 399 "supported": "strong" 400 }, 401 { 402 "claim": "Using a limited number of test cases for evaluation produces false-positive results.", 403 "evidence": "Figure 2 shows performance of all models decreases with increasing number of test cases, demonstrating that partial test suites inflate accuracy numbers. The paper uses the full test suite (avg 31.81 tests per problem) to avoid this (Section 2.3).", 404 "supported": "strong" 405 } 406 ], 407 "red_flags": [ 408 { 409 "flag": "No contamination analysis", 410 "detail": "NOI and ICPC problems are publicly available on competition platforms (Logu, ICPC official website) with solutions widely shared online. Models trained on internet-scale data could have memorized these problems. No contamination detection, temporal analysis, or mitigation is performed despite this being a benchmark evaluation paper." 411 }, 412 { 413 "flag": "No statistical significance testing", 414 "detail": "All comparative claims across 37 models are based on raw number comparisons without any statistical tests, confidence intervals, or uncertainty quantification. Claims like 'significantly outperformed' are not backed by statistical significance." 415 }, 416 { 417 "flag": "No variance or uncertainty reporting", 418 "detail": "Results are reported as single point estimates from 8 samples per problem. No standard deviations, confidence intervals, or measures of result stability across runs are provided." 419 }, 420 { 421 "flag": "GPT-4o used for problem translation", 422 "detail": "NOI problems were translated from Chinese to English using GPT-4o (Section 2.1). This could introduce a subtle bias favoring GPT-family models, since they may find GPT-generated English text more natural to parse. The paper does not discuss this potential confound." 423 }, 424 { 425 "flag": "Benchmark and code not yet available", 426 "detail": "Footnote 4 states 'We are finalizing our code and website.' The benchmark cannot be independently verified or reproduced until release." 427 } 428 ], 429 "cited_papers": [ 430 { 431 "title": "Evaluating large language models trained on code", 432 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 433 "year": 2021, 434 "arxiv_id": "2107.03374", 435 "relevance": "Introduces HumanEval and the Pass@k metric used as the primary evaluation metric in this paper." 436 }, 437 { 438 "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code", 439 "authors": ["Naman Jain", "King Han", "Alex Gu"], 440 "year": 2024, 441 "arxiv_id": "2403.07974", 442 "relevance": "Key comparison benchmark; paper argues OJBench is substantially harder than LiveCodeBench for evaluating code reasoning." 443 }, 444 { 445 "title": "CodeELO: Benchmarking competition-level code generation of LLMs with human-comparable Elo ratings", 446 "authors": ["Shanghaoran Quan", "Jiaxi Yang", "Bowen Yu"], 447 "year": 2025, 448 "arxiv_id": "2501.01257", 449 "relevance": "Closely related competition-level code benchmark using CodeForces and Elo ratings." 450 }, 451 { 452 "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning", 453 "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"], 454 "year": 2025, 455 "arxiv_id": "2501.12948", 456 "relevance": "Key reasoning-oriented model evaluated in the benchmark; demonstrates RL-based reasoning training for code." 457 }, 458 { 459 "title": "Competition-level code generation with AlphaCode", 460 "authors": ["Yujia Li", "David Choi", "Junyoung Chung"], 461 "year": 2022, 462 "relevance": "Foundational work on LLM-based competition-level code generation using large-scale sampling." 463 }, 464 { 465 "title": "Can language models solve olympiad programming?", 466 "authors": ["Quan Shi", "Michael Tang", "Karthik Narasimhan", "Shunyu Yao"], 467 "year": 2024, 468 "arxiv_id": "2404.10952", 469 "relevance": "USACO benchmark for olympiad-level programming evaluation of LLMs." 470 }, 471 { 472 "title": "Probench: Benchmarking large language models in competitive programming", 473 "authors": ["Lei Yang", "Renren Jin", "Ling Shi"], 474 "year": 2025, 475 "arxiv_id": "2502.20868", 476 "relevance": "Closely related competitive programming benchmark; discusses false-positive risk from limited test cases." 477 }, 478 { 479 "title": "Qwen2.5-Coder technical report", 480 "authors": ["Binyuan Hui", "Jian Yang", "Zeyu Cui"], 481 "year": 2024, 482 "arxiv_id": "2409.12186", 483 "relevance": "Technical report for the Qwen2.5-Coder model family evaluated in the benchmark." 484 }, 485 { 486 "title": "Measuring coding challenge competence with APPS", 487 "authors": ["Dan Hendrycks", "Steven Basart", "Saurav Kadavath"], 488 "year": 2021, 489 "arxiv_id": "2105.09938", 490 "relevance": "Early code benchmark for evaluating LLMs on programming challenges of varying difficulty." 491 }, 492 { 493 "title": "DeepSeek-Coder-V2: Breaking the barrier of closed-source models in code intelligence", 494 "authors": ["Qihao Zhu", "Daya Guo", "Zhihong Shao"], 495 "year": 2024, 496 "arxiv_id": "2406.11931", 497 "relevance": "Code-specialized LLM evaluated in the benchmark; represents strong open-source code models." 498 }, 499 { 500 "title": "Program synthesis with large language models", 501 "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"], 502 "year": 2021, 503 "arxiv_id": "2108.07732", 504 "relevance": "Introduces MBPP benchmark for program synthesis evaluation." 505 }, 506 { 507 "title": "TACO: Topics in algorithmic code generation dataset", 508 "authors": ["Rongao Li", "Jie Fu", "Bo-Wen Zhang"], 509 "year": 2023, 510 "arxiv_id": "2312.14852", 511 "relevance": "Algorithmic code generation benchmark related to competition-level evaluation." 512 } 513 ], 514 "engagement_factors": { 515 "practical_relevance": { 516 "score": 1, 517 "justification": "Useful as a benchmark for researchers evaluating code reasoning models, but not directly applicable for practitioners." 518 }, 519 "surprise_contrarian": { 520 "score": 1, 521 "justification": "Confirms the expected finding that current models struggle on hard competition problems; the CPP vs Python finding adds some novelty." 522 }, 523 "fear_safety": { 524 "score": 0, 525 "justification": "No safety or risk implications; purely evaluates code generation capability on competitive programming." 526 }, 527 "drama_conflict": { 528 "score": 1, 529 "justification": "Provides a leaderboard ranking 37 models, which generates some competitive interest, but no controversial claims." 530 }, 531 "demo_ability": { 532 "score": 0, 533 "justification": "Code and website are explicitly stated to be 'being finalized' and not yet available for use." 534 }, 535 "brand_recognition": { 536 "score": 2, 537 "justification": "Evaluates high-profile models from OpenAI (o4-mini), Google (Gemini), Anthropic (Claude), and DeepSeek, though the authoring institutions are less prominent internationally." 538 } 539 } 540 }