scan.json (26229B)
1 { 2 "paper": { 3 "title": "CodeContests+: High-Quality Test Case Generation for Competitive Programming", 4 "authors": ["Zihan Wang", "Siyao Liu", "Yang Sun", "Hongyan Li", "Kai Shen"], 5 "year": 2025, 6 "venue": "arXiv preprint", 7 "arxiv_id": "2506.05817", 8 "doi": "10.48550/arXiv.2506.05817" 9 }, 10 "checklist": { 11 "artifacts": { 12 "code_released": { 13 "applies": true, 14 "answer": true, 15 "justification": "The paper open-sources SandboxFusion (https://github.com/bytedance/SandboxFusion) and publishes the dataset on HuggingFace (https://huggingface.co/datasets/ByteDance-Seed/Code-Contests-Plus). Both URLs are provided in the paper." 16 }, 17 "data_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "The dataset CodeContests+ is released on HuggingFace at https://huggingface.co/datasets/ByteDance-Seed/Code-Contests-Plus, including pre-processed test cases in 1x-5x versions, generators, validators, and checkers." 21 }, 22 "environment_specified": { 23 "applies": true, 24 "answer": false, 25 "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided. The paper mentions using C++ with testlib and SandboxFusion supporting 20+ languages but does not specify library versions or environment details needed for reproduction." 26 }, 27 "reproduction_instructions": { 28 "applies": true, 29 "answer": false, 30 "justification": "No step-by-step reproduction instructions are provided. While the dataset and SandboxFusion tool are released, the paper does not include a README with commands to replicate the main experiments (e.g., RL training, large-scale evaluation)." 31 } 32 }, 33 "statistical_methodology": { 34 "confidence_intervals_or_error_bars": { 35 "applies": true, 36 "answer": false, 37 "justification": "No confidence intervals or error bars are reported. Table 3 reports Pass@1 on LiveCodeBench as point estimates (e.g., 0.622 vs 0.637) without any uncertainty quantification." 38 }, 39 "significance_tests": { 40 "applies": true, 41 "answer": false, 42 "justification": "No statistical significance tests are used. The paper claims CodeContests+ achieves 'consistent gains' and 'significantly higher accuracy' over CodeContests based solely on comparing raw numbers in Table 3 and Figure 3 without any formal tests." 43 }, 44 "effect_sizes_reported": { 45 "applies": true, 46 "answer": true, 47 "justification": "Effect sizes are contextualized. For example, the paper reports that CodeContests+ 5x yields 'almost twice' the number of qualified problems compared to CodeContests, and Table 3 shows absolute Pass@1 values with baselines (e.g., 0.622 vs 0.637), providing magnitude context." 48 }, 49 "sample_size_justified": { 50 "applies": true, 51 "answer": false, 52 "justification": "No justification is given for the sample sizes used. The evaluation uses 100 positive and 100 negative samples per problem (1.72M total submissions), but no justification is provided for why 100 per class is sufficient for TPR/TNR estimation." 53 }, 54 "variance_reported": { 55 "applies": true, 56 "answer": false, 57 "justification": "No variance or standard deviation is reported across runs. The RL training results in Table 3 appear to be from a single run. The paper uses 'avg@15' (average of pass@1 from 15 independent responses) as a metric, but does not report variance across training runs or seeds." 58 } 59 }, 60 "evaluation_design": { 61 "baselines_included": { 62 "applies": true, 63 "answer": true, 64 "justification": "The paper compares CodeContests+ against the original CodeContests dataset throughout Section 4.4 and 4.5, using identical problem sets. Table 1 also compares against MBPP, HumanEval, USACO, LiveCodeBench, APPS, and TACO." 65 }, 66 "baselines_contemporary": { 67 "applies": true, 68 "answer": true, 69 "justification": "CodeContests (2022) is the direct predecessor and the most relevant baseline. LiveCodeBench (2024) is used as the evaluation benchmark for RL training. These are contemporary and appropriate baselines for the task." 70 }, 71 "ablation_study": { 72 "applies": true, 73 "answer": true, 74 "justification": "The paper ablates test case quantity via the 1x-5x versions (Section 4.4, Figure 3c), showing that increasing test case count improves evaluation accuracy. The comparison of CodeContests+ vs CodeContests+HQ (TPR&TNR >= 0.9 filtered) in the RL experiment also serves as an ablation on test case quality filtering." 75 }, 76 "multiple_metrics": { 77 "applies": true, 78 "answer": true, 79 "justification": "Multiple metrics are used: True Positive Rate (TPR), True Negative Rate (TNR) for test case quality evaluation, and Pass@1 across Easy/Medium/Hard difficulty levels for RL training evaluation." 80 }, 81 "human_evaluation": { 82 "applies": true, 83 "answer": false, 84 "justification": "No human evaluation of the generated test cases, validators, or checkers is reported. The paper relies entirely on automated metrics (TPR/TNR via submission labels, Pass@1 on LiveCodeBench). Given claims about test case 'quality' and 'correctness,' human expert review of a sample would strengthen the evidence." 85 }, 86 "held_out_test_set": { 87 "applies": true, 88 "answer": true, 89 "justification": "LiveCodeBench (Aug 2024 - Feb 2025) serves as a held-out evaluation benchmark for the RL training experiment. The test case quality verification uses independently labeled submissions from CodeForces." 90 }, 91 "per_category_breakdown": { 92 "applies": true, 93 "answer": true, 94 "justification": "Table 3 provides per-difficulty breakdowns (Easy, Medium, Hard) for LiveCodeBench results. Figure 3 shows TPR/TNR distributions across all problems. The 1x-5x versions provide a breakdown by test case quantity." 95 }, 96 "failure_cases_discussed": { 97 "applies": true, 98 "answer": true, 99 "justification": "Section 3.2 discusses validator limitations including two error categories. Section 4.4 notes that 67.1% of CodeContests test cases fail validation. Case studies in Appendix D.3 show specific failure examples. The paper acknowledges lacking 'an automatic supervision method' for the second type of validator error." 100 }, 101 "negative_results_reported": { 102 "applies": true, 103 "answer": true, 104 "justification": "The paper reports that the validator agent still has unresolved error modes (Section 3.2: 'we still lack an automatic supervision method to address the second type of error'), and that 1,920 problems were removed during data cleaning (Section 4.1), acknowledging limitations of the approach." 105 } 106 }, 107 "claims_and_evidence": { 108 "abstract_claims_supported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The abstract claims that CodeContests+ achieves 'significantly higher accuracy than CodeContests, particularly with a notably higher True Positive Rate' and that 'improvements in test case quality yield considerable advantages for RL.' Both are supported by Figure 3 and Table 3 respectively." 112 }, 113 "causal_claims_justified": { 114 "applies": true, 115 "answer": true, 116 "justification": "The main causal claim is that higher-quality test cases improve RL training outcomes. The controlled experiment in Section 4.5 uses the same model (Qwen2.5-32B), same training procedure (DAPO), and same problem set, changing only the test cases. This single-variable manipulation provides adequate causal evidence for the claim, though only one training run is shown." 117 }, 118 "generalization_bounded": { 119 "applies": true, 120 "answer": false, 121 "justification": "The paper's title and conclusions speak broadly about 'competitive programming' and 'further enhancing the reasoning and coding capabilities of LLMs,' but results are only on one dataset (CodeContests/CodeForces), one model (Qwen2.5-32B), and one RL algorithm (DAPO). No explicit scoping of generalization boundaries is provided." 122 }, 123 "alternative_explanations_discussed": { 124 "applies": true, 125 "answer": false, 126 "justification": "No alternative explanations are discussed for the observed improvements. For example, the RL improvement could be partly due to the data cleaning (removal of 1,920 problems) rather than test case quality alone. The paper does not consider confounds between data cleaning and test case improvement." 127 } 128 }, 129 "setup_transparency": { 130 "model_versions_specified": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper does not specify which LLM is used for the Generator, Validator, or Checker agents. Section 3 describes the agent system architecture but never names the model or version powering these agents. The RL training uses 'Qwen2.5-32B' which is a model name but not a specific version/checkpoint." 134 }, 135 "prompts_provided": { 136 "applies": true, 137 "answer": false, 138 "justification": "The prompts used for the Generator, Validator, and Checker agents are not provided. The paper describes the agent workflows in natural language (e.g., 'the Generator Agent is given a problem statement. It is then instructed to read the statement carefully') but does not provide the actual prompt text." 139 }, 140 "hyperparameters_reported": { 141 "applies": true, 142 "answer": true, 143 "justification": "Section 4.5 reports key hyperparameters: clipping ratios epsilon_low=0.2 and epsilon_high=0.28 for DAPO, the use of avg@15 metric, and the LiveCodeBench time window (Aug 2024 - Feb 2025). However, LLM agent temperature/sampling settings are not reported." 144 }, 145 "scaffolding_described": { 146 "applies": true, 147 "answer": true, 148 "justification": "The Generator-Validator agent system is described in detail in Section 3. The Generator agent workflow (problem analysis, constraint identification, corner case design, program generation), the Validator supervision loop (feedback on validation failures, compilation errors, timeouts), and the Checker agent workflow are all described. Figure 2 provides a visual overview." 149 }, 150 "data_preprocessing_documented": { 151 "applies": true, 152 "answer": true, 153 "justification": "Section 4.1 documents the data cleaning procedure with specific filtering criteria: problems without statements, interactive problems, problems without correct submissions, file I/O problems, special problems, image-dependent problems, crawling errors, and low-quality problems. The count is reduced from 13,610 to 11,690." 154 } 155 }, 156 "limitations_and_scope": { 157 "limitations_section_present": { 158 "applies": true, 159 "answer": false, 160 "justification": "No dedicated limitations or threats-to-validity section is present. The paper has a 'Conclusion and Future Work' section (Section 5) that mentions future directions but does not substantively discuss limitations of the current work." 161 }, 162 "threats_to_validity_specific": { 163 "applies": true, 164 "answer": false, 165 "justification": "No specific threats to validity are discussed. While Section 3.2 acknowledges the validator cannot catch all errors, there is no discussion of broader threats such as the generalizability of results beyond CodeForces problems, the single RL run, or potential biases in the submission labels used for evaluation." 166 }, 167 "scope_boundaries_stated": { 168 "applies": true, 169 "answer": false, 170 "justification": "No explicit scope boundaries are stated. The paper does not clarify what types of competitive programming problems the approach may not work for, or which LLMs/RL algorithms the benefits might not transfer to." 171 } 172 }, 173 "data_integrity": { 174 "raw_data_available": { 175 "applies": true, 176 "answer": true, 177 "justification": "The dataset is released on HuggingFace with generators, validators, checkers, and pre-processed test cases. TPR/TNR values per problem are provided. The original CodeContests data is also publicly available." 178 }, 179 "data_collection_described": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section 4 describes the data collection process: starting from CodeContests (which collects problems from CodeForces), applying data cleaning (Section 4.1), generating test cases via the G-V agent system (Section 4.2), and sampling 100 positive/100 negative submissions per problem for evaluation (Section 4.4)." 183 }, 184 "recruitment_methods_described": { 185 "applies": false, 186 "answer": false, 187 "justification": "No human participants are involved. The data comes from publicly available competitive programming problems and automated contestant submissions on CodeForces." 188 }, 189 "data_pipeline_documented": { 190 "applies": true, 191 "answer": true, 192 "justification": "The pipeline from raw CodeContests data to CodeContests+ is documented: cleaning (13,610 to 11,690 problems), G-V agent system generation, validation (100% pass rate vs 67.1% for original), and filtering for high-quality subset (TPR&TNR >= 0.9 for CodeContests+HQ). Counts at each stage are provided." 193 } 194 }, 195 "conflicts_of_interest": { 196 "funding_disclosed": { 197 "applies": true, 198 "answer": false, 199 "justification": "No funding statement is present. The Acknowledgements section thanks ByteDance colleagues but does not disclose specific funding sources or grants." 200 }, 201 "affiliations_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "Author affiliations are clearly stated: ByteDance Seed and Peking University. The first author has dual affiliation." 205 }, 206 "funder_independent_of_outcome": { 207 "applies": true, 208 "answer": false, 209 "justification": "Three of five authors are affiliated with ByteDance Seed, and the dataset is released under the ByteDance brand. ByteDance has a commercial interest in LLM training data and infrastructure. The funder is not independent of the outcome." 210 }, 211 "financial_interests_declared": { 212 "applies": true, 213 "answer": false, 214 "justification": "No competing interests statement is present. ByteDance employees evaluating and releasing a ByteDance-branded dataset represents a potential conflict that is not explicitly acknowledged." 215 } 216 }, 217 "contamination": { 218 "training_cutoff_stated": { 219 "applies": true, 220 "answer": false, 221 "justification": "The training cutoff for Qwen2.5-32B (used for RL) is not stated. The LLM used for the Generator/Validator/Checker agents is not even named, let alone its training cutoff. LiveCodeBench is used with problems from Aug 2024-Feb 2025, but whether the base model was trained before this window is not explicitly stated." 222 }, 223 "train_test_overlap_discussed": { 224 "applies": true, 225 "answer": false, 226 "justification": "No discussion of whether Qwen2.5-32B or the agent LLM may have seen CodeForces problems or LiveCodeBench problems during pre-training. Since CodeForces problems are publicly available and widely used in training corpora, this is a relevant concern." 227 }, 228 "benchmark_contamination_addressed": { 229 "applies": true, 230 "answer": false, 231 "justification": "LiveCodeBench is designed to mitigate contamination via temporal windowing, and the paper uses problems from Aug 2024-Feb 2025, but this contamination mitigation is not discussed by the authors. No explicit mention of contamination risk for any benchmark used." 232 } 233 }, 234 "human_studies": { 235 "pre_registered": { 236 "applies": false, 237 "answer": false, 238 "justification": "No human participants involved in the study." 239 }, 240 "irb_or_ethics_approval": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants involved in the study." 244 }, 245 "demographics_reported": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants involved in the study." 249 }, 250 "inclusion_exclusion_criteria": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants involved in the study." 254 }, 255 "randomization_described": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants involved in the study." 259 }, 260 "blinding_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants involved in the study." 264 }, 265 "attrition_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants involved in the study." 269 } 270 }, 271 "cost_and_practicality": { 272 "inference_cost_reported": { 273 "applies": true, 274 "answer": false, 275 "justification": "No inference costs are reported for the LLM agent system. The paper mentions a cluster with 25,000 CPU cores and 70 TB memory for evaluation (Section 4.4), but does not report the cost of running the Generator-Validator agents or the total API/compute spend." 276 }, 277 "compute_budget_stated": { 278 "applies": true, 279 "answer": true, 280 "justification": "Section 4.4 describes the evaluation infrastructure: 'running on a cluster with 25,000 CPU cores and 70 TB memory' for 300+ million program executions. Appendix C provides further details about the cloud architecture (8,000 cores for judging pods, 17,000 for execution pods)." 281 } 282 } 283 }, 284 "claims": [ 285 { 286 "claim": "CodeContests+ achieves significantly higher accuracy than CodeContests in classifying correct/incorrect solutions, particularly with a notably higher True Positive Rate.", 287 "evidence": "Figure 3 shows that CodeContests has over 4000 problems with TPR <= 0.1, while CodeContests+ 5x has far fewer such problems. Figure 3c shows CodeContests+ 5x has almost twice the number of qualified problems (TPR&TNR >= threshold) compared to CodeContests at various thresholds.", 288 "supported": "strong" 289 }, 290 { 291 "claim": "Only 67.1% of test cases in the original CodeContests dataset pass validation.", 292 "evidence": "Section 4.2: 'We used validators to check the correctness of all 1.18 million generated test cases in CodeContests. Only 0.79 million of these passed validation, which is 67.1%.'", 293 "supported": "strong" 294 }, 295 { 296 "claim": "Even with one-quarter of the test cases, CodeContests+ 1x has significantly higher evaluation accuracy and yields over 80% more qualified problems than CodeContests.", 297 "evidence": "Section 4.4 and Figure 3c show that CodeContests+ 1x (avg 25 tests per problem) outperforms CodeContests (avg 101 tests per problem) in number of qualified problems at various thresholds.", 298 "supported": "strong" 299 }, 300 { 301 "claim": "Higher-quality test cases yield consistent improvements in RL training across all difficulty levels.", 302 "evidence": "Table 3 shows CodeContests+HQ improves over CodeContests: Easy 0.965 vs 0.958, Medium 0.812 vs 0.786, Hard 0.340 vs 0.329, All 0.637 vs 0.622 on LiveCodeBench Pass@1.", 303 "supported": "moderate" 304 }, 305 { 306 "claim": "The Generator-Validator agent system is the first LLM agent system designed for constructing high-quality test cases for competitive programming problems.", 307 "evidence": "Section 1, contribution 1. Prior work described in Section 2 covers manual, mutation-based, and direct LLM output methods, none of which use a generator-validator agent architecture.", 308 "supported": "moderate" 309 } 310 ], 311 "methodology_tags": ["benchmark-eval"], 312 "key_findings": "The paper introduces a Generator-Validator (G-V) LLM agent system that generates validated test cases for competitive programming problems, creating the CodeContests+ dataset. The system dramatically improves test case quality: 100% of generated test cases pass validation compared to only 67.1% in the original CodeContests. Large-scale evaluation with 1.72 million submissions shows CodeContests+ yields nearly twice the number of high-quality problems (TPR&TNR >= 0.9) compared to CodeContests. RL training with CodeContests+HQ on Qwen2.5-32B shows consistent improvements on LiveCodeBench, with overall Pass@1 increasing from 0.622 to 0.637.", 313 "red_flags": [ 314 { 315 "flag": "Company evaluating its own product", 316 "detail": "Three of five authors are from ByteDance Seed, and the released dataset carries the ByteDance brand. While the dataset improvements over CodeContests appear genuine (backed by 1.72M submission evaluation), no external validation or independent evaluation is reported." 317 }, 318 { 319 "flag": "Single RL training run", 320 "detail": "The RL training comparison (Table 3) appears to be from a single training run with no variance or confidence intervals reported. The improvements (e.g., 0.622 to 0.637 overall) are small enough that they could plausibly be within run-to-run variance." 321 }, 322 { 323 "flag": "No statistical significance testing", 324 "detail": "Claims of 'significantly higher accuracy' and 'consistent gains' are made based on raw number comparisons without any formal significance tests, especially concerning for the modest RL training improvements." 325 }, 326 { 327 "flag": "Agent model not disclosed", 328 "detail": "The LLM used for the Generator, Validator, and Checker agents is never identified. This is a critical reproducibility gap since the quality of the generated test cases depends entirely on the agent model's capabilities." 329 }, 330 { 331 "flag": "Confound between data cleaning and test case quality", 332 "detail": "CodeContests+ uses 11,690 problems (after cleaning) while CodeContests has 13,610. Although the paper states it uses the same subset for comparison experiments, the RL training with CodeContests+HQ further filters problems by TPR/TNR thresholds, potentially confounding test case quality with problem selection." 333 } 334 ], 335 "cited_papers": [ 336 { 337 "title": "Program synthesis with large language models", 338 "authors": ["Jacob Austin", "Augustus Odena", "Maxwell I. Nye"], 339 "year": 2021, 340 "arxiv_id": "2108.07732", 341 "relevance": "Introduces the MBPP benchmark for code generation, which CodeContests+ compares against in Table 1 for test case construction methodology." 342 }, 343 { 344 "title": "Evaluating large language models trained on code", 345 "authors": ["Mark Chen", "Jerry Tworek"], 346 "year": 2021, 347 "arxiv_id": "2107.03374", 348 "relevance": "Introduces HumanEval benchmark for code generation; foundational work for evaluating LLM coding capabilities." 349 }, 350 { 351 "title": "Competitive programming with large reasoning models", 352 "authors": ["Ahmed El-Kishky", "Alexander Wei"], 353 "year": 2025, 354 "arxiv_id": "2502.06807", 355 "relevance": "Demonstrates reasoning models on competitive programming tasks, directly relevant to evaluating coding capabilities of LLMs." 356 }, 357 { 358 "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning", 359 "authors": ["Daya Guo", "Dejian Yang"], 360 "year": 2025, 361 "arxiv_id": "2501.12948", 362 "relevance": "Major RL-based reasoning model trained on code data; CodeContests+ is designed to improve the data foundation for such training." 363 }, 364 { 365 "title": "Competition-level code generation with AlphaCode", 366 "authors": ["Yujia Li", "David Choi"], 367 "year": 2022, 368 "doi": "10.1126/science.abq1158", 369 "relevance": "Introduces the original CodeContests dataset which this paper improves upon; landmark work in competitive programming code generation." 370 }, 371 { 372 "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation", 373 "authors": ["Jiawei Liu", "Chunqiu Steven Xia"], 374 "year": 2023, 375 "relevance": "Identifies high False Positive Rates in MBPP and HumanEval due to insufficient test cases; directly motivates the test case quality problem addressed by CodeContests+." 376 }, 377 { 378 "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code", 379 "authors": ["Naman Jain", "King Han"], 380 "year": 2025, 381 "relevance": "Used as the held-out evaluation benchmark for the RL training experiments; designed to mitigate contamination in code evaluation." 382 }, 383 { 384 "title": "TACO: topics in algorithmic code generation dataset", 385 "authors": ["Rongao Li", "Jie Fu"], 386 "year": 2023, 387 "arxiv_id": "2312.14852", 388 "relevance": "Competitive programming dataset that uses LLMs to directly output test cases; compared against in Table 1 as a baseline approach." 389 }, 390 { 391 "title": "Measuring coding challenge competence with APPS", 392 "authors": ["Dan Hendrycks", "Steven Basart"], 393 "year": 2021, 394 "relevance": "Large-scale coding challenge dataset compared in Table 1; relevant to benchmarking code generation capabilities." 395 }, 396 { 397 "title": "DAPO: An open-source LLM reinforcement learning system at scale", 398 "authors": ["Qiying Yu", "Zheng Zhang"], 399 "year": 2025, 400 "arxiv_id": "2503.14476", 401 "relevance": "The RL training algorithm used in the paper's experiments; an open-source RL system for training code LLMs." 402 }, 403 { 404 "title": "Qwen2.5 technical report", 405 "authors": ["An Yang", "Baosong Yang"], 406 "year": 2025, 407 "arxiv_id": "2412.15115", 408 "relevance": "The base model (Qwen2.5-32B) used for RL training experiments in the paper." 409 } 410 ] 411 }