scan.json (35047B)
1 { 2 "paper": { 3 "title": "HardTests: Synthesizing High-Quality Test Cases for LLM Coding", 4 "authors": [ 5 "Zhongmou He", 6 "Yee Man Choi", 7 "Kexun Zhang", 8 "Jiabao Ji", 9 "Junting Zhou", 10 "Dejia Xu", 11 "Ivan Bercovich", 12 "Aidan Zhang", 13 "Lei Li" 14 ], 15 "year": 2025, 16 "venue": "arXiv (Preprint, Under review)", 17 "arxiv_id": "2505.24098", 18 "doi": "10.48550/arXiv.2505.24098" 19 }, 20 "scan_version": 3, 21 "active_modules": ["experimental_rigor", "data_leakage"], 22 "methodology_tags": ["benchmark-eval"], 23 "key_findings": "HardTestGen, an LLM-based test synthesis pipeline using three input types (direct, regular, hacking) and oracle program verification, produces test cases with 11.3pp higher precision and 17.5pp higher recall than existing datasets (TACO, CodeContests) for evaluating LLM-generated code on competitive programming. The precision advantage reaches ~40pp on harder problems. Post-training experiments show test quality matters substantially for reinforcement learning and self-distillation (RL with HardTests improves pass@1 while TACO RL hurts it), but teacher-distillation benefits more from question scaling than test quality.", 24 "checklist": { 25 "artifacts": { 26 "code_released": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper states 'We will open-source our dataset and synthesis pipeline at https://leililab.github.io/HardTests/' — this is a promise of future release, not an actual released codebase. No repository URL is provided." 30 }, 31 "data_released": { 32 "applies": true, 33 "answer": false, 34 "justification": "Same as code: 'We will open-source our dataset' is a promise, not a current release. The 47k problem dataset is not yet publicly available." 35 }, 36 "environment_specified": { 37 "applies": true, 38 "answer": false, 39 "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. Appendix A.6 mentions 'Llama-factory' and 'veRL' for training and 'firejail' for sandboxing but does not provide a reproducible environment setup." 40 }, 41 "reproduction_instructions": { 42 "applies": true, 43 "answer": false, 44 "justification": "No step-by-step reproduction instructions are provided. The paper describes the pipeline conceptually but does not include scripts or commands to replicate experiments." 45 } 46 }, 47 "statistical_methodology": { 48 "confidence_intervals_or_error_bars": { 49 "applies": true, 50 "answer": false, 51 "justification": "Tables 1–5 report only point estimates for precision, recall, and pass@k. No confidence intervals, error bars, or ± notation appears anywhere in the results." 52 }, 53 "significance_tests": { 54 "applies": true, 55 "answer": false, 56 "justification": "The paper claims HardTests 'significantly outperforms' baselines (Section 4.4) but provides no statistical significance tests (no p-values, t-tests, or bootstrap tests). All comparisons are raw number differences." 57 }, 58 "effect_sizes_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "The paper reports percentage point differences with baseline context throughout: '11.3 percentage points higher precision and 17.5 percentage points higher recall' (Abstract), and tables show both baseline and HardTests numbers side-by-side, allowing direct comparison of magnitudes." 62 }, 63 "sample_size_justified": { 64 "applies": true, 65 "answer": false, 66 "justification": "No justification given for sample sizes. 653 AtCoder problems and 600 Codeforces problems are used for direct evaluation (Section 4.3), 105 problems for LiveCodeBench evaluation, and ~5k problems for RL/distillation. No power analysis or sample size rationale provided." 67 }, 68 "variance_reported": { 69 "applies": true, 70 "answer": false, 71 "justification": "No variance, standard deviation, or spread measures reported across experimental runs. Tables show single-run point estimates. Figure 3 shows single RL training curves per condition without confidence bands." 72 } 73 }, 74 "evaluation_design": { 75 "baselines_included": { 76 "applies": true, 77 "answer": true, 78 "justification": "TACO (Li et al., 2023) and CodeContests (Li et al., 2022) are used as baselines for direct test quality evaluation (Tables 1–2). Ablative baselines (HT-TYPE1, HT-TYPE1+2) are included in Table 1. OlympicCoder-7B serves as a baseline for teacher-distillation (Table 3)." 79 }, 80 "baselines_contemporary": { 81 "applies": true, 82 "answer": true, 83 "justification": "TACO (2023) and CodeContests (2022) are the most relevant existing test synthesis datasets. The paper also compares with concurrent work (rStar-Coder, HF-Codeforces) in the related work section. OlympicCoder-7B (2025) is a contemporary training baseline." 84 }, 85 "ablation_study": { 86 "applies": true, 87 "answer": true, 88 "justification": "Table 1 shows ablation results: HT-TYPE1 (only directly generated inputs), HT-TYPE1+2 (adding regular inputs), and full HardTests (all three types). This demonstrates the contribution of each input type, showing precision improvements of 2–48pp from Type 2 and Type 3." 89 }, 90 "multiple_metrics": { 91 "applies": true, 92 "answer": true, 93 "justification": "Direct evaluation uses both precision and recall (Section 4.1). Downstream evaluation uses pass@1, pass@5, and pass@10 (Tables 3–5). RL training also reports validation reward curves (Figure 3)." 94 }, 95 "human_evaluation": { 96 "applies": true, 97 "answer": false, 98 "justification": "No human evaluation of test case quality is included. All evaluation is automated: precision/recall computed against oracle test suites and Codeforces submission verdicts." 99 }, 100 "held_out_test_set": { 101 "applies": true, 102 "answer": true, 103 "justification": "LiveCodeBench-105 is used as a held-out evaluation set for all post-training experiments (Section 5.1). The paper explicitly decontaminates training data against LiveCodeBench: 'We conduct decontamination by removing the problems that are in LiveCodeBench from our dataset' (Section 3.4)." 104 }, 105 "per_category_breakdown": { 106 "applies": true, 107 "answer": true, 108 "justification": "Tables 1–5 break down results by difficulty level (difficulty 1–4+). Table 3 breaks down by LiveCodeBench difficulty (Easy/Medium/Hard). Results per model (Qwen-7B, Qwen-14B, GPT-4o, human) are also shown." 109 }, 110 "failure_cases_discussed": { 111 "applies": true, 112 "answer": true, 113 "justification": "Appendix A.5 provides three detailed qualitative examples of false positives and false negatives, showing how specific test types catch or miss specific types of incorrect programs. Figure 5 documents pipeline failure rates and reasons (6.62% no valid oracle, 5.85% output verification failed, 3.72% input generation failed)." 114 }, 115 "negative_results_reported": { 116 "applies": true, 117 "answer": true, 118 "justification": "Table 3 shows that filtering to 13k high-quality trajectories underperforms using all 46.6k trajectories for teacher-distillation, contradicting the hypothesis that test quality always helps. Table 5 shows RL with TACO tests actually hurts performance (38.48→36.95 pass@1)." 119 } 120 }, 121 "claims_and_evidence": { 122 "abstract_claims_supported": { 123 "applies": true, 124 "answer": true, 125 "justification": "Abstract claims of '11.3 percentage points higher precision and 17.5 percentage points higher recall' are supported by Tables 1–2 showing consistent improvements across settings. The '40 points' precision claim is supported by Table 1 (difficulty 4+, Qwen-7B: 21.67→60.00, ~38pp). Post-training effectiveness claims are supported by Tables 3–5." 126 }, 127 "causal_claims_justified": { 128 "applies": true, 129 "answer": true, 130 "justification": "Causal claims ('test quality matters significantly for RL') are based on controlled single-variable manipulation: same model (Qwen3-4B), same training setup, same problem set (~5k), different test cases (HardTests vs TACO). The ablation study (Table 1) is also controlled single-variable manipulation of test types. These designs are adequate for the causal claims made." 131 }, 132 "generalization_bounded": { 133 "applies": true, 134 "answer": false, 135 "justification": "The title claims 'High-Quality Test Cases for LLM Coding' broadly, but the paper only tests competitive programming problems with standard I/O. The Limitation section acknowledges the I/O constraint but the title and abstract do not bound claims to competitive programming. SWE-bench-style coding (file I/O, web I/O, multi-file) is explicitly noted as out of scope but only in the Limitation section." 136 }, 137 "alternative_explanations_discussed": { 138 "applies": true, 139 "answer": false, 140 "justification": "The paper does not discuss alternative explanations for why HardTestGen works better. For instance, the improvement could partly stem from having more test cases rather than specifically harder ones, or from the oracle program quality rather than the generation strategy. The teacher-distillation finding (quantity > quality) is noted but not explored for alternative explanations." 141 }, 142 "proxy_outcome_distinction": { 143 "applies": true, 144 "answer": true, 145 "justification": "The paper clearly defines its proxy (precision and recall of test suites as binary classifiers of program correctness, Section 4.1) and maps it to the claimed outcome (test quality for verification). The downstream evaluation directly measures code generation performance (pass@k on LiveCodeBench). No unacknowledged proxy gap exists." 146 } 147 }, 148 "setup_transparency": { 149 "model_versions_specified": { 150 "applies": true, 151 "answer": false, 152 "justification": "The paper uses 'GPT-4o' for all test generation (Section 3.3) without a version or snapshot date. 'Qwen2.5-Coder-7B-Instruct' and 'Qwen2.5-Coder-14B-Instruct' are reasonably specific, but 'DeepSeek-R1', 'Qwen3-4B', and 'GPT-4o' lack version/snapshot identifiers. Model behavior changes across versions." 153 }, 154 "prompts_provided": { 155 "applies": true, 156 "answer": true, 157 "justification": "Appendix A.2.1 provides the complete prompt text for both the input validator generation and the input generator (all three types), including multiple detailed examples for in-context learning. These are the actual prompts used, not just descriptions." 158 }, 159 "hyperparameters_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Sampling parameters are stated: temperature=0.7, top_p=0.95 for candidate program generation (Section 4.3). Appendix A.6 provides detailed training hyperparameters for all three post-training scenarios: epochs, learning rate, batch size, max length, and evaluation sampling parameters." 163 }, 164 "scaffolding_described": { 165 "applies": false, 166 "answer": false, 167 "justification": "No agentic scaffolding is used. HardTestGen is a multi-step pipeline with separate LLM calls for validation, generation, and output checking, but has no agent loop, tools, retry logic, or memory. The pipeline architecture is well-described in Section 3 and Figure 2." 168 }, 169 "data_preprocessing_documented": { 170 "applies": true, 171 "answer": true, 172 "justification": "Section 3.4 describes cleaning, deduplication (using problem IDs and n-gram overlaps), and decontamination (removing LiveCodeBench problems via URL comparison). Appendix A.2.1 documents the problem filtering pipeline: removing problems without oracle programs, excluding 'core logic' (non-I/O) problems, leaving 32.5k problems." 173 } 174 }, 175 "limitations_and_scope": { 176 "limitations_section_present": { 177 "applies": true, 178 "answer": true, 179 "justification": "A dedicated 'Limitation' section follows the Conclusion, discussing three specific limitations: tests still not as good as human-written, reliance on oracle solutions, and constraint to single-file standard I/O." 180 }, 181 "threats_to_validity_specific": { 182 "applies": true, 183 "answer": true, 184 "justification": "The limitations are specific to this study: (1) synthetic tests underperform human-written ones, (2) oracle solutions may not exist for all coding domains, (3) code must be single-file with standard I/O. These are concrete, not boilerplate." 185 }, 186 "scope_boundaries_stated": { 187 "applies": true, 188 "answer": true, 189 "justification": "The paper explicitly states: 'the code being tested is constrained to a single file that uses Standard I/O for input and output. However, many real-world coding problems are more complicated, e.g. coding problems in SWE-bench that may involve file I/O or web I/O' (Limitation section). This clearly bounds what was NOT tested." 190 } 191 }, 192 "data_integrity": { 193 "raw_data_available": { 194 "applies": true, 195 "answer": false, 196 "justification": "The dataset is not yet available. 'We will open-source our dataset' is a promise, not a current release. No supplementary data files are provided." 197 }, 198 "data_collection_described": { 199 "applies": true, 200 "answer": true, 201 "justification": "Appendix A.3 describes data collection in detail: 5 direct data sources (Codeforces, AtCoder, Luogu, CodeContests, TACO) covering 13 OJs. Table 6 lists all OJ URLs and sources. Collection methods include scraping problem specifications and user submissions from each platform with specific time boundaries (up to September 2024)." 202 }, 203 "recruitment_methods_described": { 204 "applies": false, 205 "answer": false, 206 "justification": "No human participants. Data comes from public competitive programming platforms (online judges). Human submissions used for evaluation (MatrixStudio/Codeforces-Python-Submissions) are pre-existing public data." 207 }, 208 "data_pipeline_documented": { 209 "applies": true, 210 "answer": true, 211 "justification": "The full pipeline is documented: problem collection from 13 OJs → filtering (removing non-I/O problems, those without oracle programs) → deduplication (IDs + n-gram overlap) → decontamination (removing LiveCodeBench problems) → test generation → output verification. Figure 5 shows the status distribution of test generation (81.9% success rate with specific failure reasons and percentages)." 212 } 213 }, 214 "conflicts_of_interest": { 215 "funding_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "Acknowledgments section states: 'The OpenAI API credits used in this paper were partially supported by the OpenAI Research Access Program. The training compute used was partially supported by National Center for Supercomputing Applications and ScOp Venture Capital. KZ was partially supported by ChipAgents.ai.'" 219 }, 220 "affiliations_disclosed": { 221 "applies": true, 222 "answer": true, 223 "justification": "Author affiliations are listed: Carnegie Mellon University, UC Santa Barbara, UT Austin. No authors are affiliated with OpenAI or the companies whose models are evaluated." 224 }, 225 "funder_independent_of_outcome": { 226 "applies": true, 227 "answer": false, 228 "justification": "OpenAI provided API credits and the paper uses GPT-4o exclusively for all test synthesis. A positive result for HardTestGen implicitly demonstrates GPT-4o's utility for test generation. OpenAI has a commercial interest in showing GPT-4o is useful. ScOp Venture Capital's interests are unclear." 229 }, 230 "financial_interests_declared": { 231 "applies": true, 232 "answer": false, 233 "justification": "No competing interests or financial interests statement is present in the paper. The relationship with ScOp Venture Capital and ChipAgents.ai is mentioned in funding but no declaration of whether authors hold equity or other interests." 234 } 235 }, 236 "contamination": { 237 "training_cutoff_stated": { 238 "applies": true, 239 "answer": false, 240 "justification": "No training data cutoff dates are stated for any of the models used (GPT-4o, Qwen2.5-Coder, Qwen3-4B, DeepSeek-R1). Without cutoff dates, it is impossible to assess whether test problems were in the models' pre-training data." 241 }, 242 "train_test_overlap_discussed": { 243 "applies": true, 244 "answer": true, 245 "justification": "The paper explicitly addresses train/test overlap for post-training: 'We conduct decontamination by removing the problems that are in LiveCodeBench from our dataset. Since most of its problems are from Codeforces and AtCoder, we directly compare the URLs to the problems' (Section 3.4)." 246 }, 247 "benchmark_contamination_addressed": { 248 "applies": true, 249 "answer": true, 250 "justification": "LiveCodeBench is specifically designed for contamination-free evaluation (Jain et al., 2025b, described as 'Holistic and contamination free evaluation'). The paper decontaminates training data against it via URL comparison. However, contamination risk for the direct evaluation on Codeforces/AtCoder (Section 4) is not discussed." 251 } 252 }, 253 "human_studies": { 254 "pre_registered": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study. All evaluations are automated or use pre-existing public submission data." 258 }, 259 "irb_or_ethics_approval": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants. The study uses public competitive programming problems and automated code evaluation." 263 }, 264 "demographics_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants. Human submissions are from public Codeforces data but participants are not subjects of study." 268 }, 269 "inclusion_exclusion_criteria": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "randomization_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 }, 279 "blinding_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in this study." 283 }, 284 "attrition_reported": { 285 "applies": false, 286 "answer": false, 287 "justification": "No human participants in this study." 288 } 289 }, 290 "cost_and_practicality": { 291 "inference_cost_reported": { 292 "applies": true, 293 "answer": false, 294 "justification": "No API costs or per-example costs are reported for generating test cases with GPT-4o across 32.5k problems. No cost comparison between HardTestGen and baselines is provided." 295 }, 296 "compute_budget_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "Appendix A.6 states 'All our experiments are run on 8 NVIDIA H100 GPUs' but does not report total GPU hours, training time, or API spend. Hardware is identified but total compute budget is not quantified." 300 } 301 }, 302 "experimental_rigor": { 303 "seed_sensitivity_reported": { 304 "applies": true, 305 "answer": false, 306 "justification": "No multi-seed results reported. Tables show single-run point estimates. Figure 3 shows single RL training curves per condition without confidence bands or seed variation." 307 }, 308 "number_of_runs_stated": { 309 "applies": true, 310 "answer": false, 311 "justification": "For candidate program generation, '10 candidate programs from each LLM' is stated (Section 4.3). However, for the RL and distillation training experiments (the key downstream claims), the number of independent training runs is not stated. Figure 3 implies single runs per condition." 312 }, 313 "hyperparameter_search_budget": { 314 "applies": true, 315 "answer": false, 316 "justification": "No hyperparameter search is described. Training hyperparameters appear chosen without systematic search, and no search budget (configurations tried, compute spent) is reported." 317 }, 318 "best_config_selection_justified": { 319 "applies": true, 320 "answer": true, 321 "justification": "For RL, 'we run the best checkpoints (according to valid reward) of both training jobs within 100 steps' (Section 5.2). Checkpoint selection is based on validation reward, not test performance, which is appropriate." 322 }, 323 "multiple_comparison_correction": { 324 "applies": false, 325 "answer": false, 326 "justification": "No statistical significance tests are performed, so multiple comparison correction is moot." 327 }, 328 "self_comparison_bias_addressed": { 329 "applies": true, 330 "answer": false, 331 "justification": "The authors compare their own system (HardTestGen) against baselines (TACO, CodeContests) without acknowledging potential bias from evaluating their own system. No independent evaluation or discussion of author-evaluation bias." 332 }, 333 "compute_budget_vs_performance": { 334 "applies": true, 335 "answer": false, 336 "justification": "HardTestGen generates three types of tests using multiple LLM calls per problem (input validators, generators, oracle output computation), likely requiring significantly more compute than TACO's simpler approach. This compute difference is never discussed or controlled for." 337 }, 338 "benchmark_construct_validity": { 339 "applies": true, 340 "answer": false, 341 "justification": "LiveCodeBench is used for downstream evaluation without discussing its construct validity — whether performance on 105 competitive programming problems measures the claimed 'code generation performance' broadly. The direct evaluation metric (precision/recall) is well-defined but no discussion of whether binary pass/fail captures true code quality." 342 }, 343 "scaffold_confound_addressed": { 344 "applies": false, 345 "answer": false, 346 "justification": "No scaffolding is used in model comparisons. All comparisons are between the same model trained with different data, so scaffold confound does not arise." 347 } 348 }, 349 "data_leakage": { 350 "temporal_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No explicit discussion of temporal leakage. The models (Qwen, GPT-4o) may have been trained on Codeforces/AtCoder solutions used in Section 4 evaluation. While LiveCodeBench uses recent problems, the temporal relationship between model training and benchmark creation is not explicitly discussed." 354 }, 355 "feature_leakage_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "No discussion of feature leakage. The evaluation setup provides problem descriptions to models for code generation — whether these descriptions could leak answer information (e.g., through editorial-style hints) is not addressed." 359 }, 360 "non_independence_addressed": { 361 "applies": true, 362 "answer": false, 363 "justification": "No discussion of non-independence. Problems from the same OJs (Codeforces, AtCoder) may share structural similarities or be from the same contest series. Training and evaluation problems could share authors or problem templates." 364 }, 365 "leakage_detection_method": { 366 "applies": true, 367 "answer": true, 368 "justification": "A concrete decontamination method is applied: 'we directly compare the URLs to the problems' (Section 3.4) to remove LiveCodeBench problems from training data. For deduplication, n-gram overlap analysis and problem ID matching are used." 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "HardTestGen tests demonstrate 11.3 percentage points higher precision and 17.5 percentage points higher recall on average compared to existing tests when evaluating LLM-generated code.", 375 "evidence": "Tables 1 and 2 (Section 4.4) show precision and recall across 4 candidate program sources (3 LLMs + human) on AtCoder (653 problems) and Codeforces (600 problems), with consistent improvements across most settings.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "For harder problems, the precision improvement can be as large as 40 percentage points.", 380 "evidence": "Table 1: For Qwen2.5-Coder-7B on AtCoder difficulty 4+, TACO precision is 21.67 vs HardTests 60.00 (~38pp). Table 2: For Qwen-7B on Codeforces difficulty 4, TACO 9.82 vs HardTests 50.00 (~40pp).", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Test quality matters significantly for reinforcement learning: RL with HardTests improves downstream performance while RL with TACO hurts it.", 385 "evidence": "Table 5 and Figure 3 (Section 5.2): Qwen3-4B base pass@1=38.48; RL with TACO=36.95 (decrease); RL with HardTests=39.42 (increase). pass@10 improves from 56.19 to 64.76 with HardTests. Validation reward curve (Figure 3) is consistently higher for HardTests.", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Teacher-distillation benefits more from question scaling than test quality or sample scaling.", 390 "evidence": "Table 3: Full 46.6k trajectories (unfiltered) achieve pass@1=32.86, outperforming both the 13k filtered subset (25.24) and OlympicCoder-7B trained with 100k trajectories (25.81).", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "Self-distillation performance is highly dependent on sample quality, requiring a good verifier to select correct trajectories.", 395 "evidence": "Table 4: Qwen3-4B self-distilled with 'good 5k' (verified correct) achieves pass@1=36.00, pass@10=60.00, vs 'bad 5k' (verified incorrect) at 34.00/54.92 and 'random 5k' at 32.75/57.14.", 396 "supported": "moderate" 397 }, 398 { 399 "claim": "Type 2 (Regular) and Type 3 (Hacking) test inputs are necessary — precision improves 2–48pp over Type 1 alone with at most 2.5pp recall decrease.", 400 "evidence": "Table 1 ablation: HT-TYPE1 precision ranges from 10.40 to 99.42 across settings, while full HardTests ranges from 60.00 to 100.0. HT-TYPE1 recall ranges from 89.02 to 100.0, while full HardTests ranges from 87.80 to 99.18.", 401 "supported": "strong" 402 } 403 ], 404 "red_flags": [ 405 { 406 "flag": "No error bars or statistical tests on any result", 407 "detail": "All tables (1–5) report point estimates without confidence intervals, standard deviations, or significance tests. The word 'significantly' is used (Section 4.4) without any statistical significance test. RL training curves in Figure 3 appear to be single runs, which are known to have high variance (Henderson et al., 2018, which the paper cites for related work)." 408 }, 409 { 410 "flag": "Single-run RL experiments for a key claim", 411 "detail": "The claim that 'test quality matters significantly for reinforcement learning' appears based on a single RL training run per condition. RL training is notoriously noisy, and the pass@1 difference (36.95 vs 39.42) could plausibly be within single-run variance. Without multiple seeds, this claim is fragile." 412 }, 413 { 414 "flag": "Small evaluation set for downstream claims", 415 "detail": "LiveCodeBench-105 contains only 105 problems (selected as the subset with 'stdin' type test cases). This is small for evaluating post-training effects and limits statistical power for detecting real differences, especially when broken down by difficulty." 416 }, 417 { 418 "flag": "OpenAI funding with exclusive GPT-4o usage", 419 "detail": "The paper received OpenAI API credits and exclusively uses GPT-4o for all test synthesis. A positive result for HardTestGen implicitly demonstrates GPT-4o's utility. No alternative LLMs were tested as the test generator, so the pipeline's generalizability beyond GPT-4o is unknown." 420 }, 421 { 422 "flag": "Compute cost of pipeline not reported or compared", 423 "detail": "HardTestGen uses multiple LLM calls per problem (input validator, 3 types of generators, oracle execution, special judge generation for 25.4% of problems). The total API cost for processing 32.5k problems with GPT-4o is likely substantial but never reported or compared to baselines, making cost-effectiveness impossible to assess." 424 } 425 ], 426 "cited_papers": [ 427 { 428 "title": "Competition-level code generation with AlphaCode", 429 "authors": ["Yujia Li", "David Choi", "Junyoung Chung"], 430 "year": 2022, 431 "arxiv_id": "2203.07814", 432 "relevance": "Introduced the CodeContests dataset for competitive programming, a key baseline in this paper." 433 }, 434 { 435 "title": "TACO: Topics in Algorithmic Code Generation Dataset", 436 "authors": ["Rongao Li", "Jie Fu", "Bo-Wen Zhang"], 437 "year": 2023, 438 "arxiv_id": "2312.14852", 439 "relevance": "Major baseline dataset integrating multiple OJs with LLM-generated test cases; primary comparison target." 440 }, 441 { 442 "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation (EvalPlus)", 443 "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"], 444 "year": 2023, 445 "arxiv_id": "2305.01210", 446 "relevance": "Extended HumanEval with additional tests using LLM synthesis, a closely related test augmentation approach." 447 }, 448 { 449 "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning", 450 "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"], 451 "year": 2025, 452 "arxiv_id": "2501.12948", 453 "relevance": "Major RL-trained reasoning model; used as teacher for distillation experiments and exemplifies RLVR approach." 454 }, 455 { 456 "title": "Evaluating Large Language Models Trained on Code (HumanEval)", 457 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 458 "year": 2021, 459 "arxiv_id": "2107.03374", 460 "relevance": "Foundational LLM code generation benchmark with hand-written test cases." 461 }, 462 { 463 "title": "Measuring Coding Challenge Competence with APPS", 464 "authors": ["Dan Hendrycks", "Steven Basart", "Saurav Kadavath"], 465 "year": 2021, 466 "arxiv_id": "2105.09938", 467 "relevance": "Early coding challenge benchmark; paper notes 60% of programs passing APPS tests are actually wrong." 468 }, 469 { 470 "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code", 471 "authors": ["Naman Jain", "King Han", "Alex Gu"], 472 "year": 2025, 473 "relevance": "Contamination-free code evaluation benchmark used as the held-out evaluation set for all post-training experiments." 474 }, 475 { 476 "title": "OpenCodeReasoning: Advancing Data Distillation for Competitive Coding", 477 "authors": ["Wasi Uddin Ahmad", "Sean Narenthiran", "Somshubra Majumdar"], 478 "year": 2025, 479 "arxiv_id": "2504.01943", 480 "relevance": "Concurrent dataset of 28k problems with 735k reasoning traces; paper confirms that data scaling dominates trajectory correctness in distillation." 481 }, 482 { 483 "title": "ALGO: Synthesizing Algorithmic Programs with LLM-Generated Oracle Verifiers", 484 "authors": ["Kexun Zhang", "Danqing Wang", "Jingtao Xia"], 485 "year": 2023, 486 "arxiv_id": "2305.14591", 487 "relevance": "Prior work by same group on LLM-based test synthesis with oracle verifiers; basis for the oracle-free variant in Appendix A.7." 488 }, 489 { 490 "title": "AceCoder: Acing Coder RL via Automated Test-Case Synthesis", 491 "authors": ["Huaye Zeng", "Dongfu Jiang", "Haozhe Wang"], 492 "year": 2025, 493 "arxiv_id": "2502.01718", 494 "relevance": "Concurrent work on automated test synthesis for RL training of code models; used as baseline in oracle-free experiments." 495 }, 496 { 497 "title": "KodCode: A Diverse, Challenging, and Verifiable Synthetic Dataset for Coding", 498 "authors": ["Zhangchen Xu", "Yang Liu", "Yueqin Yin"], 499 "year": 2025, 500 "arxiv_id": "2503.02951", 501 "relevance": "Synthetic coding dataset that generates questions, solutions, and tests with LLMs; represents the Type-1-only approach." 502 }, 503 { 504 "title": "rStar-Coder: Scaling Competitive Code Reasoning with a Large-Scale Verified Dataset", 505 "authors": ["Yifei Liu", "Li Lyna Zhang", "Yi Zhu"], 506 "year": 2025, 507 "arxiv_id": "2505.21297", 508 "relevance": "Concurrent work on reliable test synthesis for competitive programming; compared in related work." 509 }, 510 { 511 "title": "Code-R1: Reproducing R1 for Code with Reliable Rewards", 512 "authors": ["Jiawei Liu", "Lingming Zhang"], 513 "year": 2025, 514 "relevance": "RL training approach for code that emphasizes reliable rewards; the paper's RL setup is inspired by this work." 515 }, 516 { 517 "title": "Scattered Forest Search: Smarter Code Space Exploration with LLMs", 518 "authors": ["Jonathan Light", "Yue Wu", "Yiyou Sun"], 519 "year": 2025, 520 "arxiv_id": "2411.05010", 521 "relevance": "Demonstrates that weak verifiers can harm downstream code generation and search performance." 522 } 523 ], 524 "engagement_factors": { 525 "practical_relevance": { 526 "score": 2, 527 "justification": "The pipeline and 47k-problem dataset are directly useful for researchers doing LLM post-training for coding, but code/data are not yet released and the approach is limited to competitive programming." 528 }, 529 "surprise_contrarian": { 530 "score": 1, 531 "justification": "The finding that teacher-distillation doesn't benefit from test quality is mildly surprising, but the main finding (better tests help) is expected." 532 }, 533 "fear_safety": { 534 "score": 0, 535 "justification": "No AI risk or security concerns raised; the paper focuses on improving test quality for training." 536 }, 537 "drama_conflict": { 538 "score": 1, 539 "justification": "Implicitly criticizes existing datasets (TACO has '90% false positive rate for difficult problems') but without dramatic framing." 540 }, 541 "demo_ability": { 542 "score": 1, 543 "justification": "A project page exists at leililab.github.io/HardTests/ but dataset and code are not yet released." 544 }, 545 "brand_recognition": { 546 "score": 1, 547 "justification": "CMU authors and uses well-known models (GPT-4o, DeepSeek-R1), but not from a major AI lab." 548 } 549 } 550 }