scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (19438B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "HardTests: Synthesizing High-Quality Test Cases for LLM Coding",
      6     "authors": [
      7       "Zhongmou He",
      8       "Yee Man Choi",
      9       "Kexun Zhang",
     10       "Jiabao Ji",
     11       "Junting Zhou"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2505.24098",
     16     "doi": "10.48550/arXiv.2505.24098"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The abstract states HARDTESTS 'proves to be more effective for model training' without qualification, but Table 3 shows that filtering to quality-verified trajectories (13k) underperforms unfiltered full data (46.6k) in teacher distillation—a contradiction the abstract does not acknowledge.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The RL experiment controls problem set and training setup, supporting causal inference there. However, the teacher-distillation comparison conflates data quantity (13k vs 46.6k) with quality, making the causal attribution to test quality invalid for that scenario.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The title claims 'LLM Coding' broadly, but all experiments are restricted to competitive programming with standard I/O. The limitations section acknowledges this constraint but the abstract presents no such scoping.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "RL improvements could reflect differences in problem selection or difficulty distribution rather than test quality alone. The paper does not discuss these alternatives; the teacher-distillation finding (volume dominates) is noted but not explored as an alternative explanation for the RL results.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Section 4.1 explicitly defines precision/recall of test suites against oracle tests as the measurement, clearly distinguishing it from the ultimate goal of better-trained LLMs. The connection is then tested empirically through post-training experiments.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "A dedicated 'Limitation' section follows the conclusion, listing three specific constraints: tests remain below human-written quality, oracle solutions are required, and the pipeline is restricted to single-file standard I/O problems.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The limitations name specific threats: oracle solution requirement, single-file I/O constraint, and residual false positive rate vs. human-written tests. Appendix A.2.2 further quantifies failure modes by specific reason (6.62% no valid oracle, 5.85% output verification failure, 3.72% input generation failure).",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper explicitly states HARDTESTGEN 'is constrained to a single file that uses Standard I/O' and notes SWE-bench-style problems with file or web I/O are out of scope, flagging this as explicit future work.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "An 'Acknowledgments and Disclosure of Funding' section discloses OpenAI Research Access Program (API credits), National Center for Supercomputing Applications and ScOp Venture Capital (compute), and ChipAgents.ai (partial author support).",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All author affiliations are listed on the title page: Carnegie Mellon University, UC Santa Barbara, and UT Austin.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "OpenAI provides API credits while GPT-4o is the sole model driving the entire HARDTESTGEN pipeline (test generation, validation, special judges). Positive results directly validate and promote the funder's product, creating a structural conflict.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "Only funding sources are listed. There is no competing interests statement, no disclosure of equity or consulting relationships with OpenAI or ChipAgents.ai, and no standard 'no competing interests' declaration.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Footnote 2 explicitly defines 'verifier' as rule-based correctness checking systems. Section 3.1 formally defines test suite, oracle tests, oracle programs, precision, and recall with mathematical notation.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 1 explicitly enumerates three contributions: the HARDTESTGEN pipeline, the HARDTESTS 47k-problem dataset, and empirical analyses of test quality effects on LLM post-training.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 compares against RLVR literature, LLM-based test synthesis methods (CodeT, TACO, EvalPlus, ALGO, STGen), and competition code datasets, explicitly positioning HARDTESTS against each with specific differentiators rather than just listing references.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "benchmark-creation": {
    120       "construct_design": {
    121         "construct_validity_argued": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Section 4.1 argues explicitly: false positives in RL reward signals give incorrect rewards to wrong programs, directly harming training. Precision and recall against oracle tests are justified as measuring exactly this failure mode.",
    125           "source": "haiku"
    126         },
    127         "difficulty_distribution_characterized": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Difficulty is characterized using Luogu's 7-level labels (section 3.4), and experimental results are broken down by difficulty levels 1 through 4+ in Tables 1 and 2, with explicit analysis of how performance gaps change with difficulty.",
    131           "source": "haiku"
    132         },
    133         "ceiling_floor_effects_checked": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Tables 1 and 2 reveal near-ceiling precision at difficulty 1 (HARDTESTS ~98-99% across models), which compresses observable differences. The paper emphasizes improvement on harder problems but does not analyze ceiling effects as a methodological limitation affecting interpretation of easy-problem comparisons.",
    137           "source": "haiku"
    138         },
    139         "human_baseline_included": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Human programmer submissions are used as one test population in Table 2 (evaluating how well tests detect wrong human programs), but there is no baseline showing how human-written test cases themselves perform under the same precision/recall framework for direct comparison.",
    143           "source": "haiku"
    144         },
    145         "scoring_rubric_justified": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "Precision (avoiding false reward for wrong programs) and recall (not penalizing correct programs) are justified by their direct relevance to RL training. Edge cases in output comparison are handled via LLM-generated special judge functions (Section 3.3, covering 25.4% of problems).",
    149           "source": "haiku"
    150         }
    151       },
    152       "robustness": {
    153         "contamination_resistance_designed": {
    154           "applies": true,
    155           "answer": true,
    156           "justification": "Section 3.4 explicitly describes decontamination by removing all problems present in LiveCodeBench using URL-based matching against Codeforces and AtCoder problem IDs.",
    157           "source": "haiku"
    158         },
    159         "temporal_robustness_discussed": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "Problems are collected up to September 2024 with no discussion of how to update the benchmark as models improve or how to maintain discrimination as frontier models approach ceiling performance on current competition problems.",
    163           "source": "haiku"
    164         },
    165         "failure_modes_discussed": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "Appendix A.2.2 quantifies failure modes with specific rates (18.1% total, with breakdown by cause). Appendix A.5 provides three worked qualitative examples showing how different test types fail (false positives from small-scale tests, false negatives from missing special judge functions).",
    169           "source": "haiku"
    170         },
    171         "baseline_implementations_provided": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "Complete prompts for HARDTESTGEN are provided verbatim in Appendix A.2.1. All training hyperparameters for teacher distillation, self-distillation, and RL are detailed in Appendix A.6. The pipeline will be open-sourced.",
    175           "source": "haiku"
    176         }
    177       },
    178       "documentation": {
    179         "dataset_documentation_complete": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Appendix A.3 documents all 13 OJ sources, collection dates (up to September 2024), oracle program sources with reliability tiers (Table 7), deduplication and decontamination methodology, and statistics including distribution of test case counts per problem type (Figures 4-7).",
    183           "source": "haiku"
    184         },
    185         "licensing_and_access_clear": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "The paper states the dataset will be open-sourced at a listed URL but specifies no license terms. The IP status of problems scraped from commercial platforms (Codeforces, AtCoder, Luogu) is not addressed, creating unresolved copyright uncertainty for downstream users.",
    189           "source": "haiku"
    190         },
    191         "intended_use_specified": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "Intended use for post-training (RL, self-distillation, teacher distillation) is implied throughout but never explicitly stated. The paper does not specify what should NOT be concluded from HARDTESTS results (e.g., inapplicability to non-competitive real-world coding domains).",
    195           "source": "haiku"
    196         }
    197       }
    198     }
    199   },
    200   "claims": [
    201     {
    202       "claim": "HARDTESTGEN improves test precision by 11.3 percentage points and recall by 17.5 percentage points on average over existing synthesizers.",
    203       "evidence": "Tables 1 and 2 show averaged results across AtCoder (653 problems) and Codeforces (600 problems) for three LLMs and human submissions.",
    204       "supported": "strong"
    205     },
    206     {
    207       "claim": "For harder competitive programming problems (difficulty 4+), precision improvement can exceed 40 percentage points.",
    208       "evidence": "Table 1: Qwen2.5-Coder-7B on AtCoder difficulty 4+, TACO precision 21.67 vs HARDTESTS precision 60.00.",
    209       "supported": "strong"
    210     },
    211     {
    212       "claim": "High-quality tests significantly improve reinforcement learning post-training outcomes.",
    213       "evidence": "Table 5: RL with HARDTESTS improves Qwen3-4B pass@10 from 56.19 to 64.76; RL with TACO degrades pass@1 from 38.48 to 36.95. Limited to 100 steps and one base model.",
    214       "supported": "moderate"
    215     },
    216     {
    217       "claim": "Test quality does not matter for teacher distillation, where data volume dominates.",
    218       "evidence": "Table 3: 13k quality-filtered trajectories (pass@1=25.24) underperform 46.6k unfiltered (pass@1=32.86), but this conflates quantity with quality—not a clean causal test.",
    219       "supported": "weak"
    220     },
    221     {
    222       "claim": "80% of competitive programming problems have proprietary human-written test cases that cannot be scraped.",
    223       "evidence": "Attributed to 'our study' in the introduction with no methodology described for this statistic.",
    224       "supported": "weak"
    225     },
    226     {
    227       "claim": "Oracle-free HARDTESTGEN variant halves the false positive rate compared to AceCoder baseline (17.67% vs 32.49%).",
    228       "evidence": "Table 8 in appendix, verified on 165 AtCoder problems with 50 sample solutions each.",
    229       "supported": "moderate"
    230     }
    231   ],
    232   "methodology_tags": [
    233     "benchmark-creation",
    234     "benchmark-eval"
    235   ],
    236   "key_findings": "HARDTESTGEN improves test case quality for competitive programming by combining three input types—directly generated, randomly generated, and adversarially constructed 'hacking' inputs—using oracle program outputs as ground truth. The resulting HARDTESTS dataset achieves +11.3pp precision and +17.5pp recall over baselines, with the largest gains on harder problems where existing synthesizers have >90% false positive rates. High-quality tests substantially improve RL training (pass@10 +8.6pp) and self-distillation, but test quality is irrelevant for teacher distillation where data volume dominates—a finding that partially contradicts the abstract's claim that HARDTESTS is 'more effective for model training.' An 18.1% test generation failure rate and restriction to standard I/O competitive programming are significant scope limitations.",
    237   "red_flags": [
    238     {
    239       "flag": "Teacher distillation quantity/quality confound",
    240       "detail": "The claim that test quality doesn't help teacher distillation compares 13k filtered vs 46.6k unfiltered trajectories, making it impossible to separate the effect of data quality from data quantity."
    241     },
    242     {
    243       "flag": "OpenAI funder-tool conflict",
    244       "detail": "GPT-4o drives every step of HARDTESTGEN while OpenAI Research Access Program partially funds the work. Positive results for the pipeline implicitly validate the funder's product with no independence."
    245     },
    246     {
    247       "flag": "Narrow downstream evaluation",
    248       "detail": "Post-training experiments use only 100 RL steps, a 105-problem LiveCodeBench subset, and a single base model (Qwen3-4B), severely limiting generalizability of the training quality claims."
    249     },
    250     {
    251       "flag": "Title scope mismatch",
    252       "detail": "The paper claims 'LLM Coding' broadly in its title but is entirely restricted to competitive programming with single-file standard I/O. SWE-bench-style real-world coding is explicitly excluded."
    253     },
    254     {
    255       "flag": "No dataset license specified",
    256       "detail": "Problems scraped from Codeforces, AtCoder, and Luogu are bundled into an open-source dataset with no license terms. IP status of the scraped content is unaddressed, creating redistribution uncertainty."
    257     }
    258   ],
    259   "cited_papers": [
    260     {
    261       "title": "TACO: Topics in Algorithmic Code Generation Dataset",
    262       "relevance": "Primary baseline for test synthesis quality comparison; also a source of problems in HARDTESTS"
    263     },
    264     {
    265       "title": "Competition-Level Code Generation with AlphaCode (CodeContests)",
    266       "relevance": "Primary baseline for Codeforces test synthesis; provides comparison oracle test suite"
    267     },
    268     {
    269       "title": "Measuring Coding Challenge Competence with APPS",
    270       "relevance": "Motivating paper demonstrating 60% false positive rate in existing test suites"
    271     },
    272     {
    273       "title": "Is Your Code Generated by ChatGPT Really Correct? (EvalPlus)",
    274       "relevance": "Related test synthesis work extending HumanEval with reference-implementation-guided tests"
    275     },
    276     {
    277       "title": "CodeT: Code Generation with Generated Tests",
    278       "relevance": "Prior LLM-based test synthesis baseline using direct input generation"
    279     },
    280     {
    281       "title": "LiveCodeBench: Holistic and Contamination-Free Evaluation of LLMs for Code",
    282       "relevance": "Downstream evaluation benchmark used to measure post-training quality across all experiments"
    283     },
    284     {
    285       "title": "ALGO: Synthesizing Algorithmic Programs with LLM-Generated Oracle Verifiers",
    286       "relevance": "Related oracle-based synthesis work; serves as basis for the oracle-free appendix method"
    287     },
    288     {
    289       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    290       "relevance": "Key motivation demonstrating verifier-based RL for LLM reasoning; teacher distillation source"
    291     },
    292     {
    293       "title": "AceCoder: Acing Coder RL via Automated Test-Case Synthesis",
    294       "relevance": "Concurrent work on test synthesis for coding RL; directly compared in oracle-free experiment"
    295     }
    296   ],
    297   "engagement_factors": {
    298     "practical_relevance": {
    299       "score": 3,
    300       "justification": "Directly addresses the training data quality bottleneck for code RL with an open-sourced pipeline and 47k-problem dataset immediately usable by any team doing code post-training."
    301     },
    302     "surprise_contrarian": {
    303       "score": 2,
    304       "justification": "The finding that test quality is irrelevant for teacher distillation—where data volume dominates—challenges the intuition that higher-quality data always outperforms more data."
    305     },
    306     "fear_safety": {
    307       "score": 0,
    308       "justification": "No AI safety or risk concerns; the paper is purely technical work on improving verifier quality for competitive programming."
    309     },
    310     "drama_conflict": {
    311       "score": 0,
    312       "justification": "No controversy or institutional conflict; presents incremental technical improvements over prior work without challenging prominent researchers."
    313     },
    314     "demo_ability": {
    315       "score": 2,
    316       "justification": "Pipeline will be open-sourced with complete prompts provided in appendix, allowing practitioners to synthesize tests for their own competitive programming datasets."
    317     },
    318     "brand_recognition": {
    319       "score": 1,
    320       "justification": "CMU affiliation provides moderate recognition; GPT-4o is prominently used but no major commercial AI lab (Google, OpenAI, Meta) appears as an authoring institution."
    321     }
    322   },
    323   "hn_data": {
    324     "threads": [
    325       {
    326         "hn_id": "44718352",
    327         "title": "SmallThinker: A Family of Efficient LLMs Natively Trained for Local Deployment",
    328         "points": 2,
    329         "comments": 0,
    330         "url": "https://news.ycombinator.com/item?id=44718352"
    331       }
    332     ],
    333     "top_points": 2,
    334     "total_points": 2,
    335     "total_comments": 0
    336   }
    337 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs