scan.json (33477B)
1 { 2 "paper": { 3 "title": "Rethinking Verification for LLM Code Generation: From Generation to Testing", 4 "authors": [ 5 "Zihan Ma", 6 "Taolin Zhang", 7 "Maosong Cao", 8 "Junnan Liu", 9 "Wenwei Zhang", 10 "Minnan Luo", 11 "Songyang Zhang", 12 "Kai Chen" 13 ], 14 "year": 2025, 15 "venue": "arXiv", 16 "arxiv_id": "2507.06920", 17 "doi": "10.48550/arXiv.2507.06920" 18 }, 19 "scan_version": 3, 20 "active_modules": ["experimental_rigor", "data_leakage"], 21 "methodology_tags": ["benchmark-eval"], 22 "key_findings": "Current LLM code evaluation benchmarks suffer from sparse, homogeneous test suites that miss 20-40% of errors caught by more rigorous online judges. The paper proves theoretically and empirically that simply increasing random test case quantity hits a detection rate ceiling due to inter-test correlation. SAGA, a human-LLM collaborative test case generation framework that leverages both correct and incorrect human solutions, achieves 90.62% detection rate and 32.58% verifier accuracy on TCGBench-Lite, substantially outperforming existing methods. The resulting CodeCompass benchmark causes a ~9.56% relative drop in model Pass@1 compared to LiveCodeBench-v6, revealing previously hidden quality differences between models.", 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "Footnote 1 states: 'The data demo and prompts can be accessed via https://github.com/open-compass/SAGA'. A working GitHub URL is provided." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "The GitHub repository provides a 'data demo' and the underlying problems come from publicly accessible competitive programming platforms (AtCoder, Codeforces, Nowcoder). However, only a demo of the curated TCGBench dataset appears to be released, not the full collection of 1840 problems with incorrect submissions." 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": false, 38 "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup section is provided. The paper mentions model names and training configurations (Appendix J) but does not specify the software environment needed to reproduce experiments." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": false, 43 "justification": "No step-by-step reproduction instructions are provided in the paper or referenced in supplementary materials. The GitHub link provides prompts and data demo but no documented reproduction workflow." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": false, 50 "justification": "All results in Tables 2, 3, 6 and Figures 6-11 report point estimates only (e.g., 'DR@50: 90.62%', 'VAcc@50: 32.58%'). No confidence intervals, error bars, or uncertainty measures are provided for any result." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "The paper claims SAGA outperforms baselines based solely on comparing point estimates (e.g., Table 2). No statistical significance tests (t-tests, bootstrap tests, permutation tests) are reported for any comparison." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "The paper consistently reports improvements with baseline context: 'Detection Rate increasing by 9.55% and Verifier Accuracy by 12.14%' (Section 1), 'AUC@50 of 0.2228, surpassing the Input-Interpreter (0.1234)' (Section 3.2.1), 'average Pass@1 for various models drops by a relative 9.56%' (Section 4)." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "TCGBench has 1840 problems and TCGBench-Lite has 270 problems, but no justification is provided for why these specific sizes were chosen. No power analysis or sample size rationale is given." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": false, 70 "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. The paper states 'greedy decoding strategy' (Section 2.2) which is deterministic for a single input, but no multi-run or cross-seed variance is reported for any experiment." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "Table 2 compares SAGA against TestChain, Input-Interpreter (LiveCodeBench-style), EvalPlus, and TCGCoder-7B. Multiple baseline TCG methods are included." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "Baselines include TestChain (2024), LiveCodeBench (2024), and EvalPlus (2023), all recent and representative of current TCG approaches. The LLM backbone used (DeepSeek-V3-0324) is also contemporary." 83 }, 84 "ablation_study": { 85 "applies": true, 86 "answer": true, 87 "justification": "Table 2 includes comprehensive ablation studies: analytical component ablation (Multidimensional only vs. Differential only), prompt design ablation (SimpleCOT, Random Input w/ GT, EvalPlus w/ GT), and base LLM ablation across four different backbones." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "The paper uses four evaluation metrics: Detection Rate (DR@k), Verifier Accuracy (VAcc@k), AUC@N (normalized area under accuracy-number curve), and Diversity Ratio (DivRatio@k). These are reported consistently in Table 2 and Figures 6-11." 93 }, 94 "human_evaluation": { 95 "applies": true, 96 "answer": false, 97 "justification": "No human evaluation of SAGA's generated test cases is reported. The paper mentions 'human-verified adversarial examples' for CodeCompass curation, but no systematic human evaluation of test case quality or relevance is conducted." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": true, 102 "justification": "TCGBench-Lite (270 problems from June 2024+) is used for evaluation. Appendix J confirms TCGCoder-7B's training data (15,000 early-stage problems) was 'distinct from our evaluation sets to prevent data leakage.'" 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "Figure 11 (Appendix G) provides per-difficulty breakdowns (Easy/Medium/Hard) for all methods. Results are also broken down by platform (AtCoder, Codeforces, Nowcoder) in Figures 9-10." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": false, 112 "justification": "The paper discusses failures of baseline methods (Direct Generation quality issues in Figure 3, saturation of Input-Interpreter in Figure 4) but does not discuss specific failure cases where SAGA itself fails to generate adequate test cases or where its verifiers miss errors." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": true, 117 "justification": "The paper reports several negative findings: Direct Generation has very low retention rates and <10% VAcc (Figure 3), increasing random test quantity shows diminishing returns with saturation (Figure 4, Corollary 1), and the ablation in Table 5 (Appendix H.2) shows that doubling correct solutions cannot compensate for removing Differential Analysis." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "Abstract claims match results: 'detection rate of 90.62%' matches Table 2 DR@50, 'verifier accuracy of 32.58%' matches Table 2 VAcc@50, 'Verifier Accuracy...is 10.78% higher than that of LiveCodeBench-v6' matches Table 3 (30.39% vs 19.61% = +10.78%)." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": true, 129 "justification": "The paper's causal claims ('SAGA improves test quality') are supported by controlled ablation studies (Table 2) with single-variable manipulations: removing analytical components, changing prompts, and swapping backbone LLMs while holding other variables constant." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": false, 134 "justification": "The title claims to address 'LLM Code Generation' broadly, and the conclusion speaks of 'reliable LLM code evaluation' generally, but all experiments are limited to competitive programming problems from AtCoder, Codeforces, and Nowcoder. Real-world software engineering tasks, multi-file code generation, and non-algorithmic programming are not tested." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper does not discuss alternative explanations for SAGA's superior performance. For example, SAGA has strictly more information access than baselines (both correct and incorrect human solutions) — it is not discussed whether the improvement is due to the analytical framework vs. simply having more input data." 140 }, 141 "proxy_outcome_distinction": { 142 "applies": true, 143 "answer": true, 144 "justification": "The paper carefully defines its metrics (DR, VAcc) in Section 2.1 and measures exactly those quantities. Claims about test suite quality are stated in terms of the defined metrics rather than broader unmeasured constructs. The paper does note the gap between benchmark performance and real-world reliability in Section 1." 145 } 146 }, 147 "setup_transparency": { 148 "model_versions_specified": { 149 "applies": true, 150 "answer": true, 151 "justification": "Specific model versions are provided: 'DeepSeek-V3-0324' (with date), 'Qwen2.5-72B-Instruct', 'Qwen2.5-Coder-32B-Instruct', 'Qwen2.5-Coder-7B-Instruct', 'GPT-4o (2024-11-20)' (Table 6). All model names include version identifiers." 152 }, 153 "prompts_provided": { 154 "applies": true, 155 "answer": true, 156 "justification": "Footnote 1 states 'The data demo and prompts can be accessed via https://github.com/open-compass/SAGA'. Prompts are available via the linked repository." 157 }, 158 "hyperparameters_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section 2.2 states 'greedy decoding strategy' for all experiments. Appendix J reports TCGCoder-7B training hyperparameters: 3 epochs, batch size 16, learning rate 5e-6 (min 3e-7), max sequence length 61,335 tokens." 162 }, 163 "scaffolding_described": { 164 "applies": false, 165 "answer": false, 166 "justification": "SAGA is a structured prompting pipeline, not agentic scaffolding with tool use, retry logic, or memory. No agentic scaffolding is used." 167 }, 168 "data_preprocessing_documented": { 169 "applies": true, 170 "answer": true, 171 "justification": "Section 2.2 describes TCGBench curation: 1840 problems from three platforms, average 36.66 incorrect submissions per problem (WA/TLE verdicts). Appendix E documents TCGBench-Lite: 270 problems from June 2024+, difficulty classification methodology, and 41.41 avg incorrect submissions per problem." 172 } 173 }, 174 "limitations_and_scope": { 175 "limitations_section_present": { 176 "applies": true, 177 "answer": false, 178 "justification": "No dedicated limitations section exists in the paper. The conclusion (Section 5) discusses future directions but does not address limitations of the current work." 179 }, 180 "threats_to_validity_specific": { 181 "applies": true, 182 "answer": false, 183 "justification": "No threats to validity are discussed. The paper does not acknowledge specific threats such as the narrow scope of competitive programming, the reliance on greedy decoding, or the potential non-representativeness of the selected platforms." 184 }, 185 "scope_boundaries_stated": { 186 "applies": true, 187 "answer": false, 188 "justification": "No explicit scope boundaries are stated. The paper does not clarify that results apply only to competitive programming tasks, not general software engineering. It does not state what was NOT tested or what claims the authors are NOT making." 189 } 190 }, 191 "data_integrity": { 192 "raw_data_available": { 193 "applies": true, 194 "answer": false, 195 "justification": "Only a 'data demo' is provided via the GitHub link. The full TCGBench dataset (1840 problems with 36.66 avg incorrect submissions each) and TCGBench-Lite (270 problems) do not appear to be fully released for independent verification." 196 }, 197 "data_collection_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "Section 2.2 and Appendix D describe data collection: problems from AtCoder, Codeforces, and Nowcoder; incorrect submissions with WA/TLE verdicts; average 36.66 incorrect submissions per problem; recent contests to minimize contamination." 201 }, 202 "recruitment_methods_described": { 203 "applies": false, 204 "answer": false, 205 "justification": "No human participants in this study. Data is sourced from public competitive programming platforms." 206 }, 207 "data_pipeline_documented": { 208 "applies": true, 209 "answer": false, 210 "justification": "The paper provides high-level descriptions of data collection (platforms, submission types) but does not document filtering counts at each stage. Appendix D states 'Further details on data collection, filtering criteria, and specific contest sources are available in supplementary materials' but these are not in the paper itself." 211 } 212 }, 213 "conflicts_of_interest": { 214 "funding_disclosed": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding disclosure or acknowledgments section is present in the paper. Authors are from Shanghai AI Laboratory and Xi'an Jiaotong University but no funding sources are mentioned." 218 }, 219 "affiliations_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Author affiliations are clearly listed: Shanghai AI Laboratory, School of Computer Science and Technology at Xi'an Jiaotong University, and MOE KLINNS Lab. The paper evaluates third-party models (DeepSeek, Qwen, GPT-4o), not the authors' own products." 223 }, 224 "funder_independent_of_outcome": { 225 "applies": true, 226 "answer": false, 227 "justification": "No funding information is disclosed, making it impossible to assess funder independence." 228 }, 229 "financial_interests_declared": { 230 "applies": true, 231 "answer": false, 232 "justification": "No competing interests statement or financial disclosure is included in the paper." 233 } 234 }, 235 "contamination": { 236 "training_cutoff_stated": { 237 "applies": true, 238 "answer": false, 239 "justification": "The paper does not state the training data cutoff dates for any of the evaluated models (DeepSeek-V3-0324, Qwen2.5 series, GPT-4o). Model version identifiers are given but not their training cutoffs." 240 }, 241 "train_test_overlap_discussed": { 242 "applies": true, 243 "answer": true, 244 "justification": "Section 3.2.1 states TCGBench-Lite uses problems 'from AtCoder, Codeforces, and Nowcoder contests since June 2024' to 'ensure contemporary relevance and minimize potential data leakage.' Appendix J confirms TCGCoder-7B training data is 'distinct from our evaluation sets to prevent data leakage.'" 245 }, 246 "benchmark_contamination_addressed": { 247 "applies": true, 248 "answer": true, 249 "justification": "The paper addresses contamination through temporal selection: TCGBench-Lite and CodeCompass use problems from June 2024 onwards, and the paper explicitly notes this 'minimiz[es] potential data leakage for evaluating newer models' (Appendix E)." 250 } 251 }, 252 "human_studies": { 253 "pre_registered": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study. The paper evaluates TCG methods on competitive programming problems." 257 }, 258 "irb_or_ethics_approval": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "demographics_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "inclusion_exclusion_criteria": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "randomization_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "blinding_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 }, 283 "attrition_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants in this study." 287 } 288 }, 289 "cost_and_practicality": { 290 "inference_cost_reported": { 291 "applies": true, 292 "answer": false, 293 "justification": "No inference cost, API cost, tokens consumed, or wall-clock time is reported for running SAGA or any of the baseline TCG methods. The cost of generating 50 test cases per problem with DeepSeek-V3 is not quantified." 294 }, 295 "compute_budget_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "For TCGCoder-7B training, the paper mentions '2 nodes, each with 8 GPUs' and FSDP (Appendix J) but does not state total GPU hours, training time, or total compute cost. No compute budget is stated for the main SAGA experiments." 299 } 300 }, 301 "experimental_rigor": { 302 "seed_sensitivity_reported": { 303 "applies": true, 304 "answer": false, 305 "justification": "No multi-seed results are reported. The paper uses 'greedy decoding strategy' which is deterministic for a given input, but no analysis of sensitivity to other sources of randomness (e.g., SAGA's random test input generation) is provided." 306 }, 307 "number_of_runs_stated": { 308 "applies": true, 309 "answer": false, 310 "justification": "The number of experimental runs is not explicitly stated for any experiment. It appears results are from single runs with greedy decoding." 311 }, 312 "hyperparameter_search_budget": { 313 "applies": true, 314 "answer": false, 315 "justification": "No hyperparameter search budget is described for SAGA's configuration. The number of test cases (50) and other design choices appear fixed without justifying the search process." 316 }, 317 "best_config_selection_justified": { 318 "applies": true, 319 "answer": false, 320 "justification": "The selection of SAGA's final configuration (number of correct solutions analyzed, prompt structure, number of test cases) is not justified as a principled selection from among alternatives." 321 }, 322 "multiple_comparison_correction": { 323 "applies": false, 324 "answer": false, 325 "justification": "No statistical significance tests are performed in the paper, so multiple comparison correction is not applicable." 326 }, 327 "self_comparison_bias_addressed": { 328 "applies": true, 329 "answer": false, 330 "justification": "The authors evaluate their own SAGA system and its distilled TCGCoder-7B model against baselines they implemented. No acknowledgment of author-evaluation bias is provided, despite Lucic et al. (2018) showing systematic underperformance of author-implemented baselines." 331 }, 332 "compute_budget_vs_performance": { 333 "applies": true, 334 "answer": false, 335 "justification": "SAGA requires access to both correct and incorrect human solutions and uses more complex multi-step prompting than baselines, yet no performance-per-compute comparison is provided. The computational overhead of SAGA relative to simpler methods is not discussed." 336 }, 337 "benchmark_construct_validity": { 338 "applies": true, 339 "answer": true, 340 "justification": "The entire paper is a construct validity investigation of code generation benchmarks. Section 2 formally defines Detection Rate and Verifier Accuracy, derives theoretical bounds (Corollary 1, Appendix C), and empirically validates them. The paper directly questions whether existing benchmarks measure what they claim." 341 }, 342 "scaffold_confound_addressed": { 343 "applies": false, 344 "answer": false, 345 "justification": "No agentic scaffolding is involved in the code generation evaluation. Models are evaluated directly on coding problems with test suites." 346 } 347 }, 348 "data_leakage": { 349 "temporal_leakage_addressed": { 350 "applies": true, 351 "answer": true, 352 "justification": "TCGBench-Lite uses problems from 'June 2024 - Present' (Table 4, Appendix E) to 'ensure contemporary relevance and minimize potential data leakage.' This temporal split addresses whether models could have seen the problems during training." 353 }, 354 "feature_leakage_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "The paper does not discuss whether the evaluation setup could leak answer information to code generation models. For the RLVR application discussed, there is no analysis of whether test case patterns could leak correctness signals beyond the intended reward." 358 }, 359 "non_independence_addressed": { 360 "applies": true, 361 "answer": false, 362 "justification": "No discussion of whether problems within TCGBench share structural similarities (e.g., similar algorithms, same problem templates across platforms) that could violate independence assumptions." 363 }, 364 "leakage_detection_method": { 365 "applies": true, 366 "answer": true, 367 "justification": "Temporal splits (problems from June 2024 onwards) serve as a concrete leakage prevention method. For TCGCoder-7B, the paper explicitly separates training data (15,000 early-stage problems) from evaluation sets (Appendix J)." 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "LiveCodeBench's verifiers miss errors: 20% for medium and 40% for hard problems pass LiveCodeBench but fail on LeetCode's online judge.", 374 "evidence": "Reported in Section 1 based on re-evaluation of LLM-generated solutions that passed LiveCodeBench's private tests on LeetCode's online judge.", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "LLM errors cluster tightly while human errors are widely distributed, indicating LLM-generated test suites have systematic blind spots.", 379 "evidence": "Figure 1(b) shows PCA analysis of error patterns where LLM errors cluster and human errors are dispersed.", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "Detection rate saturates with increasing random test case count due to inter-test correlation, and cannot reach 100%.", 384 "evidence": "Theoretically derived in Corollary 1 (Appendix C) and empirically validated across three platforms in Figure 4, showing detection rate plateau consistent with 1-(1-p̄)^(1/ρ̄_eff).", 385 "supported": "strong" 386 }, 387 { 388 "claim": "SAGA achieves a detection rate of 90.62% and verifier accuracy of 32.58% on TCGBench-Lite, outperforming all baselines.", 389 "evidence": "Table 2 shows DR@50=90.62% and VAcc@50=32.58% for SAGA vs. next best Input-Interpreter at DR@50=81.07% and VAcc@50=16.72%.", 390 "supported": "strong" 391 }, 392 { 393 "claim": "SAGA's AUC@50 (0.2228) more than doubles the Input-Interpreter baseline's (0.1234).", 394 "evidence": "Table 2 directly reports these values for TCGBench-Lite.", 395 "supported": "strong" 396 }, 397 { 398 "claim": "Both SAGA analytical components (Multidimensional and Differential Analysis) individually outperform baselines, but their combination in SAGA is optimal.", 399 "evidence": "Table 2 ablation: Multidim-only AUC@50=0.1923, Differ-only AUC@50=0.1926, full SAGA AUC@50=0.2228, all above baseline 0.1234.", 400 "supported": "strong" 401 }, 402 { 403 "claim": "TCGCoder-7B (SAGA-distilled) outperforms all established baselines using much larger backbone models.", 404 "evidence": "Table 2: TCGCoder-7B AUC@50=0.1890 vs. Input-Interpreter 0.1234, EvalPlus 0.1139, TestChain 0.0841 (all using DeepSeek-V3-0324 backbone).", 405 "supported": "strong" 406 }, 407 { 408 "claim": "CodeCompass causes an average relative 9.56% drop in model Pass@1 compared to LiveCodeBench-v6, revealing hidden quality differences between models.", 409 "evidence": "Figure 8 shows Pass@1 drops across 7 models ranging from -1.6% to -16.2%, with model re-ranking (e.g., Qwen2.5-72B and Qwen2.5-Coder-32B switch positions).", 410 "supported": "strong" 411 }, 412 { 413 "claim": "Doubling correct-solution input for Multidimensional Analysis cannot compensate for removing Differential Analysis, proving the latter provides unique value.", 414 "evidence": "Table 5 (Appendix H.2): SAGA VAcc@50=47.08% vs. Multidim-Enhanced (double Shuman, no Differential) VAcc@50=38.73%.", 415 "supported": "moderate" 416 } 417 ], 418 "red_flags": [ 419 { 420 "flag": "No statistical significance testing", 421 "detail": "All comparisons between SAGA and baselines rely on point estimates without statistical tests. Given that the experiments use greedy decoding (single deterministic output), the results may not reflect variability from other sources such as problem selection and human submission quality." 422 }, 423 { 424 "flag": "No limitations section", 425 "detail": "The paper lacks any dedicated discussion of limitations, threats to validity, or scope boundaries. This is a notable omission for a paper proposing a new evaluation framework." 426 }, 427 { 428 "flag": "Overgeneralized scope claims", 429 "detail": "The paper claims relevance to 'LLM Code Generation' broadly but evaluates exclusively on competitive programming problems from three platforms. Generalization to real-world software engineering, multi-file projects, or non-algorithmic tasks is unaddressed." 430 }, 431 { 432 "flag": "Information advantage not controlled", 433 "detail": "SAGA has access to both correct and incorrect human solutions, giving it strictly more information than baselines (Input-Interpreter, TestChain) that lack incorrect submissions. The improvement may partly stem from this information advantage rather than the analytical framework itself. The Appendix H.2 ablation partially addresses this but does not fully disentangle information quantity from analytical method." 434 }, 435 { 436 "flag": "No cost analysis", 437 "detail": "The paper does not report computational cost, API spend, or wall-clock time for any method. SAGA's multi-step prompting with analysis of 10 correct solutions and paired incorrect submissions per problem is likely substantially more expensive than baselines, but this trade-off is never quantified." 438 }, 439 { 440 "flag": "Self-evaluation bias", 441 "detail": "Authors evaluate their own SAGA system and distilled TCGCoder-7B against their own implementations of baselines without acknowledging potential author-evaluation bias (Lucic et al. 2018)." 442 } 443 ], 444 "cited_papers": [ 445 { 446 "title": "Evaluating large language models trained on code", 447 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 448 "year": 2021, 449 "arxiv_id": "2107.03374", 450 "relevance": "Introduces HumanEval, a foundational LLM code generation benchmark whose sparse test suites (avg 7.7 tests/problem) are critiqued in this paper." 451 }, 452 { 453 "title": "Program synthesis with large language models", 454 "authors": ["Jacob Austin", "Augustus Odena", "Maxwell I. Nye"], 455 "year": 2021, 456 "arxiv_id": "2108.07732", 457 "relevance": "Introduces MBPP benchmark with only 3 tests/problem, exemplifying the sparse test coverage problem analyzed in this work." 458 }, 459 { 460 "title": "Is your code generated by chatGPT really correct? Rigorous evaluation of large language models for code generation", 461 "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"], 462 "year": 2023, 463 "relevance": "EvalPlus showed 15% pass rate drop with 80x more tests, directly motivating this work's investigation of test case quality." 464 }, 465 { 466 "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code", 467 "authors": ["Naman Jain", "King Han", "Alex Gu"], 468 "year": 2024, 469 "arxiv_id": "2403.07974", 470 "relevance": "Major LLM code evaluation benchmark whose Input-Interpreter TCG approach and verifier weaknesses are central targets of this paper's analysis." 471 }, 472 { 473 "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning", 474 "authors": ["DeepSeek-AI"], 475 "year": 2025, 476 "arxiv_id": "2501.12948", 477 "relevance": "Exemplifies RLVR frameworks for code generation that depend on verifier quality for accurate reward estimation." 478 }, 479 { 480 "title": "Competitive programming with large reasoning models", 481 "authors": ["Ahmed El-Kishky", "Alexander Wei", "Andre Saraiva"], 482 "year": 2025, 483 "arxiv_id": "2502.06807", 484 "relevance": "O1-IOI demonstrates competitive programming performance with RL-based training, directly affected by verifier quality issues studied here." 485 }, 486 { 487 "title": "Large language models as test case generators: Performance evaluation and enhancement", 488 "authors": ["Kefan Li", "Yuan Yuan"], 489 "year": 2024, 490 "arxiv_id": "2404.13340", 491 "relevance": "TestChain baseline for direct LLM-based test case generation, evaluated against SAGA in Table 2." 492 }, 493 { 494 "title": "AceCoder: Acing coder RL via automated test-case synthesis", 495 "authors": ["Huaye Zeng", "Dongfu Jiang", "Haozhe Wang"], 496 "year": 2025, 497 "arxiv_id": "2502.01718", 498 "relevance": "Proposes automated test synthesis for RL-based code generation training, representing the direct generation TCG paradigm." 499 }, 500 { 501 "title": "TESTEVAL: Benchmarking large language models for test case generation", 502 "authors": ["Wenhan Wang", "Chenyuan Yang", "Zhijie Wang"], 503 "year": 2024, 504 "arxiv_id": "2406.04531", 505 "relevance": "Dedicated TCG benchmark that tailors tests to specific solutions, representing an alternative evaluation approach for LLM test generation." 506 }, 507 { 508 "title": "Competition-level code generation with AlphaCode", 509 "authors": ["Yujia Li", "David H. Choi", "Junyoung Chung"], 510 "year": 2022, 511 "arxiv_id": "2203.07814", 512 "relevance": "Demonstrates competition-level code generation with large-scale sampling and filtering, relevant to understanding code evaluation methodology." 513 }, 514 { 515 "title": "Dynamic scaling of unit tests for code reward modeling", 516 "authors": ["Zeyao Ma", "Xiaokang Zhang", "Jing Zhang"], 517 "year": 2025, 518 "arxiv_id": "2501.01054", 519 "relevance": "CodeRM proposes dynamic test quantity scaling by difficulty for reward modeling, representing direct generation TCG for RL training." 520 }, 521 { 522 "title": "LEVER: Learning to verify language-to-code generation with execution", 523 "authors": ["Ansong Ni", "Srini Iyer", "Dragomir Radev"], 524 "year": 2023, 525 "relevance": "Proposes learned verifiers for code generation, representing an alternative approach to test-suite-based verification." 526 } 527 ], 528 "engagement_factors": { 529 "practical_relevance": { 530 "score": 2, 531 "justification": "SAGA is a concrete TCG framework with released prompts that could be adopted by benchmark builders and RLVR practitioners, though not a one-click tool." 532 }, 533 "surprise_contrarian": { 534 "score": 1, 535 "justification": "The finding that benchmark test suites are weak is somewhat known; the systematic quantification and theoretical bound on detection rate saturation adds rigor but is not paradigm-shifting." 536 }, 537 "fear_safety": { 538 "score": 0, 539 "justification": "No AI risk, security, or safety concerns are raised. The paper focuses on evaluation methodology." 540 }, 541 "drama_conflict": { 542 "score": 1, 543 "justification": "The paper calls out LiveCodeBench's verifiers as flawed (20-40% missed errors) which has a mild 'benchmarks are broken' angle, but the framing is constructive rather than adversarial." 544 }, 545 "demo_ability": { 546 "score": 1, 547 "justification": "GitHub repo with data demo and prompts is provided, but no pip-installable tool or live demo for trying SAGA directly." 548 }, 549 "brand_recognition": { 550 "score": 1, 551 "justification": "Shanghai AI Laboratory (OpenCompass team) is recognized in the Chinese AI research community but not a top-tier global brand like OpenAI or Google DeepMind." 552 } 553 } 554 }