scan.json (25697B)
1 { 2 "paper": { 3 "title": "Assessing Correctness in LLM-Based Code Generation via Uncertainty Estimation", 4 "authors": ["Arindam Sharma", "Cristina David"], 5 "year": 2025, 6 "venue": "arXiv preprint (under review)", 7 "arxiv_id": "2502.11620" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": false, 14 "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. There is no mention of code availability." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper uses LiveCodeBench, a publicly available benchmark (cited as [20], with arXiv link https://arxiv.org/abs/2403.07974). The dataset is publicly accessible and was not modified." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "The paper mentions 'Ubuntu 20.04.5 LTS (Focal Fossa) with one NVIDIA A100 GPU (80GB)' and the Crosshair tool [7], but does not provide a requirements.txt, Dockerfile, or detailed dependency/library version listing sufficient to recreate the environment." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The experimental setup section describes the methodology but does not give executable instructions." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Results in Tables 2, 3, and 4 report point estimates (Pearson correlation coefficients, accuracy percentages) without confidence intervals or error bars." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": true, 41 "justification": "The paper reports p-values alongside Pearson correlation coefficients in Table 2 and Table 4, and highlights statistically significant results. This constitutes significance testing for their comparative claims." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "Pearson correlation coefficients are reported (e.g., -0.56, -0.51) alongside p-values, providing the magnitude of the effect. Abstention metrics in Table 3 report accuracy percentages, false positive rates (e.g., 0.01%), and false negative rates (e.g., 11.9%-20.3%), giving context for the magnitude of improvement." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper uses 831 problems from LiveCodeBench split by difficulty (Easy: 239, Medium: 332, Hard: 260) but does not justify why this sample size is adequate for the correlation analyses, nor is any power analysis discussed." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No standard deviations, variance, or spread measures are reported across experimental runs. The 2-fold cross-validation for abstention policies is mentioned but results are reported as single numbers without variance across folds." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "The paper compares symbolic clustering against multiple baselines: NLG embeddings (DeBERTa), code embeddings (GPT text-embedding-ada-002), CodeBLEU, and raw LLM probability. SE-CODEEMBED and LLM-PROBABILITY serve as explicit baselines in Table 3." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "The baselines are drawn from recent state-of-the-art NLG uncertainty estimation methods: Kuhn et al. (ICLR 2023), Abbasi et al. (NeurIPS 2024), and OpenAI's text-embedding-ada-002 embeddings. These are contemporary and competitive." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Table 2 functions as a systematic ablation: it tests the same uncertainty estimation frameworks (SE, MI, CC) with different clustering methods (NLG, CODEEMBED, CODEBLEU, SYMB) and distribution assumptions (norm vs. uniform), isolating the contribution of symbolic clustering and token probabilities." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper uses Pearson correlation (for uncertainty-correctness relationship) and abstention policy metrics (accuracy, false positive rate, false negative rate) as distinct evaluation dimensions." 79 }, 80 "human_evaluation": { 81 "applies": false, 82 "answer": false, 83 "justification": "The paper evaluates code correctness via automated test suites from the LiveCodeBench benchmark. Human evaluation is not relevant to the claims, which concern statistical correlation between uncertainty estimates and automated correctness scores." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "For abstention policies, the paper uses a 50/50 train/validation split with 2-fold cross-validation, alternating between training and validation sets. The correlation analysis in Table 2 uses the full dataset without tuning." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down by difficulty level (Easy, Medium, Hard) for all three models in Tables 2, 3, and 4. Table 1 provides model-specific statistics per difficulty level." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "The paper discusses failure cases: NLG, CODEEMBED, and CODEBLEU clustering methods all fail to achieve significant correlation. The motivating example in Section 3 illustrates a specific failure where embedding-based and CodeBLEU-based clustering incorrectly separates equivalent programs. The Limitations section discusses false positives from bounded symbolic execution." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "Negative results are prominently reported: SE-NLG, SE-CODEEMBED, SE-CODEBLEU, MI-NLG, MI-CODEEMBED, MI-CODEBLEU, CC-CODEEMBED, and LLM-PROBABILITY all fail to achieve statistically significant correlation with correctness (Table 2). The paper centers on explaining and overcoming these negative results." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims: (1) existing NLG uncertainty techniques fail for code — supported by Table 2 (non-significant p-values for NLG/CODEEMBED/CODEBLEU variants); (2) symbolic clustering restores predictive power — supported by significant correlations for SYMB variants; (3) uniform distribution performs comparably — supported by comparing SYMBNORM vs. SYMBUNIFORM; (4) false positive rate below 0.02% — supported by Table 3." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper's main causal claim is that semantic clustering is the key factor enabling correlation between uncertainty and correctness. This is justified through controlled ablations: the same uncertainty framework (SE, MI) is tested with different clustering methods while holding other variables constant (Table 2). The ablation design supports the causal claim." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The title claims to address 'LLM-Based Code Generation' broadly, but results are only on three models and one benchmark (LiveCodeBench, which is Python-focused competitive programming problems). The paper does not bound its claims to this specific setting. Section 7 (Conclusion) refers to 'code generation' generally without qualification." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper does not discuss alternative explanations for why symbolic clustering works (e.g., could the bounded equivalence checking be introducing systematic biases? Could the timeout-based assumption of equivalence inflate cluster merging?). The Limitations section discusses bounded symbolic execution but does not consider alternative explanations for the observed correlations." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper specifies 'gpt-3.5-turbo-instruct', 'DeepSeek-R1', and 'claude-3-7-sonnet' but does not provide snapshot dates or API version identifiers for any of these. 'gpt-3.5-turbo-instruct' has no version suffix. 'DeepSeek-R1' and 'claude-3-7-sonnet' are marketing names without version pinning." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper states 'We use the natural language description in the query provided to the LLM' (Section 5) but does not provide the actual prompt text, system instructions, or any wrapping around the problem description. The iterative prompting approach (Appendix B) is described conceptually but the actual prompts are not shown." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "The paper reports: M=5 sampled responses, 2 iterations for iterative prompting, Crosshair per-condition timeout of 5s, per-path timeout of 5s, overall equivalence timeout of 10s, correctness test case timeout of 5s, and 90% correctness score threshold. However, LLM sampling parameters (temperature, top-p) are not explicitly stated." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "The paper does not use agentic scaffolding. The LLMs are queried directly for code generation; the symbolic execution pipeline is a post-processing analysis step, not an agentic scaffold." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "The paper documents the data pipeline: LiveCodeBench problems are used as prompts, 5 responses are sampled per problem, the top-ranked response is tested against test cases, correctness scores are computed as percentage of passing tests. For abstention, downsampling of incorrect responses and 2-fold cross-validation are described (Appendix C.2)." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "A dedicated 'Limitations' subsection appears at the end of Section 5.1, discussing bounded symbolic execution and the limited set of NLG techniques evaluated." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "The limitations are specific to this study: (1) bounded symbolic execution may cause false positives in equivalence checking because behavioral differences may only manifest beyond the unrolling limit; (2) only two NLG-based techniques (Kuhn et al. and Abbasi et al.) are evaluated, mitigated by their prominence in the literature." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "The paper does not explicitly state what the results do NOT show. It does not bound its claims to Python, competitive programming, or the specific models tested. The limitations discuss technical constraints of symbolic execution but do not state, for instance, that results may not generalize to other programming languages, real-world codebases, or non-competitive-programming tasks." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "No raw data (generated code samples, uncertainty scores, correctness scores per problem) is released. Only aggregated statistics in tables are shown." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 5 describes data collection: 831 problems from LiveCodeBench, categorized by difficulty, with natural language descriptions used as prompts and I/O test cases for evaluation. Table 1 provides detailed statistics per model and difficulty level." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants. The data source is a standard public benchmark (LiveCodeBench)." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The pipeline is documented: (1) sample 5 responses per problem from each LLM, (2) compute token-level log-probabilities or apply uniform distribution, (3) perform symbolic execution for clustering, (4) compute uncertainty scores, (5) compute correctness scores via test cases. Timeouts and thresholds are specified. The downsampling procedure for GPT-3.5-turbo-instruct is documented in Appendix C.2." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding sources, grants, or sponsors are mentioned anywhere in the paper." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Both authors are affiliated with the University of Bristol, clearly stated on the first page. No evaluated product is from Bristol, so there is no product-affiliation conflict." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information is disclosed, so independence cannot be assessed. The absence of any funding disclosure makes this NO rather than NA — funding may exist but is undisclosed." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests statement or financial interests declaration is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper does not state the training data cutoff dates for any of the three models (gpt-3.5-turbo-instruct, DeepSeek-R1, claude-3-7-sonnet). This is relevant because they evaluate model performance on LiveCodeBench." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": true, 225 "justification": "The paper addresses contamination by choosing LiveCodeBench specifically because it is described as a 'contamination-free benchmark' (Section 5). LiveCodeBench is designed with temporally-gated problems to avoid train/test overlap." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": true, 230 "justification": "LiveCodeBench is explicitly chosen as a 'contamination-free benchmark for code-related tasks' (Section 5). The benchmark is designed to mitigate contamination by sourcing problems that postdate model training cutoffs." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "The approach requires multiple LLM calls (5 samples per problem, plus iterative prompting for MI variants) and symbolic execution per pair of snippets. No API costs, token counts, or wall-clock times per problem are reported." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "The paper mentions using an NVIDIA A100 GPU (80GB) but does not state total GPU hours, API costs, or overall computational budget. Only per-pair symbolic execution timeouts (10s) are mentioned." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "Existing NLG-based uncertainty estimation techniques (semantic entropy and mutual information) fail to exhibit statistically significant correlation with correctness when applied to code generation, even with code-specific embeddings or CodeBLEU.", 286 "evidence": "Table 2 shows non-significant p-values (all >0.05) for SE-NLG, SE-CODEEMBED, SE-CODEBLEU, MI-NLG, MI-CODEEMBED, MI-CODEBLEU, and CC-CODEEMBED across all difficulty levels and both models.", 287 "supported": "strong" 288 }, 289 { 290 "claim": "Symbolic clustering via symbolic execution enables NLG-based uncertainty techniques to achieve statistically significant correlation with correctness in code generation.", 291 "evidence": "Table 2 shows all SYMB variants (SE-SYMBNORM, SE-SYMBUNIFORM, MI-SYMBNORM, MI-SYMBUNIFORM, CC-SYMB) achieve p-values well below 0.05 with Pearson coefficients ranging from -0.19 to -0.56.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "Token-level log-probabilities are not essential; uncertainty estimates assuming uniform distributions perform comparably to those using actual log-probabilities.", 296 "evidence": "Table 2 shows SE-SYMBUNIFORM and MI-SYMBUNIFORM achieve comparable or near-comparable correlation coefficients to SE-SYMBNORM and MI-SYMBNORM. For example, on Easy problems with gpt-3.5-turbo-instruct, SE-SYMBNORM achieves -0.56 vs. SE-SYMBUNIFORM at -0.51.", 297 "supported": "strong" 298 }, 299 { 300 "claim": "Abstention policies based on symbolic clustering achieve false positive rates below 0.02%.", 301 "evidence": "Table 3 shows SE-SYMBNORM (0.01%), SE-SYMBUNIFORM (0.01%), and CC-SYMB (0.02% and 0.01%) false positive rates for gpt-3.5-turbo-instruct and DeepSeek-R1.", 302 "supported": "strong" 303 }, 304 { 305 "claim": "The symbolic cluster count (CC-SYMB), despite its simplicity, correlates with correctness and performs on par with more complex information-theoretic approaches.", 306 "evidence": "Table 2 shows CC-SYMB achieving correlation coefficients comparable to SE-SYMBNORM (e.g., -0.51 vs. -0.56 on Easy, -0.44 vs. -0.52 on Medium for gpt-3.5-turbo-instruct). Table 3 shows comparable abstention metrics.", 307 "supported": "strong" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval"], 311 "key_findings": "The paper demonstrates that existing NLG uncertainty estimation techniques fail to correlate with correctness in code generation because learned embeddings and heuristic metrics cannot capture functional equivalence of programs. Symbolic clustering via symbolic execution restores this correlation, and a simple symbolic cluster count metric performs comparably to complex information-theoretic approaches. Abstention policies based on symbolic clustering achieve near-zero false positive rates (below 0.02%) with false negative rates of 11.9-20.3%, tested across three LLMs and 831 LiveCodeBench problems.", 312 "red_flags": [ 313 { 314 "flag": "No code or raw data released", 315 "detail": "Despite proposing a novel technique (symbolic clustering for uncertainty estimation), no implementation code, generated samples, or raw experimental data are released. This prevents independent verification of results." 316 }, 317 { 318 "flag": "Missing LLM sampling parameters", 319 "detail": "Temperature, top-p, and other sampling parameters for the three LLMs are not reported. These significantly affect the diversity of generated code and thus the clustering results. The paper mentions 'sampling M code snippets' but does not specify the sampling configuration." 320 }, 321 { 322 "flag": "No variance or reproducibility measures", 323 "detail": "All results are reported as single numbers without standard deviations or confidence intervals. The 2-fold cross-validation for abstention is mentioned but fold-level results are not shown. There is no indication of whether results are stable across random seeds or sampling runs." 324 }, 325 { 326 "flag": "Unbounded generalization claims", 327 "detail": "The paper claims to address 'LLM-Based Code Generation' in the title but only evaluates on Python competitive programming problems from LiveCodeBench. No discussion of whether symbolic clustering generalizes to other programming languages, real-world codebases, or non-algorithmic tasks." 328 } 329 ], 330 "cited_papers": [ 331 { 332 "title": "Semantic uncertainty: Linguistic invariances for uncertainty estimation in natural language generation", 333 "authors": ["Lorenz Kuhn", "Yarin Gal", "Sebastian Farquhar"], 334 "year": 2023, 335 "relevance": "Foundational work on semantic entropy for NLG uncertainty estimation, which this paper adapts to code generation." 336 }, 337 { 338 "title": "To believe or not to believe your LLM: Iterative prompting for estimating epistemic uncertainty", 339 "authors": ["Yasin Abbasi-Yadkori", "Ilja Kuzborskij", "András György", "Csaba Szepesvari"], 340 "year": 2024, 341 "relevance": "State-of-the-art mutual information approach for epistemic uncertainty estimation in LLMs, adapted in this paper for code." 342 }, 343 { 344 "title": "Detecting hallucinations in large language models using semantic entropy", 345 "authors": ["Sebastian Farquhar", "Jannik Kossen", "Lorenz Kuhn", "Yarin Gal"], 346 "year": 2024, 347 "relevance": "Extension of semantic entropy to hallucination detection in LLMs, closely related to correctness estimation." 348 }, 349 { 350 "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code", 351 "authors": ["Naman Jain", "King Han", "Alex Gu"], 352 "year": 2024, 353 "arxiv_id": "2403.07974", 354 "relevance": "Contamination-free code generation benchmark used for evaluation in this paper." 355 }, 356 { 357 "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning", 358 "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"], 359 "year": 2025, 360 "arxiv_id": "2501.12948", 361 "relevance": "State-of-the-art open-source reasoning model evaluated in this paper for code generation." 362 }, 363 { 364 "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation", 365 "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"], 366 "year": 2024, 367 "relevance": "Large-scale evaluation of LLM-generated code correctness using external oracles, relevant to the broader code quality assessment space." 368 }, 369 { 370 "title": "Look before you leap: An exploratory study of uncertainty analysis for large language models", 371 "authors": ["Yuheng Huang", "Jiayang Song", "Zhijie Wang"], 372 "year": 2025, 373 "relevance": "Explores lightweight uncertainty proxies (e.g., CodeBLEU) for code generation and finds them ineffective, directly motivating this paper's approach." 374 }, 375 { 376 "title": "AutoCodeRover: Autonomous program improvement", 377 "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"], 378 "year": 2024, 379 "doi": "10.1145/3650212.3680384", 380 "relevance": "Autonomous program repair system relevant to the broader agentic AI coding landscape." 381 }, 382 { 383 "title": "Automated repair of programs from large language models", 384 "authors": ["Zhiyu Fan", "Xiang Gao", "Martin Mirchev", "Abhik Roychoudhury", "Shin Hwei Tan"], 385 "year": 2023, 386 "doi": "10.1109/ICSE48619.2023.00128", 387 "relevance": "LLM-based automated program repair, relevant to code quality and correctness assessment." 388 }, 389 { 390 "title": "Exploring and evaluating hallucinations in LLM-powered code generation", 391 "authors": ["Fang Liu", "Yang Liu", "Lin Shi"], 392 "year": 2024, 393 "relevance": "Establishes taxonomy of hallucinations in LLM-generated code, directly related to correctness estimation." 394 }, 395 { 396 "title": "De-Hallucinator: Iterative grounding for LLM-based code completion", 397 "authors": ["Aryaz Eghbali", "Michael Pradel"], 398 "year": 2024, 399 "relevance": "Technique for improving LLM code reliability through iterative grounding, related to oracle-free quality estimation." 400 } 401 ] 402 }