scan.json (26292B)
1 { 2 "paper": { 3 "title": "Inference Scaling fLaws: The Limits of LLM Resampling with Imperfect Verifiers", 4 "authors": ["Benedikt Stroebl", "Sayash Kapoor", "Arvind Narayanan"], 5 "year": 2024, 6 "venue": "arXiv", 7 "arxiv_id": "2411.17501", 8 "doi": "10.48550/arXiv.2411.17501" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "checklist": { 13 "artifacts": { 14 "code_released": { 15 "applies": true, 16 "answer": true, 17 "justification": "Appendix B states 'We release code to reproduce all experimental results of this paper in a GitHub repository' with a link to https://github.com/benediktstroebl/inference-scaling-limits. A Colab notebook is also provided." 18 }, 19 "data_released": { 20 "applies": true, 21 "answer": true, 22 "justification": "The paper states the GitHub repository 'also contains all code samples for all models used in our experiments' (Appendix B). Benchmarks used (HumanEval+, MBPP+) are publicly available." 23 }, 24 "environment_specified": { 25 "applies": true, 26 "answer": false, 27 "justification": "No mention of environment specifications, requirements.txt, or dependency versions in the paper." 28 }, 29 "reproduction_instructions": { 30 "applies": true, 31 "answer": false, 32 "justification": "While code is released and a Colab notebook is provided, the paper itself does not contain step-by-step reproduction instructions. It references the repository and notebook but does not detail how to run the experiments." 33 } 34 }, 35 "statistical_methodology": { 36 "confidence_intervals_or_error_bars": { 37 "applies": true, 38 "answer": true, 39 "justification": "Figure 13 shows upper and lower bounds for the conditional accuracy analysis. The paper also reports error bars for models with insufficient samples (e.g., Command-Light)." 40 }, 41 "significance_tests": { 42 "applies": true, 43 "answer": false, 44 "justification": "The paper makes comparative claims about weaker vs. stronger models but does not use any statistical significance tests. Comparisons are based on visual inspection of trends and point estimates." 45 }, 46 "effect_sizes_reported": { 47 "applies": true, 48 "answer": true, 49 "justification": "The paper reports specific effect sizes throughout, e.g., 'the optimal number of samples is K ≤5' at cost-benefit ratio of 4, and quantifies the generalization gap with specific accuracy values for each model (Figs. 1, 3, 4)." 50 }, 51 "sample_size_justified": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper generates 50-1000 samples per model per task but does not justify why these numbers are sufficient. No power analysis or sample size justification is provided." 55 }, 56 "variance_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "Section 4 describes repeating the sampling process 1,000 times with random permutations and computing mean reward for each K, providing a measure of variance across runs." 60 } 61 }, 62 "evaluation_design": { 63 "baselines_included": { 64 "applies": true, 65 "answer": true, 66 "justification": "The paper compares multiple models of varying capability (GPT-4o, Llama 3.1 family, Code Llama family, Command family, etc.) and uses GPT-4o as a reference baseline for the 'cutoff line' analysis." 67 }, 68 "baselines_contemporary": { 69 "applies": true, 70 "answer": true, 71 "justification": "Models include GPT-4o, Llama 3.1 (2024), and Phi-3, which were contemporary at time of publication. Some older models (CodeGen, Code Llama) are included to show the capability spectrum." 72 }, 73 "ablation_study": { 74 "applies": false, 75 "answer": false, 76 "justification": "This paper is an analytical study of inference scaling limits, not a system with components to ablate." 77 }, 78 "multiple_metrics": { 79 "applies": true, 80 "answer": true, 81 "justification": "The paper uses multiple metrics: Pass@1, conditional accuracy (pass extended tests given pass base tests), false positive rate, reward under cost-benefit ratios, and four code quality metrics (camelCase, snake_case, line length, commenting)." 82 }, 83 "human_evaluation": { 84 "applies": true, 85 "answer": true, 86 "justification": "Section 5 includes a qualitative manual analysis: 'we randomly sample 10 implementations across all models... Through manual analysis, we identified several recurring error types' with specific examples discussed." 87 }, 88 "held_out_test_set": { 89 "applies": true, 90 "answer": true, 91 "justification": "The core design uses HumanEval+ and MBPP+ extended test suites as held-out evaluation, with standard HumanEval/MBPP tests used as the 'verifier'. This is the paper's key methodological design." 92 }, 93 "per_category_breakdown": { 94 "applies": true, 95 "answer": true, 96 "justification": "Results are broken down per model, per benchmark (HumanEval+ and MBPP+), per task difficulty category (easy/hard, Fig. 6), and per subset of tasks with poor unit tests (Appendix A.2)." 97 }, 98 "failure_cases_discussed": { 99 "applies": true, 100 "answer": true, 101 "justification": "Section 5 and Appendix A.5.3 provide detailed qualitative analysis of false positive failure cases with specific code examples (Figs. 8, 19-30)." 102 }, 103 "negative_results_reported": { 104 "applies": true, 105 "answer": true, 106 "justification": "The entire paper reports a negative result about inference scaling. The paper also notes 'We have not been able to identify any intuitive reason for this difference' regarding varying false positive rate patterns across models (Section 4)." 107 } 108 }, 109 "claims_and_evidence": { 110 "abstract_claims_supported": { 111 "applies": true, 112 "answer": true, 113 "justification": "All abstract claims are supported: the correlation between single-sample accuracy and false positive rate (Fig. 3), the finite optimal K (Fig. 4), and the lower quality of false positives (Fig. 7) are all empirically demonstrated." 114 }, 115 "causal_claims_justified": { 116 "applies": true, 117 "answer": true, 118 "justification": "The paper is careful with causal language, using hedging like 'We speculate that this is because weaker models' true understanding of the programming tasks is worse.' The core claims are about correlations and mathematical bounds, not causal mechanisms." 119 }, 120 "generalization_bounded": { 121 "applies": true, 122 "answer": true, 123 "justification": "The Limitations section explicitly bounds scope: 'Our experiments focus solely on repeated sampling in the context of coding tasks' and suggests 'other domains might exhibit different behavior.' The paper acknowledges prompt sensitivity and contamination as unexamined factors." 124 }, 125 "alternative_explanations_discussed": { 126 "applies": true, 127 "answer": true, 128 "justification": "The Discussion and Limitations sections consider alternatives: prompt sensitivity, benchmark contamination, model-generated vs human unit tests, and the possibility of mitigation strategies like PlanSearch or solution refinement." 129 }, 130 "proxy_outcome_distinction": { 131 "applies": true, 132 "answer": true, 133 "justification": "The paper explicitly distinguishes between passing unit tests (proxy) and actual correctness (outcome), which is the central thesis. Section 2 discusses limitations of imperfect verifiers and the generalization gap between benchmark performance and real-world performance." 134 } 135 }, 136 "setup_transparency": { 137 "model_versions_specified": { 138 "applies": true, 139 "answer": false, 140 "justification": "Models are referred to by marketing names (GPT-4o, Llama 3.1 8B/70B, Code Llama 7B/13B, etc.) without specific API versions or snapshot dates." 141 }, 142 "prompts_provided": { 143 "applies": true, 144 "answer": true, 145 "justification": "Appendix A.5.2 (Fig. 18) provides the exact prompt templates used for each code readability metric. The main experiment uses prompts from the EvalPlus implementation, which is referenced." 146 }, 147 "hyperparameters_reported": { 148 "applies": true, 149 "answer": true, 150 "justification": "Temperature is explicitly stated as 0.8 (Appendix A.2.1). The paper notes 'other than the temperature use their default settings' from the EvalPlus implementation." 151 }, 152 "scaffolding_described": { 153 "applies": false, 154 "answer": false, 155 "justification": "No agentic scaffolding is used. The paper evaluates direct model sampling with unit test verification." 156 }, 157 "data_preprocessing_documented": { 158 "applies": true, 159 "answer": true, 160 "justification": "Appendix A.2.1-A.2.3 document excluded tasks in detail: 21 tasks excluded by EvalPlus authors, 28 for oracle issues, 29 for test inconsistencies in MBPP+, and 14 in HumanEval+, with criteria stated for each." 161 } 162 }, 163 "limitations_and_scope": { 164 "limitations_section_present": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section 6 (Discussion) contains a dedicated 'Limitations' subsection discussing scope restrictions and unexamined factors." 168 }, 169 "threats_to_validity_specific": { 170 "applies": true, 171 "answer": true, 172 "justification": "The Limitations section identifies specific threats: focus on coding tasks only, prompt sensitivity ('prompt engineering could influence false positive generation'), benchmark contamination not investigated, and mitigation strategies not tested." 173 }, 174 "scope_boundaries_stated": { 175 "applies": true, 176 "answer": true, 177 "justification": "The paper explicitly states what was not tested: 'we did not investigate how benchmark contamination contributes to our findings', 'we did not explore mitigation strategies', and lists specific domains that could differ (reasoning, web agents, agent-user interaction)." 178 } 179 }, 180 "data_integrity": { 181 "raw_data_available": { 182 "applies": true, 183 "answer": true, 184 "justification": "Appendix B states the GitHub repository 'contains all code samples for all models used in our experiments', making the raw data available for verification." 185 }, 186 "data_collection_described": { 187 "applies": true, 188 "answer": true, 189 "justification": "Appendix A.2.1 describes sample collection in detail: temperature 0.8, EvalPlus implementation, minimum 50 samples per model and task, with specific counts per model (200 for most, 1000 for Command-Light)." 190 }, 191 "recruitment_methods_described": { 192 "applies": false, 193 "answer": false, 194 "justification": "No human participants; data sources are standard benchmarks (HumanEval+, MBPP+)." 195 }, 196 "data_pipeline_documented": { 197 "applies": true, 198 "answer": true, 199 "justification": "The pipeline is documented: generate samples → filter through base unit tests → evaluate against extended test suites → classify as true/false positives. Task exclusion criteria and counts are detailed in Appendix A.2.1-A.2.3." 200 } 201 }, 202 "conflicts_of_interest": { 203 "funding_disclosed": { 204 "applies": true, 205 "answer": false, 206 "justification": "No funding sources are disclosed in the paper. There is no acknowledgments section mentioning grants or sponsors." 207 }, 208 "affiliations_disclosed": { 209 "applies": true, 210 "answer": true, 211 "justification": "All three authors are from Princeton University, clearly stated in the paper header. They are evaluating third-party models, not their own products." 212 }, 213 "funder_independent_of_outcome": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding information is disclosed, so independence cannot be assessed." 217 }, 218 "financial_interests_declared": { 219 "applies": true, 220 "answer": false, 221 "justification": "No competing interests or financial interests statement is included in the paper." 222 } 223 }, 224 "contamination": { 225 "training_cutoff_stated": { 226 "applies": true, 227 "answer": false, 228 "justification": "The paper does not state the training data cutoff dates for any of the models evaluated, despite evaluating them on benchmarks that may have been in their training data." 229 }, 230 "train_test_overlap_discussed": { 231 "applies": true, 232 "answer": false, 233 "justification": "The Limitations section acknowledges 'we did not investigate how benchmark contamination contributes to our findings' but does not actually analyze train/test overlap." 234 }, 235 "benchmark_contamination_addressed": { 236 "applies": true, 237 "answer": false, 238 "justification": "HumanEval (2021) and MBPP (2021) were published well before most models' training cutoffs. The paper acknowledges this as a limitation but does not address it: 'models could be overly optimized for passing the standard test cases.'" 239 } 240 }, 241 "human_studies": { 242 "pre_registered": { 243 "applies": false, 244 "answer": false, 245 "justification": "No human participants in this study." 246 }, 247 "irb_or_ethics_approval": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants in this study." 251 }, 252 "demographics_reported": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study." 256 }, 257 "inclusion_exclusion_criteria": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "randomization_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "blinding_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "attrition_reported": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 } 277 }, 278 "cost_and_practicality": { 279 "inference_cost_reported": { 280 "applies": true, 281 "answer": false, 282 "justification": "Despite the paper's central argument about compute costs of resampling, no actual inference costs (API costs, wall-clock time, tokens consumed) are reported for the experiments." 283 }, 284 "compute_budget_stated": { 285 "applies": true, 286 "answer": false, 287 "justification": "No total computational budget is stated. The paper generated at minimum 50 samples per model per task across multiple models and benchmarks but does not quantify the compute used." 288 } 289 }, 290 "experimental_rigor": { 291 "seed_sensitivity_reported": { 292 "applies": true, 293 "answer": true, 294 "justification": "Section 4 describes repeating the process 1,000 times with random permutations of sample order and computing mean reward, which tests sensitivity to ordering (a form of seed sensitivity for the resampling analysis)." 295 }, 296 "number_of_runs_stated": { 297 "applies": true, 298 "answer": true, 299 "justification": "The paper explicitly states '1,000 times' for the permutation experiment (Section 4) and specific sample counts per model (50-1000 samples, detailed in Appendix A.2.1)." 300 }, 301 "hyperparameter_search_budget": { 302 "applies": true, 303 "answer": false, 304 "justification": "No hyperparameter search was conducted or reported. The paper uses a fixed temperature of 0.8 from the EvalPlus defaults without justifying why this setting is appropriate or testing alternatives." 305 }, 306 "best_config_selection_justified": { 307 "applies": false, 308 "answer": false, 309 "justification": "The paper does not select a 'best' configuration; it uses standard benchmark settings throughout." 310 }, 311 "multiple_comparison_correction": { 312 "applies": false, 313 "answer": false, 314 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 315 }, 316 "self_comparison_bias_addressed": { 317 "applies": false, 318 "answer": false, 319 "justification": "The paper does not propose a new system; it analyzes existing models on established benchmarks. No self-comparison bias applies." 320 }, 321 "compute_budget_vs_performance": { 322 "applies": true, 323 "answer": true, 324 "justification": "The entire paper is about performance as a function of compute budget (number of samples K). Figures 4 and 14-16 show reward curves as a function of K, which is the core contribution." 325 }, 326 "benchmark_construct_validity": { 327 "applies": true, 328 "answer": true, 329 "justification": "The paper's central contribution IS a discussion of construct validity: it demonstrates that HumanEval/MBPP unit tests do not adequately measure correctness, and discusses how this gap misleads inference scaling claims. Section 6 further discusses this." 330 }, 331 "scaffold_confound_addressed": { 332 "applies": false, 333 "answer": false, 334 "justification": "No scaffolding is involved; the paper evaluates direct model sampling." 335 } 336 }, 337 "data_leakage": { 338 "temporal_leakage_addressed": { 339 "applies": true, 340 "answer": false, 341 "justification": "The paper acknowledges contamination as a limitation but does not address temporal leakage: HumanEval (2021) and MBPP (2021) predate all evaluated models' training, and this is not analyzed." 342 }, 343 "feature_leakage_addressed": { 344 "applies": true, 345 "answer": false, 346 "justification": "No discussion of whether evaluation setup leaks information. The use of function signatures and docstrings as prompts could constitute feature leakage if models were trained on solutions, but this is not discussed." 347 }, 348 "non_independence_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "No discussion of train/test independence. Some models may have been trained on code containing HumanEval/MBPP solutions." 352 }, 353 "leakage_detection_method": { 354 "applies": true, 355 "answer": false, 356 "justification": "No leakage detection or prevention method is applied. The Limitations section states 'we did not investigate how benchmark contamination contributes to our findings.'" 357 } 358 } 359 }, 360 "claims": [ 361 { 362 "claim": "Weaker models produce false positives at higher rates than stronger models, with a strong linear correlation between single-sample accuracy and false positive rate.", 363 "evidence": "Figure 3 shows a linear relationship on both HumanEval+ and MBPP+ across multiple model families (Cohere Command, GPT-4o, Llama 3.1, Code Llama). Section 3.", 364 "supported": "strong" 365 }, 366 { 367 "claim": "No amount of inference scaling of weaker models can enable them to match the single-sample accuracy of a sufficiently strong model when using imperfect verifiers.", 368 "evidence": "Figure 1a/3 shows models below the GPT-4o cutoff line cannot match GPT-4o's Pass@1 even with infinite compute. Mathematical argument in Section 3.", 369 "supported": "strong" 370 }, 371 { 372 "claim": "The optimal number of resampling attempts is finite and very low (K ≤ 5) under realistic cost-benefit assumptions.", 373 "evidence": "Figure 4 shows optimal K across Llama 3.1 and Code Llama families at various cost-benefit ratios. At ratio 4, K ≤ 5 for all models. Theoretical model in Appendix A.3 confirms.", 374 "supported": "strong" 375 }, 376 { 377 "claim": "False positive solutions have lower code quality (readability, naming conventions, commenting) compared to true positive solutions.", 378 "evidence": "Figure 7 and Figure 17 show consistent quality differences across all models and all four readability metrics tested. Section 5.", 379 "supported": "moderate" 380 }, 381 { 382 "claim": "The false positive rate increases with the number of sampling attempts K.", 383 "evidence": "Figure 5 shows FP rate as function of K for Llama 3.1 70B and Code Llama 7B. Explained by bimodal task difficulty distribution (Figure 6). Section 4.", 384 "supported": "strong" 385 } 386 ], 387 "methodology_tags": ["benchmark-eval", "theoretical"], 388 "key_findings": "Resampling-based inference scaling with imperfect verifiers has fundamental limits: weaker models produce false positives at higher rates, creating an accuracy ceiling that no amount of compute can overcome. Under realistic cost-benefit assumptions where false positives have negative utility, the optimal number of samples is finite and very low (K ≤ 5). False positive solutions also exhibit lower code quality across readability metrics. The findings challenge the narrative that inference scaling can substitute for model capability.", 389 "red_flags": [ 390 { 391 "flag": "Contamination not addressed", 392 "detail": "HumanEval and MBPP were published in 2021, well before the training cutoffs of all evaluated models. Contamination could differentially affect stronger models (more training data = more exposure to solutions), potentially confounding the relationship between capability and false positive rates. The authors acknowledge this limitation but do not investigate it." 393 }, 394 { 395 "flag": "No cost/compute reporting despite cost-focused argument", 396 "detail": "Despite the paper arguing that inference scaling has diminishing returns, it reports no actual inference costs, API spend, or wall-clock times for the experiments. The cost-benefit analysis uses abstract ratios rather than empirical cost data." 397 } 398 ], 399 "cited_papers": [ 400 { 401 "title": "AI Agents That Matter", 402 "authors": ["Sayash Kapoor", "Benedikt Stroebl", "Zachary S. Siegel", "Nitya Nadgir", "Arvind Narayanan"], 403 "year": 2024, 404 "arxiv_id": "2407.01502", 405 "relevance": "Prior work by same authors on cost-controlled evaluation of AI agents, directly relevant to inference scaling and benchmarking methodology." 406 }, 407 { 408 "title": "Large Language Monkeys: Scaling Inference Compute with Repeated Sampling", 409 "authors": ["Bradley Brown", "Jordan Juravsky", "Ryan Ehrlich"], 410 "year": 2024, 411 "arxiv_id": "2407.21787", 412 "relevance": "Key prior work on inference scaling laws that this paper challenges, showing empirical scaling of pass rates with sample count." 413 }, 414 { 415 "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation", 416 "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"], 417 "year": 2023, 418 "relevance": "Introduces HumanEval+ and MBPP+ benchmarks with extended test suites, the core evaluation infrastructure used in this paper." 419 }, 420 { 421 "title": "Evaluating Large Language Models Trained on Code", 422 "authors": ["Mark Chen"], 423 "year": 2021, 424 "arxiv_id": "2107.03374", 425 "relevance": "Original HumanEval benchmark paper, foundational to code generation evaluation." 426 }, 427 { 428 "title": "Scaling LLM Test-Time Compute Optimally can be More Effective than Scaling Model Parameters", 429 "authors": ["Charlie Snell", "Jaehoon Lee", "Kelvin Xu", "Aviral Kumar"], 430 "year": 2024, 431 "arxiv_id": "2408.03314", 432 "relevance": "Key paper on test-time compute scaling that this work's findings temper." 433 }, 434 { 435 "title": "Networks of Networks: Complexity Class Principles Applied to Compound AI Systems Design", 436 "authors": ["Jared Quincy Davis", "Boris Hanin"], 437 "year": 2024, 438 "arxiv_id": "2407.16831", 439 "relevance": "Theoretical framework for verifier-based compound AI systems that this paper builds upon." 440 }, 441 { 442 "title": "Are More LLM Calls All You Need? Towards Scaling Laws of Compound Inference Systems", 443 "authors": ["Lingjiao Chen"], 444 "year": 2024, 445 "arxiv_id": "2403.02419", 446 "relevance": "Studies scaling laws for compound inference systems including majority voting limitations." 447 }, 448 { 449 "title": "Archon: An Architecture Search Framework for Inference-Time Techniques", 450 "authors": ["Jon Saad-Falcon"], 451 "year": 2024, 452 "arxiv_id": "2409.15254", 453 "relevance": "Framework combining multiple inference scaling techniques including fusion and ranking." 454 }, 455 { 456 "title": "Beyond Correctness: Benchmarking Multi-dimensional Code Generation for Large Language Models", 457 "authors": ["Jiasheng Zheng"], 458 "year": 2024, 459 "arxiv_id": "2407.11470", 460 "relevance": "Provides code quality evaluation framework (readability metrics) used in Section 5 of this paper." 461 }, 462 { 463 "title": "Generative Verifiers: Reward Modeling as Next-Token Prediction", 464 "authors": ["Lunjun Zhang", "Arian Hosseini"], 465 "year": 2024, 466 "arxiv_id": "2408.15240", 467 "relevance": "Proposes generative reward models as verifiers for inference scaling, one of the imperfect verification approaches analyzed." 468 }, 469 { 470 "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models", 471 "authors": ["Xuezhi Wang"], 472 "year": 2023, 473 "arxiv_id": "2203.11171", 474 "relevance": "Foundational work on majority voting as inference scaling technique, compared in Table 1." 475 }, 476 { 477 "title": "Reflexion: language agents with verbal reinforcement learning", 478 "authors": ["Noah Shinn"], 479 "year": 2023, 480 "relevance": "Key paper on LLM self-reflection and critique as inference scaling technique, categorized in Table 1." 481 } 482 ] 483 }