scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (22219B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "theoretical",
      4   "paper": {
      5     "title": "Inference Scaling fLaws: The Limits of LLM Resampling with Imperfect Verifiers",
      6     "authors": [
      7       "Benedikt Stroebl",
      8       "Sayash Kapoor",
      9       "Arvind Narayanan"
     10     ],
     11     "year": 2024,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2411.17501",
     14     "doi": "10.48550/arXiv.2411.17501"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "All abstract claims are substantiated: the correlation between single-sample accuracy and false positive rate is shown in Figures 3a/3b across multiple model families, the finite optimal K claim is supported by Figure 4, and the no-free-lunch argument is backed by both empirical results and the formal model in Appendix A.3.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The core causal claim — that resampling 'cannot decrease' false positive probability — is mathematically justified by the memoryless nature of i.i.d. sampling. Observational claims about weaker models having higher false positive rates are presented as correlations with supporting figures, not asserted as mechanistic causation.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The paper explicitly scopes its empirical claims to coding benchmarks (HumanEval+, MBPP+) with unit test verifiers, and the limitations section explicitly names reasoning and web agents as out-of-scope extensions requiring future work.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Alternative explanations including benchmark contamination, prompt sensitivity, and task ambiguity in edge cases are explicitly discussed in the Limitations paragraph of Section 6, even though they are not empirically investigated.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper explicitly distinguishes between what is measured (passing extended HumanEval+/MBPP+ test suites) and what is claimed (correctness), noting 'we assume that solutions that pass the full set of tests are correct. If this assumption is not true, the generalization gaps that we reveal only grow bigger.'",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "A dedicated Limitations paragraph appears in Section 6 (Discussion) covering restriction to coding tasks, prompt sensitivity, benchmark contamination, and absence of mitigation strategy exploration.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Specific threats are named: experiments cover only repeated sampling for coding tasks with human-written unit tests; prompt sensitivity is cited with specific prior work; benchmark contamination could cause models to overfit to standard test cases; diversity-inducing alternatives like PlanSearch were not tested.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Explicit scope boundaries are stated: 'Our experiments focus solely on repeated sampling in the context of coding tasks.' Section 6 names specific out-of-scope domains (reasoning, web agents, agent-user interaction) for future work.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding sources are disclosed anywhere in the paper. The Acknowledgments section thanks only individuals who gave feedback on drafts, with no mention of grants, institutional support, or funding agencies.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All three authors are clearly affiliated with Princeton University, as stated on the title page and in the contact footnote.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No funding is disclosed; this criterion is not applicable.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement, patent disclosures, or financial interest declarations appear anywhere in the paper.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms are defined in Section 2: 'verifier,' 'false positive,' 'oracle verifier,' 'imperfect verifier,' 'completeness,' 'soundness,' and 'inference scaling' (explicitly distinguished from training-time improvements) are all given precise definitions with examples.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Four explicit contributions are enumerated in the introduction: a review of inference scaling techniques, demonstration of the generalization gap, empirical analysis of optimal sample count with cost modeling, and evidence that false positives degrade code quality beyond correctness.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper provides structured surveys of inference scaling techniques (Table 1) and verification methods (Table 2), explicitly positions against Brown et al.'s 'Large Language Monkeys,' builds on Davis et al.'s verifier framework, and directly states findings that weaken its own prior work (Kapoor et al.).",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "theoretical": {
    118       "formal_quality": {
    119         "assumptions_stated_explicitly": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "Appendix A.3 explicitly states all model assumptions: bimodal task difficulty (easy T1, hard T2), prior probabilities p1/p2, correctness probabilities r1/r2, verifier completeness c, and soundness s, with all parameter values given numerically in Table 3.",
    123           "source": "haiku"
    124         },
    125         "proofs_complete_or_sketched": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "The theoretical model provides complete derivations for rejection probability (Eq. 1), Bayesian belief updates after rejections (Eq. 2), true/false positive probabilities (Eqs. 3-4), expected value per attempt (Eq. 5), and total reward (Eq. 6), with a Python notebook implementation released for verification.",
    129           "source": "haiku"
    130         },
    131         "bounds_tight_or_discussed": {
    132           "applies": true,
    133           "answer": true,
    134           "justification": "The key bound — that Pweak(Correct|Pass Verifier) is an exact ceiling on achievable accuracy through resampling — is mathematically exact, not an approximation. The paper discusses when this bound is binding via the condition Pstrong(Correct) > Pweak(Correct|Pass Verifier) and shows empirically where models fall relative to this threshold.",
    135           "source": "haiku"
    136         },
    137         "counterexamples_explored": {
    138           "applies": true,
    139           "answer": true,
    140           "justification": "Edge cases are explored: cost-benefit ratio of 10 where K_opt = 0 for almost all models (Figure 15); dramatically different false positive rate growth for Llama 3.1 70B versus Code Llama families (Section 4); and tasks with unit test precision below 90% (Appendix A.2) showing more pronounced generalization gaps.",
    141           "source": "haiku"
    142         },
    143         "notation_consistent": {
    144           "applies": true,
    145           "answer": true,
    146           "justification": "Notation in Appendix A.3 is introduced systematically and used consistently throughout: β_i for rejection probabilities, p^(k)_Ti for posterior beliefs, P^(k)_TP/FP for solution probabilities, EV_k for expected value per step, without symbol overloading.",
    147           "source": "haiku"
    148         },
    149         "constructive_vs_existence_noted": {
    150           "applies": true,
    151           "answer": true,
    152           "justification": "All results are fully constructive: Equations 1-6 allow computing K_opt for any given parameter set, and the authors release a Python Colab notebook that implements the model, making the computations reproducible and applicable to other benchmarks.",
    153           "source": "haiku"
    154         }
    155       },
    156       "connections": {
    157         "connection_to_practice_discussed": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "The cost-benefit framework is explicitly designed for real deployment decisions, with discussion of security-sensitive applications where false positives represent vulnerabilities; code quality findings (naming conventions, line length) are directly actionable; and Section 6 addresses implications for training methodology and verifier development.",
    161           "source": "haiku"
    162         },
    163         "relationship_to_prior_work_clear": {
    164           "applies": true,
    165           "answer": true,
    166           "justification": "The paper explicitly builds on Davis et al.'s verifier framework, challenges Brown et al.'s 'Large Language Monkeys' scaling claims, and states in Section 6 that 'Our findings weaken the support for our previous paper's claim [Kapoor et al.]' — an unusual and clear positioning relative to prior work including the authors' own.",
    167           "source": "haiku"
    168         },
    169         "computational_complexity_discussed": {
    170           "applies": false,
    171           "answer": false,
    172           "justification": "The theoretical model involves only simple expected value computations with no non-trivial algorithmic complexity; formal complexity analysis is not relevant to this work's contribution.",
    173           "source": "haiku"
    174         },
    175         "limitations_of_formal_model_stated": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Limitations of the formal model are stated: the bimodal task difficulty assumption is an idealization calibrated only on HumanEval+ (Table 3); the paper acknowledges human-written unit tests may not match LLM-generated test scenarios in real deployments; and the model's applicability to other benchmarks and domains is left as future work.",
    179           "source": "haiku"
    180         }
    181       }
    182     }
    183   },
    184   "claims": [
    185     {
    186       "claim": "Weaker models produce false positives (solutions that pass standard unit tests but fail extended tests) at higher rates than stronger models, with a linear relationship between single-sample accuracy and achievable accuracy with infinite compute budget",
    187       "evidence": "Figures 3a and 3b show this relationship empirically across Cohere Command, GPT-4o, and Llama 3.1 families on both HumanEval+ and MBPP+",
    188       "supported": "strong"
    189     },
    190     {
    191       "claim": "For models below GPT-4o's single-sample accuracy, no amount of resampling can match GPT-4o's Pass@1, because GPT-4o's accuracy exceeds the weaker model's accuracy conditioned on passing unit tests",
    192       "evidence": "Horizontal cutoff line in Figure 3a shows all models below the line cannot match GPT-4o through resampling; result replicated on MBPP+ in Figure 3b",
    193       "supported": "strong"
    194     },
    195     {
    196       "claim": "The optimal number of resampling attempts is finite and very low (K≤5 for cost-benefit ratio of 4) even with zero computational cost, because false positive rate increases with K",
    197       "evidence": "Figure 4 shows inference scaling curves with cost modeling across Llama 3.1 and Code Llama families; Figure 5 confirms that false positive rate rises monotonically with K",
    198       "supported": "strong"
    199     },
    200     {
    201       "claim": "The increasing false positive rate with K is explained by bimodal task difficulty: easy tasks are solved quickly, leaving only hard tasks that disproportionately generate false positives",
    202       "evidence": "Figure 6 shows strongly bimodal task difficulty distributions for Llama 3.1 8B and GPT-4o on both benchmarks; Appendix A.3 formalizes this mechanism in the theoretical model",
    203       "supported": "strong"
    204     },
    205     {
    206       "claim": "False positive solutions are lower quality across all four code readability metrics (camelCase, snake_case, line length, commenting) compared to true positive solutions, consistently across all tested models",
    207       "evidence": "Figure 7 and Figure 17 show this pattern consistently across all models and metrics on HumanEval+",
    208       "supported": "strong"
    209     },
    210     {
    211       "claim": "Using imperfect verifiers as evaluation ground truth creates misleading model comparisons that are invisible when using only standard benchmarks",
    212       "evidence": "Section 2 and the empirical results show that gaps between models visible in HumanEval+/MBPP+ would not appear if HumanEval/MBPP were used for both verification and evaluation",
    213       "supported": "moderate"
    214     }
    215   ],
    216   "methodology_tags": [
    217     "benchmark-eval",
    218     "theoretical"
    219   ],
    220   "key_findings": "Inference scaling via resampling with imperfect verifiers (unit tests with limited coverage) has a fundamental, mathematically unavoidable ceiling: weaker models produce false positives at higher rates than stronger models, meaning resampling cannot close the capability gap. The optimal number of resampling attempts is empirically very low — often under 5, sometimes zero — because false positive rates increase rather than decrease as sampling continues, driven by the bimodal distribution of task difficulty (easy tasks are solved quickly; harder remaining tasks are disproportionately susceptible to false positives). False positive solutions are also degraded on code quality metrics beyond functional correctness, and this degradation is more severe for weaker models.",
    221   "red_flags": [
    222     {
    223       "flag": "Single-domain scope overclaimed",
    224       "detail": "All experiments are on Python coding benchmarks (HumanEval, MBPP) with unit tests. The abstract and introduction suggest broad applicability to 'domains such as reasoning or coding,' but no reasoning experiments are run; generalization is asserted, not demonstrated."
    225     },
    226     {
    227       "flag": "Extended test suite assumed perfect oracle",
    228       "detail": "The paper explicitly assumes 'solutions that pass the full set of tests are correct' (Section 3). If extended tests have their own false positives, measured generalization gaps are lower bounds, not exact values — and the paper identifies real oracle issues (e.g., 57 tasks excluded from MBPP+ due to harness bugs)."
    229     },
    230     {
    231       "flag": "Benchmark contamination uninvestigated",
    232       "detail": "The authors acknowledge models 'could be overly optimized for passing the standard test cases' due to contamination, but this alternative explanation for the observed higher false positive rates in weaker models is left uninvestigated."
    233     },
    234     {
    235       "flag": "Cost-benefit ratios hypothetical",
    236       "detail": "The cost-benefit ratios (0, 1, 2, 4, 8) used in Figure 4 are hypothetical; the paper does not measure actual deployment costs of false-positive code for any real application, making the 'optimal K' guidance approximate rather than actionable."
    237     },
    238     {
    239       "flag": "Paper misclassified as theoretical",
    240       "detail": "Sections 3, 4, and 5 are empirical studies (50-1000 samples per model per task); the formal theoretical model is confined to Appendix A.3 and is secondary. This paper would more accurately be classified as empirical with a theoretical supplement."
    241     }
    242   ],
    243   "cited_papers": [
    244     {
    245       "title": "Large Language Monkeys: Scaling Inference Compute with Repeated Sampling",
    246       "relevance": "Primary target of this paper's critique; directly challenged by the finding that resampling with imperfect verifiers cannot indefinitely improve accuracy"
    247     },
    248     {
    249       "title": "AI Agents That Matter",
    250       "relevance": "Prior work by the same authors on cost-controlled evaluation; this paper explicitly weakens some of its claims"
    251     },
    252     {
    253       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation",
    254       "relevance": "Source of HumanEval+ and MBPP+ benchmarks and evaluation implementation used throughout all experiments"
    255     },
    256     {
    257       "title": "Are More LLM Calls All You Need? Towards Scaling Laws of Compound Inference Systems",
    258       "relevance": "Related work demonstrating limitations of majority voting in inference scaling; cited alongside this paper as evidence that known limitations exist across techniques"
    259     },
    260     {
    261       "title": "Networks of Networks: Complexity Class Principles Applied to Compound AI Systems Design",
    262       "relevance": "The verifier-based judge setup that this paper's formal theoretical model explicitly builds upon"
    263     },
    264     {
    265       "title": "Beyond Correctness: Benchmarking Multi-dimensional Code Generation for Large Language Models",
    266       "relevance": "Source of code readability metrics and prompt templates used in Section 5's code quality analysis of false positive solutions"
    267     },
    268     {
    269       "title": "Scaling LLM Test-Time Compute Optimally can be More Effective than Scaling Model Parameters",
    270       "relevance": "Related work on optimal test-time compute allocation; cited in Table 1 as complementary perspective on inference scaling techniques"
    271     },
    272     {
    273       "title": "Reflexion: language agents with verbal reinforcement learning",
    274       "relevance": "Key self-refinement inference scaling technique surveyed in Table 1; its known limitations (hurts performance under uncertainty) contrasted with resampling"
    275     }
    276   ],
    277   "engagement_factors": {
    278     "practical_relevance": {
    279       "score": 3,
    280       "justification": "Directly tells practitioners whether and how much repeated sampling is worth using, with concrete K thresholds for different cost-benefit scenarios."
    281     },
    282     "surprise_contrarian": {
    283       "score": 3,
    284       "justification": "Challenges the popular 'infinite monkey' inference scaling narrative; the finding that optimal K is often under 5 and sometimes zero directly contradicts widespread optimism about compute scaling."
    285     },
    286     "fear_safety": {
    287       "score": 1,
    288       "justification": "Briefly notes that models trained on imperfect verifiers may learn to exploit verifier weaknesses rather than solve tasks robustly, creating potential safety concerns, but this is a minor point."
    289     },
    290     "drama_conflict": {
    291       "score": 2,
    292       "justification": "The paper explicitly states it weakens its own prior work's claims and directly contradicts several cited papers; the authors' self-correction and challenge to widely-cited scaling results creates genuine tension."
    293     },
    294     "demo_ability": {
    295       "score": 2,
    296       "justification": "Full code released on GitHub and a Colab notebook reproduces the theoretical model; practitioners can apply the analysis to their own models with moderate effort."
    297     },
    298     "brand_recognition": {
    299       "score": 1,
    300       "justification": "Princeton University affiliation and Arvind Narayanan's reputation in AI/security research provide modest recognition, but no famous industry lab or widely-known product is involved."
    301     }
    302   },
    303   "hn_data": {
    304     "threads": [
    305       {
    306         "hn_id": "39229755",
    307         "title": "Arrows of Time for Large Language Models",
    308         "points": 6,
    309         "comments": 3,
    310         "url": "https://news.ycombinator.com/item?id=39229755",
    311         "created_at": "2024-02-02T15:33:39Z"
    312       },
    313       {
    314         "hn_id": "42258272",
    315         "title": "Inference Scaling FLaws: The Limits of LLM Resampling with Imperfect Verifiers",
    316         "points": 3,
    317         "comments": 0,
    318         "url": "https://news.ycombinator.com/item?id=42258272",
    319         "created_at": "2024-11-27T18:15:21Z"
    320       },
    321       {
    322         "hn_id": "38482212",
    323         "title": "Language Models: A Guide for the Perplexed",
    324         "points": 2,
    325         "comments": 1,
    326         "url": "https://news.ycombinator.com/item?id=38482212",
    327         "created_at": "2023-12-01T02:18:03Z"
    328       },
    329       {
    330         "hn_id": "46384504",
    331         "title": "Dark Patterns and Deceptive Designs in Chinese and Japanese F2P Mobile Games",
    332         "points": 2,
    333         "comments": 0,
    334         "url": "https://news.ycombinator.com/item?id=46384504",
    335         "created_at": "2025-12-25T14:10:50Z"
    336       },
    337       {
    338         "hn_id": "38475261",
    339         "title": "Language Models: A Guide for the Perplexed",
    340         "points": 2,
    341         "comments": 0,
    342         "url": "https://news.ycombinator.com/item?id=38475261",
    343         "created_at": "2023-11-30T16:13:10Z"
    344       },
    345       {
    346         "hn_id": "42740309",
    347         "title": "Language Models: A Guide for the Perplexed (2023)",
    348         "points": 1,
    349         "comments": 0,
    350         "url": "https://news.ycombinator.com/item?id=42740309",
    351         "created_at": "2025-01-17T17:00:34Z"
    352       },
    353       {
    354         "hn_id": "42709928",
    355         "title": "Language Models: A Guide for the Perplexed",
    356         "points": 1,
    357         "comments": 0,
    358         "url": "https://news.ycombinator.com/item?id=42709928",
    359         "created_at": "2025-01-15T11:58:08Z"
    360       },
    361       {
    362         "hn_id": "39745066",
    363         "title": "Arrows of Time for Large Language Models",
    364         "points": 1,
    365         "comments": 0,
    366         "url": "https://news.ycombinator.com/item?id=39745066",
    367         "created_at": "2024-03-18T14:29:48Z"
    368       }
    369     ],
    370     "top_points": 6,
    371     "total_points": 18,
    372     "total_comments": 4
    373   }
    374 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs