scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26071B)
      1 {
      2   "paper": {
      3     "title": "More Bang for the Buck: Improving the Inference of Large Language Models at a Fixed Budget using Reset and Discard (ReD)",
      4     "authors": [
      5       "Sagi Meir",
      6       "Tommer D. Keidar",
      7       "Noam Levi",
      8       "Shlomi Reuveni",
      9       "Barak Hirshberg"
     10     ],
     11     "year": 2026,
     12     "venue": "arXiv",
     13     "arxiv_id": "2601.21522",
     14     "doi": "10.48550/arXiv.2601.21522"
     15   },
     16   "scan_version": 2,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["theoretical", "benchmark-eval"],
     19   "key_findings": "The paper connects pass@k to coverage@cost via renewal theory and shows that power-law pass@k with exponent α<1 leads to sublinear coverage growth under solve-to-completion. Reset-and-Discard (ReD) provably improves coverage@cost for any budget and any difficulty distribution, with τ=1 (reset every attempt) proven optimal. Experiments on three LLMs with HumanEval confirm ReD yields large savings in attempts, tokens, and USD, and enables efficient estimation of inference power-law exponents.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "Sec. 6 states 'All the code used to generate the results of this paper will be posted on GitHub' — a promise of future release, not an actual release. No URL is provided."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The paper uses HumanEval (Chen et al., 2021), a publicly available benchmark with 164 problems."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper mentions the Groq API and model names but provides no requirements.txt, Dockerfile, or library version details."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No step-by-step reproduction instructions are provided. The experimental procedure is described narratively in Sec. 6 but there are no scripts, README, or commands to replicate results."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Figures 1 and 2 show shaded regions representing standard deviation over 100 random realizations. Fig. 3 reports the power-law exponent as 0.34 ± 0.01."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No statistical significance tests are used. Comparisons between ReD and solve-to-completion rely on visual inspection of curves and theoretical proofs, not formal tests."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "Results are presented graphically in Figures 1-2 without explicit numerical effect sizes. The paper does not report, e.g., 'ReD achieves 80% coverage with X% fewer attempts than solve-to-completion.' Specific savings are not quantified in text."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "HumanEval has 164 problems. No justification is provided for why this benchmark size is sufficient for the empirical claims. The choice of 100 random realizations is also not justified."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Standard deviation across 100 random realizations is shown as shaded regions in Figures 1 and 2. The power-law fit reports ±0.01 uncertainty."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The paper compares ReD against the standard solve-to-completion protocol throughout Sec. 6 and Figures 1-2."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Solve-to-completion is the standard, prevailing practice for LLM inference allocation. It is the natural and appropriate baseline. The paper also cites related budget-aware work (Wang et al. 2024, FrugalGPT)."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Theorem 4.1 proves τ=1 is optimal, effectively ablating the reset interval parameter. The paper also compares across three models and three cost metrics (attempts, tokens, USD), showing how different factors affect the comparison."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Three cost metrics are used: cumulative attempts, cumulative tokens, and USD cost (Fig. 2). Coverage is the primary outcome metric."
     90       },
     91       "human_evaluation": {
     92         "applies": false,
     93         "answer": false,
     94         "justification": "Human evaluation is not relevant — the paper evaluates a budget allocation policy on automatically verifiable coding tasks. Correctness is determined by test suite execution."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "ReD is a parameter-free allocation policy (τ=1 is provably optimal, not tuned on data). The entire HumanEval benchmark serves as the evaluation set with no tuning phase."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "No per-category or per-difficulty breakdown of HumanEval results is provided. Only aggregate coverage curves are shown."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": false,
    109         "justification": "No specific failure cases are discussed. The paper does not analyze which types of problems ReD helps most or least, nor show examples where the method is less effective."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper reports that power-law exponent inference does not work for the two larger models because they solve nearly all questions in very few rounds (Sec. 6). It also notes gpt-oss-20b's verbosity hurts its token efficiency (Sec. 6)."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "All abstract claims are supported: the power-law → sublinear growth connection (Sec. 3), ReD improving coverage@cost (Theorem 4.1, Sec. 4), budget savings prediction (Sec. 4.3), power-law exponent inference (Sec. 5), and empirical validation (Sec. 6)."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The central causal claim — 'ReD improves coverage@cost' — is justified by rigorous mathematical proof (Theorem 4.1, Appendix B) showing that resetting every attempt minimizes the mean attempts per question for any difficulty distribution. Empirical results confirm the theory."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The theoretical results are general (any pass@k with power-law behavior). Empirical claims are bounded: 'Experiments on three LLMs using HumanEval demonstrate...' (abstract). The limitations section explicitly bounds scope to 'perfectly verifiable tasks' and 'parallel sampling.'"
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Sec. 7 discusses that sequential/correlated sampling may outperform i.i.d. sampling (an alternative to ReD's approach), that different verification methods (Best-of-N, majority vote, reward models) could change the dynamics, and that model verbosity affects token-based comparisons."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper's claims match the granularity of its measurements. It measures coverage@cost (unique problems solved per budget) and frames results in exactly those terms, not broader claims about 'model capability' or 'intelligence.'"
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Specific model identifiers are provided: 'llama-3.1-8b-instant', 'llama-3.3-70b-versatile', and 'openai/gpt-oss-20b' with a citation to the model card (OpenAI, 2025)."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "The full system prompt is provided verbatim in Sec. 6: 'You are an intelligent coding assistant. You will be provided with a function signature and docstring...' The paper also states the standard HumanEval prompt (function signature + docstring) was used."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Temperature 0.8 is stated (Sec. 6). k=100 attempts per question per model is stated."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used. The approach is simple repeated sampling with regex extraction of output code."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Sec. 6 describes the full experimental procedure: evaluating pass@k up to k=100, saving results in a questions×attempts matrix, recording input/output tokens, shuffling rows and columns to generate realizations, and analyzing under both protocols."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Sec. 7 contains a 'Limitations and future work' subsection with substantive discussion of scope boundaries."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "The paper identifies specific limitations: only perfectly verifiable tasks are considered, only i.i.d. parallel sampling (not sequential/backtracking), and a perfect verifier is assumed. These are specific to this study."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Sec. 7 explicitly states: 'we considered only perfectly verifiable tasks,' identifies verification under imperfect verifiers as open, and notes that sequential sampling (correlated attempts) is not addressed but may be more beneficial."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The results matrix (per-question, per-attempt outcomes) is not released. Code and data are promised for future release but not yet available."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Sec. 6 describes using the Groq API to evaluate each model on HumanEval for k=100 attempts, recording per-question per-attempt success and token counts in a results matrix."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants. The data source is the standard HumanEval benchmark."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Sec. 6 documents the pipeline: query models via Groq API → save results matrix (questions×attempts) with token counts → shuffle rows/columns → analyze under both solve-to-completion and ReD protocols."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "The Acknowledgments section lists funding: Israel Science Foundation (grants 1037/22, 1312/22), Pazy Foundation (grant 415-2023), ERC Horizon 2020 (grant 947731), TAD, VATAT, and EPFL AI4science."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are listed: Tel Aviv University (School of Chemistry, CPLS, CCMMS) and EPFL. No conflicts — they evaluate third-party models, not their own products."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "All funding sources are academic (ISF, ERC, Pazy Foundation, TAD, VATAT, EPFL). None have a financial interest in the outcome of inference allocation policy research."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests or financial interests statement is included in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No training data cutoff dates are stated for any of the three models (llama-3.1-8b-instant, llama-3.3-70b-versatile, gpt-oss-20b)."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "No discussion of potential train/test overlap. HumanEval (2021) is very likely in the training data of all three models."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "HumanEval was published in 2021. All models used were trained after 2021 and likely saw HumanEval solutions. No contamination discussion is provided."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": true,
    290         "justification": "Table 1 provides per-million-token pricing for all three models. Figure 2 (right panel) shows cumulative USD cost. Cost is a central metric of the paper."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "The paper states k=100 attempts per question per model, 164 questions (HumanEval), and provides token pricing (Table 1). Cumulative token counts and USD costs are shown in Figure 2."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": true,
    302         "justification": "Results are reported across 100 random realizations (row/column shuffles of the results matrix), with standard deviations shown as shaded regions in Figures 1 and 2."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": true,
    307         "justification": "Sec. 6 states 'we plot the mean coverage over 100 random realizations' with standard deviation. k=100 attempts per question is also stated."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": true,
    312         "justification": "The method's only parameter (τ) is provably optimal at τ=1 (Theorem 4.1), so no hyperparameter search is needed. Temperature 0.8 is the only tunable parameter for the LLM calls, stated without search."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": true,
    317         "justification": "τ=1 is mathematically proven optimal for any difficulty distribution (Theorem 4.1, Appendix B). No selection from candidate configurations is needed."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": false,
    321         "answer": false,
    322         "justification": "The paper does not perform multiple statistical tests — comparisons are shown graphically and the main results are proved theoretically."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors implement both ReD and the solve-to-completion baseline. No discussion of potential author-implementation bias, though the theoretical proofs mitigate this concern substantially."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": true,
    332         "justification": "This is the paper's core contribution: coverage@cost plots show performance as a function of compute budget measured in attempts, tokens, and USD (Figures 1 and 2)."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "The paper uses HumanEval without discussing its construct validity or limitations as a benchmark. No analysis of whether HumanEval adequately represents the diversity of real coding tasks."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "No scaffolding is used. All models receive the same prompt and use simple repeated sampling with regex extraction."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "HumanEval was published in 2021. All three models were trained after 2021 and may have seen solutions. No temporal leakage discussion is provided."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of whether the evaluation setup leaks information. The standard HumanEval prompt includes function signatures and docstrings, but no analysis of feature leakage."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No discussion of whether HumanEval problems are independent of training data or each other."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No concrete leakage detection or prevention method is used or discussed."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "Power-law pass@k with exponent 0 < α < 1 leads to sublinear growth of coverage@cost under solve-to-completion",
    371       "evidence": "Derived analytically via renewal theory and Z-transforms (Eq. 7, Sec. 3.3, Appendix A.2)",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "ReD improves coverage@cost for any budget and any difficulty distribution, and τ=1 is the optimal resetting strategy",
    376       "evidence": "Proven in Theorem 4.1 and Appendix B using Chebyshev sum inequality. Corollary B.1 shows E[T] ≥ E[Tτ] for all τ.",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "For α < 1, ReD changes coverage@cost growth from sublinear to linear",
    381       "evidence": "Shown analytically in Eq. 12 (Sec. 4.2). E[Tτ] is always finite for finite τ, making coverage@cost grow linearly.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "ReD substantially reduces attempts, tokens, and USD cost to reach a desired coverage on HumanEval",
    386       "evidence": "Figures 1 and 2 show ReD outperforms solve-to-completion across three models and three cost metrics. Mean ± std over 100 realizations shown.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "The power-law exponent can be inferred from ReD runs via linear regression of ⟨Rn⟩/⟨Rn+1 - Rn⟩ vs n",
    391       "evidence": "Derived analytically (Eq. 22, Sec. 5.3). Demonstrated for llama-3.1-8b-instant: α = 0.34 ± 0.01 (R² = 0.96), matching direct estimation from pass@k (Fig. 3, Fig. S1).",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "ReD on a small model (llama-3.1-8b) can be more cost-effective than using larger models with solve-to-completion",
    396       "evidence": "Fig. 2 shows ReD on llama-8b outperforms llama-70b (solve-to-completion) in attempts up to ~90% coverage, and outperforms gpt-oss-20b in USD for nearly all coverage levels.",
    397       "supported": "moderate"
    398     }
    399   ],
    400   "red_flags": [
    401     {
    402       "flag": "No contamination discussion",
    403       "detail": "HumanEval was published in 2021; all three models were likely trained on it. Contamination could distort the pass@k curves that underpin the theoretical framework's empirical validation. The paper does not mention this."
    404     },
    405     {
    406       "flag": "Single benchmark validation",
    407       "detail": "All empirical results are on HumanEval (164 problems). While the theoretical results are general, the empirical demonstration is limited to one small benchmark. Results may not transfer to harder or more diverse benchmarks."
    408     },
    409     {
    410       "flag": "Code not released",
    411       "detail": "The paper promises 'All the code used to generate the results of this paper will be posted on GitHub' but provides no URL. Results cannot currently be independently verified."
    412     }
    413   ],
    414   "cited_papers": [
    415     {
    416       "title": "Evaluating large language models trained on code",
    417       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    418       "year": 2021,
    419       "arxiv_id": "2107.03374",
    420       "relevance": "Introduced HumanEval and pass@k metric, which this paper builds upon for coverage@cost analysis."
    421     },
    422     {
    423       "title": "Large language monkeys: Scaling inference compute with repeated sampling",
    424       "authors": ["Bradley Brown", "Jordan Juravsky", "Ryan Ehrlich"],
    425       "year": 2024,
    426       "arxiv_id": "2407.21787",
    427       "relevance": "Documents inference-time scaling of pass@k over large sample budgets, a key empirical foundation for ReD."
    428     },
    429     {
    430       "title": "Scaling LLM test-time compute optimally can be more effective than scaling model parameters",
    431       "authors": ["Charlie Snell", "Jaehoon Lee", "Kelvin Xu", "Aviral Kumar"],
    432       "year": 2024,
    433       "arxiv_id": "2408.03314",
    434       "relevance": "Studies compute-optimal allocation of test-time compute, directly related to ReD's budget allocation problem."
    435     },
    436     {
    437       "title": "Inference scaling laws: An empirical analysis of compute-optimal inference for problem-solving with language models",
    438       "authors": ["Yangzhen Wu", "Zhiqing Sun", "Shanda Li"],
    439       "year": 2024,
    440       "arxiv_id": "2408.00724",
    441       "relevance": "Empirical analysis of inference scaling laws that ReD builds upon for budget-aware inference."
    442     },
    443     {
    444       "title": "Codemonkeys: Scaling test-time compute for software engineering",
    445       "authors": ["Ryan Ehrlich", "Bradley Brown", "Jordan Juravsky"],
    446       "year": 2025,
    447       "arxiv_id": "2501.14723",
    448       "relevance": "Scales test-time compute for SE tasks via repeated sampling, demonstrating the problem ReD addresses."
    449     },
    450     {
    451       "title": "How do large language monkeys get their power (laws)?",
    452       "authors": ["Rylan Schaeffer", "Joshua Kazdan", "John Hughes"],
    453       "year": 2025,
    454       "arxiv_id": "2502.17578",
    455       "relevance": "Proposes statistical explanations for power-law pass@k behavior, a key assumption underlying ReD's theory."
    456     },
    457     {
    458       "title": "Efficient prediction of pass@k scaling in large language models",
    459       "authors": ["Joshua Kazdan", "Rylan Schaeffer", "Aya Allouah"],
    460       "year": 2025,
    461       "arxiv_id": "2510.05197",
    462       "relevance": "Develops sample-efficient estimators for pass@k scaling, complementary to ReD's exponent inference method."
    463     },
    464     {
    465       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    466       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    467       "year": 2024,
    468       "relevance": "Proposes cost-optimized LLM routing strategies; ReD cites it as a related hierarchical routing approach."
    469     },
    470     {
    471       "title": "A simple model of inference scaling laws",
    472       "authors": ["Noam Levi"],
    473       "year": 2025,
    474       "relevance": "Proposes a difficulty-mixture model for pass@k power-law behavior that ReD's framework directly builds on."
    475     },
    476     {
    477       "title": "Reasoning in token economies: Budget-aware evaluation of LLM reasoning strategies",
    478       "authors": ["Junlin Wang", "Siddhartha Jain", "Dejiao Zhang"],
    479       "year": 2024,
    480       "relevance": "Argues for budget-aware evaluation of reasoning strategies, directly motivating coverage@cost over pass@k."
    481     },
    482     {
    483       "title": "Smaller, weaker, yet better: Training LLM reasoners via compute-optimal sampling",
    484       "authors": ["Hritik Bansal", "Arian Hosseini", "Rishabh Agarwal"],
    485       "year": 2025,
    486       "relevance": "Studies compute-optimal sampling for training data generation, related to ReD's coverage optimization."
    487     },
    488     {
    489       "title": "From decoding to meta-generation: Inference-time algorithms for large language models",
    490       "authors": ["Sean Welleck", "Amanda Bertsch", "Matthew Finlayson"],
    491       "year": 2024,
    492       "arxiv_id": "2406.16838",
    493       "relevance": "Comprehensive survey of inference-time algorithms for LLMs, providing context for ReD's contribution."
    494     }
    495   ]
    496 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs