scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (34431B)
      1 {
      2   "paper": {
      3     "title": "Planning In Natural Language Improves LLM Search For Code Generation",
      4     "authors": [
      5       "Evan Wang",
      6       "Federico Cassano",
      7       "Catherine Wu",
      8       "Yunfeng Bai",
      9       "Will Song",
     10       "Vaskar Nath",
     11       "Ziwen Han",
     12       "Sean Hendryx",
     13       "Summer Yue",
     14       "Hugh Zhang"
     15     ],
     16     "year": 2024,
     17     "venue": "arXiv.org",
     18     "arxiv_id": "2409.03733",
     19     "doi": "10.48550/arXiv.2409.03733"
     20   },
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The abstract states 'Code can be found at https://github.com/scaleapi/plansearch' — a specific GitHub repository URL is provided."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The paper uses publicly available benchmarks: HumanEval+, MBPP+, and LiveCodeBench, all of which are publicly accessible. No proprietary data was collected."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed dependency list is provided in the paper. Only model names and API parameters are mentioned."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions are included in the paper. While code is released, the paper itself does not contain a 'Reproducing Results' section or specific commands to replicate experiments."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "All results in Table 1 and figures are reported as point estimates (e.g., 'pass@200 of 77.0%') with no confidence intervals or error bars. The pass@k curves also lack uncertainty bounds."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper claims PLANSEARCH outperforms baselines across all models and benchmarks but never uses statistical significance tests. Comparisons are based solely on comparing raw pass@k numbers."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Effect sizes are reported with baseline context throughout: 'pass@200 of 77.0% on LiveCodeBench, outperforming both the best pass-rate achieved without any search (pass@1 = 41.4%) and using standard repeated sampling (pass@200 = 60.6%).' Appendix C reports relative gains explicitly."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No justification is given for the number of problems used from each benchmark or the choice of k=200 as the maximum number of completions. No power analysis is discussed."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No variance, standard deviation, or spread measures are reported for any results. The paper uses a single run per model/method configuration. Appendix O acknowledges PLANSEARCH generations may not be independent but provides no variance data."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The paper compares against Repeated Sampling, IdeaSearch (their intermediate method), Chain-of-Thought (Appendix E), and o1-mini (a search-augmented model). Multiple baseline comparisons are provided."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Baselines include contemporary models: Claude 3.5 Sonnet, GPT-4o, GPT-4o-mini, DeepSeek-Coder-V2, and o1-mini — all state-of-the-art at time of writing (2024)."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Appendix H provides ablations on maximum subset size S (Figure 28), observation tree depth L (Figure 29), and the observation-to-code translation pipeline (Figure 30), all using GPT-4o-mini on LiveCodeBench."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Multiple metrics are used: pass@1, pass@k curves for various k, pass@200, diversity scores (Section 6.1), and compute-normalized comparisons (Appendix D). Results are also reported across three benchmarks."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "All evaluation is entirely automated through test-suite execution (pass/fail on test cases). No human evaluation of code quality, readability, or correctness of plans is conducted."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "LiveCodeBench uses a temporal split — only problems from May–September 2024 are used, after Claude 3.5 Sonnet's April 2024 training cutoff. HumanEval+ and MBPP+ are standard held-out benchmarks with hidden test cases."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Results are broken down per model (4 models + o1-mini), per method (Repeated Sampling, IdeaSearch, PLANSEARCH), and per benchmark (HumanEval+, MBPP+, LiveCodeBench) in Table 1 and Appendix A."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 6 discusses that 'PLANSEARCH often hurts pass@1 for several models, including most notably Sonnet 3.5 on LiveCodeBench.' Section 7 discusses limitations of the approach and when it underperforms."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Several negative results are reported: PLANSEARCH hurts pass@1 (Section 6), PLANSEARCH does not significantly outperform until k≥4 (Section 7), and gains over o1-mini are marginal (Appendix K). Appendix H shows ablations where removing components had minimal effect."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Abstract claims about pass@200 of 77.0% on LiveCodeBench are confirmed in Table 1. Claims about outperforming baselines and diversity correlation are supported by Figure 6 and the experimental results."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The paper claims PLANSEARCH improves performance through increased diversity. Ablation studies (Appendix H) provide controlled single-variable manipulations. The backtranslation experiment (Section 3.2) and conditioning experiment (Section 3.3) provide additional causal support for the role of idea diversity."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The title claims 'LLM Search For Code Generation' broadly, but the paper only evaluates on competitive programming benchmarks (HumanEval+, MBPP+, LiveCodeBench) involving self-contained function-level problems. Real-world software engineering tasks, multi-file projects, and non-Python languages are untested. The paper does not bound its claims to competitive programming."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Section 6 investigates whether diversity explains the performance gains (rather than just generating more samples). Appendix D provides compute-normalized comparisons to control for the additional compute used by PLANSEARCH. Appendix O discusses the non-independence issue with the estimator."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper's claims match the granularity of its measurements — it measures pass@k on specific benchmarks and claims improvement in pass@k. The diversity metric is explicitly labeled as a proxy and its correlation with performance is measured, not assumed."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "The paper uses marketing names without specific API versions or snapshot dates: 'Claude 3.5 Sonnet', 'GPT-4o-mini', 'GPT-4o', 'DeepSeek-Coder-V2', 'o1-mini'. Per schema criteria, marketing names like 'GPT-4o' without a snapshot date do not count as specified versions. Model behavior changes across API updates."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Full prompt text is provided in Appendix M (Sections M.1–M.4), including backtranslation prompts, repeated sampling prompts, IdeaSearch prompts, and all PLANSEARCH prompts (observation generation, combining observations, etc.)."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 5.2: 'All models are run with temperature 0.9 and top-p of 0.95. (o1-mini was run with temperature 1.0 and top-p of 1.0 because of API constraints.)' Appendix F shows the temperature sweep used to select this value."
    160       },
    161       "scaffolding_described": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The PLANSEARCH pipeline is described in detail in Section 4.3: observation generation (4.3.1), deriving new observations via combinatorial subsets (4.3.2), translation to ideas then pseudocode then code (4.3.3). Figure 2 provides a visual diagram of the full pipeline."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 5.2 documents formatting requirements, code extraction, and test execution. Section 5.1 describes the temporal filtering of LiveCodeBench (May–September 2024). Section 3.3 describes the filtering criteria for the conditioning experiment (removing 0% and 100% solve-rate problems)."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 7, titled 'Limitations and Future Work,' provides substantive discussion of limitations including pass@1 degradation, compute costs, need for verifiers, and domains outside code generation."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 7 discusses specific threats: PLANSEARCH hurts pass@1, doesn't outperform until k≥4, requires a verifier for filtering, and may have diminishing returns when stacked on search models like o1. Appendix O discusses the biased estimator concern for non-independent samples."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The paper does not explicitly enumerate what the results do NOT show. While Section 7 mentions 'domains outside of code generation that are out of scope,' it does not state that results are limited to competitive programming (not general software engineering), Python only, or API-accessible models only. The title and abstract frame claims broadly as 'LLM Search For Code Generation.'"
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "No raw data (individual model outputs, per-problem pass/fail results, generated codes) is made available. Only aggregated pass@k statistics are reported in the paper."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 5.2 describes the data collection procedure: models are sampled at specified temperatures, code is extracted from outputs, and run against test suites. The number of samples and benchmarks used are specified."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants are involved. All data comes from standard public benchmarks (HumanEval+, MBPP+, LiveCodeBench) and LLM API outputs."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The pipeline is documented: generate n completions per model/method → extract code from model output → run against all test cases → compute pass@k using unbiased estimator (Equation 4). For filtering experiments, the additional public-test-filtering step is described in Section 5.4."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding sources are disclosed. The Acknowledgements section (Section 8) thanks individuals for comments and discussion but does not mention any grants, sponsors, or funding agencies."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are clearly listed: Scale AI, California Institute of Technology, Northeastern University, and Anysphere. Two authors are noted as having conducted work while at Scale AI."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "Most authors are affiliated with Scale AI, a company that sells AI data and evaluation services. Scale AI has a commercial interest in demonstrating that inference-time compute scaling (which could drive demand for their services) is effective. No explicit funding disclosure allows independence assessment."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial interests statement is included. Several authors are from Scale AI (a well-funded AI company) and Anysphere (an AI startup), but no patent, equity, or financial interest disclosures are provided."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": true,
    237         "justification": "Section 5.1: 'Claude 3.5 Sonnet has a knowledge cutoff of April 2024.' The paper uses May 2024 as the cutoff date for LiveCodeBench problems to avoid contamination."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": true,
    242         "justification": "Section 5.1 explicitly addresses this: 'LiveCodeBench differentiates itself from other benchmarks by taking care to segregate problems by date to avoid data contamination concerns. For this paper, we use only the subset of problems between May 2024 and September 2024.'"
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": true,
    247         "justification": "Section 5.1 addresses contamination by using only post-cutoff LiveCodeBench problems and noting that 'coding data is often highly upsampled during pre-training.' The temporal split specifically mitigates contamination risk for LiveCodeBench. HumanEval+ and MBPP+ contamination is not directly addressed but the paper relies on LiveCodeBench as its primary contamination-free evaluation."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants are involved in this study. All experiments are automated benchmark evaluations of LLM code generation."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants are involved. The study evaluates LLMs on code benchmarks."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants are involved in this study."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants are involved in this study."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants are involved in this study."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants are involved in this study."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants are involved in this study."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": true,
    291         "justification": "Appendix D reports '244 generated tokens per completion per problem for Repeated Sampling, and 1,428 generated tokens per completion per problem for PLANSEARCH.' Figure 18 provides a compute-normalized comparison. However, dollar costs are not reported."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No total computational budget is stated. The paper mentions 'prohibitively high costs' for o1-mini evaluation but does not quantify total API spend, GPU hours, or total tokens consumed across all experiments."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No results are reported across multiple random seeds. Each model/method configuration appears to be run once, with the stochasticity coming from sampling temperature rather than seed variation."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": true,
    308         "justification": "Section 5.2 states that Repeated Sampling and IdeaSearch generate exactly n codes, while PLANSEARCH generates 300–400 codes per problem. The pass@k estimator uses these sample counts explicitly."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": true,
    313         "justification": "Appendix F describes a temperature sweep 'over temperature increments of 0.1 from 0.0 to 1.2, inclusive, with top-p of 0.95, on Repeated Sampling and IdeaSearch.' This gives 13 configurations per method. Appendix H provides ablation budgets for S and L parameters."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": true,
    318         "justification": "Temperature selection is justified via the sweep in Appendix F. Ablation results in Appendix H justify the choices of S=2 and L=2 by showing performance across alternatives."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": false,
    322         "answer": false,
    323         "justification": "No statistical significance tests are performed at all, so multiple comparison correction is not applicable."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "Scale AI authors implement and evaluate their own PLANSEARCH method against their own implementations of baselines (Repeated Sampling, IdeaSearch). No acknowledgment of potential author-evaluation bias, no independent evaluation, and no comparison against third-party implementations."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": true,
    333         "justification": "Appendix D / Figure 18 provides an explicit compute-normalized comparison showing 'average tokens used per problem' on the x-axis and solve-rate on the y-axis for Repeated Sampling vs. PLANSEARCH across all models."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": true,
    338         "justification": "Section 3 discusses why coding is a powerful domain for search (verifiable via test execution) and Section 5.1 discusses the properties of each benchmark. The paper acknowledges saturation effects on HumanEval+ and MBPP+ and uses LiveCodeBench as the primary evaluation for this reason."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "The comparison is structured so that the same model is used across all search methods (Repeated Sampling, IdeaSearch, PLANSEARCH). The search method IS the variable being tested, and the model is controlled. Results in Table 1 show per-model comparisons within the same scaffold."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": true,
    350         "justification": "Section 5.1: LiveCodeBench problems are restricted to May–September 2024, after Claude 3.5 Sonnet's April 2024 training cutoff, explicitly to prevent temporal leakage."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "The paper uses public test filtering (Section 5.4) as a feature to improve performance but does not discuss whether public tests provide hints that leak information about the hidden test structure or expected solution approach."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of whether benchmark problems share structural similarities, come from similar sources, or have near-duplicate relationships. HumanEval+ and MBPP+ problem independence is not analyzed."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": true,
    365         "justification": "Temporal splits are used as a concrete leakage prevention method: LiveCodeBench problems are filtered to only include those created after May 2024, and the paper explicitly states this is to avoid contamination."
    366       }
    367     }
    368   },
    369   "scan_version": 3,
    370   "active_modules": ["experimental_rigor", "data_leakage"],
    371   "claims": [
    372     {
    373       "claim": "PLANSEARCH achieves pass@200 of 77.0% on LiveCodeBench with Claude 3.5 Sonnet, nearly double the best pass@1 (41.4%) and outperforming standard repeated sampling pass@200 (60.6%).",
    374       "evidence": "Table 1 and Figure 1 show pass@200 = 77.0% for Claude 3.5 Sonnet with PLANSEARCH on LiveCodeBench. Repeated Sampling pass@200 is 55.6% for the same model, and the best non-search pass@1 across all models is 41.4%.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Searching over plans in natural language produces significantly more diverse code outputs than baseline search methods.",
    379       "evidence": "Section 6.1 and Figure 6 show a strong positive correlation between idea diversity scores (measured via LLM-as-judge pairwise comparison) and relative performance gains from pass@1 to pass@200 across all models and methods.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "Most variance in solving a problem is explained by whether the solution sketch is correct, not implementation details.",
    384       "evidence": "Figure 4b shows that when conditioning on a specific sketch, solve rates polarize toward 0% or 100%, suggesting that the sketch quality dominates implementation variance. The backtranslation experiment (Figure 4a) shows even 10 tokens of a correct sketch significantly boosts accuracy.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "PLANSEARCH outperforms all baselines across all models and benchmarks considered.",
    389       "evidence": "Table 1 shows PLANSEARCH achieves the highest pass@200 for every model-benchmark combination: HumanEval+ (up to 99.5%), MBPP+ (up to 93.7%), LiveCodeBench (up to 77.0%). Figures 7–9 show full pass@k curves.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Instruction-tuned models can have less diversity than their base model counterparts, with base models sometimes exceeding instruct models at high k.",
    394       "evidence": "Figure 3 shows DeepSeek-Coder-V2-Lite-Base surpassing its instruct counterpart at high k on MBPP+. Appendix I (Figure 31) provides extended comparisons across multiple model families. However, this trend is 'not all models and benchmarks' per the authors.",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "Running PLANSEARCH on a small model (GPT-4o-mini) outperforms larger models without search after merely 4 attempts.",
    399       "evidence": "This claim is stated in the introduction. Supporting evidence in Appendix A figures shows GPT-4o-mini with PLANSEARCH crossing larger models' pass@1 at low k, though 'merely 4' is not precisely demonstrated in the main text.",
    400       "supported": "moderate"
    401     }
    402   ],
    403   "methodology_tags": ["benchmark-eval"],
    404   "key_findings": "PLANSEARCH, a search algorithm that explores plans in natural language rather than directly searching over code, achieves pass@200 of 77.0% on LiveCodeBench with Claude 3.5 Sonnet — nearly double the best single-sample performance (41.4%). The paper demonstrates that idea diversity, measured via LLM-as-judge pairwise comparisons, is highly predictive of search performance gains across all models and methods. Backtranslation experiments show that even a 10-token correct solution sketch significantly boosts accuracy, and conditioning on specific sketches polarizes solve rates toward 0% or 100%, suggesting solution planning is the critical bottleneck rather than code implementation.",
    405   "red_flags": [
    406     {
    407       "flag": "No error bars or uncertainty quantification",
    408       "detail": "All main results (Table 1, Figures 1, 5) are reported as point estimates without confidence intervals, error bars, or variance across runs. For a paper making strong comparative claims across methods, the absence of any uncertainty quantification makes it impossible to assess whether observed differences are statistically meaningful."
    409     },
    410     {
    411       "flag": "Company evaluating its own method",
    412       "detail": "Most authors are affiliated with Scale AI, which has commercial interests in AI evaluation and inference-time compute scaling. The paper evaluates their own PLANSEARCH method against their own implementations of baselines. No independent evaluation or third-party replication is provided."
    413     },
    414     {
    415       "flag": "Variable sample counts across methods",
    416       "detail": "PLANSEARCH generates a variable number of codes (300–400) per problem, while Repeated Sampling and IdeaSearch generate exactly n codes. The unbiased pass@k estimator assumes remaining generations did not pass when k > n, but the authors acknowledge in Appendix O that PLANSEARCH's non-independent samples may bias the estimator."
    417     },
    418     {
    419       "flag": "LLM-as-judge circular dependency for diversity measurement",
    420       "detail": "The diversity metric (Section 6.1) uses GPT-4o-mini to judge pairwise similarity of code solutions. Using an LLM to measure diversity of LLM outputs introduces a circular dependency — the judge model's own biases may systematically mischaracterize diversity in ways that align with the paper's hypothesis."
    421     },
    422     {
    423       "flag": "Contamination risk unaddressed for HumanEval+ and MBPP+",
    424       "detail": "While LiveCodeBench uses temporal splits, the paper's results on HumanEval+ and MBPP+ (published 2021/2023) are reported without addressing contamination risk from models trained after these benchmarks were released. The paper notes saturation but does not discuss whether inflated scores result from data leakage."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "Large language monkeys: Scaling inference compute with repeated sampling",
    430       "authors": ["Bradley Brown", "Jordan Juravsky", "Ryan Ehrlich", "Ronald Clark", "Quoc V. Le", "Christopher Ré", "Azalia Mirhoseini"],
    431       "year": 2024,
    432       "arxiv_id": "2407.21787",
    433       "relevance": "Directly relevant as a study on scaling inference-time compute for LLM code generation through repeated sampling."
    434     },
    435     {
    436       "title": "Scaling llm test-time compute optimally can be more effective than scaling model parameters",
    437       "authors": ["Charlie Snell", "Jaehoon Lee", "Kelvin Xu", "Aviral Kumar"],
    438       "year": 2024,
    439       "arxiv_id": "2408.03314",
    440       "relevance": "Foundational work on test-time compute scaling laws for LLMs, directly motivating the inference-time search paradigm evaluated in this paper."
    441     },
    442     {
    443       "title": "Tree of Thoughts: Deliberate Problem Solving with Large Language Models",
    444       "authors": ["Shunyu Yao", "Dian Yu", "Jeffrey Zhao", "Izhak Shafran", "Thomas L. Griffiths", "Yuan Cao", "Karthik Narasimhan"],
    445       "year": 2023,
    446       "arxiv_id": "2305.10601",
    447       "relevance": "A key related approach to search-based LLM reasoning that PLANSEARCH builds upon and differentiates from."
    448     },
    449     {
    450       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    451       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans", "Maarten Bosma", "Brian Ichter", "Fei Xia", "Ed Chi", "Quoc Le", "Denny Zhou"],
    452       "year": 2022,
    453       "arxiv_id": "2201.11903",
    454       "relevance": "Foundational prompting technique that PLANSEARCH extends by explicitly searching over reasoning plans rather than using a single chain-of-thought."
    455     },
    456     {
    457       "title": "Livecodebench: Holistic and contamination free evaluation of large language models for code",
    458       "authors": ["Naman Jain", "King Han", "Alex Gu", "Wen-Ding Li", "Fanjia Yan", "Tianjun Zhang", "Sida Wang", "Armando Solar-Lezama", "Koushik Sen", "Ion Stoica"],
    459       "year": 2024,
    460       "arxiv_id": "2403.07974",
    461       "relevance": "Primary contamination-free benchmark used to evaluate PLANSEARCH, designed with temporal splits to avoid data leakage."
    462     },
    463     {
    464       "title": "Evaluating Large Language Models Trained on Code",
    465       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    466       "year": 2021,
    467       "arxiv_id": "2107.03374",
    468       "relevance": "Introduced HumanEval benchmark and the pass@k metric used throughout this paper's evaluation."
    469     },
    470     {
    471       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation",
    472       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    473       "year": 2023,
    474       "relevance": "Created HumanEval+ and MBPP+ with additional test cases for more rigorous evaluation, both used as benchmarks in this paper."
    475     },
    476     {
    477       "title": "Competition-level code generation with alphacode",
    478       "authors": ["Yujia Li", "David Choi", "Junyoung Chung", "Nate Kushman"],
    479       "year": 2022,
    480       "relevance": "Pioneered large-scale sampling for competitive programming code generation, a direct precursor to the search approaches evaluated here."
    481     },
    482     {
    483       "title": "Reflexion: Language agents with verbal reinforcement learning",
    484       "authors": ["Noah Shinn", "Federico Cassano", "Edward Berman", "Ashwin Gopinath", "Karthik Narasimhan", "Shunyu Yao"],
    485       "year": 2023,
    486       "arxiv_id": "2303.11366",
    487       "relevance": "Related agentic approach using verbal feedback to improve LLM performance, part of the broader inference-time compute scaling landscape."
    488     },
    489     {
    490       "title": "STaR: Bootstrapping Reasoning With Reasoning",
    491       "authors": ["Eric Zelikman", "Yuhuai Wu", "Jesse Mu", "Noah D. Goodman"],
    492       "year": 2022,
    493       "arxiv_id": "2203.14465",
    494       "relevance": "Training-time approach to improving reasoning that contrasts with PLANSEARCH's inference-time approach to diversity."
    495     },
    496     {
    497       "title": "Smaller, weaker, yet better: Training llm reasoners via compute-optimal sampling",
    498       "authors": ["Hritik Bansal", "Arian Hosseini", "Rishabh Agarwal", "Vinh Q. Tran", "Mehran Kazemi"],
    499       "year": 2024,
    500       "arxiv_id": "2408.16737",
    501       "relevance": "Studies compute-optimal sampling strategies for LLM reasoning, showing small models with more sampling can outperform larger models."
    502     },
    503     {
    504       "title": "Rest-mcts*: Llm self-training via process reward guided tree search",
    505       "authors": ["Dan Zhang", "Sining Zhoubian", "Ziniu Hu", "Yisong Yue", "Yuxiao Dong", "Jie Tang"],
    506       "year": 2024,
    507       "arxiv_id": "2406.03816",
    508       "relevance": "Uses MCTS for LLM search over individual tokens, contrasting with PLANSEARCH's approach of searching over natural language plans."
    509     },
    510     {
    511       "title": "Quiet-star: Language models can teach themselves to think before speaking",
    512       "authors": ["Eric Zelikman", "Georges Harik", "Yijia Shao", "Varuna Jayasiri", "Nick Haber", "Noah D. Goodman"],
    513       "year": 2024,
    514       "arxiv_id": "2403.09629",
    515       "relevance": "Trains models to generate internal reasoning tokens, a training-time alternative to PLANSEARCH's inference-time diversity approach."
    516     }
    517   ],
    518   "engagement_factors": {
    519     "practical_relevance": {
    520       "score": 2,
    521       "justification": "Code is released and method can be applied on top of existing LLM APIs for code generation, though it requires significant inference compute (300-400 samples per problem)."
    522     },
    523     "surprise_contrarian": {
    524       "score": 2,
    525       "justification": "Challenges the assumption that scaling model parameters is sufficient — demonstrates that searching over natural language plans can nearly double performance, and that instruction-tuned models may be less diverse than base models."
    526     },
    527     "fear_safety": {
    528       "score": 0,
    529       "justification": "No safety or security concerns raised; the paper focuses on improving code generation accuracy."
    530     },
    531     "drama_conflict": {
    532       "score": 0,
    533       "justification": "No controversy or conflict with existing work; positions itself as a complementary contribution to existing search methods."
    534     },
    535     "demo_ability": {
    536       "score": 2,
    537       "justification": "Code released at GitHub (scaleapi/plansearch), usable with LLM APIs, though running experiments requires API credits and compute."
    538     },
    539     "brand_recognition": {
    540       "score": 2,
    541       "justification": "Scale AI is a well-known AI company; the paper evaluates on prominent models including Claude 3.5 Sonnet and GPT-4o, attracting attention from those ecosystems."
    542     }
    543   }
    544 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs