scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23147B)
      1 {
      2   "paper": {
      3     "title": "Are More LM Calls All You Need? Towards the Scaling Properties of Compound AI Systems",
      4     "authors": ["Lingjiao Chen", "Jared Quincy Davis", "Boris Hanin", "Peter Bailis", "Ion Stoica", "Matei Zaharia", "James Zou"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2403.02419",
      8     "doi": "10.48550/arXiv.2403.02419"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["theoretical", "benchmark-eval"],
     13   "key_findings": "The performance of Vote and Filter-Vote compound AI systems is often non-monotonic in the number of LM calls: performance first increases then decreases (or vice versa). This is explained by the diversity of query difficulties — more calls help on easy queries but hurt on hard ones. The authors derive an analytical scaling model that accurately predicts performance and the optimal number of LM calls from a small number of samples.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The paper states 'We will release the code and datasets used in this paper' (Section 1) — this is a promise of future release, not an actual release."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The real-world datasets used (MMLU Physics, TruthfulQA, GPQA, AVERITEC) are all publicly available standard benchmarks. The synthetic datasets are parameterized and fully described."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications, dependency lists, or setup instructions are provided."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. The experimental setup is described at a high level but not with reproducible commands."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Results are reported as point estimates. While experiments are averaged over 1,000 runs, no confidence intervals or error bars are shown in figures or tables."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No statistical significance tests are reported. Claims about non-monotonic behavior and model fit quality are based on visual inspection and MSE values without formal tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No effect sizes are reported. Performance differences are shown visually in figures but not quantified with standardized effect measures."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for why 1,000 simulation runs were chosen, or why these particular dataset sizes were sufficient."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Although the paper averages over 1,000 runs, no standard deviation, IQR, or other spread measure is reported for any result."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper compares Vote and Filter-Vote against each other and against single LM calls (K=1). The analytical scaling model is compared against empirical results."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The systems studied (majority voting, filter-vote) are contemporary techniques used in state-of-the-art systems like Gemini CoT@32 and AlphaCode 2."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper systematically varies parameters (K, α, p1, p2) and breaks down performance by easy/difficult queries, which serves as an ablation of the query difficulty distribution's effect."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "Only accuracy is used as the evaluation metric across all experiments."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is irrelevant — the paper studies scaling properties of voting systems on objective multiple-choice benchmarks."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The scaling model is fit on a training set of K values (2, 5, 10, 20, 50, 100) and used to predict performance for 100 randomly drawn K values, constituting a train/test separation."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by easy vs. difficult queries (Figures 2, 4b) and across four different datasets. Synthetic experiments vary α, p1, p2 systematically (Figure 5)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper explicitly discusses cases where more LM calls degrade performance (the core finding). Figure 4d gives concrete examples of easy and difficult queries."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The central finding is a negative result: more LM calls can hurt performance. The non-monotonic behavior is itself a cautionary finding."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims about non-monotonic scaling, query difficulty explanation, and analytical model accuracy are all supported by the theoretical results and experiments in Sections 4-5."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The causal claim that query difficulty diversity causes non-monotonic behavior is supported both by mathematical proof (Theorem 2) and controlled synthetic experiments where difficulty parameters are varied directly."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper explicitly limits scope to 'tasks with a fairly small number of possible responses (e.g., multiple-choice questions)' and notes 'tasks with many valid outputs, such as chat, remain under-explored' (Section 1). The Limitations section (Appendix B) further bounds generalization."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No alternative explanations for the non-monotonic behavior are discussed beyond the query difficulty framework. Other potential factors (prompt sensitivity, temperature effects, model stochasticity patterns) are not considered."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures accuracy on multiple-choice benchmarks and claims results about accuracy on multiple-choice benchmarks — the measurement matches the claims without proxy gap."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper specifies 'GPT-3.5-turbo-0125' (Section 5), which is an exact model version with snapshot date."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Full prompt text for both the generator and filter are provided in Appendix D.2."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Temperature is reported as 0.1 (Appendix D.3). The number of queries per question (400) and simulation runs (1,000) are stated."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The systems are simple vote/filter-vote aggregation without agents, tools, or iterative workflows."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The paper describes how datasets are used (Appendix D.1), which partitions are selected, and how queries are formatted as multiple-choice questions. The simulation procedure (400 samples, random sampling with replacement for each K) is described in Appendix D.3."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Appendix B is a dedicated 'Limitations' section discussing scope boundaries."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The Limitations section identifies specific threats: analysis is limited to two compound system types, experiments use only objective tasks, and predicting query difficulty without querying LMs remains open."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper explicitly states boundaries: focuses on tasks with small answer spaces, only Vote and Filter-Vote designs, objective tasks only. Section 1 notes 'tasks with many valid outputs, such as chat, remain under-explored.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The raw LM responses (400 per query) are not released. Only aggregated performance curves are shown."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The data collection procedure is described: each query is sent to GPT-3.5-turbo-0125 400 times at temperature 0.1, and K responses are sampled with replacement, averaged over 1,000 runs (Appendix D.3)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data sources are standard public benchmarks."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline from query → 400 LM responses → random K-sample → vote/filter-vote → accuracy averaged over 1,000 runs is fully documented in Appendix D.3."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source is disclosed anywhere in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations (Stanford, UC Berkeley, Princeton) are clearly listed on the first page."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding information is provided, so independence cannot be assessed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is present. Several authors have known affiliations with AI companies (Databricks/Anyscale) which are not disclosed in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The training data cutoff for GPT-3.5-turbo-0125 is not stated."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether GPT-3.5 may have seen MMLU, TruthfulQA, or GPQA questions during training."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "MMLU was published in 2020 and TruthfulQA in 2021, both well before GPT-3.5's training. GPQA was published in late 2023. No contamination analysis is provided for any benchmark."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "The paper notes in the Conclusion that 'we do not discuss the cost of LM calls; this is an important dimension' but does not report any cost figures despite calling GPT-3.5 hundreds of times per query."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total compute budget, API costs, or wall-clock times are reported, despite extensive API usage (400 calls × number of queries × 4 datasets)."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "While experiments are averaged over 1,000 simulation runs (random sampling from 400 pre-collected responses), no seed sensitivity analysis or variance across seeds is reported."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "The paper states 'All experiments are averaged over 1,000 runs' (Section 5) and '400 times' for initial query collection (Appendix D.3)."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The temperature was set to 0.1 with no justification or search over alternatives. The scaling model fitting procedure uses specific K values with no discussion of sensitivity to this choice."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The choice of temperature=0.1 and the specific K values used for fitting (2, 5, 10, 20, 50, 100) are not justified."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors propose the analytical scaling model and evaluate it themselves without acknowledging potential author-evaluation bias."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "The entire paper is about performance as a function of compute (number of LM calls). Figures 1, 2, 4, 5 all show performance curves across varying compute levels."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether MMLU Physics, TruthfulQA, GPQA, or AVERITEC actually measure what they claim to measure, or whether voting behavior on multiple-choice questions generalizes to real-world compound AI system performance."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved — the systems are simple vote/filter-vote aggregation."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether GPT-3.5 may have been trained on benchmark data that predates its training cutoff."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the multiple-choice format or prompt structure leaks information about correct answers."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of independence between training data and benchmark questions."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention methods are applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "The performance of Vote and Filter-Vote is often non-monotonic in the number of LM calls — first increasing then decreasing (or vice versa).",
    365       "evidence": "Empirically demonstrated across MMLU Physics, TruthfulQA, GPQA, and AVERITEC datasets with GPT-3.5 (Figure 1, Section 5).",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "The non-monotonic behavior is caused by the diversity of query difficulties: more LM calls improve performance on easy queries but degrade performance on hard queries.",
    370       "evidence": "Theorem 2 provides mathematical conditions for when non-monotonicity emerges based on query difficulty distribution. Empirically validated with performance breakdowns (Figure 2, 4b) and systematic synthetic experiments (Figure 5).",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "The analytical scaling model can accurately predict the performance of Vote and Filter-Vote and identify the optimal number of LM calls.",
    375       "evidence": "Scaling model predictions match empirical observations with MSE ranging from 1e-6 to 1e-4 (Figure 6). Predicted optimal K matches empirical optimal K exactly across all evaluated configurations (Table 2).",
    376       "supported": "moderate"
    377     }
    378   ],
    379   "red_flags": [
    380     {
    381       "flag": "No cost analysis despite cost being central to the practical question",
    382       "detail": "The paper asks 'are more LM calls all you need?' but does not report any costs, despite the Conclusion acknowledging 'we do not discuss the cost of LM calls; this is an important dimension.' The practical implication is incomplete without cost data."
    383     },
    384     {
    385       "flag": "No uncertainty quantification",
    386       "detail": "Despite averaging over 1,000 runs, no error bars, confidence intervals, or standard deviations are reported for any empirical result. The reader cannot assess the statistical reliability of the performance curves."
    387     },
    388     {
    389       "flag": "Single model evaluation",
    390       "detail": "All experiments use only GPT-3.5-turbo-0125. The generality of the non-monotonic scaling behavior to other models is unknown."
    391     },
    392     {
    393       "flag": "No contamination analysis",
    394       "detail": "Multiple benchmarks used (MMLU, TruthfulQA) predate GPT-3.5's training and could be contaminated, which would affect the easy/hard query distribution and potentially the scaling curves."
    395     }
    396   ],
    397   "cited_papers": [
    398     {
    399       "title": "The shift from models to compound ai systems",
    400       "authors": ["Matei Zaharia", "Omar Khattab", "Lingjiao Chen", "Jared Quincy Davis"],
    401       "year": 2024,
    402       "relevance": "Foundational blog post motivating compound AI systems as the new paradigm over monolithic models."
    403     },
    404     {
    405       "title": "Self-consistency improves chain of thought reasoning in language models",
    406       "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"],
    407       "year": 2022,
    408       "arxiv_id": "2203.11171",
    409       "relevance": "Key prior work on majority voting over multiple LLM reasoning paths to improve performance."
    410     },
    411     {
    412       "title": "Scaling laws for neural language models",
    413       "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"],
    414       "year": 2020,
    415       "arxiv_id": "2001.08361",
    416       "relevance": "Foundational scaling laws work that this paper extends from training-time to inference-time scaling."
    417     },
    418     {
    419       "title": "Inverse scaling: When bigger isn't better",
    420       "authors": ["Ian R McKenzie", "Alexander Lyzhov", "Michael Pieler"],
    421       "year": 2023,
    422       "arxiv_id": "2306.09479",
    423       "relevance": "Demonstrates non-monotonic scaling in model size, parallel to this paper's findings on inference-time scaling."
    424     },
    425     {
    426       "title": "Improving factuality and reasoning in language models through multiagent debate",
    427       "authors": ["Yilun Du", "Shuang Li", "Antonio Torralba"],
    428       "year": 2023,
    429       "arxiv_id": "2305.14325",
    430       "relevance": "Multi-agent debate as a compound inference strategy using multiple LM calls."
    431     },
    432     {
    433       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    434       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    435       "year": 2023,
    436       "arxiv_id": "2305.05176",
    437       "relevance": "Cost-efficient LLM inference strategies, directly relevant to compound system cost-performance tradeoffs."
    438     },
    439     {
    440       "title": "SWE-bench: Can language models resolve real-world github issues?",
    441       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig"],
    442       "year": 2024,
    443       "relevance": "Major benchmark for evaluating LLM agents on real-world software engineering tasks."
    444     },
    445     {
    446       "title": "A survey on evaluation of large language models",
    447       "authors": ["Yupeng Chang", "Xu Wang", "Jindong Wang"],
    448       "year": 2024,
    449       "relevance": "Comprehensive survey on LLM evaluation methodology and benchmarks."
    450     },
    451     {
    452       "title": "The rise and potential of large language model based agents: A survey",
    453       "authors": ["Zhiheng Xi", "Wenxiang Chen", "Xin Guo"],
    454       "year": 2023,
    455       "arxiv_id": "2309.07864",
    456       "relevance": "Survey of LLM-based agent architectures, relevant to understanding compound AI systems."
    457     },
    458     {
    459       "title": "GPQA: A graduate-level google-proof q&a benchmark",
    460       "authors": ["David Rein", "Betty Li Hou", "Asa Cooper Stickland"],
    461       "year": 2023,
    462       "arxiv_id": "2311.12022",
    463       "relevance": "Expert-level benchmark used in this paper's experiments, important for AI capability evaluation."
    464     }
    465   ]
    466 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs