ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25533B)


      1 {
      2   "paper": {
      3     "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models",
      4     "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans", "Quoc Le", "Ed H. Chi", "Sharan Narang", "Aakanksha Chowdhery", "Denny Zhou"],
      5     "year": 2022,
      6     "venue": "International Conference on Learning Representations (ICLR 2023)",
      7     "arxiv_id": "2203.11171"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "Self-consistency, a decoding strategy that samples diverse reasoning paths and takes majority vote, significantly improves chain-of-thought prompting across arithmetic and commonsense reasoning benchmarks. Gains are striking: +17.9% on GSM8K, +11.0% on SVAMP, +12.2% on AQuA with PaLM-540B or GPT-3. The method requires no training, fine-tuning, or additional annotation — it is purely a test-time technique. Self-consistency also helps when chain-of-thought hurts performance compared to standard prompting, and the consistency level correlates with accuracy, providing an uncertainty estimate.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No code repository or link is provided in the paper. The reproducibility statement mentions model checkpoints and APIs but no code for the self-consistency method itself."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All benchmarks used are publicly available (GSM8K, SVAMP, AQuA, CommonsenseQA, ARC, etc.). The paper also provides full prompts in Appendix A.3."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper mentions TPU configurations (e.g., TPU v3 2x2 for UL2, TPU v4 4x4x12 for PaLM) in Appendix A.2 but does not provide software environment details, library versions, or dependency specifications."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided. The reproducibility statement points to model checkpoints and APIs but does not provide scripts or a README for replication."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Standard deviations are reported across 10 runs. Figure 2 and Figures 7-8 show error bars. The paper notes 'standard deviation of self-consistency is ≤0.5 for all tasks' in Table 2 footnote."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No statistical significance tests are performed. Claims of 'X outperforms Y' are based on comparing accuracy numbers without any hypothesis testing."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Absolute accuracy improvements are reported with baseline context throughout, e.g., '+17.9%' on GSM8K (from 56.5% to 74.4%). Tables 2 and 3 show both baseline and improved numbers."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No justification is given for using 40 sampled paths or 10 runs. These choices appear arbitrary with no power analysis or justification."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Results are averaged over 10 runs with standard deviations reported. The paper states this explicitly in Section 3.2 and shows variance in figures and tables (e.g., Tables 6, 7)."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Multiple baselines are compared: greedy CoT decoding (Wei et al., 2022), sample-and-rank, beam search, prompt-order ensembles, multi-prompt ensembles, model ensembles, and previous SoTA methods."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Baselines include the then-contemporary CoT prompting (Wei et al., 2022) and prior SoTA results. The comparison methods (sample-and-rank, beam search, ensembles) are standard approaches."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Table 1 ablates different aggregation strategies (weighted avg, weighted sum, majority vote). Section 3.5 ablates sampling strategies, scaling, imperfect prompts, equation prompts, and zero-shot CoT."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "Only accuracy is used as the evaluation metric across all tasks (with EM/F1 for HotpotQA in Table 5). No other metrics like calibration or per-example analysis are reported."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "Human evaluation is not relevant — the tasks have fixed ground-truth answers and automated evaluation is the appropriate method."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The paper states in footnote 3: 'By default we use the test split for all datasets if the labels are available for evaluation. For CommonsenseQA we use the dev split; for StrategyQA we use the question-only set from BIG-bench.'"
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down per task (Tables 2, 3), per model (4 models), per aggregation strategy (Table 1), and per number of sampled paths (Figure 2)."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Tables 4, 12, and 13 show specific examples including incorrect reasoning paths. The paper discusses that 'language models can sometimes generate incorrect or nonsensical reasoning paths' (Section 5)."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Table 1 shows that weighted average aggregation performs much worse. Table 6 shows beam search underperforms sampling. The paper notes self-consistency gains are 'relatively lower for smaller models' (Section 3.5). Symbolic reasoning shows minimal gains for small models."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims specific accuracy gains (+17.9% GSM8K, +11.0% SVAMP, etc.) which are all supported by Table 2 and Table 3 results."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The paper's causal claims are primarily about self-consistency improving performance. The ablation design (controlled comparisons changing only the decoding strategy while keeping prompts/models fixed) adequately supports these claims."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper tests across 4 models, 12+ benchmarks, and multiple reasoning types. Section 5 notes the limitation that 'self-consistency can be applied only to problems where the final answer is from a fixed answer set.' The scope is reasonably bounded."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The paper compares against alternative explanations for the improvement: sample-and-rank (Section 3.4), beam search diversity, model ensembles, and prompt ensembles. Table 1 analyzes whether the gain comes from probability weighting vs. majority vote."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper measures accuracy on specific benchmarks and claims improvements on those specific benchmarks. It does not overclaim — it says 'reasoning accuracy' in the context of the tested benchmarks, not 'general reasoning ability.'"
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Specific models are named: UL2-20B, GPT-3 code-davinci-001 and code-davinci-002, LaMDA-137B, PaLM-540B. The GPT-3 engines are specific enough to reproduce via the API."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Full prompts are provided in Appendix A.3 (Tables 14-21) for all tasks, including the exact chain-of-thought exemplars used."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section 3.1 reports sampling parameters: T=0.5/k=40 for UL2 and LaMDA, T=0.7/k=40 for PaLM, T=0.7 without top-k for GPT-3. Appendix A.2 reports max tokens (128) and no frequency/presence penalty for GPT-3."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. Self-consistency is a pure decoding strategy applied to language model outputs."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "The paper describes how answers are parsed: 'we parse the first numerical part as the final answer after the model generates \"The answer is\"' (footnote 1). Dataset splits are documented in footnote 3."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 5 (Conclusion and Discussion) contains substantive discussion of limitations including computational cost, applicability to fixed-answer-set problems only, and nonsensical reasoning paths."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "The paper discusses specific limitations: higher computation cost, restriction to fixed answer sets, models generating incorrect/nonsensical reasoning paths (with specific examples in Table 4). These are specific to this work."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 5 explicitly states: 'self-consistency can be applied only to problems where the final answer is from a fixed answer set' and notes it cannot be directly applied to open-text generation without a consistency metric."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "Raw model outputs (the sampled reasoning paths and answers) are not released. Only aggregate accuracy numbers are reported."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Data collection is well-described: all benchmarks are public with specific splits documented (footnote 3). The sampling procedure is detailed in Section 3.1."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. All data comes from standard public benchmarks."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The pipeline is documented: prompt model with CoT exemplars → sample 40 outputs → parse answers → aggregate via majority vote. Answer parsing is described in footnote 1."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding or acknowledgments section is present in the paper. All authors are from Google Research, Brain Team, but no funding disclosure is provided."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "All authors are listed as Google Research, Brain Team, with email addresses at google.com clearly shown on the first page."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "Google employees evaluate Google's own models (LaMDA-137B, PaLM-540B) alongside others. Google has a financial interest in demonstrating the capabilities of its language models."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No training data cutoff dates are stated for any of the four models used. This is relevant as some benchmarks (e.g., AddSub from 2014, MultiArith from 2015) likely appear in training data."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of potential train/test overlap despite using public benchmarks with models that could have seen them during training."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "Many benchmarks used (AddSub 2014, MultiArith 2015, CommonsenseQA 2019) were published well before these models' training. No contamination analysis is performed."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "The paper acknowledges the computational cost limitation ('self-consistency incurs more computation cost') but does not quantify the cost per example or total API spend."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Appendix A.2 states hardware used (TPU v3/v4 configurations) and approximate inference times: '1 to 4 hours' per task on UL2/LaMDA, '2 to 12 hours' on PaLM, with some tasks up to 2 days."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "Results are averaged over 10 runs with standard deviations reported. Figures 2, 7, 8 show error bars across runs. The paper explicitly states 'averaged over 10 runs' (Section 3.2)."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Section 3.2: 'We report the results of self-consistency averaged over 10 runs, where we sampled 40 outputs independently from the decoder in each run.'"
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The sampling parameters (temperature, top-k) appear chosen without reporting how they were selected. Section 3.5 ablates these choices but doesn't report the search budget that led to the default settings."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "Section 3.5 and Figure 4 show that self-consistency is robust across sampling strategies and parameters. Table 1 justifies the choice of majority vote over other aggregation strategies by comparing all options."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied despite many comparisons across tasks, models, and configurations."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors do not acknowledge any bias from evaluating their own proposed method. However, the method is simple enough (majority vote over samples) that implementation bias is less of a concern."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": true,
    325         "justification": "Figure 2 plots accuracy as a function of the number of sampled reasoning paths (1, 5, 10, 20, 40), directly showing the compute-performance tradeoff. Section 5 discusses this tradeoff."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "No discussion of whether the reasoning benchmarks actually measure reasoning ability vs. pattern matching or memorization. The paper takes benchmark validity at face value."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No scaffolding is used. Self-consistency is a decoding strategy, not a scaffolded system."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "No discussion of whether the models' training data includes the benchmark problems. Many benchmarks predate the models' training data."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether the evaluation setup leaks information. The few-shot exemplars could prime the model differently for different problems, but this is not discussed."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether training data and test benchmarks share structural similarities or overlap."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No leakage detection or prevention method is applied."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "Self-consistency improves chain-of-thought prompting by +17.9% on GSM8K, +11.0% on SVAMP, +12.2% on AQuA, +6.4% on StrategyQA, and +3.9% on ARC-challenge.",
    364       "evidence": "Tables 2 and 3 show these gains with PaLM-540B or GPT-3 code-davinci-002. Results averaged over 10 runs with std dev ≤0.5.",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "Self-consistency achieves new state-of-the-art results on arithmetic reasoning benchmarks without any task-specific training.",
    369       "evidence": "Table 2 compares against previous SoTA including fine-tuned models (GPT-3 175B finetuned + verifier). Self-consistency with PaLM-540B or GPT-3 surpasses these on most tasks.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Self-consistency significantly outperforms sample-and-rank, beam search, and ensemble-based approaches.",
    374       "evidence": "Figure 3 compares to sample-and-rank, Table 6 to beam search, Table 7 to prompt-order and multi-prompt ensembles, Table 10 to model ensembles. Self-consistency dominates in all comparisons.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Self-consistency helps recover performance when chain-of-thought hurts compared to standard prompting.",
    379       "evidence": "Table 5 shows that on ANLI-R1, e-SNLI, and RTE, CoT hurts but self-consistency recovers and outperforms standard prompting.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Majority vote (unweighted) performs comparably to normalized weighted sum for answer aggregation.",
    384       "evidence": "Table 1 shows majority vote at 74.4% vs. normalized weighted sum at 74.1% on GSM8K, with similar results across tasks. The paper explains this is because normalized probabilities are close to each other.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Self-consistency is robust to sampling strategies and parameters.",
    389       "evidence": "Figure 4 (left) and Figure 6 show results across various temperature and top-k settings, with consistent improvements regardless of sampling parameters.",
    390       "supported": "strong"
    391     }
    392   ],
    393   "red_flags": [
    394     {
    395       "flag": "Company evaluating own models",
    396       "detail": "All authors are Google Research employees evaluating Google's LaMDA-137B and PaLM-540B models. While the method is also tested on public models (UL2, GPT-3), the largest gains are showcased on Google's proprietary models."
    397     },
    398     {
    399       "flag": "No contamination analysis",
    400       "detail": "Many benchmarks used (AddSub 2014, MultiArith 2015, ASDiv 2020, CommonsenseQA 2019) were publicly available well before the models' training. No training cutoff dates or contamination analysis is provided for any model."
    401     },
    402     {
    403       "flag": "No statistical significance tests",
    404       "detail": "Despite many comparative claims across 12+ benchmarks and 4 models, no statistical tests are performed. Claims of superiority rely solely on comparing accuracy numbers."
    405     }
    406   ],
    407   "cited_papers": [
    408     {
    409       "title": "Chain of thought prompting elicits reasoning in large language models",
    410       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans", "Maarten Bosma", "Brian Ichter", "Fei Xia", "Ed Chi", "Quoc Le", "Denny Zhou"],
    411       "year": 2022,
    412       "relevance": "The foundational work that self-consistency builds upon; introduces chain-of-thought prompting for LLM reasoning."
    413     },
    414     {
    415       "title": "Training verifiers to solve math word problems",
    416       "authors": ["Karl Cobbe", "Vineet Kosaraju", "Mohammad Bavarian", "Mark Chen", "Heewoo Jun", "Lukasz Kaiser", "Matthias Plappert", "Jerry Tworek", "Jacob Hilton", "Reiichiro Nakano", "Christopher Hesse", "John Schulman"],
    417       "year": 2021,
    418       "relevance": "Introduces GSM8K benchmark and the verifier approach that self-consistency outperforms without requiring training."
    419     },
    420     {
    421       "title": "Language models are few-shot learners",
    422       "authors": ["Tom Brown"],
    423       "year": 2020,
    424       "relevance": "GPT-3 paper establishing few-shot prompting paradigm; one of the four models evaluated."
    425     },
    426     {
    427       "title": "PaLM: Scaling language modeling with pathways",
    428       "authors": ["Aakanksha Chowdhery"],
    429       "year": 2022,
    430       "arxiv_id": "2204.02311",
    431       "relevance": "Introduces PaLM-540B, the largest model evaluated; key to understanding scaling effects of self-consistency."
    432     },
    433     {
    434       "title": "Large language models are zero-shot reasoners",
    435       "authors": ["Takeshi Kojima", "Shixiang Shane Gu", "Machel Reid", "Yutaka Matsuo", "Yusuke Iwasawa"],
    436       "year": 2022,
    437       "relevance": "Introduces zero-shot chain-of-thought prompting; self-consistency is shown to work with this approach as well."
    438     },
    439     {
    440       "title": "Evaluating large language models trained on code",
    441       "authors": ["Mark Chen"],
    442       "year": 2021,
    443       "relevance": "Codex paper; code-davinci-001/002 engines used as two of the four model families evaluated."
    444     },
    445     {
    446       "title": "The unreliability of explanations in few-shot prompting for textual reasoning",
    447       "authors": ["Xi Ye", "Greg Durrett"],
    448       "year": 2022,
    449       "relevance": "Shows chain-of-thought can hurt performance on some tasks; self-consistency addresses this limitation."
    450     },
    451     {
    452       "title": "Scaling language models: Methods, analysis & insights from training Gopher",
    453       "authors": ["Jack W Rae"],
    454       "year": 2021,
    455       "relevance": "Discusses limitations of scaling for reasoning; provides context for why prompting strategies like self-consistency are needed."
    456     },
    457     {
    458       "title": "Beyond the imitation game: Measuring and extrapolating the capabilities of language models",
    459       "authors": ["BIG-bench collaboration"],
    460       "year": 2021,
    461       "relevance": "Major LLM benchmark suite; StrategyQA evaluation set sourced from BIG-bench."
    462     },
    463     {
    464       "title": "LaMDA: Language models for dialog applications",
    465       "authors": ["Romal Thoppilan"],
    466       "year": 2022,
    467       "arxiv_id": "2201.08239",
    468       "relevance": "Introduces LaMDA-137B, one of the four models evaluated; also describes the re-ranker approach that self-consistency outperforms."
    469     }
    470   ]
    471 }

Impressum · Datenschutz