scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (33274B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
      6     "authors": [
      7       "Wei, J.",
      8       "Wang, X.",
      9       "Schuurmans, D.",
     10       "Bosma, M.",
     11       "Ichter, B.",
     12       "Xia, F.",
     13       "Chi, E.",
     14       "Le, Q.",
     15       "Zhou, D."
     16     ],
     17     "year": 2022,
     18     "venue": "NeurIPS 2022",
     19     "arxiv_id": "2201.11903",
     20     "doi": null
     21   },
     22   "checklist": {
     23     "claims_and_evidence": {
     24       "abstract_claims_supported": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The abstract claims CoT improves reasoning, emerges in sufficiently large models, and achieves SOTA on GSM8K. All are supported: Figure 4 shows emergence, Table 1 shows GSM8K SOTA (56.9% vs 55% prior best with PaLM 540B).",
     28         "source": "opus"
     29       },
     30       "causal_claims_justified": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The paper makes causal claims ('chain-of-thought prompting improves performance') and supports them with controlled ablation studies (Section 3.3): equation only, variable compute only, and reasoning after answer. These ablations isolate specific factors, supporting the causal claim that the content of the chain of thought matters.",
     34         "source": "opus"
     35       },
     36       "generalization_bounded": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The paper clearly scopes claims to specific tasks (arithmetic, commonsense, symbolic reasoning), specific models (GPT-3, LaMDA, PaLM), and specific model scales (≥100B). Section 6 explicitly discusses when CoT helps (Appendix A.3 gives three conditions) and states it only helps 'sufficiently large' models.",
     40         "source": "opus"
     41       },
     42       "alternative_explanations_discussed": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The ablation study (Section 3.3) tests three alternative explanations: (1) equations alone suffice, (2) more computation is the key factor, (3) CoT just activates knowledge. All are refuted with evidence. Section 6 also acknowledges models may not be 'actually reasoning.'",
     46         "source": "opus"
     47       },
     48       "proxy_outcome_distinction": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper consistently measures solve rate/accuracy on specific benchmarks and does not overclaim beyond these measurements. Claims are tied to specific datasets (e.g., 'on the GSM8K benchmark') rather than broad ungrounded claims about 'reasoning ability' in general.",
     52         "source": "opus"
     53       }
     54     },
     55     "limitations_and_scope": {
     56       "limitations_section_present": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Section 6 (Discussion) contains a dedicated paragraph on limitations, covering: (1) CoT doesn't prove reasoning, (2) annotation costs for finetuning, (3) no guarantee of correct reasoning paths, (4) only works at large scale making it costly to serve.",
     60         "source": "opus"
     61       },
     62       "threats_to_validity_specific": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The paper discusses specific threats: generated chains of thought are not always factual (Section 6, Appendix D.1), incorrect reasoning can accidentally lead to correct answers especially for classification tasks (Appendix D.1), and prompt sensitivity affects results (Section 3.4, Appendix A.2).",
     66         "source": "opus"
     67       },
     68       "scope_boundaries_stated": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Appendix A.3 explicitly states when CoT helps and when it doesn't: three conditions must be met (challenging task, large model, flat scaling curve). Section 6 states CoT emergence 'only at large model scales makes it costly to serve.' The paper scopes to arithmetic, commonsense, and symbolic reasoning tasks.",
     72         "source": "opus"
     73       }
     74     },
     75     "conflicts_of_interest": {
     76       "funding_disclosed": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "No funding sources are disclosed. The paper is from Google Research, Brain Team, but no specific grant numbers or funding acknowledgments are provided beyond naming individual colleagues who gave feedback.",
     80         "source": "opus"
     81       },
     82       "affiliations_disclosed": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "All authors are identified as Google Research, Brain Team with email addresses {jasonwei,dennyzhou}@google.com on the first page.",
     86         "source": "opus"
     87       },
     88       "funder_independent_of_outcome": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "The authors work at Google, which develops and markets the LaMDA and PaLM models being evaluated. Google has a commercial interest in demonstrating that large language models (especially their own) can perform better with prompting techniques.",
     92         "source": "opus"
     93       },
     94       "financial_interests_declared": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No competing interests statement is present in the paper. Given all authors work at Google and evaluate Google's proprietary models (LaMDA, PaLM), financial interests are plausible but undisclosed.",
     98         "source": "opus"
     99       }
    100     },
    101     "scope_and_framing": {
    102       "key_terms_defined": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Chain of thought is precisely defined in Section 2 as 'a series of intermediate natural language reasoning steps that lead to the final output,' and the (input, chain of thought, output) prompting format is explicitly described.",
    106         "source": "haiku"
    107       },
    108       "intended_contribution_clear": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The contribution is clearly stated: chain-of-thought prompting as a simple prompting method for eliciting multi-step reasoning, showing it is an emergent property of model scale.",
    112         "source": "haiku"
    113       },
    114       "engagement_with_prior_work": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Section 7 and extended Appendix C situate the work relative to five prior directions (prompting, NLEs, program synthesis, numeric/logical reasoning, intermediate steps), explicitly contrasting with and building on specific prior contributions.",
    118         "source": "haiku"
    119       }
    120     }
    121   },
    122   "type_checklist": {
    123     "empirical": {
    124       "artifacts": {
    125         "code_released": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "No code repository URL is provided in the paper. The authors provide inputs/outputs/targets for LaMDA and GPT-3 in supplementary material but do not release evaluation code or scripts.",
    129           "source": "opus"
    130         },
    131         "data_released": {
    132           "applies": true,
    133           "answer": true,
    134           "justification": "The paper uses publicly available benchmarks (GSM8K, SVAMP, ASDiv, AQuA, MAWPS, CSQA, StrategyQA, BIG-bench tasks). The coinflip and last letter concatenation datasets are new and are provided in supplementary material (Appendix E.3). LaMDA inputs/outputs/targets are also provided.",
    135           "source": "opus"
    136         },
    137         "environment_specified": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "Appendix E.2 states TPU v3 (8x8) for LaMDA 137B and TPU v4 (4x4x12) for PaLM 540B inference, and GPT-3 via public API. However, no software environment details (library versions, dependencies) are provided.",
    141           "source": "opus"
    142         },
    143         "reproduction_instructions": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "No step-by-step reproduction instructions are provided. While full prompts are given in Appendix G and model outputs are in supplementary material, there are no scripts or instructions to replicate the experiments.",
    147           "source": "opus"
    148         }
    149       },
    150       "statistical_methodology": {
    151         "confidence_intervals_or_error_bars": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No confidence intervals or error bars are reported on main results. Standard deviations are only provided for LaMDA 137B ablation/robustness experiments in Tables 6 and 7, but not for the primary results across models and benchmarks in Tables 1-5.",
    155           "source": "opus"
    156         },
    157         "significance_tests": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "No statistical significance tests are reported. Claims of improvement are based solely on comparing point estimates (e.g., '56.9% vs 17.9%') without any formal testing.",
    161           "source": "opus"
    162         },
    163         "effect_sizes_reported": {
    164           "applies": true,
    165           "answer": true,
    166           "justification": "Performance improvements are reported with baseline context throughout (e.g., Table 1 shows '+39.0' for PaLM 540B on GSM8K going from 17.9% to 56.9%). Absolute and relative gains are consistently provided.",
    167           "source": "opus"
    168         },
    169         "sample_size_justified": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "No justification is given for the number of evaluation examples or the choice of 50 examples for error analysis (Sections 3.2, Appendix D). The evaluation set sizes are inherited from existing benchmarks without discussion of statistical power.",
    173           "source": "opus"
    174         },
    175         "variance_reported": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Standard deviations across five random seeds (different exemplar orderings) are reported for LaMDA 137B in Tables 6 and 7. For other models, single exemplar orders are used to save compute, which is explicitly stated in Section 3.1.",
    179           "source": "opus"
    180         }
    181       },
    182       "evaluation_design": {
    183         "baselines_included": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Standard few-shot prompting is used as the primary baseline throughout. Prior supervised state-of-the-art results are also compared against (e.g., finetuned GPT-3 with verifier for GSM8K from Cobbe et al. 2021).",
    187           "source": "opus"
    188         },
    189         "baselines_contemporary": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Baselines include contemporary prior best results from Cobbe et al. (2021), Jie et al. (2022), Pi et al. (2022), and others. These were recent at the time of publication.",
    193           "source": "opus"
    194         },
    195         "ablation_study": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Section 3.3 presents a detailed ablation study with three variations: equation only, variable compute only (dots), and chain of thought after answer. Results shown in Figure 5 and Tables 6-7.",
    199           "source": "opus"
    200         },
    201         "multiple_metrics": {
    202           "applies": true,
    203           "answer": false,
    204           "justification": "Only solve rate (accuracy) is used as the evaluation metric across all benchmarks. No secondary metrics are reported.",
    205           "source": "opus"
    206         },
    207         "human_evaluation": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "Manual error analysis of 50 correct and 50 incorrect chain-of-thought outputs from LaMDA 137B on GSM8K (Section 3.2, Appendices D.1-D.2). Also manual analysis of 45 PaLM 62B errors (Appendix A.1).",
    211           "source": "opus"
    212         },
    213         "held_out_test_set": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Results are reported on standard evaluation splits of each benchmark. The few-shot exemplars were manually composed or drawn from training sets, while results are on evaluation/test splits. Section 3.1 notes 'most of the datasets only have an evaluation split.'",
    217           "source": "opus"
    218         },
    219         "per_category_breakdown": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Results are broken down by individual benchmark (Tables 1-5), by model size (Figures 4, 7, 8), and by MAWPS subsets (Table 3: SingleOp, SingleEq, AddSub, MultiArith). In-domain vs OOD results provided for symbolic reasoning.",
    223           "source": "opus"
    224         },
    225         "failure_cases_discussed": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Appendix D.2 provides detailed error analysis categorizing 50 incorrect outputs into calculator errors (8%), symbol mapping errors (16%), one-step missing (22%), and semantic understanding/incoherence errors (54%). Specific examples are shown in Tables 10-11.",
    229           "source": "opus"
    230         },
    231         "negative_results_reported": {
    232           "applies": true,
    233           "answer": true,
    234           "justification": "The paper reports that chain-of-thought prompting hurts performance for small models (<10B parameters), shown in Figure 4 and Table 2. Also reports minimal gains on CSQA (Section 4) and easy MAWPS subsets (Section 3.2, Table 3).",
    235           "source": "opus"
    236         }
    237       },
    238       "setup_transparency": {
    239         "model_versions_specified": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "Section 3.1 specifies exact model versions: GPT-3 text-ada-001, text-babbage-001, text-curie-001, text-davinci-002 with presumed parameter counts. LaMDA model sizes (422M to 137B), PaLM sizes (8B, 62B, 540B), UL2 20B, and Codex code-davinci-002 are specified.",
    243           "source": "opus"
    244         },
    245         "prompts_provided": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Full prompt text for all tasks is provided in Appendix G (Tables 20-28). This includes all few-shot exemplars with chain-of-thought annotations. Alternate annotator prompts are in Appendix H (Tables 29-30).",
    249           "source": "opus"
    250         },
    251         "hyperparameters_reported": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "Section 3.1 states 'We sample from the models via greedy decoding.' For LaMDA, 'averaged results over five random seeds' with different exemplar orderings. Input context window limited to 1024 tokens (Appendix D.3).",
    255           "source": "opus"
    256         },
    257         "scaffolding_described": {
    258           "applies": false,
    259           "answer": false,
    260           "justification": "No agentic scaffolding is used. The approach is simple few-shot prompting with no tool use, retry logic, or agent workflows.",
    261           "source": "opus"
    262         },
    263         "data_preprocessing_documented": {
    264           "applies": true,
    265           "answer": true,
    266           "justification": "The exemplar selection process is documented: 8 manually composed exemplars for most benchmarks (Section 3.1), 4 from AQuA training set, and specific criteria for GSM8K exemplars (≤60 tokens, ≤2 steps, Section 3.4). Symbolic reasoning data generation is described in Section 5.",
    267           "source": "opus"
    268         }
    269       },
    270       "data_integrity": {
    271         "raw_data_available": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Appendix E.1 states: 'we make exact inputs, targets, and predictions for LaMDA 137B for each task available as a zip file in the supplementary material.' GPT-3 results are reproducible via the public API.",
    275           "source": "opus"
    276         },
    277         "data_collection_described": {
    278           "applies": true,
    279           "answer": true,
    280           "justification": "All benchmarks are publicly available with citations. The synthetic symbolic reasoning datasets are described in Section 5 with generation procedures (random concatenation from top-1000 names from namecensus.com). Chain-of-thought annotations are fully provided.",
    281           "source": "opus"
    282         },
    283         "recruitment_methods_described": {
    284           "applies": false,
    285           "answer": false,
    286           "justification": "No human participants. The study evaluates language models on standard benchmarks.",
    287           "source": "opus"
    288         },
    289         "data_pipeline_documented": {
    290           "applies": true,
    291           "answer": true,
    292           "justification": "The pipeline is straightforward: benchmark datasets → prompt construction with exemplars → model inference via greedy decoding → answer extraction → accuracy comparison. External calculator variant is also described (Python eval function, Table 1). The process for creating chain-of-thought annotations is documented.",
    293           "source": "opus"
    294         }
    295       },
    296       "contamination": {
    297         "training_cutoff_stated": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "No training data cutoff dates are stated for any of the five model families evaluated (GPT-3, LaMDA, PaLM, UL2, Codex). The reader cannot assess whether benchmark data appeared in training.",
    301           "source": "opus"
    302         },
    303         "train_test_overlap_discussed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "No discussion of whether evaluation benchmark examples could have appeared in the training data of any model. This is a significant omission given the use of publicly available benchmarks with large-scale models.",
    307           "source": "opus"
    308         },
    309         "benchmark_contamination_addressed": {
    310           "applies": true,
    311           "answer": false,
    312           "justification": "Many benchmarks used (GSM8K 2021, CSQA 2019, SVAMP 2021, MAWPS 2016) were publicly available before model training. No contamination analysis is performed or discussed.",
    313           "source": "opus"
    314         }
    315       },
    316       "human_studies": {
    317         "pre_registered": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants. This is a benchmark evaluation study of language models.",
    321           "source": "opus"
    322         },
    323         "irb_or_ethics_approval": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants. This is a benchmark evaluation study.",
    327           "source": "opus"
    328         },
    329         "demographics_reported": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants.",
    333           "source": "opus"
    334         },
    335         "inclusion_exclusion_criteria": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants.",
    339           "source": "opus"
    340         },
    341         "randomization_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants.",
    345           "source": "opus"
    346         },
    347         "blinding_described": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants.",
    351           "source": "opus"
    352         },
    353         "attrition_reported": {
    354           "applies": false,
    355           "answer": false,
    356           "justification": "No human participants.",
    357           "source": "opus"
    358         }
    359       },
    360       "cost_and_practicality": {
    361         "inference_cost_reported": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "No inference costs, latency, or token consumption are reported for any experiment despite using multiple large models (up to PaLM 540B). Section 6 mentions large scale 'makes it costly to serve' but provides no quantification.",
    365           "source": "opus"
    366         },
    367         "compute_budget_stated": {
    368           "applies": true,
    369           "answer": false,
    370           "justification": "Appendix E.2 describes hardware used (TPU v3 for LaMDA, TPU v4 for PaLM) but explicitly states 'we did not estimate the total amount of compute.' No GPU hours, total API spend, or wall-clock time are provided.",
    371           "source": "opus"
    372         }
    373       },
    374       "experimental_rigor": {
    375         "seed_sensitivity_reported": {
    376           "applies": true,
    377           "answer": true,
    378           "justification": "For LaMDA, results are averaged over five random seeds (different exemplar orderings) with standard deviations reported in Tables 6-7. However, for GPT-3 and PaLM, single exemplar orderings are used, justified by LaMDA's low variance.",
    379           "source": "opus"
    380         },
    381         "number_of_runs_stated": {
    382           "applies": true,
    383           "answer": true,
    384           "justification": "Section 3.1 states: 'For LaMDA, we report averaged results over five random seeds.' For other models, 'to save compute we report results for a single exemplar order.'",
    385           "source": "opus"
    386         },
    387         "hyperparameter_search_budget": {
    388           "applies": true,
    389           "answer": false,
    390           "justification": "No hyperparameter search is reported. The paper uses greedy decoding and manually composed prompts. Section 3.1 notes 'These particular exemplars did not undergo prompt engineering' but no search budget is documented.",
    391           "source": "opus"
    392         },
    393         "best_config_selection_justified": {
    394           "applies": true,
    395           "answer": true,
    396           "justification": "The paper uses a single set of 8 exemplars across most benchmarks (Section 3.1), and Section 3.4 demonstrates robustness across different annotators, exemplar sets, and orderings. No cherry-picking of configurations is apparent.",
    397           "source": "opus"
    398         },
    399         "multiple_comparison_correction": {
    400           "applies": false,
    401           "answer": false,
    402           "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable. The paper reports only point estimates and standard deviations.",
    403           "source": "opus"
    404         },
    405         "self_comparison_bias_addressed": {
    406           "applies": true,
    407           "answer": false,
    408           "justification": "The authors do not discuss the bias of evaluating their own prompting technique. While they compare against prior work, they do not acknowledge that their implementations of baselines could be disadvantaged.",
    409           "source": "opus"
    410         },
    411         "compute_budget_vs_performance": {
    412           "applies": true,
    413           "answer": false,
    414           "justification": "No analysis of performance as a function of compute. CoT prompting generates more tokens (longer outputs) than standard prompting, increasing compute cost, but this trade-off is not quantified.",
    415           "source": "opus"
    416         },
    417         "benchmark_construct_validity": {
    418           "applies": true,
    419           "answer": false,
    420           "justification": "The paper does not discuss whether the benchmarks actually measure 'reasoning ability' as claimed, or whether solve rate on these tasks is a valid proxy for reasoning. The connection between benchmark performance and the claimed cognitive ability is assumed rather than argued.",
    421           "source": "opus"
    422         },
    423         "scaffold_confound_addressed": {
    424           "applies": false,
    425           "answer": false,
    426           "justification": "No scaffolding is involved. The approach is direct few-shot prompting without any agentic framework.",
    427           "source": "opus"
    428         }
    429       },
    430       "data_leakage": {
    431         "temporal_leakage_addressed": {
    432           "applies": true,
    433           "answer": false,
    434           "justification": "No discussion of temporal leakage. Several benchmarks (MAWPS 2016, CSQA 2019) were published years before model training and could be in training data. No temporal analysis is provided.",
    435           "source": "opus"
    436         },
    437         "feature_leakage_addressed": {
    438           "applies": true,
    439           "answer": false,
    440           "justification": "No discussion of feature leakage. The few-shot exemplars provide the reasoning structure that the model then imitates, but whether this constitutes a form of answer leakage for the test examples is not discussed.",
    441           "source": "opus"
    442         },
    443         "non_independence_addressed": {
    444           "applies": true,
    445           "answer": false,
    446           "justification": "No discussion of whether training and test data are independent. Given the models were trained on large web corpora that could include these benchmarks, independence is not verified.",
    447           "source": "opus"
    448         },
    449         "leakage_detection_method": {
    450           "applies": true,
    451           "answer": false,
    452           "justification": "No leakage detection or prevention methods are used. No canary strings, membership inference, decontamination, or temporal splits are applied.",
    453           "source": "opus"
    454         }
    455       }
    456     }
    457   },
    458   "claims": [
    459     {
    460       "claim": "Chain-of-thought prompting significantly improves LLM performance on arithmetic, commonsense, and symbolic reasoning benchmarks",
    461       "evidence": "Table 1 shows PaLM 540B improving from 17.9% to 56.9% on GSM8K; Tables 2–5 show consistent improvements across 3 model families and 10 benchmarks",
    462       "supported": "strong"
    463     },
    464     {
    465       "claim": "Chain-of-thought reasoning is an emergent ability requiring ~100B+ parameters; it hurts performance in smaller models",
    466       "evidence": "Figure 4 and Tables 2–5 show CoT consistently hurts or provides negligible gains below ~10B parameters for LaMDA, GPT, and PaLM across all task types",
    467       "supported": "strong"
    468     },
    469     {
    470       "claim": "PaLM 540B with CoT achieves new state-of-the-art on GSM8K without finetuning, surpassing finetuned GPT-3 with verifier",
    471       "evidence": "Figure 2 shows PaLM 540B + CoT at 57% vs prior finetuned best at 55%; confirmed in Table 1",
    472       "supported": "strong"
    473     },
    474     {
    475       "claim": "Performance gains are due to sequential natural language reasoning, not variable compute or knowledge activation alone",
    476       "evidence": "Section 3.3 ablations show variable compute only (dot sequences) and reasoning-after-answer both perform at baseline level, while CoT substantially exceeds both",
    477       "supported": "strong"
    478     },
    479     {
    480       "claim": "CoT prompting is robust to different annotators, exemplar sets, and linguistic styles",
    481       "evidence": "Section 3.4 and Figure 6 show all three annotators and three independent GSM8K exemplar sets outperform standard prompting; tested only on LaMDA 137B",
    482       "supported": "moderate"
    483     },
    484     {
    485       "claim": "CoT enables out-of-distribution generalization to longer symbolic sequences not seen in few-shot exemplars",
    486       "evidence": "Figure 8 shows CoT generalizes to 3–4 word names and 3–4 coin flips when exemplars only show 2-word/2-flip cases; standard prompting fails completely on OOD",
    487       "supported": "strong"
    488     }
    489   ],
    490   "methodology_tags": [
    491     "benchmark-eval"
    492   ],
    493   "key_findings": "Chain-of-thought prompting—augmenting few-shot exemplars with step-by-step reasoning demonstrations—dramatically improves large language model performance on arithmetic, commonsense, and symbolic reasoning benchmarks, with PaLM 540B achieving state-of-the-art on GSM8K without finetuning. The effect is emergent: CoT provides no benefit (and sometimes hurts) models below ~100B parameters, suggesting this capability arises from scale. Ablation studies demonstrate that sequential natural language reasoning—not extra computation or improved knowledge retrieval—accounts for the gains. The method is robust to annotator differences, prompt variations, and exemplar selection, and enables out-of-distribution length generalization for symbolic tasks.",
    494   "red_flags": [
    495     {
    496       "flag": "Proprietary model lock-in",
    497       "detail": "Headline results rely on Google-internal PaLM 540B and LaMDA 137B models inaccessible to outside researchers; reproducibility is limited to GPT-3 results only."
    498     },
    499     {
    500       "flag": "Self-evaluation without independent replication",
    501       "detail": "Google employees evaluate Google's own proprietary models as the primary experimental vehicles with no third-party replication, and no conflict-of-interest disclosure."
    502     },
    503     {
    504       "flag": "No contamination analysis",
    505       "detail": "Training data cutoffs for PaLM, LaMDA, and GPT-3 are not stated, and no analysis of potential benchmark contamination is conducted despite evaluation benchmarks being publicly available before model training."
    506     },
    507     {
    508       "flag": "Overclaiming generalization",
    509       "detail": "The paper claims CoT is 'potentially applicable to any task that humans can solve via language' based on three task families; this generalization substantially exceeds the empirical scope."
    510     },
    511     {
    512       "flag": "Missing variance for headline results",
    513       "detail": "Standard deviations are only reported for LaMDA ablation conditions; PaLM 540B and GPT-3 headline accuracy figures have no variance estimates or significance tests."
    514     }
    515   ],
    516   "cited_papers": [
    517     {
    518       "title": "Language Models are Few-Shot Learners",
    519       "relevance": "Introduces GPT-3 and the few-shot prompting baseline that CoT directly builds on and extends"
    520     },
    521     {
    522       "title": "Training Verifiers to Solve Math Word Problems",
    523       "relevance": "Introduces GSM8K benchmark and establishes the finetuned GPT-3 baseline that CoT surpasses without finetuning"
    524     },
    525     {
    526       "title": "Program Induction by Rationale Generation: Learning to Solve and Explain Algebraic Word Problems",
    527       "relevance": "Pioneer work on natural language rationales for math reasoning that directly motivates chain-of-thought"
    528     },
    529     {
    530       "title": "Show Your Work: Scratchpads for Intermediate Computation with Language Models",
    531       "relevance": "Closely related work on intermediate reasoning steps via language models, contrasted with CoT's prompting-only approach"
    532     },
    533     {
    534       "title": "Emergent Abilities of Large Language Models",
    535       "relevance": "Companion paper establishing the theoretical framework for emergent capabilities cited to explain scale-dependent CoT"
    536     },
    537     {
    538       "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models",
    539       "relevance": "Direct follow-up work extending CoT with majority voting over multiple sampled reasoning paths"
    540     },
    541     {
    542       "title": "Scaling Laws for Neural Language Models",
    543       "relevance": "Foundational scaling work establishing model size—capability relationships used to motivate emergent CoT findings"
    544     },
    545     {
    546       "title": "Do As I Can, Not As I Say: Grounding Language in Robotic Affordances",
    547       "relevance": "SayCan robot planning dataset used to demonstrate CoT on embodied commonsense reasoning"
    548     }
    549   ],
    550   "engagement_factors": {
    551     "practical_relevance": {
    552       "score": 3,
    553       "justification": "Chain-of-thought prompting is a directly usable technique that any developer working with LLMs can apply immediately to improve reasoning outputs."
    554     },
    555     "surprise_contrarian": {
    556       "score": 2,
    557       "justification": "The emergent scaling finding—that CoT hurts small models but dramatically helps 100B+ models—was genuinely surprising and reshaped how the field thinks about prompting."
    558     },
    559     "fear_safety": {
    560       "score": 0,
    561       "justification": "The paper focuses on improving reasoning accuracy with no safety, security, or risk angle."
    562     },
    563     "drama_conflict": {
    564       "score": 0,
    565       "justification": "No controversy or conflict; the paper presents a new technique without challenging specific claims or companies."
    566     },
    567     "demo_ability": {
    568       "score": 2,
    569       "justification": "The technique is immediately reproducible via the GPT-3 API with the exact prompts provided in the appendix, though it requires access to large-scale models."
    570     },
    571     "brand_recognition": {
    572       "score": 3,
    573       "justification": "From Google Brain with prominent authors (Jason Wei, Quoc Le, Denny Zhou), evaluating GPT-3 and PaLM—all household names in the AI community."
    574     }
    575   },
    576   "hn_data": {
    577     "threads": [
    578       {
    579         "hn_id": "42711991",
    580         "title": "Show HN: QwQ-32B APIs – o1 like reasoning at 1% the cost",
    581         "points": 17,
    582         "comments": 3,
    583         "url": "https://news.ycombinator.com/item?id=42711991",
    584         "created_at": "2025-01-15T15:29:12Z"
    585       },
    586       {
    587         "hn_id": "30988904",
    588         "title": "Chain of Thought Prompting Elicits Reasoning in Large Language Models",
    589         "points": 2,
    590         "comments": 1,
    591         "url": "https://news.ycombinator.com/item?id=30988904",
    592         "created_at": "2022-04-11T14:08:01Z"
    593       },
    594       {
    595         "hn_id": "30112147",
    596         "title": "Plume: Differential Privacy at Scale",
    597         "points": 2,
    598         "comments": 0,
    599         "url": "https://news.ycombinator.com/item?id=30112147",
    600         "created_at": "2022-01-28T08:34:48Z"
    601       },
    602       {
    603         "hn_id": "34053182",
    604         "title": "Chain of Thought Prompting Elicits Reasoning in Large Language Models",
    605         "points": 1,
    606         "comments": 0,
    607         "url": "https://news.ycombinator.com/item?id=34053182",
    608         "created_at": "2022-12-19T15:29:09Z"
    609       }
    610     ],
    611     "top_points": 17,
    612     "total_points": 22,
    613     "total_comments": 4
    614   }
    615 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs