scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31245B)
      1 {
      2   "paper": {
      3     "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
      4     "authors": [
      5       "Jason Wei",
      6       "Xuezhi Wang",
      7       "Dale Schuurmans",
      8       "Maarten Bosma",
      9       "Brian Ichter",
     10       "Fei Xia",
     11       "Ed H. Chi",
     12       "Quoc V. Le",
     13       "Denny Zhou"
     14     ],
     15     "year": 2022,
     16     "venue": "NeurIPS 2022",
     17     "arxiv_id": "2201.11903"
     18   },
     19   "scan_version": 3,
     20   "active_modules": [
     21     "experimental_rigor",
     22     "data_leakage"
     23   ],
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No code repository URL is provided in the paper. The authors provide inputs/outputs/targets for LaMDA and GPT-3 in supplementary material but do not release evaluation code or scripts."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The paper uses publicly available benchmarks (GSM8K, SVAMP, ASDiv, AQuA, MAWPS, CSQA, StrategyQA, BIG-bench tasks). The coinflip and last letter concatenation datasets are new and are provided in supplementary material (Appendix E.3). LaMDA inputs/outputs/targets are also provided."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Appendix E.2 states TPU v3 (8x8) for LaMDA 137B and TPU v4 (4x4x12) for PaLM 540B inference, and GPT-3 via public API. However, no software environment details (library versions, dependencies) are provided."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No step-by-step reproduction instructions are provided. While full prompts are given in Appendix G and model outputs are in supplementary material, there are no scripts or instructions to replicate the experiments."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No confidence intervals or error bars are reported on main results. Standard deviations are only provided for LaMDA 137B ablation/robustness experiments in Tables 6 and 7, but not for the primary results across models and benchmarks in Tables 1-5."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No statistical significance tests are reported. Claims of improvement are based solely on comparing point estimates (e.g., '56.9% vs 17.9%') without any formal testing."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Performance improvements are reported with baseline context throughout (e.g., Table 1 shows '+39.0' for PaLM 540B on GSM8K going from 17.9% to 56.9%). Absolute and relative gains are consistently provided."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No justification is given for the number of evaluation examples or the choice of 50 examples for error analysis (Sections 3.2, Appendix D). The evaluation set sizes are inherited from existing benchmarks without discussion of statistical power."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Standard deviations across five random seeds (different exemplar orderings) are reported for LaMDA 137B in Tables 6 and 7. For other models, single exemplar orders are used to save compute, which is explicitly stated in Section 3.1."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Standard few-shot prompting is used as the primary baseline throughout. Prior supervised state-of-the-art results are also compared against (e.g., finetuned GPT-3 with verifier for GSM8K from Cobbe et al. 2021)."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Baselines include contemporary prior best results from Cobbe et al. (2021), Jie et al. (2022), Pi et al. (2022), and others. These were recent at the time of publication."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section 3.3 presents a detailed ablation study with three variations: equation only, variable compute only (dots), and chain of thought after answer. Results shown in Figure 5 and Tables 6-7."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "Only solve rate (accuracy) is used as the evaluation metric across all benchmarks. No secondary metrics are reported."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Manual error analysis of 50 correct and 50 incorrect chain-of-thought outputs from LaMDA 137B on GSM8K (Section 3.2, Appendices D.1-D.2). Also manual analysis of 45 PaLM 62B errors (Appendix A.1)."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results are reported on standard evaluation splits of each benchmark. The few-shot exemplars were manually composed or drawn from training sets, while results are on evaluation/test splits. Section 3.1 notes 'most of the datasets only have an evaluation split.'"
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Results are broken down by individual benchmark (Tables 1-5), by model size (Figures 4, 7, 8), and by MAWPS subsets (Table 3: SingleOp, SingleEq, AddSub, MultiArith). In-domain vs OOD results provided for symbolic reasoning."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Appendix D.2 provides detailed error analysis categorizing 50 incorrect outputs into calculator errors (8%), symbol mapping errors (16%), one-step missing (22%), and semantic understanding/incoherence errors (54%). Specific examples are shown in Tables 10-11."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The paper reports that chain-of-thought prompting hurts performance for small models (<10B parameters), shown in Figure 4 and Table 2. Also reports minimal gains on CSQA (Section 4) and easy MAWPS subsets (Section 3.2, Table 3)."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The abstract claims CoT improves reasoning, emerges in sufficiently large models, and achieves SOTA on GSM8K. All are supported: Figure 4 shows emergence, Table 1 shows GSM8K SOTA (56.9% vs 55% prior best with PaLM 540B)."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper makes causal claims ('chain-of-thought prompting improves performance') and supports them with controlled ablation studies (Section 3.3): equation only, variable compute only, and reasoning after answer. These ablations isolate specific factors, supporting the causal claim that the content of the chain of thought matters."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper clearly scopes claims to specific tasks (arithmetic, commonsense, symbolic reasoning), specific models (GPT-3, LaMDA, PaLM), and specific model scales (≥100B). Section 6 explicitly discusses when CoT helps (Appendix A.3 gives three conditions) and states it only helps 'sufficiently large' models."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The ablation study (Section 3.3) tests three alternative explanations: (1) equations alone suffice, (2) more computation is the key factor, (3) CoT just activates knowledge. All are refuted with evidence. Section 6 also acknowledges models may not be 'actually reasoning.'"
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper consistently measures solve rate/accuracy on specific benchmarks and does not overclaim beyond these measurements. Claims are tied to specific datasets (e.g., 'on the GSM8K benchmark') rather than broad ungrounded claims about 'reasoning ability' in general."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 3.1 specifies exact model versions: GPT-3 text-ada-001, text-babbage-001, text-curie-001, text-davinci-002 with presumed parameter counts. LaMDA model sizes (422M to 137B), PaLM sizes (8B, 62B, 540B), UL2 20B, and Codex code-davinci-002 are specified."
    153       },
    154       "prompts_provided": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Full prompt text for all tasks is provided in Appendix G (Tables 20-28). This includes all few-shot exemplars with chain-of-thought annotations. Alternate annotator prompts are in Appendix H (Tables 29-30)."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 3.1 states 'We sample from the models via greedy decoding.' For LaMDA, 'averaged results over five random seeds' with different exemplar orderings. Input context window limited to 1024 tokens (Appendix D.3)."
    163       },
    164       "scaffolding_described": {
    165         "applies": false,
    166         "answer": false,
    167         "justification": "No agentic scaffolding is used. The approach is simple few-shot prompting with no tool use, retry logic, or agent workflows."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "The exemplar selection process is documented: 8 manually composed exemplars for most benchmarks (Section 3.1), 4 from AQuA training set, and specific criteria for GSM8K exemplars (≤60 tokens, ≤2 steps, Section 3.4). Symbolic reasoning data generation is described in Section 5."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 6 (Discussion) contains a dedicated paragraph on limitations, covering: (1) CoT doesn't prove reasoning, (2) annotation costs for finetuning, (3) no guarantee of correct reasoning paths, (4) only works at large scale making it costly to serve."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The paper discusses specific threats: generated chains of thought are not always factual (Section 6, Appendix D.1), incorrect reasoning can accidentally lead to correct answers especially for classification tasks (Appendix D.1), and prompt sensitivity affects results (Section 3.4, Appendix A.2)."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Appendix A.3 explicitly states when CoT helps and when it doesn't: three conditions must be met (challenging task, large model, flat scaling curve). Section 6 states CoT emergence 'only at large model scales makes it costly to serve.' The paper scopes to arithmetic, commonsense, and symbolic reasoning tasks."
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Appendix E.1 states: 'we make exact inputs, targets, and predictions for LaMDA 137B for each task available as a zip file in the supplementary material.' GPT-3 results are reproducible via the public API."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "All benchmarks are publicly available with citations. The synthetic symbolic reasoning datasets are described in Section 5 with generation procedures (random concatenation from top-1000 names from namecensus.com). Chain-of-thought annotations are fully provided."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": false,
    205         "answer": false,
    206         "justification": "No human participants. The study evaluates language models on standard benchmarks."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "The pipeline is straightforward: benchmark datasets → prompt construction with exemplars → model inference via greedy decoding → answer extraction → accuracy comparison. External calculator variant is also described (Python eval function, Table 1). The process for creating chain-of-thought annotations is documented."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding sources are disclosed. The paper is from Google Research, Brain Team, but no specific grant numbers or funding acknowledgments are provided beyond naming individual colleagues who gave feedback."
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "All authors are identified as Google Research, Brain Team with email addresses {jasonwei,dennyzhou}@google.com on the first page."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "The authors work at Google, which develops and markets the LaMDA and PaLM models being evaluated. Google has a commercial interest in demonstrating that large language models (especially their own) can perform better with prompting techniques."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests statement is present in the paper. Given all authors work at Google and evaluate Google's proprietary models (LaMDA, PaLM), financial interests are plausible but undisclosed."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No training data cutoff dates are stated for any of the five model families evaluated (GPT-3, LaMDA, PaLM, UL2, Codex). The reader cannot assess whether benchmark data appeared in training."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No discussion of whether evaluation benchmark examples could have appeared in the training data of any model. This is a significant omission given the use of publicly available benchmarks with large-scale models."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "Many benchmarks used (GSM8K 2021, CSQA 2019, SVAMP 2021, MAWPS 2016) were publicly available before model training. No contamination analysis is performed or discussed."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants. This is a benchmark evaluation study of language models."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants. This is a benchmark evaluation study."
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No inference costs, latency, or token consumption are reported for any experiment despite using multiple large models (up to PaLM 540B). Section 6 mentions large scale 'makes it costly to serve' but provides no quantification."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "Appendix E.2 describes hardware used (TPU v3 for LaMDA, TPU v4 for PaLM) but explicitly states 'we did not estimate the total amount of compute.' No GPU hours, total API spend, or wall-clock time are provided."
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": true,
    306         "justification": "For LaMDA, results are averaged over five random seeds (different exemplar orderings) with standard deviations reported in Tables 6-7. However, for GPT-3 and PaLM, single exemplar orderings are used, justified by LaMDA's low variance."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "Section 3.1 states: 'For LaMDA, we report averaged results over five random seeds.' For other models, 'to save compute we report results for a single exemplar order.'"
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No hyperparameter search is reported. The paper uses greedy decoding and manually composed prompts. Section 3.1 notes 'These particular exemplars did not undergo prompt engineering' but no search budget is documented."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": true,
    321         "justification": "The paper uses a single set of 8 exemplars across most benchmarks (Section 3.1), and Section 3.4 demonstrates robustness across different annotators, exemplar sets, and orderings. No cherry-picking of configurations is apparent."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": false,
    325         "answer": false,
    326         "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable. The paper reports only point estimates and standard deviations."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The authors do not discuss the bias of evaluating their own prompting technique. While they compare against prior work, they do not acknowledge that their implementations of baselines could be disadvantaged."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "No analysis of performance as a function of compute. CoT prompting generates more tokens (longer outputs) than standard prompting, increasing compute cost, but this trade-off is not quantified."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "The paper does not discuss whether the benchmarks actually measure 'reasoning ability' as claimed, or whether solve rate on these tasks is a valid proxy for reasoning. The connection between benchmark performance and the claimed cognitive ability is assumed rather than argued."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": false,
    345         "answer": false,
    346         "justification": "No scaffolding is involved. The approach is direct few-shot prompting without any agentic framework."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of temporal leakage. Several benchmarks (MAWPS 2016, CSQA 2019) were published years before model training and could be in training data. No temporal analysis is provided."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of feature leakage. The few-shot exemplars provide the reasoning structure that the model then imitates, but whether this constitutes a form of answer leakage for the test examples is not discussed."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No discussion of whether training and test data are independent. Given the models were trained on large web corpora that could include these benchmarks, independence is not verified."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": false,
    368         "justification": "No leakage detection or prevention methods are used. No canary strings, membership inference, decontamination, or temporal splits are applied."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Chain-of-thought prompting significantly improves the ability of large language models to perform complex reasoning tasks.",
    375       "evidence": "Demonstrated across arithmetic (5 benchmarks), commonsense (5 benchmarks), and symbolic reasoning (2 tasks) with three model families. PaLM 540B on GSM8K: 17.9% → 56.9% (Section 3.2, Table 1, Figure 4).",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Chain-of-thought reasoning is an emergent ability of model scale, not positively impacting small models.",
    380       "evidence": "Performance gains only appear at ~100B parameters across LaMDA, GPT-3, and PaLM (Figure 4, Tables 2-5). Small models produce 'fluent but illogical chains of thought' leading to lower performance than standard prompting (Section 3.2).",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "PaLM 540B with chain-of-thought prompting achieves state-of-the-art on GSM8K, surpassing finetuned GPT-3 with a verifier.",
    385       "evidence": "PaLM 540B CoT achieves 56.9% on GSM8K vs prior best of 55% from Cobbe et al. (2021) finetuned GPT-3 with verifier (Figure 2, Table 1).",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "The benefit of chain-of-thought prompting comes from the semantic reasoning content, not just additional computation or knowledge activation.",
    390       "evidence": "Ablation study (Section 3.3, Figure 5): variable compute only (dots) performs same as baseline; reasoning after answer performs same as baseline; equation only helps partially but not as much as full CoT.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Chain-of-thought prompting is robust across different annotators, exemplar sets, and exemplar orderings.",
    395       "evidence": "Three annotators, three GSM8K-sampled exemplar sets, varying numbers of exemplars, and multiple orderings all outperform standard prompting (Section 3.4, Figure 6, Tables 6-7, Figure 11).",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "Chain-of-thought prompting facilitates length generalization to longer sequences in symbolic reasoning.",
    400       "evidence": "Models trained on 2-step exemplars generalize to 3- and 4-step problems for last letter concatenation and coin flip tasks (Section 5, Figure 8, Table 5). Performance lower than in-domain but scales upward with model size.",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval"
    406   ],
    407   "key_findings": "Chain-of-thought prompting—providing intermediate reasoning steps in few-shot exemplars—dramatically improves large language model performance on arithmetic, commonsense, and symbolic reasoning tasks, but only at model scales of ~100B+ parameters. PaLM 540B with CoT achieved state-of-the-art on GSM8K (56.9%) surpassing finetuned approaches. Ablation studies show the benefit comes from the semantic content of reasoning steps, not merely from additional computation or knowledge activation. The approach is robust across different annotators, exemplar sets, and model families.",
    408   "red_flags": [
    409     {
    410       "flag": "Company evaluating own models",
    411       "detail": "All authors are from Google Research, Brain Team. Two of the five model families evaluated (LaMDA and PaLM) are Google proprietary models. The paper demonstrates that Google's largest models benefit most from the technique."
    412     },
    413     {
    414       "flag": "No contamination analysis",
    415       "detail": "Multiple benchmarks used (MAWPS 2016, CSQA 2019, GSM8K 2021) were publicly available before model training, yet no contamination analysis is performed. Training data cutoffs are not stated for any model."
    416     },
    417     {
    418       "flag": "Single metric evaluation",
    419       "detail": "Only solve rate (accuracy) is reported across all experiments. No secondary metrics (e.g., reasoning chain quality, calibration, partial credit) are used to evaluate the approach."
    420     },
    421     {
    422       "flag": "Incomplete variance reporting",
    423       "detail": "Standard deviations across seeds are only reported for LaMDA 137B. For GPT-3 and PaLM (including the SOTA-claiming PaLM 540B results), only single-run numbers are reported, justified by LaMDA's low variance—but variance may differ across model families."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "Language Models are Few-Shot Learners",
    429       "authors": [
    430         "Tom Brown",
    431         "Benjamin Mann",
    432         "Nick Ryder"
    433       ],
    434       "year": 2020,
    435       "relevance": "Foundational work on few-shot prompting with GPT-3, the baseline approach that CoT prompting extends."
    436     },
    437     {
    438       "title": "Training Verifiers to Solve Math Word Problems",
    439       "authors": [
    440         "Karl Cobbe",
    441         "Vineet Kosaraju",
    442         "Mohammad Bavarian"
    443       ],
    444       "year": 2021,
    445       "arxiv_id": "2110.14168",
    446       "relevance": "Introduced GSM8K benchmark and the finetuned GPT-3 verifier approach that CoT prompting surpasses."
    447     },
    448     {
    449       "title": "Evaluating Large Language Models Trained on Code",
    450       "authors": [
    451         "Mark Chen",
    452         "Jerry Tworek",
    453         "Heewoo Jun"
    454       ],
    455       "year": 2021,
    456       "arxiv_id": "2107.03374",
    457       "relevance": "Introduces Codex and code evaluation methodology; one of the five model families evaluated with CoT prompting."
    458     },
    459     {
    460       "title": "Emergent Abilities of Large Language Models",
    461       "authors": [
    462         "Jason Wei",
    463         "Yi Tay",
    464         "Rishi Bommasani"
    465       ],
    466       "year": 2022,
    467       "relevance": "Provides theoretical framing for the emergence of CoT reasoning at scale, directly tied to this paper's core finding."
    468     },
    469     {
    470       "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models",
    471       "authors": [
    472         "Xuezhi Wang",
    473         "Jason Wei",
    474         "Dale Schuurmans"
    475       ],
    476       "year": 2022,
    477       "arxiv_id": "2203.11171",
    478       "relevance": "Follow-up work showing majority voting over sampled CoT generations further improves performance."
    479     },
    480     {
    481       "title": "Show Your Work: Scratchpads for Intermediate Computation with Language Models",
    482       "authors": [
    483         "Maxwell Nye",
    484         "Anders Johan Andreassen",
    485         "Guy Gur-Ari"
    486       ],
    487       "year": 2021,
    488       "arxiv_id": "2112.00114",
    489       "relevance": "Closest prior work using intermediate computation steps for program execution; CoT generalizes this to natural language."
    490     },
    491     {
    492       "title": "Program Induction by Rationale Generation: Learning to Solve and Explain Algebraic Word Problems",
    493       "authors": [
    494         "Wang Ling",
    495         "Dani Yogatama",
    496         "Chris Dyer"
    497       ],
    498       "year": 2017,
    499       "relevance": "Pioneered natural language rationales for math problem solving, the training-based predecessor to CoT prompting."
    500     },
    501     {
    502       "title": "Do As I Can, Not As I Say: Grounding Language in Robotic Affordances",
    503       "authors": [
    504         "Michael Ahn",
    505         "Anthony Brohan",
    506         "Noah Brown"
    507       ],
    508       "year": 2022,
    509       "arxiv_id": "2204.01691",
    510       "relevance": "SayCan robot planning benchmark used to evaluate CoT prompting for commonsense reasoning in robotic instruction following."
    511     },
    512     {
    513       "title": "Scaling Language Models: Methods, Analysis & Insights from Training Gopher",
    514       "authors": [
    515         "Jack W. Rae",
    516         "Sebastian Borgeaud",
    517         "Trevor Cai"
    518       ],
    519       "year": 2021,
    520       "arxiv_id": "2112.11446",
    521       "relevance": "Documented that scaling alone is insufficient for reasoning tasks, motivating CoT prompting as an alternative approach."
    522     },
    523     {
    524       "title": "STaR: Bootstrapping Reasoning with Reasoning",
    525       "authors": [
    526         "Eric Zelikman",
    527         "Yuhuai Wu",
    528         "Noah D. Goodman"
    529       ],
    530       "year": 2022,
    531       "arxiv_id": "2203.14465",
    532       "relevance": "Extends CoT idea to self-training: models generate rationales, filter correct ones, and finetune on them."
    533     },
    534     {
    535       "title": "Finetuned Language Models Are Zero-Shot Learners",
    536       "authors": [
    537         "Jason Wei",
    538         "Maarten Bosma",
    539         "Vincent Y. Zhao"
    540       ],
    541       "year": 2022,
    542       "relevance": "Instruction tuning work (FLAN) that augments inputs with task instructions; CoT takes the orthogonal approach of augmenting outputs."
    543     },
    544     {
    545       "title": "Program Synthesis with Large Language Models",
    546       "authors": [
    547         "Jacob Austin",
    548         "Augustus Odena",
    549         "Maxwell Nye"
    550       ],
    551       "year": 2021,
    552       "arxiv_id": "2108.07732",
    553       "relevance": "Evaluates LLMs for code generation, related to using intermediate steps in program synthesis."
    554     }
    555   ],
    556   "engagement_factors": {
    557     "practical_relevance": {
    558       "score": 3,
    559       "justification": "Chain-of-thought prompting is a directly usable technique that any developer working with LLMs can apply immediately to improve reasoning outputs."
    560     },
    561     "surprise_contrarian": {
    562       "score": 2,
    563       "justification": "The emergent scaling finding—that CoT hurts small models but dramatically helps 100B+ models—was genuinely surprising and reshaped how the field thinks about prompting."
    564     },
    565     "fear_safety": {
    566       "score": 0,
    567       "justification": "The paper focuses on improving reasoning accuracy with no safety, security, or risk angle."
    568     },
    569     "drama_conflict": {
    570       "score": 0,
    571       "justification": "No controversy or conflict; the paper presents a new technique without challenging specific claims or companies."
    572     },
    573     "demo_ability": {
    574       "score": 2,
    575       "justification": "The technique is immediately reproducible via the GPT-3 API with the exact prompts provided in the appendix, though it requires access to large-scale models."
    576     },
    577     "brand_recognition": {
    578       "score": 3,
    579       "justification": "From Google Brain with prominent authors (Jason Wei, Quoc Le, Denny Zhou), evaluating GPT-3 and PaLM—all household names in the AI community."
    580     }
    581   }
    582 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs