ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (28607B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
      6     "authors": [
      7       "Wei, J.",
      8       "Wang, X.",
      9       "Schuurmans, D.",
     10       "Bosma, M.",
     11       "Ichter, B.",
     12       "Xia, F.",
     13       "Chi, E.",
     14       "Le, Q.",
     15       "Zhou, D."
     16     ],
     17     "year": 2022,
     18     "venue": "NeurIPS 2022",
     19     "arxiv_id": "2201.11903",
     20     "doi": null
     21   },
     22   "checklist": {
     23     "claims_and_evidence": {
     24       "abstract_claims_supported": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "All abstract claims are directly supported: CoT improves arithmetic/commonsense/symbolic reasoning (Sections 3–5), and PaLM 540B achieves SOTA on GSM8K (Table 1, Figure 2) with exact numbers provided.",
     28         "source": "haiku"
     29       },
     30       "causal_claims_justified": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Section 3.3 ablation study isolates causal role of sequential natural language reasoning by testing equation-only, variable-compute-only (dots), and reasoning-after-answer variants, ruling out competing explanations for gains.",
     34         "source": "haiku"
     35       },
     36       "generalization_bounded": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Section 6 and Appendix A.3 explicitly bound generalization: CoT is an emergent ability requiring ~100B+ parameter models, gains are minimal for easy single-step tasks, and conditions for benefit are stated.",
     40         "source": "haiku"
     41       },
     42       "alternative_explanations_discussed": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Section 3.3 explicitly tests and rules out: variable computation alone, equation-only intermediate steps, and chain-of-thought provided only after the answer as alternative explanations for performance gains.",
     46         "source": "haiku"
     47       },
     48       "proxy_outcome_distinction": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Section 6 explicitly states 'chain of thought emulates the thought processes of human reasoners' but 'does not answer whether the neural network is actually reasoning,' clearly distinguishing benchmark accuracy from genuine reasoning.",
     52         "source": "haiku"
     53       }
     54     },
     55     "limitations_and_scope": {
     56       "limitations_section_present": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Section 6 (Discussion) contains a dedicated limitations paragraph listing four specific limitations: the open question of actual reasoning, annotation costs for finetuning, no guarantee of correct reasoning paths, and the large-scale requirement.",
     60         "source": "haiku"
     61       },
     62       "threats_to_validity_specific": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Specific threats are addressed: CoT hurts models below ~10B parameters (Table 2), prompt engineering sensitivity is quantified with variance across annotators (Tables 6-7), and incorrect reasoning paths leading to correct answers are identified as a validity concern (Appendix D.1).",
     66         "source": "haiku"
     67       },
     68       "scope_boundaries_stated": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Appendix A.3 explicitly states conditions where CoT helps vs. does not: requires challenging multi-step reasoning, large model, and flat scaling curve; gains are minimal for easy problems where models already score >90%.",
     72         "source": "haiku"
     73       }
     74     },
     75     "conflicts_of_interest": {
     76       "funding_disclosed": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "No explicit funding disclosure statement is present; all authors are Google Research employees but no formal funding acknowledgment appears in the paper.",
     80         "source": "haiku"
     81       },
     82       "affiliations_disclosed": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "All nine authors are explicitly identified as 'Google Research, Brain Team' in the paper header.",
     86         "source": "haiku"
     87       },
     88       "funder_independent_of_outcome": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "All authors are Google employees evaluating Google's proprietary PaLM model, which achieves the headline SOTA result; Google benefits directly from positive outcomes.",
     92         "source": "haiku"
     93       },
     94       "financial_interests_declared": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No competing interests statement or declaration of financial interests is included anywhere in the paper.",
     98         "source": "haiku"
     99       }
    100     },
    101     "scope_and_framing": {
    102       "key_terms_defined": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 2 explicitly defines 'chain of thought' as 'a series of intermediate natural language reasoning steps that lead to the final output' and distinguishes it from standard prompting with a concrete example.",
    106         "source": "haiku"
    107       },
    108       "intended_contribution_clear": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper clearly states it explores chain-of-thought prompting—providing CoT demonstrations as few-shot exemplars—to unlock reasoning abilities in LLMs without finetuning, with the contribution positioned as combining rationale-augmented training and few-shot prompting.",
    112         "source": "haiku"
    113       },
    114       "engagement_with_prior_work": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Section 7 and Appendix C (Extended Related Work) engage substantively with five prior directions, explicitly situating CoT as orthogonal to instruction-following approaches (augments outputs vs. inputs) and distinct from finetuning-based rationale methods.",
    118         "source": "haiku"
    119       }
    120     }
    121   },
    122   "type_checklist": {
    123     "empirical": {
    124       "artifacts": {
    125         "code_released": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "No source code is released; Appendix E.1 provides prompts and supplementary LaMDA predictions but no code for reproducing the experimental pipeline. GPT-3 experiments can be attempted via API with provided prompts but constitute replication, not released code.",
    129           "source": "haiku"
    130         },
    131         "data_released": {
    132           "applies": true,
    133           "answer": true,
    134           "justification": "All benchmarks used (GSM8K, SVAMP, ASDiv, AQuA, MAWPS, CSQA, StrategyQA, BIG-bench tasks, SayCan) are publicly available; synthetic datasets (last letter concatenation, coin flip) are provided in supplementary materials.",
    135           "source": "haiku"
    136         },
    137         "environment_specified": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "Appendix E.2 mentions TPU v3 (8x8) for LaMDA and TPU v4 (4x4x12) for PaLM inference but provides no software environment specs, dependency versions, or configuration files.",
    141           "source": "haiku"
    142         },
    143         "reproduction_instructions": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "LaMDA and PaLM are proprietary and inaccessible; while prompts are in Appendix G, Appendix E.1 acknowledges reproducibility is limited, and no step-by-step instructions for running experiments are provided even for GPT-3.",
    147           "source": "haiku"
    148         }
    149       },
    150       "statistical_methodology": {
    151         "confidence_intervals_or_error_bars": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "Standard deviations across 5 random seed orderings are reported for LaMDA 137B in Tables 6 and 7; for GPT-3 and PaLM, single runs are used due to API cost, explicitly acknowledged.",
    155           "source": "haiku"
    156         },
    157         "significance_tests": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "No statistical significance tests (t-tests, ANOVA, p-values) are applied to any comparative claims; all comparisons are presented as raw accuracy numbers.",
    161           "source": "haiku"
    162         },
    163         "effect_sizes_reported": {
    164           "applies": true,
    165           "answer": true,
    166           "justification": "Absolute percentage point improvements are consistently reported throughout (e.g., PaLM 540B improves +39pp on GSM8K from 17.9% to 56.9%), providing meaningful effect size context.",
    167           "source": "haiku"
    168         },
    169         "sample_size_justified": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "Benchmark evaluation set sizes are not justified statistically; the manual error analysis of 50 correct and 50 incorrect examples (Appendix D) has no power analysis or justification for the sample size.",
    173           "source": "haiku"
    174         },
    175         "variance_reported": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Standard deviation across random seed orderings is reported for LaMDA 137B (Tables 6-7); for other models, single exemplar orders are used with explicit acknowledgment of this limitation.",
    179           "source": "haiku"
    180         }
    181       },
    182       "evaluation_design": {
    183         "baselines_included": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Standard few-shot prompting (without chain of thought) is used as the primary baseline throughout all experiments, and prior supervised SOTA numbers from published work are included.",
    187           "source": "haiku"
    188         },
    189         "baselines_contemporary": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Baselines include contemporaneous finetuned models (Cobbe et al. 2021 for GSM8K, Jie et al. 2022 for SVAMP, Lan et al. 2021 for MAWPS) and the same underlying LLMs with standard prompting.",
    193           "source": "haiku"
    194         },
    195         "ablation_study": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Section 3.3 presents three ablations with LaMDA and PaLM: equation-only prompting, variable-compute-only (dots equal to equation length), and chain-of-thought provided only after the answer.",
    199           "source": "haiku"
    200         },
    201         "multiple_metrics": {
    202           "applies": true,
    203           "answer": false,
    204           "justification": "All quantitative evaluation uses accuracy (solve rate) as the sole metric; no efficiency, calibration, partial credit, or diversity metrics are reported.",
    205           "source": "haiku"
    206         },
    207         "human_evaluation": {
    208           "applies": true,
    209           "answer": false,
    210           "justification": "No formal human evaluation with external raters is conducted; the manual error analysis in Appendix D (50 correct, 50 incorrect outputs) is author inspection without inter-rater reliability measurement or formal evaluation protocol.",
    211           "source": "haiku"
    212         },
    213         "held_out_test_set": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Standard benchmark evaluation splits are used; for BIG-bench tasks without training sets, the first 10 examples serve as exemplars and remaining examples form the evaluation set.",
    217           "source": "haiku"
    218         },
    219         "per_category_breakdown": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Results are broken down by model family, scale, and dataset; MAWPS is stratified into SingleOp/SingleEq/AddSub/MultiArith subsets (Table 3) showing differential benefits by difficulty level.",
    223           "source": "haiku"
    224         },
    225         "failure_cases_discussed": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Appendix D.2 provides detailed categorized failure analysis of 50 incorrect LaMDA 137B outputs: 8% calculator errors, 16% symbol mapping errors, 22% one-step-missing errors, 54% semantic understanding failures, with concrete examples for each.",
    229           "source": "haiku"
    230         },
    231         "negative_results_reported": {
    232           "applies": true,
    233           "answer": true,
    234           "justification": "Table 2 shows CoT hurts performance for models below ~10B parameters across all model families; Table 3 shows minimal or negative gains for easy single-step MAWPS tasks; CSQA shows minimal gains for GPT-3.",
    235           "source": "haiku"
    236         }
    237       },
    238       "setup_transparency": {
    239         "model_versions_specified": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "GPT-3 API names (text-ada-001, text-babbage-001, text-curie-001, text-davinci-002) and Codex (code-davinci-002) are specified; PaLM and LaMDA parameter counts (8B/62B/540B and 420M/2B/8B/68B/137B) are stated.",
    243           "source": "haiku"
    244         },
    245         "prompts_provided": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Appendix G (Tables 20-28) provides complete few-shot prompts for all nine task types including math word problems (three annotator variants), AQuA, last letter concatenation, coin flip, CSQA, StrategyQA, date understanding, sports understanding, and SayCan.",
    249           "source": "haiku"
    250         },
    251         "hyperparameters_reported": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "Greedy decoding is specified for all models; number of few-shot exemplars per task is stated (8 for most, 4 for AQuA, 6 for SayCan); token constraints on exemplar sampling are documented (≤60 tokens, ≤2 steps).",
    255           "source": "haiku"
    256         },
    257         "scaffolding_described": {
    258           "applies": false,
    259           "answer": false,
    260           "justification": "No agentic scaffolding is used; the method is standard few-shot prompting with CoT exemplars, which is the core method being studied rather than a scaffold around another system.",
    261           "source": "haiku"
    262         },
    263         "data_preprocessing_documented": {
    264           "applies": true,
    265           "answer": true,
    266           "justification": "Exemplar selection criteria are documented (random sampling from training sets with length constraints); symbolic dataset generation is described using top-1000 names from namecensus.com; benchmark splits are specified.",
    267           "source": "haiku"
    268         }
    269       },
    270       "data_integrity": {
    271         "raw_data_available": {
    272           "applies": true,
    273           "answer": false,
    274           "justification": "Only LaMDA 137B inputs/targets/predictions are provided as supplementary zip; PaLM and GPT-3 raw outputs are not released, making the headline SOTA PaLM results independently unverifiable.",
    275           "source": "haiku"
    276         },
    277         "data_collection_described": {
    278           "applies": true,
    279           "answer": true,
    280           "justification": "Prompt construction process is described (manual composition with no special instructions to annotators, random sampling from GSM8K training for robustness experiments); symbolic dataset generation procedure documented with source.",
    281           "source": "haiku"
    282         },
    283         "recruitment_methods_described": {
    284           "applies": false,
    285           "answer": false,
    286           "justification": "No human participants were recruited; standard public benchmarks are used and annotators are paper co-authors.",
    287           "source": "haiku"
    288         },
    289         "data_pipeline_documented": {
    290           "applies": true,
    291           "answer": true,
    292           "justification": "The pipeline from prompt construction to model inference to accuracy evaluation is clearly described; calculator augmentation post-processing is detailed; LaMDA exact inputs/outputs provided in supplementary.",
    293           "source": "haiku"
    294         }
    295       },
    296       "contamination": {
    297         "training_cutoff_stated": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "Training data cutoffs for PaLM, LaMDA, GPT-3, and Codex are not stated anywhere in the paper.",
    301           "source": "haiku"
    302         },
    303         "train_test_overlap_discussed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "No discussion of potential overlap between model pretraining corpora and benchmark test sets; this is a significant omission given that proprietary model training data is undisclosed.",
    307           "source": "haiku"
    308         },
    309         "benchmark_contamination_addressed": {
    310           "applies": true,
    311           "answer": false,
    312           "justification": "The possibility that GSM8K, SVAMP, CSQA, or other benchmark examples appeared in pretraining data of any evaluated model is not addressed despite this being a known issue with large pretrained models.",
    313           "source": "haiku"
    314         }
    315       },
    316       "human_studies": {
    317         "pre_registered": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants involved; NeurIPS checklist confirms N/A.",
    321           "source": "haiku"
    322         },
    323         "irb_or_ethics_approval": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants involved; NeurIPS checklist confirms N/A.",
    327           "source": "haiku"
    328         },
    329         "demographics_reported": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants involved.",
    333           "source": "haiku"
    334         },
    335         "inclusion_exclusion_criteria": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants involved.",
    339           "source": "haiku"
    340         },
    341         "randomization_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants involved.",
    345           "source": "haiku"
    346         },
    347         "blinding_described": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants involved.",
    351           "source": "haiku"
    352         },
    353         "attrition_reported": {
    354           "applies": false,
    355           "answer": false,
    356           "justification": "No human participants involved.",
    357           "source": "haiku"
    358         }
    359       },
    360       "cost_and_practicality": {
    361         "inference_cost_reported": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "Appendix E.2 describes hardware used (TPU v3 8x8 for LaMDA, TPU v4 4x4x12 for PaLM) but explicitly states 'we did not estimate the total amount of compute'; no inference cost, latency, or per-query cost is reported.",
    365           "source": "haiku"
    366         },
    367         "compute_budget_stated": {
    368           "applies": true,
    369           "answer": false,
    370           "justification": "Hardware configurations are described (TPU v3 and v4 chip counts) but total compute budget in FLOPs, GPU-hours, or API cost is not provided.",
    371           "source": "haiku"
    372         }
    373       }
    374     }
    375   },
    376   "claims": [
    377     {
    378       "claim": "Chain-of-thought prompting significantly improves LLM performance on arithmetic reasoning benchmarks",
    379       "evidence": "PaLM 540B improves from 17.9% to 56.9% on GSM8K (+39pp), 69.4% to 79.0% on SVAMP; GPT-3 175B from 15.6% to 46.9% on GSM8K (Table 1)",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Chain-of-thought reasoning is an emergent ability that only appears in models with approximately 100B+ parameters",
    384       "evidence": "Table 2 shows CoT hurts or shows no benefit for models below ~10B parameters across LaMDA, GPT, and PaLM families; large gains only emerge at 100B+",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "PaLM 540B with CoT achieves state-of-the-art on GSM8K, surpassing finetuned GPT-3 with a verifier",
    389       "evidence": "Figure 2 and Table 1: PaLM 540B CoT achieves 56.9% vs. prior best of 55% (finetuned GPT-3 + verifier, Cobbe et al. 2021)",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Sequential natural language reasoning steps drive CoT gains, not variable computation or equation generation alone",
    394       "evidence": "Section 3.3 ablation: variable-compute-only (dots) matches baseline; equation-only improves less than full CoT; reasoning-after-answer matches baseline",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "CoT prompting enables out-of-distribution length generalization in symbolic reasoning tasks",
    399       "evidence": "Figure 8: PaLM 540B CoT achieves 94.8% on 3-word and 63.0% on 4-word last letter concatenation (OOD), vs. near 0% for standard prompting",
    400       "supported": "moderate"
    401     },
    402     {
    403       "claim": "CoT improvements are robust to different annotators, exemplar sets, and exemplar orderings",
    404       "evidence": "Section 3.4 and Tables 6-7: all annotator variants and GSM8K-sampled exemplars outperform standard prompting baseline, though coin flip variance across annotators is high (71.4%–99.6%)",
    405       "supported": "strong"
    406     },
    407     {
    408       "claim": "CoT generalizes to commonsense reasoning, surpassing prior SOTA on StrategyQA and sports understanding",
    409       "evidence": "Figure 7: PaLM 540B CoT achieves 77.8% on StrategyQA (vs. 69.4% prior best) and 95.4% on sports understanding (vs. 84% unaided human enthusiast)",
    410       "supported": "moderate"
    411     }
    412   ],
    413   "methodology_tags": [
    414     "benchmark-eval"
    415   ],
    416   "key_findings": "Chain-of-thought prompting—augmenting few-shot exemplars with intermediate natural language reasoning steps—dramatically improves LLM performance on arithmetic, commonsense, and symbolic reasoning, with PaLM 540B achieving SOTA on GSM8K by surpassing finetuned models. The effect is an emergent property of model scale: CoT actually hurts models below ~10B parameters and only yields gains at ~100B+. Ablations rule out variable computation and simple equation generation as explanations, attributing gains to sequential natural language reasoning. Error analysis reveals 54% of incorrect outputs involve fundamental semantic understanding failures, and the paper frankly acknowledges that benchmark accuracy does not prove genuine reasoning capability.",
    417   "red_flags": [
    418     {
    419       "flag": "Proprietary model lock-in",
    420       "detail": "The headline SOTA result (PaLM 540B) relies on a proprietary model inaccessible to outside researchers, making independent replication of the most important claim impossible."
    421     },
    422     {
    423       "flag": "No statistical significance tests",
    424       "detail": "All comparative claims are made without p-values or confidence intervals; only LaMDA results include error bars (5 random seed orderings), and no significance test is applied to any comparison."
    425     },
    426     {
    427       "flag": "Training contamination unaddressed",
    428       "detail": "No discussion of whether GSM8K, SVAMP, CSQA, or other benchmark test examples appeared in pretraining data; particularly concerning for proprietary models with undisclosed training corpora."
    429     },
    430     {
    431       "flag": "Self-evaluation of own model",
    432       "detail": "All authors are Google Research employees, and the headline result showcases Google's PaLM model achieving SOTA; no independent verification of PaLM results is possible."
    433     },
    434     {
    435       "flag": "Single accuracy metric throughout",
    436       "detail": "All quantitative results rely solely on accuracy (solve rate); no efficiency, calibration, robustness, or partial-credit metrics are reported across any of the 10+ benchmarks evaluated."
    437     },
    438     {
    439       "flag": "Causal mechanism largely speculative",
    440       "detail": "Despite ablations ruling out proximate alternatives, Section A.1 acknowledges why model scale improves CoT remains 'certainly multi-faceted' and the preliminary error analysis is done on only 45 examples."
    441     }
    442   ],
    443   "cited_papers": [
    444     {
    445       "title": "Language Models are Few-Shot Learners",
    446       "relevance": "Foundational few-shot prompting paper (GPT-3); establishes the standard prompting paradigm CoT extends and provides primary baseline; introduces GPT-3 models used in evaluation"
    447     },
    448     {
    449       "title": "Training Verifiers to Solve Math Word Problems",
    450       "relevance": "Introduces GSM8K benchmark and finetuned verifier approach; primary prior SOTA that CoT surpasses; establishes the evaluation setting for the headline result"
    451     },
    452     {
    453       "title": "Emergent Abilities of Large Language Models",
    454       "relevance": "Companion paper providing theoretical framework for understanding CoT as an emergent ability of scale; CoT prompting contributes a key case study to this work"
    455     },
    456     {
    457       "title": "Show Your Work: Scratchpads for Intermediate Computation with Language Models",
    458       "relevance": "Closest prior work using intermediate steps via finetuning; CoT demonstrates similar gains with prompting alone, without gradient updates"
    459     },
    460     {
    461       "title": "Program Induction by Rationale Generation: Learning to Solve and Explain Algebraic Word Problems",
    462       "relevance": "Pioneering natural language rationale approach for math via training from scratch; CoT achieves comparable results with few-shot prompting only"
    463     },
    464     {
    465       "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models",
    466       "relevance": "Direct follow-up work improving CoT via majority voting over sampled generations; cited by this paper as showing CoT can be further enhanced"
    467     },
    468     {
    469       "title": "Scaling Laws for Neural Language Models",
    470       "relevance": "Establishes scaling law context; CoT findings on emergent ability complicate simple scaling predictions by showing prompting technique matters beyond raw model size"
    471     },
    472     {
    473       "title": "Scaling Language Models: Methods, Analysis & Insights from Training Gopher",
    474       "relevance": "Shows scaling alone is insufficient for arithmetic and reasoning tasks, directly motivating the need for CoT prompting as an additional intervention"
    475     }
    476   ],
    477   "engagement_factors": {
    478     "practical_relevance": {
    479       "score": 3,
    480       "justification": "CoT requires only prompt modification with no finetuning or model access; immediately applicable by any practitioner with API access to a large LLM using the exact prompts in Appendix G."
    481     },
    482     "surprise_contrarian": {
    483       "score": 2,
    484       "justification": "The emergent ability finding—that CoT actually hurts small models—challenged the assumption that prompting improvements scale smoothly, surprising the field at publication."
    485     },
    486     "fear_safety": {
    487       "score": 0,
    488       "justification": "No AI safety or risk concerns are raised; the paper acknowledges factual incorrectness in generated chains but frames this as a limitation rather than a safety concern."
    489     },
    490     "drama_conflict": {
    491       "score": 1,
    492       "justification": "Mild competitive angle between Google PaLM and OpenAI GPT-3/Codex with PaLM claiming SOTA, but no explicit controversy or disagreement with prior work."
    493     },
    494     "demo_ability": {
    495       "score": 3,
    496       "justification": "Practitioners can immediately reproduce CoT with GPT-3 API using the exact 8-exemplar prompts provided in Appendix G; no additional resources or infrastructure required."
    497     },
    498     "brand_recognition": {
    499       "score": 3,
    500       "justification": "Google Brain team at NeurIPS 2022; became one of the most cited papers in the LLM prompting literature and underpins most subsequent work on reasoning with LLMs."
    501     }
    502   },
    503   "hn_data": {
    504     "threads": [
    505       {
    506         "hn_id": "42711991",
    507         "title": "Show HN: QwQ-32B APIs – o1 like reasoning at 1% the cost",
    508         "points": 17,
    509         "comments": 3,
    510         "url": "https://news.ycombinator.com/item?id=42711991",
    511         "created_at": "2025-01-15T15:29:12Z"
    512       },
    513       {
    514         "hn_id": "30988904",
    515         "title": "Chain of Thought Prompting Elicits Reasoning in Large Language Models",
    516         "points": 2,
    517         "comments": 1,
    518         "url": "https://news.ycombinator.com/item?id=30988904",
    519         "created_at": "2022-04-11T14:08:01Z"
    520       },
    521       {
    522         "hn_id": "30112147",
    523         "title": "Plume: Differential Privacy at Scale",
    524         "points": 2,
    525         "comments": 0,
    526         "url": "https://news.ycombinator.com/item?id=30112147",
    527         "created_at": "2022-01-28T08:34:48Z"
    528       },
    529       {
    530         "hn_id": "34053182",
    531         "title": "Chain of Thought Prompting Elicits Reasoning in Large Language Models",
    532         "points": 1,
    533         "comments": 0,
    534         "url": "https://news.ycombinator.com/item?id=34053182",
    535         "created_at": "2022-12-19T15:29:09Z"
    536       }
    537     ],
    538     "top_points": 17,
    539     "total_points": 22,
    540     "total_comments": 4
    541   }
    542 }

Impressum · Datenschutz