scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28716B)
      1 {
      2   "paper": {
      3     "title": "Least-to-Most Prompting Enables Complex Reasoning in Large Language Models",
      4     "authors": [
      5       "Denny Zhou",
      6       "Nathanael Schärli",
      7       "Le Hou",
      8       "Jason Wei",
      9       "Nathan Scales",
     10       "Xuezhi Wang",
     11       "Dale Schuurmans",
     12       "Claire Cui",
     13       "Olivier Bousquet",
     14       "Quoc Le",
     15       "Ed Chi"
     16     ],
     17     "year": 2022,
     18     "venue": "International Conference on Learning Representations (ICLR 2023)",
     19     "arxiv_id": "2205.10625",
     20     "doi": "10.48550/arXiv.2205.10625"
     21   },
     22   "scan_version": 2,
     23   "active_modules": ["experimental_rigor", "data_leakage"],
     24   "methodology_tags": ["benchmark-eval"],
     25   "key_findings": "Least-to-most prompting, a two-stage approach that decomposes problems into subproblems then solves them sequentially, substantially outperforms chain-of-thought prompting on easy-to-hard generalization. On SCAN length split, code-davinci-002 with least-to-most prompting achieves 99.7% accuracy using only 14 exemplars versus 16.2% for chain-of-thought. On last-letter-concatenation, least-to-most maintains 74.0% at length 12 versus chain-of-thought's 31.8%. On GSM8K, improvement is marginal overall (62.4% vs 60.9%) but meaningful for problems requiring 5+ steps (45.2% vs 39.1%).",
     26   "checklist": {
     27     "artifacts": {
     28       "code_released": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No code repository or GitHub link is provided. The paper includes all prompts in the appendix but no implementation code."
     32       },
     33       "data_released": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "The main evaluation benchmarks (SCAN, GSM8K, DROP) are publicly available standard datasets. The custom last-letter-concatenation dataset is constructed from a publicly available Wiktionary frequency list with the procedure fully described in Appendix 7.3."
     37       },
     38       "environment_specified": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No environment specification, requirements file, or dependency list is provided. The paper names the GPT-3 model variants used but provides no computational environment details."
     42       },
     43       "reproduction_instructions": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "While all prompts are provided in the appendices (a significant aid), there are no step-by-step instructions, scripts, or README describing how to run the experiments end-to-end. A researcher would need to reverse-engineer the evaluation pipeline."
     47       }
     48     },
     49     "statistical_methodology": {
     50       "confidence_intervals_or_error_bars": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "All results in Tables 4, 8, 11, 12, 13 are single point estimates with no confidence intervals, error bars, or ± notation."
     54       },
     55       "significance_tests": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "The paper repeatedly claims one method 'significantly outperforms' another (e.g., abstract, Section 6) based solely on comparing raw accuracy numbers. No statistical significance tests (p-values, bootstrap tests, etc.) are reported for any comparison."
     59       },
     60       "effect_sizes_reported": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Results tables provide both baseline and proposed method accuracies side by side (e.g., Table 4: chain-of-thought 31.8% vs least-to-most 74.0% at L=12; Table 8: 16.2% vs 99.7% on SCAN), giving clear context for the magnitude of improvement."
     64       },
     65       "sample_size_justified": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The choice of 500 test lists per length for last-letter-concatenation is stated but not justified. No power analysis or sample size rationale is provided for any experiment."
     69       },
     70       "variance_reported": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No variance, standard deviation, or spread measures are reported. All results appear to be single-run numbers. There is no mention of running experiments multiple times."
     74       }
     75     },
     76     "evaluation_design": {
     77       "baselines_included": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Standard few-shot prompting and chain-of-thought prompting are included as baselines across all experiments. For SCAN, comparisons with specialized neural-symbolic models from the literature are also discussed (Section 4)."
     81       },
     82       "baselines_contemporary": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Chain-of-thought prompting (Wei et al., 2022) was the state-of-the-art prompting method at the time. For SCAN, the paper compares against the best known neural-symbolic approaches (Chen et al., 2020; Liu et al., 2020; Nye et al., 2020; Shaw et al., 2021)."
     86       },
     87       "ablation_study": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "The comparison between least-to-most and chain-of-thought prompting effectively ablates the decomposition stage (both use the same exemplars for subproblem solving). Table 13 further varies the number of exemplars (2, 4, 8-shot) and prompt styles, testing component contributions."
     91       },
     92       "multiple_metrics": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "Only accuracy (exact match) is used as the evaluation metric across all tasks. No secondary metrics such as decomposition quality, partial credit, or efficiency measures are reported."
     96       },
     97       "human_evaluation": {
     98         "applies": false,
     99         "answer": false,
    100         "justification": "All tasks (SCAN, last-letter-concatenation, GSM8K, DROP) have well-defined correct answers amenable to exact-match automated evaluation. Human evaluation is not relevant to the accuracy claims."
    101       },
    102       "held_out_test_set": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "SCAN, GSM8K, and DROP have established test splits. The last-letter-concatenation test data is freshly generated from a word list, completely independent of the prompt exemplars."
    106       },
    107       "per_category_breakdown": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Table 4 breaks down by list length (4-12), Table 8 by model, Table 11 by DROP subset (football/non-football), Table 12 by number of reasoning steps required, and Table 13 by prompt configuration and model variant."
    111       },
    112       "failure_cases_discussed": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Detailed error analysis is provided for all three task types: Section 3.1 and Appendix 7.4 for last-letter-concatenation (concatenation errors, wrong template), Section 3.2 and Appendix 8.2 for SCAN (13 errors catalogued by type), and Appendix 9.5 for DROP."
    116       },
    117       "negative_results_reported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Table 12 shows chain-of-thought actually outperforms least-to-most on 2-step GSM8K problems (76.68% vs 74.53%). Overall GSM8K improvement is marginal (62.39% vs 60.87%). Section 5 honestly discusses domains where decomposition fails. Table 19 shows Least-to-Most (best) at 68.01% vs Chain-of-Thought (best) at 68.61%."
    121       }
    122     },
    123     "claims_and_evidence": {
    124       "abstract_claims_supported": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The abstract claims 'at least 99%' on SCAN (actual: 99.7%, Table 8), '16% accuracy with chain-of-thought' (actual: 16.2%, Table 8), and '14 exemplars' (confirmed in Appendix 8.1.2). All specific claims are supported by the results."
    128       },
    129       "causal_claims_justified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper claims decomposition 'enables' complex reasoning. The controlled comparison between least-to-most and chain-of-thought uses identical exemplars for subproblem solving, isolating the decomposition variable. This single-variable manipulation is adequate for the causal claim about decomposition's contribution."
    133       },
    134       "generalization_bounded": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Despite the broad title, Section 5 explicitly bounds the claims: 'Decomposition prompts typically don't generalize well across different domains' and 'Generalizing decomposition can even be difficult within the same domain.' The abstract scopes to specific tasks. Results are model-specific (GPT-3 variants)."
    138       },
    139       "alternative_explanations_discussed": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 3.1 and Table 13 address the main confound that least-to-most prompts contain more tokens by testing chain-of-thought with 4 and 8 exemplars, showing the advantage persists. They also compare using identical vs independent exemplars (Section 7.2.4), isolating the recursive structure as the key factor."
    143       },
    144       "proxy_outcome_distinction": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The paper measures exact-match accuracy on specific benchmarks and claims performance on those specific tasks (compositional generalization on SCAN, length generalization on last-letter-concatenation, multi-step math on GSM8K). Claims closely match measurement granularity."
    148       }
    149     },
    150     "setup_transparency": {
    151       "model_versions_specified": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Exact GPT-3 model versions are specified throughout: code-davinci-002, text-davinci-002, code-davinci-001. These are specific API model identifiers."
    155       },
    156       "prompts_provided": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "All prompts are provided in full in the appendices: Appendix 7 (last-letter-concatenation), Appendix 8 (SCAN), Appendix 9 (DROP), Appendix 10 (GSM8K). The paper explicitly states: 'We have included prompts for all the tasks in the Appendix.'"
    160       },
    161       "hyperparameters_reported": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No API hyperparameters are reported: temperature, top-p, max tokens, and other generation settings are not mentioned despite their significant impact on LLM output."
    165       },
    166       "scaffolding_described": {
    167         "applies": false,
    168         "answer": false,
    169         "justification": "No agentic scaffolding is used. This is a pure prompting paper with sequential API calls."
    170       },
    171       "data_preprocessing_documented": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Appendix 7.3 details the last-letter-concatenation data construction: Wiktionary top 10k words, profane word removal yielding 9694 words, random list generation of 500 examples per length. SCAN, GSM8K, and DROP use established public benchmark splits. Python expression postprocessing for SCAN is described in Section 3.2."
    175       }
    176     },
    177     "limitations_and_scope": {
    178       "limitations_section_present": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 5 is a dedicated 'Limitations' section with substantive discussion of when the approach fails."
    182       },
    183       "threats_to_validity_specific": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 5 discusses specific limitations: decomposition prompts don't transfer across domains (math → common sense), decomposition within a domain can fail even with correct sub-solutions, and exceptional SCAN/last-letter results occur because decomposition is 'relatively straightforward' in those tasks."
    187       },
    188       "scope_boundaries_stated": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 5 explicitly states what the method does NOT do: 'a prompt that demonstrates decomposing math word problems isn't effective for teaching large language models to break down common sense reasoning problems, such as \"Did Aristotle use a laptop?\"'. The boundary between easy and hard decomposition is acknowledged."
    192       }
    193     },
    194     "data_integrity": {
    195       "raw_data_available": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No raw model outputs, individual predictions, or per-example results are available. Only aggregate accuracy numbers are reported in the tables."
    199       },
    200       "data_collection_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The last-letter-concatenation dataset construction is described in detail in Appendix 7.3 (source word list, filtering steps, sampling procedure). The other benchmarks (SCAN, GSM8K, DROP) are well-known public datasets with published documentation."
    204       },
    205       "recruitment_methods_described": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No human participants. All data comes from standard benchmarks or algorithmically generated test cases."
    209       },
    210       "data_pipeline_documented": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "For last-letter-concatenation: Wiktionary list → profanity filter (10000→9694 words) → random list sampling → evaluation. For SCAN: established length split. For DROP: numerical reasoning subset. The Python expression postprocessing step for SCAN is described in Section 3.2 and Appendix 8.4."
    214       }
    215     },
    216     "conflicts_of_interest": {
    217       "funding_disclosed": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No funding source is disclosed. The Acknowledgement section thanks individuals but does not mention grants, funding agencies, or corporate funding. All authors are from Google Research, Brain Team, implying Google funding, but this is not explicitly stated."
    221       },
    222       "affiliations_disclosed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "All authors are clearly identified as 'Google Research, Brain Team' in the author list. Since they evaluate OpenAI's GPT-3 (not a Google product), the standard company-evaluating-own-product conflict does not apply."
    226       },
    227       "funder_independent_of_outcome": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "Google (the implicit funder via author employment) does not have a direct financial stake in GPT-3's performance with this prompting technique. The researchers are evaluating a competitor's product, making the funding source reasonably independent of the outcome."
    231       },
    232       "financial_interests_declared": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No competing interests or financial interests statement is included in the paper."
    236       }
    237     },
    238     "contamination": {
    239       "training_cutoff_stated": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "The paper uses GPT-3 models (code-davinci-002, text-davinci-002, code-davinci-001) without stating any training data cutoff dates."
    243       },
    244       "train_test_overlap_discussed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No discussion of whether SCAN (2018), GSM8K (2021), or DROP (2019) examples appeared in GPT-3's pre-training data. All three benchmarks predate the models used."
    248       },
    249       "benchmark_contamination_addressed": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "All benchmarks (SCAN published 2018, DROP published 2019, GSM8K published 2021) were publicly available before GPT-3's training. No discussion of contamination risk for any benchmark."
    253       }
    254     },
    255     "human_studies": {
    256       "pre_registered": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study."
    260       },
    261       "irb_or_ethics_approval": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study."
    265       },
    266       "demographics_reported": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "inclusion_exclusion_criteria": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study."
    275       },
    276       "randomization_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this study."
    280       },
    281       "blinding_described": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this study."
    285       },
    286       "attrition_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants in this study."
    290       }
    291     },
    292     "cost_and_practicality": {
    293       "inference_cost_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Least-to-most prompting requires multiple sequential API calls (decomposition + one call per subproblem) versus a single call for chain-of-thought. No inference cost, latency, or token consumption is reported."
    297       },
    298       "compute_budget_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No total computational budget, API costs, or hardware resources are mentioned despite running thousands of API calls across multiple GPT-3 models."
    302       }
    303     },
    304     "experimental_rigor": {
    305       "seed_sensitivity_reported": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "All results appear to be single-run numbers. No seed sensitivity analysis or results across multiple random seeds are reported."
    309       },
    310       "number_of_runs_stated": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "The paper does not state whether results are from single runs or averaged across multiple runs. This is particularly concerning for LLM API calls where temperature settings affect variability."
    314       },
    315       "hyperparameter_search_budget": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "The prompts appear manually designed. No reporting of how many prompt designs were tried, what alternatives were considered, or what search process led to the final prompts."
    319       },
    320       "best_config_selection_justified": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "While Table 13 shows multiple configurations, the main results use specific configurations (e.g., 2-shot for Table 4) without justifying why these were selected as the primary comparison. Table 19 shows engineered prompts outperform the 1-shot versions but the main paper highlights the simpler versions."
    324       },
    325       "multiple_comparison_correction": {
    326         "applies": false,
    327         "answer": false,
    328         "justification": "No statistical tests are performed at all, so correction for multiple comparisons does not arise. The absence of statistical testing is captured by the significance_tests item."
    329       },
    330       "self_comparison_bias_addressed": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "The authors design and evaluate their own prompting method against baselines. No acknowledgment of author-evaluation bias, and no independent evaluation is reported."
    334       },
    335       "compute_budget_vs_performance": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "Least-to-most prompting requires O(n) API calls for a problem decomposed into n subproblems, while chain-of-thought uses a single call. This substantial compute difference is never discussed or controlled for. The performance advantage may partly reflect the additional compute."
    339       },
    340       "benchmark_construct_validity": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "The paper uses SCAN, GSM8K, and DROP without discussing whether these benchmarks adequately measure 'complex reasoning' as claimed. SCAN tests a narrow form of compositional generalization; whether success on it implies general complex reasoning capability is not discussed."
    344       },
    345       "scaffold_confound_addressed": {
    346         "applies": false,
    347         "answer": false,
    348         "justification": "No agentic scaffolding is used. This is a pure prompting approach."
    349       }
    350     },
    351     "data_leakage": {
    352       "temporal_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "SCAN (2018), DROP (2019), and GSM8K (2021) were all publicly available before GPT-3 model training. The paper does not discuss whether the models may have seen benchmark solutions during pre-training."
    356       },
    357       "feature_leakage_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of whether the prompt design or evaluation setup leaks information about correct answers."
    361       },
    362       "non_independence_addressed": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No discussion of whether GPT-3's training data includes SCAN grammar rules, GSM8K problem patterns, or DROP reading comprehension passages that overlap with test examples."
    366       },
    367       "leakage_detection_method": {
    368         "applies": true,
    369         "answer": false,
    370         "justification": "No leakage detection or prevention methods (canary strings, membership inference, decontamination) are employed."
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "Least-to-most prompting achieves 99.7% accuracy on SCAN length split using only 14 exemplars, compared to 16.2% for chain-of-thought prompting.",
    377       "evidence": "Table 8 (Section 3.2) shows code-davinci-002 results: least-to-most 99.7%, chain-of-thought 16.2%, standard prompting 16.7%. The 14 exemplars are the command-mapping prompt (Appendix 8.1.2).",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "Least-to-most prompting significantly outperforms chain-of-thought on length generalization for last-letter-concatenation, with the gap widening as length increases.",
    382       "evidence": "Table 4 (Section 3.1): at L=4, least-to-most 94.0% vs chain-of-thought 84.2%; at L=12, 74.0% vs 31.8%. Table 13 extends this across multiple prompt sizes and models, showing consistent advantage.",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "Least-to-most prompting improves chain-of-thought on GSM8K, particularly for problems requiring 5+ steps (45.23% vs 39.07%).",
    387       "evidence": "Table 12 (Section 3.3) breaks down by step count. Overall improvement is marginal: 62.39% vs 60.87% (Table 11). The step-count breakdown shows the advantage concentrates on harder problems. However, no statistical tests confirm the 5+ step difference is significant.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "Least-to-most prompting outperforms chain-of-thought prompting on DROP by a large margin.",
    392       "evidence": "Table 11: non-football subset 82.45% vs 74.77%, football subset 73.42% vs 59.56%. These are substantial absolute differences but no statistical testing is provided.",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "Least-to-most prompting can solve SCAN in any split with at least 99% accuracy.",
    397       "evidence": "Section 3.2 states: 'We also test least-to-most prompting on all other splits and even the full SCAN dataset. We find that its solving rate remains the same.' However, detailed per-split results are not shown in the paper.",
    398       "supported": "moderate"
    399     },
    400     {
    401       "claim": "The advantage of least-to-most over chain-of-thought is not simply due to more information in the prompt.",
    402       "evidence": "Table 13 (Appendix 7.3) shows 2-shot least-to-most (123 tokens) outperforms 8-shot chain-of-thought (573 tokens) at L=12: 74.0% vs 38.4%. This controls for prompt information content.",
    403       "supported": "strong"
    404     }
    405   ],
    406   "red_flags": [
    407     {
    408       "flag": "No uncertainty quantification",
    409       "detail": "All results are single point estimates with no error bars, confidence intervals, standard deviations, or statistical tests. The word 'significantly' is used throughout without statistical basis. For a method involving stochastic LLM API calls, result variability is unknown."
    410     },
    411     {
    412       "flag": "Compute cost asymmetry not addressed",
    413       "detail": "Least-to-most prompting requires multiple sequential API calls (1 decomposition + N subproblem calls) versus 1 call for chain-of-thought. For SCAN with decomposition into 3-4 steps, this is 4-5x the compute. This cost difference is never acknowledged, making the comparison unfair from a practical standpoint."
    414     },
    415     {
    416       "flag": "No contamination discussion",
    417       "detail": "All benchmarks (SCAN 2018, DROP 2019, GSM8K 2021) were publicly available before GPT-3's training data collection. GPT-3 code-davinci-002 could have seen SCAN grammar rules, GSM8K problems, or DROP passages during pre-training. This is especially concerning for SCAN where 99.7% accuracy is achieved with only 14 exemplars."
    418     },
    419     {
    420       "flag": "Missing API hyperparameters",
    421       "detail": "Temperature, top-p, and other generation settings are not reported for any experiment. These can significantly affect LLM output variability and quality, making exact reproduction impossible."
    422     }
    423   ],
    424   "cited_papers": [
    425     {
    426       "title": "Chain of thought prompting elicits reasoning in large language models",
    427       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans", "Maarten Bosma", "Ed Chi", "Brian Ichter", "Fei Xia", "Quoc Le", "Denny Zhou"],
    428       "year": 2022,
    429       "relevance": "Core baseline method; established chain-of-thought prompting as a paradigm for LLM reasoning that least-to-most extends."
    430     },
    431     {
    432       "title": "Language models are few-shot learners",
    433       "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"],
    434       "year": 2020,
    435       "relevance": "Introduced GPT-3 and few-shot prompting, the foundation for all prompting methods evaluated in this paper."
    436     },
    437     {
    438       "title": "Self-consistency improves chain of thought reasoning in language models",
    439       "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans", "Quoc Le", "Ed Chi", "Denny Zhou"],
    440       "year": 2022,
    441       "arxiv_id": "2203.11171",
    442       "relevance": "Proposed self-consistency decoding for chain-of-thought prompting; complementary technique that can be combined with least-to-most."
    443     },
    444     {
    445       "title": "PaLM: Scaling language modeling with pathways",
    446       "authors": ["Aakanksha Chowdhery", "Sharan Narang", "Jacob Devlin"],
    447       "year": 2022,
    448       "arxiv_id": "2204.02311",
    449       "relevance": "Large language model that also uses chain-of-thought prompting; relevant to understanding scale effects on reasoning capabilities."
    450     },
    451     {
    452       "title": "Training verifiers to solve math word problems",
    453       "authors": ["Karl Cobbe", "Vineet Kosaraju", "Mohammad Bavarian"],
    454       "year": 2021,
    455       "arxiv_id": "2110.14168",
    456       "relevance": "Introduced the GSM8K benchmark used for math reasoning evaluation in this paper."
    457     },
    458     {
    459       "title": "Generalization without systematicity: On the compositional skills of sequence-to-sequence recurrent networks",
    460       "authors": ["Brenden Lake", "Marco Baroni"],
    461       "year": 2018,
    462       "relevance": "Introduced the SCAN benchmark for compositional generalization, the primary evaluation benchmark where least-to-most achieves near-perfect accuracy."
    463     },
    464     {
    465       "title": "AI Chains: Transparent and controllable human-AI interaction by chaining large language model prompts",
    466       "authors": ["Tongshuang Wu", "Michael Terry", "Carrie Jun Cai"],
    467       "year": 2022,
    468       "relevance": "Proposed chaining LLM steps for complex tasks; conceptually related to least-to-most's sequential subproblem solving approach."
    469     },
    470     {
    471       "title": "Unsupervised question decomposition for question answering",
    472       "authors": ["Ethan Perez", "Patrick Lewis", "Wen-tau Yih", "Kyunghyun Cho", "Douwe Kiela"],
    473       "year": 2020,
    474       "relevance": "Prior work on task decomposition for multi-hop QA; least-to-most extends decomposition to few-shot prompting without training."
    475     },
    476     {
    477       "title": "Compositional generalization via neural-symbolic stack machines",
    478       "authors": ["Xinyun Chen", "Chen Liang", "Adams Wei Yu", "Dawn Song", "Denny Zhou"],
    479       "year": 2020,
    480       "relevance": "Neural-symbolic approach achieving 100% on SCAN but requiring full training set and specialized architecture, contrasting with least-to-most's 14-exemplar prompting approach."
    481     }
    482   ]
    483 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs