scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27135B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Least-to-Most Prompting Enables Complex Reasoning in Large Language Models",
      6     "authors": [
      7       "Denny Zhou",
      8       "Nathanael Scharli",
      9       "Le Hou",
     10       "Jason Wei",
     11       "Nathan Scales",
     12       "Xuezhi Wang",
     13       "Dale Schuurmans",
     14       "Claire Cui",
     15       "Olivier Bousquet",
     16       "Quoc Le",
     17       "Ed Chi"
     18     ],
     19     "year": 2022,
     20     "venue": "International Conference on Learning Representations",
     21     "arxiv_id": "2205.10625",
     22     "doi": "10.48550/arXiv.2205.10625"
     23   },
     24   "checklist": {
     25     "claims_and_evidence": {
     26       "abstract_claims_supported": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "All major abstract claims are verified by the results: 99.7% SCAN accuracy (Table 8), comparisons to 16.2% chain-of-thought, and outperformance on symbolic/compositional/math tasks.",
     30         "source": "haiku"
     31       },
     32       "causal_claims_justified": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Controlled comparisons hold model, benchmark, and evaluation methodology constant while varying prompting strategy; ablations vary number of exemplars and model versions, providing reasonable support for causal attribution.",
     36         "source": "haiku"
     37       },
     38       "generalization_bounded": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The title claims 'enables complex reasoning' broadly, but experiments are limited to three specific task types; the limitations section acknowledges cross-domain failures but the main framing still overstates generality.",
     42         "source": "haiku"
     43       },
     44       "alternative_explanations_discussed": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper does not discuss alternative explanations for why decomposition helps (e.g., longer effective context, different token distribution, prompt length differences), beyond noting that L2M prompts contain more words.",
     48         "source": "haiku"
     49       },
     50       "proxy_outcome_distinction": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "The paper measures exact-match accuracy on well-defined compositional tasks; claims about 'complex reasoning' are directly supported by accuracy on those specific tasks without inappropriate abstraction.",
     54         "source": "haiku"
     55       }
     56     },
     57     "limitations_and_scope": {
     58       "limitations_section_present": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Section 5 is a dedicated Limitations section discussing cross-domain and within-domain generalization failures with concrete examples.",
     62         "source": "haiku"
     63       },
     64       "threats_to_validity_specific": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Specific limitations are cited: decomposition prompts for math do not transfer to commonsense ('Did Aristotle use a laptop?'), and within GSM8K the method requires correct decomposition to succeed.",
     68         "source": "haiku"
     69       },
     70       "scope_boundaries_stated": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Section 5 explicitly states that the technique does not generalize well across different domains and that strong results on SCAN/last-letter-concatenation depend on the relative simplicity of decomposition for those tasks.",
     74         "source": "haiku"
     75       }
     76     },
     77     "conflicts_of_interest": {
     78       "funding_disclosed": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "No funding source is disclosed; the acknowledgment section only thanks colleagues and reviewers, with no mention of grants or external sponsors.",
     82         "source": "haiku"
     83       },
     84       "affiliations_disclosed": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "All authors are listed as affiliated with Google Research, Brain Team in the paper header.",
     88         "source": "haiku"
     89       },
     90       "funder_independent_of_outcome": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Google Research employees are evaluating GPT-3 (an OpenAI model), not a Google product; the prompting technique is model-agnostic, so there is no direct financial stake in specific results.",
     94         "source": "haiku"
     95       },
     96       "financial_interests_declared": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "There is no competing interests or financial disclosure statement anywhere in the paper.",
    100         "source": "haiku"
    101       }
    102     },
    103     "scope_and_framing": {
    104       "key_terms_defined": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Least-to-most prompting is explicitly defined (Section 2); chain-of-thought is referenced with citation; compositional generalization and easy-to-hard generalization are explained with concrete examples.",
    108         "source": "haiku"
    109       },
    110       "intended_contribution_clear": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper clearly states in the introduction that it proposes least-to-most prompting to overcome the easy-to-hard generalization limitation of chain-of-thought prompting, with no training required.",
    114         "source": "haiku"
    115       },
    116       "engagement_with_prior_work": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Section 4 engages substantively with prior work on compositional generalization (SCAN), neural-symbolic models, and task decomposition, comparing approaches and explaining how L2M differs from prior decomposition methods.",
    120         "source": "haiku"
    121       }
    122     }
    123   },
    124   "type_checklist": {
    125     "empirical": {
    126       "artifacts": {
    127         "code_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "No code repository is linked or mentioned; prompts are provided in the appendix but no runnable code is released.",
    131           "source": "haiku"
    132         },
    133         "data_released": {
    134           "applies": true,
    135           "answer": true,
    136           "justification": "SCAN, GSM8K, and DROP are standard publicly available benchmarks; the paper says 'We will release the full dataset upon publication' for last-letter-concatenation, but the standard benchmarks are already public.",
    137           "source": "haiku"
    138         },
    139         "environment_specified": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No requirements file, Docker image, or dependency specification is provided; experiments use GPT-3 API access with no version pinning beyond model names.",
    143           "source": "haiku"
    144         },
    145         "reproduction_instructions": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "While prompts are provided in the appendix, no step-by-step instructions for running experiments (API setup, evaluation scripts, postprocessing) are included.",
    149           "source": "haiku"
    150         }
    151       },
    152       "statistical_methodology": {
    153         "confidence_intervals_or_error_bars": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "All results are reported as point-estimate accuracy percentages with no confidence intervals or error bars.",
    157           "source": "haiku"
    158         },
    159         "significance_tests": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "No statistical significance tests are reported for any of the comparative claims (e.g., L2M vs. chain-of-thought accuracy differences).",
    163           "source": "haiku"
    164         },
    165         "effect_sizes_reported": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "Absolute accuracy differences with baseline context are reported throughout (e.g., 99.7% vs 16.2% on SCAN, 74.0% vs 31.8% at L=12), which convey meaningful effect sizes.",
    169           "source": "haiku"
    170         },
    171         "sample_size_justified": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "500 examples per list length for last-letter-concatenation is stated but not formally justified; no power analysis is conducted for any experiment.",
    175           "source": "haiku"
    176         },
    177         "variance_reported": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "No variance, standard deviation, or multiple-run statistics are reported; all results appear to be single-run point estimates.",
    181           "source": "haiku"
    182         }
    183       },
    184       "evaluation_design": {
    185         "baselines_included": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Standard (few-shot) prompting and chain-of-thought prompting are used as baselines across all tasks; zero-shot is also included for math reasoning.",
    189           "source": "haiku"
    190         },
    191         "baselines_contemporary": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Chain-of-thought (Wei et al. 2022) and self-consistency (Wang et al. 2022) are the state-of-the-art contemporaneous methods; neural-symbolic SCAN baselines are cited from current literature.",
    195           "source": "haiku"
    196         },
    197         "ablation_study": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "The appendix includes ablations over number of exemplars (2-shot vs 4-shot vs 8-shot), different GPT-3 model variants (code-002, text-002, code-001), and dependent vs independent example ordering.",
    201           "source": "haiku"
    202         },
    203         "multiple_metrics": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "Only exact-match accuracy is reported across all tasks; no efficiency, token count, latency, or other metrics are included.",
    207           "source": "haiku"
    208         },
    209         "human_evaluation": {
    210           "applies": false,
    211           "answer": false,
    212           "justification": "Tasks have deterministic ground-truth outputs (action sequences, letter concatenations, arithmetic answers); human evaluation is not applicable.",
    213           "source": "haiku"
    214         },
    215         "held_out_test_set": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are reported on standard held-out test sets for SCAN (length split), GSM8K, and DROP; for last-letter-concatenation, separate randomly generated test examples are used.",
    219           "source": "haiku"
    220         },
    221         "per_category_breakdown": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Table 12 breaks GSM8K results by number of reasoning steps (2, 3, 4, ≥5); Table 4 breaks last-letter-concatenation by list length (4–12); Table 8 covers all SCAN splits.",
    225           "source": "haiku"
    226         },
    227         "failure_cases_discussed": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Detailed error analyses are provided for both last-letter-concatenation (Section 7.4, Table 14, Appendix 7.5.5) and SCAN (Section 8.2, Table 15), with specific failure examples shown.",
    231           "source": "haiku"
    232         },
    233         "negative_results_reported": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "The paper explicitly reports that L2M only marginally improves overall GSM8K performance (62.39% vs 60.87%) and that 'Least-to-Most (best)' is slightly worse than 'Chain-of-Thought (best)' on GSM8K (Table 19).",
    237           "source": "haiku"
    238         }
    239       },
    240       "setup_transparency": {
    241         "model_versions_specified": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Specific GPT-3 model version names are consistently reported: code-davinci-002, text-davinci-002, code-davinci-001, and LM-540B.",
    245           "source": "haiku"
    246         },
    247         "prompts_provided": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Complete prompt contexts for all tasks and methods are provided in the appendix (Sections 7–10), including decomposition and solution prompts with all exemplars.",
    251           "source": "haiku"
    252         },
    253         "hyperparameters_reported": {
    254           "applies": true,
    255           "answer": false,
    256           "justification": "No generation hyperparameters (temperature, top-p, max tokens, etc.) are reported anywhere in the paper.",
    257           "source": "haiku"
    258         },
    259         "scaffolding_described": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 2 describes the two-stage pipeline in detail: decomposition prompt construction, sequential subproblem solving with answer chaining, and postprocessing of Python expression output.",
    263           "source": "haiku"
    264         },
    265         "data_preprocessing_documented": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "Section 7.3 describes last-letter-concatenation dataset construction (Wiktionary top-10k word list, removal of profanity, yielding 9,694 words, 500 examples per length level).",
    269           "source": "haiku"
    270         }
    271       },
    272       "data_integrity": {
    273         "raw_data_available": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "SCAN, GSM8K, and DROP are publicly available benchmarks; sample data rows for last-letter-concatenation are shown in Section 7.3 and release is promised.",
    277           "source": "haiku"
    278         },
    279         "data_collection_described": {
    280           "applies": true,
    281           "answer": true,
    282           "justification": "Data generation for last-letter-concatenation is described in detail (Section 7.3); for standard benchmarks, original dataset papers are cited.",
    283           "source": "haiku"
    284         },
    285         "recruitment_methods_described": {
    286           "applies": false,
    287           "answer": false,
    288           "justification": "No human participants; all evaluation is automated on benchmark datasets.",
    289           "source": "haiku"
    290         },
    291         "data_pipeline_documented": {
    292           "applies": true,
    293           "answer": true,
    294           "justification": "The full pipeline from prompt construction → API call → postprocessing (Python expression expansion) → accuracy evaluation is described in the paper and appendix.",
    295           "source": "haiku"
    296         }
    297       },
    298       "contamination": {
    299         "training_cutoff_stated": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "GPT-3 training data cutoffs are never mentioned in the paper, despite evaluating on benchmarks that predate GPT-3 training.",
    303           "source": "haiku"
    304         },
    305         "train_test_overlap_discussed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "No discussion of whether SCAN, GSM8K, or DROP appeared in GPT-3's training data; contamination is not addressed.",
    309           "source": "haiku"
    310         },
    311         "benchmark_contamination_addressed": {
    312           "applies": true,
    313           "answer": false,
    314           "justification": "SCAN and GSM8K are public benchmarks published before GPT-3's training cutoff; the paper does not address whether the model may have memorized solutions.",
    315           "source": "haiku"
    316         }
    317       },
    318       "human_studies": {
    319         "pre_registered": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "haiku"
    324         },
    325         "irb_or_ethics_approval": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "haiku"
    330         },
    331         "demographics_reported": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "inclusion_exclusion_criteria": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "randomization_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         },
    349         "blinding_described": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "haiku"
    354         },
    355         "attrition_reported": {
    356           "applies": false,
    357           "answer": false,
    358           "justification": "No human participants.",
    359           "source": "haiku"
    360         }
    361       },
    362       "cost_and_practicality": {
    363         "inference_cost_reported": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No API cost, token count, or latency figures are reported despite the method requiring multiple API calls per problem (one for decomposition, one or more for solving).",
    367           "source": "haiku"
    368         },
    369         "compute_budget_stated": {
    370           "applies": true,
    371           "answer": false,
    372           "justification": "Total number of API calls or overall compute budget is not reported.",
    373           "source": "haiku"
    374         }
    375       }
    376     }
    377   },
    378   "claims": [
    379     {
    380       "claim": "Least-to-most prompting with code-davinci-002 achieves 99.7% accuracy on the SCAN length split using only 14 exemplars.",
    381       "evidence": "Table 8 reports 99.7% for least-to-most vs. 16.2% for chain-of-thought and 16.7% for standard prompting.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Least-to-most prompting substantially outperforms chain-of-thought on the last-letter-concatenation task, especially at longer list lengths.",
    386       "evidence": "Table 4: at L=12, least-to-most achieves 74.0% vs. 31.8% for chain-of-thought with code-davinci-002.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Least-to-most prompting provides more benefit over chain-of-thought on harder (more-step) problems than on easier problems.",
    391       "evidence": "Table 12: for ≥5-step GSM8K problems, L2M achieves 45.23% vs. 39.07% (+6.2pp), while at 2 steps chain-of-thought is slightly better (76.68% vs. 74.53%).",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "Least-to-most prompting offers only marginal overall improvement on GSM8K math word problems.",
    396       "evidence": "Table 11 shows 62.39% vs. 60.87% for the one-shot prompts; Table 19 shows 'Least-to-Most (best)' at 68.01% is slightly below 'Chain-of-Thought (best)' at 68.61%.",
    397       "supported": "strong"
    398     },
    399     {
    400       "claim": "Decomposition prompts for one domain do not transfer to other domains.",
    401       "evidence": "Section 5 (Limitations) notes that a math decomposition prompt is ineffective for commonsense reasoning ('Did Aristotle use a laptop?') and gives a concrete example.",
    402       "supported": "moderate"
    403     },
    404     {
    405       "claim": "Least-to-most prompting achieves the SCAN result using far fewer training examples than specialized neural-symbolic models.",
    406       "evidence": "The paper notes neural-symbolic models achieving 100% require training on the full SCAN set (15,000+ examples), while L2M uses 14 exemplars.",
    407       "supported": "strong"
    408     }
    409   ],
    410   "methodology_tags": [
    411     "benchmark-eval"
    412   ],
    413   "key_findings": "Least-to-most prompting—decomposing problems into simpler subproblems and solving them sequentially via few-shot prompting—dramatically outperforms chain-of-thought on compositional generalization tasks, achieving 99.7% on the SCAN length split with 14 exemplars versus 16.2% for chain-of-thought. The advantage is largest on tasks requiring generalization to harder instances than the demonstrated examples (easy-to-hard generalization). The improvement is modest overall on GSM8K math word problems (+1.5pp) but more pronounced on multi-step instances (≥5 steps: +6.2pp). The technique requires no training or fine-tuning and uses only the same model with restructured prompts.",
    414   "red_flags": [
    415     {
    416       "flag": "No statistical significance tests",
    417       "detail": "All comparative claims are based on point-estimate accuracy differences with no confidence intervals, standard errors, or significance tests, making it impossible to assess whether observed differences are reliable."
    418     },
    419     {
    420       "flag": "No generation hyperparameters",
    421       "detail": "Temperature, top-p, max tokens, and other sampling parameters are never reported, preventing exact reproduction and making it unclear whether results are sensitive to these settings."
    422     },
    423     {
    424       "flag": "Contamination not addressed",
    425       "detail": "GPT-3 training data cutoffs are not stated and potential overlap between training data and SCAN/GSM8K/DROP benchmark examples is never discussed."
    426     },
    427     {
    428       "flag": "Single-run results",
    429       "detail": "No variance across runs is reported; given stochastic decoding, single-run results may not be reliable especially for smaller differences on GSM8K."
    430     },
    431     {
    432       "flag": "Multi-call cost unreported",
    433       "detail": "Least-to-most requires multiple LLM calls per problem (decomposition + sequential subproblem solving), but no token count, latency, or cost comparison with chain-of-thought is provided."
    434     }
    435   ],
    436   "cited_papers": [
    437     {
    438       "title": "Chain of Thought Prompting Elicits Reasoning in Large Language Models",
    439       "relevance": "Primary baseline and motivation; L2M is designed to overcome chain-of-thought's easy-to-hard generalization limitation."
    440     },
    441     {
    442       "title": "Generalization without Systematicity: On the Compositional Skills of Sequence-to-Sequence Recurrent Networks (SCAN)",
    443       "relevance": "Primary benchmark for compositional generalization used to demonstrate L2M's most dramatic result."
    444     },
    445     {
    446       "title": "Training Verifiers to Solve Math Word Problems (GSM8K)",
    447       "relevance": "Math reasoning benchmark used to evaluate L2M; provides test of step-count generalization."
    448     },
    449     {
    450       "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models",
    451       "relevance": "Complementary prompting technique mentioned as combinable with L2M; state-of-the-art context."
    452     },
    453     {
    454       "title": "Language Models are Few-Shot Learners (GPT-3)",
    455       "relevance": "Foundation for the few-shot prompting paradigm that L2M extends."
    456     },
    457     {
    458       "title": "DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning over Paragraphs",
    459       "relevance": "Second math reasoning benchmark used in experiments; L2M shows large gains (+7.7pp non-football subset)."
    460     },
    461     {
    462       "title": "Compositional Generalization via Neural-Symbolic Stack Machines",
    463       "relevance": "Represents the class of specialized neural-symbolic models that L2M outperforms on SCAN without training."
    464     },
    465     {
    466       "title": "Least-to-Most Prompting in Educational Psychology (Libby et al.)",
    467       "relevance": "Original source of the 'least-to-most' pedagogical technique that inspired the LLM prompting strategy."
    468     }
    469   ],
    470   "engagement_factors": {
    471     "practical_relevance": {
    472       "score": 3,
    473       "justification": "Immediately applicable to any LLM API user; requires no training, just prompt restructuring, and prompts are fully provided in the appendix."
    474     },
    475     "surprise_contrarian": {
    476       "score": 2,
    477       "justification": "The 99.7% vs 16.2% gap on SCAN from a purely prompting change—no architecture or training change—is genuinely surprising and challenges assumptions about model limitations."
    478     },
    479     "fear_safety": {
    480       "score": 0,
    481       "justification": "No safety or risk content; purely a prompting methodology paper."
    482     },
    483     "drama_conflict": {
    484       "score": 1,
    485       "justification": "Positions itself against chain-of-thought prompting's limitations, creating mild tension, but no controversy or external conflict."
    486     },
    487     "demo_ability": {
    488       "score": 3,
    489       "justification": "Can be demonstrated immediately with any LLM API using the provided prompts; the SCAN and last-letter-concatenation tasks are easy to run interactively."
    490     },
    491     "brand_recognition": {
    492       "score": 2,
    493       "justification": "Google Brain team with prominent authors (Quoc Le, Ed Chi) and ICLR venue; well-cited in the prompting literature."
    494     }
    495   },
    496   "hn_data": {
    497     "threads": [
    498       {
    499         "hn_id": "40389576",
    500         "title": "GDPR: Is It Worth It?",
    501         "points": 72,
    502         "comments": 205,
    503         "url": "https://news.ycombinator.com/item?id=40389576",
    504         "created_at": "2024-05-17T13:22:06Z"
    505       },
    506       {
    507         "hn_id": "21538090",
    508         "title": "Oral History of Ken Thompson (2005) [pdf]",
    509         "points": 56,
    510         "comments": 1,
    511         "url": "https://news.ycombinator.com/item?id=21538090",
    512         "created_at": "2019-11-14T18:43:15Z"
    513       },
    514       {
    515         "hn_id": "46384992",
    516         "title": "Oral History of Richard Greenblatt (2005) [pdf]",
    517         "points": 18,
    518         "comments": 0,
    519         "url": "https://news.ycombinator.com/item?id=46384992",
    520         "created_at": "2025-12-25T15:34:44Z"
    521       },
    522       {
    523         "hn_id": "44002385",
    524         "title": "Community Fact-Checks Do Not Break Follower Loyalty",
    525         "points": 4,
    526         "comments": 0,
    527         "url": "https://news.ycombinator.com/item?id=44002385",
    528         "created_at": "2025-05-16T06:35:29Z"
    529       },
    530       {
    531         "hn_id": "32632312",
    532         "title": "Exploring the Role of the Cybercrime Underground in the Russia-Ukraine Conflict",
    533         "points": 4,
    534         "comments": 0,
    535         "url": "https://news.ycombinator.com/item?id=32632312",
    536         "created_at": "2022-08-28T21:36:55Z"
    537       },
    538       {
    539         "hn_id": "32250208",
    540         "title": "Satoshi Nakamoto and the origins of Bitcoin – the profile of a genius",
    541         "points": 3,
    542         "comments": 2,
    543         "url": "https://news.ycombinator.com/item?id=32250208",
    544         "created_at": "2022-07-27T13:35:45Z"
    545       },
    546       {
    547         "hn_id": "37866902",
    548         "title": "Getting Bored of Cyberwar",
    549         "points": 3,
    550         "comments": 1,
    551         "url": "https://news.ycombinator.com/item?id=37866902",
    552         "created_at": "2023-10-13T05:03:06Z"
    553       },
    554       {
    555         "hn_id": "44173062",
    556         "title": "The Evaluation of Engineering Artificial General Intelligence",
    557         "points": 3,
    558         "comments": 0,
    559         "url": "https://news.ycombinator.com/item?id=44173062",
    560         "created_at": "2025-06-03T18:26:02Z"
    561       },
    562       {
    563         "hn_id": "39491738",
    564         "title": "Satoshi Nakamoto and the Origins of Bitcoin",
    565         "points": 3,
    566         "comments": 0,
    567         "url": "https://news.ycombinator.com/item?id=39491738",
    568         "created_at": "2024-02-24T14:34:33Z"
    569       },
    570       {
    571         "hn_id": "35617015",
    572         "title": "Least-to-Most Prompting Enables Complex Reasoning in Large Language Models",
    573         "points": 2,
    574         "comments": 0,
    575         "url": "https://news.ycombinator.com/item?id=35617015",
    576         "created_at": "2023-04-18T17:27:21Z"
    577       }
    578     ],
    579     "top_points": 72,
    580     "total_points": 168,
    581     "total_comments": 209
    582   }
    583 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs