scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (29163B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Are LLMs Prescient? A Continuous Evaluation using Daily News as the Oracle",
      6     "authors": [
      7       "Hui Dai",
      8       "Ryan Teehan",
      9       "Mengye Ren"
     10     ],
     11     "year": 2024,
     12     "venue": "International Conference on Machine Learning",
     13     "arxiv_id": "2411.08324",
     14     "doi": "10.48550/arXiv.2411.08324"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "All abstract claims are backed by results: the 21.55% TF and 11.33% MC degradation figures appear in Table 3, the RAG finding (improvement but persistent decline) is shown in Figure 4, and the daily generation mechanism is described in Section 3.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "The paper makes causal claims that outdated pre-training data causes performance degradation, but the study is purely observational—no intervention or controlled experiment isolates training data recency from other confounds such as question difficulty distribution over time.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper draws broad conclusions about LLM temporal generalization from data collected exclusively from five major US English news sources (CBS, CNBC, CNN, Forbes, NPR); this geographic and linguistic scope is not prominently bounded in the conclusions.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Section 4.3 discusses alternative explanations including over-representation of pre-2021 data in training corpora, LLM-generated distractor choices being less realistic than true answers, and alignment training causing refusal behavior rather than pure knowledge gaps.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper clearly distinguishes between accuracy on forecasting QA (what is measured) and temporal generalization / forecasting ability (what is claimed), and it explicitly differentiates the forecasting setting from reading comprehension via the gold article control.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "A dedicated 'Limitations' paragraph appears at the end of Section 4.3, discussing both data generation and evaluation limitations.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Specific threats are named: (1) question generation bias from an outdated LLM may degrade benchmark reliability over time; (2) the dataset only captures events that occurred, missing counterfactual events; (3) insufficient post-cutoff time horizon limits analysis of knowledge-cutoff vs. RAG-cutoff interactions.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The limitations section states that conclusions about the RAG-cutoff vs. knowledge-cutoff relationship are limited by the available time horizon, and the data collection scope (five US outlets, English only, Jan 2020–Dec 2024) is described in Section 3.1.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Funding is disclosed in the Acknowledgment section: IITP/MSIT South Korea grants (RS-2024-00469482 & RS-2024-00509279), Microsoft Accelerating Foundation Models Research program for Azure compute, and NYU HPC.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All three authors are listed as affiliated with New York University on the title page.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Primary funders are Korean government agencies and Microsoft (compute credits only); the paper evaluates multiple vendors' models including OpenAI and Anthropic, and the findings are not favorable to any single funder's products.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial interests statement is present anywhere in the paper.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms are defined precisely: 'temporal generalization' (following Lazaridou et al.), 'knowledge cutoff,' 'RAG cutoff (R-Cutoff),' 'closed-book,' 'constrained open-book,' and 'gold article' settings are all explicitly defined in Section 4.1.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 1 closes with a bulleted two-contribution summary: (1) Daily Oracle as a continuous forecasting benchmark and (2) empirical findings on performance degradation patterns.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The Related Work section and Table 1 actively compare Daily Oracle to ForecastQA, AutoCast, ForecastBench, FreshBench, StreamingQA, etc., explicitly identifying what each prior work lacks and how this work addresses those gaps.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "The abstract states 'Code and data are available at https://agenticlearning.ai/daily-oracle,' indicating both are publicly released.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "The dataset (31,510 QA pairs) is released at the same URL mentioned in the abstract.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "No requirements.txt, Dockerfile, or environment specification is mentioned; Table 5 lists model API versions but no software environment for running evaluation is described.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "Beyond the URL pointing to an external site, no step-by-step reproduction instructions are included in the paper or appendix.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "Table 3 reports yearly averages and YoY changes; Figure 3 shows moving average curves. No confidence intervals or error bars appear anywhere in the results.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No statistical significance tests are applied to any comparative claims; all comparisons between models or settings are made on raw accuracy values.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Effect sizes are reported as percentage point changes (21.55% average drop for TF, 11.33% for MC) with clear baselines (starting accuracy) provided in the text and Table 3.",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "The 31,510 QA pairs arise from daily generation across 5 years; no power analysis or justification for whether this sample size is adequate for the claimed conclusions is provided.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "Table 3 reports only yearly averages with no variance, standard deviation, or spread measures; Figure 3 applies 5-month smoothing which suppresses variance.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Random chance baselines (50% for TF, 25% for MC) are explicitly referenced, and 8 diverse LLMs including smaller models serve as comparative baselines.",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Evaluated models include Claude-3.5-Sonnet, GPT-4o, Llama-3-8B, Qwen-2-7B, and Gemma-2-2B—all contemporary as of the 2024 evaluation period.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "The three experimental settings (closed-book, constrained open-book, gold article) function as ablations isolating the contribution of retrieved context and gold-standard context to performance.",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "The paper reports accuracy on TF questions, accuracy on MC questions, year-over-year accuracy change, pre-cutoff vs. post-cutoff YoY change, and refusal rates (Appendix B.2).",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": false,
    204           "justification": "Human evaluation is conducted for dataset quality validation (Section 3.3), not for evaluating LLM system outputs; LLM outputs are evaluated solely by automated accuracy.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "Questions are generated from news articles published after the models' training data, making them a genuine held-out test set by construction; the gold article setting confirms answerability.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Table 3 provides pre-cutoff and post-cutoff breakdowns per model; Figure 3 shows temporal performance per model; Figure 8 shows question category distribution over time.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Section 4.3 analyzes failure modes including Mistral/Mixtral models falling below random baseline due to refusals; Figure 20 provides a concrete case study of closed-book failure with correct open-book answer.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Negative results are explicitly reported: RAG with cutoffs prior to knowledge cutoff can make Llama-3-8B perform worse than closed-book; Claude-3.5-Sonnet performs worse with constrained open-book than closed-book due to confounding retrieved articles.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "Table 5 lists exact model version identifiers for all 8 models (e.g., claude-3-5-sonnet-20240620, gpt-4-1106-preview, Mistral-7B-Instruct-v0.3).",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "Appendix D (Figures 23–37) provides all prompts used for article summarization, QA generation, misleading choices generation, QA filtering, and all evaluation settings.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "BM25 top-5 retrieval and 512-word article truncation are mentioned, but generation hyperparameters (temperature, top-p, max tokens) for any of the LLMs are not reported.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": false,
    253           "answer": false,
    254           "justification": "The evaluation queries LLMs directly as black-box APIs without agentic scaffolding; this criterion is not applicable.",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Section 3.1 and Appendices A.1–A.3 document all preprocessing: Common Crawl + Newspaper3k collection, 6-article daily selection (3 random + 3 hot-topic DBSCAN), 4-step QA construction pipeline, and 7-criterion filtering with scoring threshold.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "The paper states 'Code and data are available at https://agenticlearning.ai/daily-oracle,' implying the QA pairs are publicly accessible.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Section 3.1 describes data collection in detail: sources (Common Crawl + Newspaper3k), filtered to 5 mainstream outlets, 1,246,973 English articles from Jan 2019–Dec 2024.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "The evaluation has no participant recruitment; the 4 human annotators for dataset quality validation are not described in terms of recruitment or qualification.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "The full pipeline from article collection through 4-step QA construction (summary, generation, misleading choices, filtering) is documented with prompts in Appendix D and illustrated in Figure 7.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": true,
    294           "justification": "Table 3 lists knowledge cutoff dates for all models with known cutoffs (e.g., Claude-3.5-Sonnet Apr 2024, GPT-4 Apr 2023, GPT-3.5 Sept 2021, Llama-3-8B Mar 2023, Gemma-2-2B Jul 2024).",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": true,
    300           "justification": "Contamination/overlap is a central motivation of the paper; Section 1 explicitly frames benchmark contamination as the problem Daily Oracle solves, and the analysis separates pre-cutoff and post-cutoff performance throughout.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": true,
    306           "justification": "The benchmark is specifically designed to be contamination-resistant: questions are generated from news published after training cutoffs, and the pre-cutoff vs. post-cutoff distinction is the primary analytical lens of the paper.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human subjects study; the 4 annotators used for dataset quality evaluation do not constitute human subjects research.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human subjects involvement requiring IRB approval.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human subjects study.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human subjects study.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human subjects study.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human subjects study.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human subjects study.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "The paper mentions 'budget constraints' limiting some evaluations (e.g., proprietary models only through Sep 2024 in open-book setting) but no actual inference costs or latency figures are reported.",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "Azure cloud compute credits and NYU HPC are acknowledged but no total compute budget (GPU-hours, API call counts, or dollar amounts) is stated.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "LLM performance on future event prediction degrades by an average of 21.55% on TF questions and 11.33% on MC questions between January 2020 and December 2024.",
    373       "evidence": "Table 3 shows per-model yearly averages and YoY change; the average starting vs. ending accuracy (64.68%→50.74% TF; 58.30%→51.69% MC) is stated in Section 4.2.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Performance degradation accelerates markedly after a model's knowledge cutoff date, with post-cutoff YoY decline substantially exceeding pre-cutoff decline.",
    378       "evidence": "Table 3 shows GPT-4 pre-cutoff MC YoY change of -4.23% vs. post-cutoff -18.54%; similar acceleration observed for Claude-3.5-Sonnet (-6.26% vs. -11.78%).",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "RAG with updated news articles improves forecasting accuracy but does not eliminate the temporal degradation trend.",
    383       "evidence": "Figure 4 shows RAG-augmented models still exhibit a declining accuracy trend across all RAG cutoff dates; Section 4.2 states 'across all different RAG cutoffs, the overall performance decline pattern persists.'",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Even with direct access to the gold article containing the answer, most LLMs show declining performance over time, suggesting outdated internal representations contribute beyond just missing knowledge.",
    388       "evidence": "Figure 5 shows ~90% accuracy in gold article setting but with persistent downward trends for most models; Section 4.3 attributes this to 'outdated representations.'",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Daily Oracle is the only forecasting benchmark continuously updated on a daily basis.",
    393       "evidence": "Table 1 compares update intervals of all comparable benchmarks: FreshQA updates weekly, ForecastBench biweekly, others are static.",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "All LLMs experience performance decline, though smaller models (Gemma-2-2B, GPT-3.5) show smaller YoY declines than larger proprietary models.",
    398       "evidence": "Table 3 shows Gemma-2-2B TF average YoY -1.04% (smallest) vs. Mixtral-8x7B -10.78% (largest); discussed in Section 4.2.",
    399       "supported": "moderate"
    400     }
    401   ],
    402   "methodology_tags": [
    403     "benchmark-eval",
    404     "observational"
    405   ],
    406   "key_findings": "Daily Oracle demonstrates that all 8 tested LLMs exhibit consistent temporal performance degradation on future event prediction tasks, averaging a 21.55% accuracy drop on True/False questions and 11.33% on Multiple Choice questions across the 2020–2024 evaluation period. The degradation systematically accelerates after each model's training knowledge cutoff, with post-cutoff YoY decline roughly 3–4x steeper than pre-cutoff decline in most models. While Retrieval Augmented Generation mitigates some degradation, the declining trend persists even with contemporaneous retrieved articles—and notably, even when models are provided the exact gold article containing the answer, performance still trends downward over time, implicating outdated internal representations rather than simply missing facts as a root cause.",
    407   "red_flags": [
    408     {
    409       "flag": "Low inter-annotator agreement",
    410       "detail": "Fleiss' Kappa of 0.26 (fair agreement) for human dataset quality evaluation; 'Non-answerability Before Publication Date' reaches κ=0.02, nearly chance-level. This undermines the validation of the core dataset quality claim."
    411     },
    412     {
    413       "flag": "No statistical significance testing",
    414       "detail": "All comparative claims between models and settings rely on raw accuracy point differences with no confidence intervals, p-values, or variance measures, making it impossible to determine which observed differences are reliable."
    415     },
    416     {
    417       "flag": "Five-outlet source bias",
    418       "detail": "The news corpus is restricted to CBS, CNBC, CNN, Forbes, and NPR—all US-based English-language mainstream outlets. This geographic, linguistic, and ideological selection bias is not discussed in the limitations."
    419     },
    420     {
    421       "flag": "LLM-generated benchmark circularity",
    422       "detail": "Questions and distractors are generated by GPT-3.5/4/4o-mini/4o; the benchmark's quality depends on these models' limitations and may systematically favor or disadvantage models from the same family."
    423     },
    424     {
    425       "flag": "Incomplete open-book evaluation",
    426       "detail": "Due to 'budget constraints,' open-book evaluation for proprietary models (Claude, GPT-4) was cut off at September 2024 rather than December 2024, creating an inconsistent comparison baseline across models."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "Mind the gap: Assessing temporal generalization in neural language models",
    432       "relevance": "Foundational paper defining temporal generalization that Daily Oracle builds upon; introduced the concept evaluated here."
    433     },
    434     {
    435       "title": "Forecasting future world events with neural networks (AutoCast)",
    436       "relevance": "Direct predecessor forecasting benchmark; paper explicitly compares to and argues improvements over AutoCast's static dataset."
    437     },
    438     {
    439       "title": "ForecastBench: A dynamic benchmark of AI forecasting capabilities",
    440       "relevance": "Most direct competitor benchmark; Table 1 comparison is central to Daily Oracle's novelty argument."
    441     },
    442     {
    443       "title": "Approaching human-level forecasting with language models",
    444       "relevance": "Key prior work on LLM forecasting performance; paper adopts their question categorization prompt and compares methodology."
    445     },
    446     {
    447       "title": "Is your LLM outdated? A deep look at temporal generalization (FreshBench)",
    448       "relevance": "Concurrent work studying temporal generalization with forecasting markets; paper compares methodology and findings in Appendix C."
    449     },
    450     {
    451       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    452       "relevance": "RAG methodology foundational reference; the constrained open-book setting directly implements and evaluates RAG."
    453     },
    454     {
    455       "title": "ForecastQA: A question answering challenge for event forecasting with temporal text data",
    456       "relevance": "Earlier forecasting QA benchmark that Daily Oracle significantly extends in size and temporal coverage."
    457     },
    458     {
    459       "title": "RealTimeQA: What's the answer right now?",
    460       "relevance": "Related dynamic benchmark; paper contrasts its focus on factual update testing vs. Daily Oracle's forecasting focus."
    461     }
    462   ],
    463   "engagement_factors": {
    464     "practical_relevance": {
    465       "score": 3,
    466       "justification": "Directly addresses the reliability of LLMs deployed in time-sensitive applications (finance, healthcare, policy) and provides a live benchmark practitioners can use today."
    467     },
    468     "surprise_contrarian": {
    469       "score": 2,
    470       "justification": "The finding that degradation persists even with gold article access (suggesting outdated representations, not just missing facts) challenges the assumption that RAG fully solves temporal staleness."
    471     },
    472     "fear_safety": {
    473       "score": 1,
    474       "justification": "Raises reliability concerns about deploying outdated LLMs for high-stakes forecasting, but frames this as a performance limitation rather than a safety risk."
    475     },
    476     "drama_conflict": {
    477       "score": 1,
    478       "justification": "Implicitly challenges vendors claiming their models are current; the benchmark continuously exposes performance gaps but without naming antagonists."
    479     },
    480     "demo_ability": {
    481       "score": 3,
    482       "justification": "The Daily Oracle benchmark is live and continuously generating new questions; anyone can evaluate a model against today's news questions."
    483     },
    484     "brand_recognition": {
    485       "score": 1,
    486       "justification": "NYU affiliation; paper evaluates well-known models (GPT-4, Claude, Llama) but the lab itself has low brand recognition compared to frontier AI labs."
    487     }
    488   },
    489   "hn_data": {
    490     "threads": [
    491       {
    492         "hn_id": "41883728",
    493         "title": "Agents Thinking Fast and Slow: A Talker-Reasoner Architecture",
    494         "points": 13,
    495         "comments": 0,
    496         "url": "https://news.ycombinator.com/item?id=41883728",
    497         "created_at": "2024-10-18T21:36:47Z"
    498       },
    499       {
    500         "hn_id": "41968116",
    501         "title": "Agents Thinking Fast and Slow: A Talker-Reasoner Architecture",
    502         "points": 9,
    503         "comments": 0,
    504         "url": "https://news.ycombinator.com/item?id=41968116",
    505         "created_at": "2024-10-28T05:37:38Z"
    506       },
    507       {
    508         "hn_id": "41963802",
    509         "title": "Solving Global Lyapunov functions: open problem in mathematics with transformers",
    510         "points": 3,
    511         "comments": 0,
    512         "url": "https://news.ycombinator.com/item?id=41963802",
    513         "created_at": "2024-10-27T16:36:50Z"
    514       },
    515       {
    516         "hn_id": "41842065",
    517         "title": "Agents Thinking Fast and Slow: A Talker-Reasoner Architecture",
    518         "points": 3,
    519         "comments": 0,
    520         "url": "https://news.ycombinator.com/item?id=41842065",
    521         "created_at": "2024-10-14T21:11:10Z"
    522       },
    523       {
    524         "hn_id": "42258010",
    525         "title": "Gradient Boosting Trees and LLMs for Tabular Data Few-Shot Learning",
    526         "points": 2,
    527         "comments": 0,
    528         "url": "https://news.ycombinator.com/item?id=42258010",
    529         "created_at": "2024-11-27T17:46:47Z"
    530       },
    531       {
    532         "hn_id": "45842485",
    533         "title": "Death by a Thousand Prompts: Open Model Vulnerability Analysis",
    534         "points": 2,
    535         "comments": 1,
    536         "url": "https://news.ycombinator.com/item?id=45842485",
    537         "created_at": "2025-11-07T00:59:31Z"
    538       },
    539       {
    540         "hn_id": "41872780",
    541         "title": "Discovering Global Lyapunov functions using symbolic transformers",
    542         "points": 2,
    543         "comments": 1,
    544         "url": "https://news.ycombinator.com/item?id=41872780",
    545         "created_at": "2024-10-17T19:09:00Z"
    546       },
    547       {
    548         "hn_id": "33454886",
    549         "title": "EDiffi: Text-to-Image Diffusion Models with an Ensemble of Expert Denoisers",
    550         "points": 2,
    551         "comments": 1,
    552         "url": "https://news.ycombinator.com/item?id=33454886",
    553         "created_at": "2022-11-03T18:00:20Z"
    554       },
    555       {
    556         "hn_id": "42646570",
    557         "title": "Protect Dark and Quiet Sky from Harmful Interference by Satellite Constellations",
    558         "points": 1,
    559         "comments": 0,
    560         "url": "https://news.ycombinator.com/item?id=42646570",
    561         "created_at": "2025-01-09T15:28:14Z"
    562       }
    563     ],
    564     "top_points": 13,
    565     "total_points": 37,
    566     "total_comments": 3
    567   }
    568 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs