scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28576B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Inference Scaling Laws: An Empirical Analysis of Compute-Optimal Inference for Problem-Solving with Language Models",
      6     "authors": [
      7       "Yangzhen Wu",
      8       "Zhiqing Sun",
      9       "Shanda Li",
     10       "Sean Welleck",
     11       "Yiming Yang"
     12     ],
     13     "year": 2024,
     14     "venue": "ICLR 2025",
     15     "arxiv_id": "2408.00724",
     16     "doi": null
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Main abstract claims (smaller models with REBASE outperform larger models, inference scaling is more compute-efficient) are directly supported by Figs 4-5 and Table 1 across multiple model families and benchmarks.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims about inference strategy performance are based on controlled comparisons varying only the strategy or model size while holding other variables fixed, which is adequate for algorithmic causal inference.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The title and abstract claim general 'inference scaling laws' for 'problem-solving with language models' but experiments are predominantly on math benchmarks (MATH500, GSM8K); code generation appears only in the appendix, and no other task domains are tested.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper hypothesizes why REBASE outperforms MCTS (efficient reward use, no costly rollouts) but does not discuss whether the specific reward model quality, fine-tuning procedure, or dataset choice could confound the scaling law conclusions.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper explicitly chose math benchmarks because they 'allow us to accurately evaluate problem solving ability,' and accuracy on held-out test sets is a direct measure — the proxy relationship is acknowledged and justified.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "There is no dedicated limitations or threats-to-validity section; the conclusion mentions saturation behavior and future work but does not constitute a formal limitations section.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No specific threats to validity are discussed; benchmark contamination, reward model quality as a confound, and domain restriction to mathematics are not addressed.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not explicitly state what the results do NOT show; claims about compute-optimal inference are presented without explicit boundary statements confining them to fine-tuned math models.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Acknowledgments disclose Google Fellowship (Zhiqing Sun) and NSF SCALE grant DMS 2134012 (Sean Welleck), constituting partial funding disclosure.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly stated on the title page: Tsinghua University and Carnegie Mellon University.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "NSF is independent of outcome; the Google Fellowship funds a researcher but the paper evaluates open-source models (not Google products), making direct financial interest in the specific results unlikely.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "There is no competing interests or financial interests declaration anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "'Compute-optimal inference' is formally defined as an optimization problem in Section 3; inference FLOPs computation follows Kaplan et al.; all inference strategies (greedy, best-of-n, majority voting, REBASE) are explicitly defined.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three explicit contributions are listed: empirical inference scaling laws, theoretical convergence analysis of voting methods, and the REBASE tree-search algorithm.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Related work engages substantively with training scaling laws (Chinchilla), inference strategies (MCTS, self-consistency), and concurrent work (Snell et al. 2025), explicitly contrasting how this work differs from each.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "The paper lists a project webpage URL but does not explicitly state code is released; no GitHub or code repository link with an explicit release statement appears in the paper text.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "All datasets used (MATH500, GSM8K, MetaMath, Math-Shepherd) are standard publicly available benchmarks; no original data collection was performed.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Table 2 provides training hyperparameters but no requirements file, Dockerfile, or software environment specifications are provided.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Hyperparameters and algorithm descriptions are provided but step-by-step instructions for running the full experimental pipeline from fine-tuning to evaluation are absent.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "The paper states each configuration is run multiple times to calculate variance, but all figures display only point estimates without error bars or confidence intervals.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are used for any comparative claims between REBASE, sampling, and MCTS across the paper.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Effect sizes are reported in practical terms: '2x less FLOPs to achieve comparable accuracy,' '7 times less compute' (Table 1), and specific accuracy numbers with FLOPs for all configurations in Table 4.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The choice of MATH500 (500 examples) and full GSM8K test set is not justified; no power analysis or discussion of whether these benchmark sizes are sufficient for the measured effect sizes.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "The paper mentions calculating variance from multiple runs but reports only means in figures and tables; no standard deviations or spread measures are displayed.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Multiple baselines are included: greedy search, sampling with majority voting, sampling with best-of-n, sampling with weighted voting, and MCTS across all tested models.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Baselines include contemporary state-of-practice methods: MCTS variants, self-consistency (majority voting), and weighted majority voting, all from 2023-2024.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "No ablation of REBASE components is performed; the balance temperature Tb, reward model choice, and tree depth are not ablated, making it unclear which design choices drive improvements.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Results are reported along both accuracy/error rate and inference FLOPs dimensions, providing a two-dimensional cost-performance analysis throughout.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "No human evaluation is relevant; this is an algorithmic study on math benchmarks with automatic evaluation using exact-match accuracy.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Models are fine-tuned on MetaMath training data and evaluated on separate held-out MATH500 and GSM8K test sets that are not used during training.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Figure 8 provides per-difficulty breakdown (MATH-easy levels 1-2 vs. MATH-hard levels 3-5) comparing REBASE and sampling for both Llemma-7B and 34B.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "The paper discusses why MCTS produces unfinished solutions as a failure mode but does not show example failure cases or analyze specific problem types where methods fail.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "MCTS underperforming simple sampling in cost-performance tradeoff is prominently reported in Fig. 4 and is a key negative finding used to motivate REBASE.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Specific model families are cited with corresponding papers (Pythia/Biderman et al. 2023, Llemma/Azerbayev et al. 2024, Mistral-7B/Jiang et al. 2023, Llama3-8B-Instruct/Dubey et al. 2024) and fine-tuning details are in Table 2.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "The paper does not provide actual prompt templates or system instructions used for generating solutions; the inference pipeline format is described only abstractly.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Table 2 provides complete fine-tuning hyperparameters; Appendix C specifies inference hyperparameters including temperature=1.0, max tokens=1024, REBASE balance temperature Tb=0.1, and MCTS expansion configurations.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "The REBASE algorithm is described in procedural detail (Figure 3, Section 3.1.2) with initialization, reward assignment, and expansion steps, plus Equation 1 for the expansion width calculation.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": false,
    262           "justification": "The paper only briefly mentions preprocessing MetaMath 'to make the solutions in a stepwise format' without documenting the exact preprocessing procedure.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "Raw model outputs and per-problem accuracy results are not released; only aggregated accuracy numbers in figures and tables are provided.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": false,
    275           "answer": false,
    276           "justification": "No original data collection; all datasets (MATH, GSM8K, MetaMath, Math-Shepherd) are existing public benchmarks referenced to their original papers.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants in this study.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": false,
    288           "justification": "The path from benchmark data through fine-tuning to inference evaluation is described at a high level but lacks sufficient documentation for independent end-to-end verification.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "Pre-training data cutoffs for Pythia, Llemma, Mistral, and Llama3 base models are not stated, which matters since MATH (2021) and GSM8K (2021) predate most of these models' training cutoffs.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "Potential overlap between base model pre-training corpora and the MATH/GSM8K evaluation benchmarks is not discussed anywhere in the paper.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "MATH and GSM8K (both 2021) are old enough to appear in pre-training corpora of the tested models; this contamination risk is not acknowledged or mitigated.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Inference FLOPs per question is the primary x-axis throughout all experiments; exact FLOP counts are tabulated in Tables 1 and 4 for all configurations.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": true,
    366           "justification": "The paper explicitly varies and reports total computational budget in FLOPs as the core independent variable, with the compute-optimal inference problem formally defined in Section 3.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Scaling inference compute can be more computationally efficient than scaling model parameters",
    375       "evidence": "Llemma-7B with REBASE achieves comparable accuracy to Llemma-34B while using 2x fewer FLOPs across both MATH500 and GSM8K (Figs 4-5, Table 1)",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "REBASE consistently achieves Pareto-optimal cost-performance tradeoffs, outperforming sampling and MCTS at all compute budgets",
    380       "evidence": "Figs 4-8 and Table 4 show REBASE dominating all baselines across Llemma-7B, 34B, Mistral-7B, and Llama3-8B on MATH500, GSM8K, and MBPP",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "MCTS underperforms simple sampling in the inference cost-performance tradeoff for LLM problem-solving",
    385       "evidence": "Fig 4 shows MCTS worse than sampling+weighted voting at equivalent FLOPs budgets; attributed to costly rollouts producing many unfinished solutions with fewer effective votes",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Sampling-based voting accuracy converges exponentially to a fixed limit determined by the model's output distribution, making infinite sampling insufficient for perfect accuracy",
    390       "evidence": "Theorems 1 and 2 provide formal convergence proofs; empirically shown by saturation curves in Figs 6-7",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "REBASE yields greater performance gains over sampling on harder problems than easier ones",
    395       "evidence": "Fig 8 shows REBASE and sampling comparable on MATH-easy (levels 1-2) but REBASE significantly better on MATH-hard (levels 3-5) for both Llemma-7B and 34B",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "The optimal model size for inference depends on the available compute budget, following the equation log10(C) = 1.19·log10(N) + 2.03",
    400       "evidence": "Fig 1 (right panel) shows optimal model size shifting from smaller to larger as compute budget increases; regression equation derived from Pythia model family experiments",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval",
    406     "theoretical"
    407   ],
    408   "key_findings": "Inference compute scaling follows predictable laws analogous to training scaling laws, with smaller models paired with advanced inference (REBASE) Pareto-dominating larger models at the same compute budget across math and code benchmarks. MCTS, despite widespread use, is outperformed by simple sampling at equivalent FLOPs due to costly rollouts that produce many unfinished solutions. Theoretical proofs show sampling-based voting inevitably saturates to a fixed accuracy ceiling, motivating the proposed REBASE algorithm which achieves up to 7x compute efficiency over sampling with higher accuracy by using process reward model scores to guide tree expansion without explicit rollouts.",
    409   "red_flags": [
    410     {
    411       "flag": "No limitations section",
    412       "detail": "No dedicated limitations or threats-to-validity section exists; domain restriction to math benchmarks, reward model dependence, and contamination risks are not acknowledged."
    413     },
    414     {
    415       "flag": "Variance calculated but not displayed",
    416       "detail": "Paper claims each configuration runs multiple times to calculate variance, but all figures show only point estimates with no error bars, making reliability of fine-grained comparisons unverifiable."
    417     },
    418     {
    419       "flag": "Overclaiming generality from math-only benchmarks",
    420       "detail": "Title claims 'inference scaling laws' for problem-solving broadly; primary experiments are exclusively on MATH500 and GSM8K, with code generation appearing only in the appendix."
    421     },
    422     {
    423       "flag": "Benchmark contamination unaddressed",
    424       "detail": "MATH (2021) and GSM8K (2021) predate training cutoffs of Pythia, Llemma, and Mistral base models; pre-training contamination is not discussed or mitigated."
    425     },
    426     {
    427       "flag": "No REBASE ablation",
    428       "detail": "The balance temperature Tb and PRM-based expansion design are not ablated; it is unclear whether results are robust to these design choices or dependent on specific reward model quality."
    429     },
    430     {
    431       "flag": "Single reward model across all experiments",
    432       "detail": "All experiments use the same Llemma-34B PRM fine-tuned on Math-Shepherd; generalizability of REBASE's advantages to other reward models or domains is not tested."
    433     }
    434   ],
    435   "cited_papers": [
    436     {
    437       "title": "Scaling Laws for Neural Language Models (Kaplan et al., 2020)",
    438       "relevance": "Foundation training scaling law that this paper extends to inference-time compute"
    439     },
    440     {
    441       "title": "Training Compute-Optimal Large Language Models (Chinchilla, Hoffmann et al., 2022)",
    442       "relevance": "Training-side compute-optimal scaling law that directly parallels the inference-side analysis in this paper"
    443     },
    444     {
    445       "title": "Scaling Test-Time Compute Optimally Can Be More Effective than Scaling LLM Parameters (Snell et al., 2025)",
    446       "relevance": "Concurrent complementary work on test-time compute scaling explicitly compared in related work"
    447     },
    448     {
    449       "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models (Wang et al., 2023)",
    450       "relevance": "Establishes majority voting as the primary baseline inference strategy evaluated throughout"
    451     },
    452     {
    453       "title": "Let's Verify Step by Step (Lightman et al., 2024)",
    454       "relevance": "Process reward model methodology that REBASE relies on for step-level scoring and candidate ranking"
    455     },
    456     {
    457       "title": "Tree of Thoughts: Deliberate Problem Solving with Large Language Models (Yao et al., 2023)",
    458       "relevance": "Tree-search inference strategy for LLMs that motivated the MCTS baseline comparison"
    459     },
    460     {
    461       "title": "Training Verifiers to Solve Math Word Problems (Cobbe et al., 2021)",
    462       "relevance": "Provides the GSM8K benchmark, one of two primary evaluation datasets"
    463     },
    464     {
    465       "title": "Measuring Mathematical Problem Solving with the MATH Dataset (Hendrycks et al., 2021)",
    466       "relevance": "Provides the MATH500 evaluation benchmark used throughout the paper"
    467     },
    468     {
    469       "title": "Llemma: An Open Language Model for Mathematics (Azerbayev et al., 2024)",
    470       "relevance": "Primary policy model (Llemma-7B and 34B) used in inference strategy experiments"
    471     },
    472     {
    473       "title": "Math-Shepherd: Verify and Reinforce LLMs Step-by-Step Without Human Annotations (Wang et al., 2024)",
    474       "relevance": "Training dataset for the process reward model used in all REBASE and weighted voting experiments"
    475     }
    476   ],
    477   "engagement_factors": {
    478     "practical_relevance": {
    479       "score": 3,
    480       "justification": "Directly actionable for ML engineers: smaller model + REBASE can match larger models at 2x lower inference cost, with quantitative FLOP budgets and regression equations for optimal model selection."
    481     },
    482     "surprise_contrarian": {
    483       "score": 2,
    484       "justification": "Challenges the assumption that bigger is always better, and shows MCTS — a widely celebrated technique — is actually worse than simple sampling in the cost-performance tradeoff."
    485     },
    486     "fear_safety": {
    487       "score": 0,
    488       "justification": "No safety or risk concerns raised; purely algorithmic efficiency work on mathematical reasoning."
    489     },
    490     "drama_conflict": {
    491       "score": 1,
    492       "justification": "The MCTS underperformance finding is mildly contrarian but framed constructively as motivation for REBASE rather than as an attack on prior work."
    493     },
    494     "demo_ability": {
    495       "score": 2,
    496       "justification": "Experiments use open-source models (Llemma, Mistral, Llama3) and public benchmarks, enabling others to reproduce and test the approach with disclosed hyperparameters."
    497     },
    498     "brand_recognition": {
    499       "score": 2,
    500       "justification": "Carnegie Mellon School of Computer Science affiliation and ICLR 2025 acceptance provide credibility; not a major lab flagship but a well-regarded academic venue."
    501     }
    502   },
    503   "hn_data": {
    504     "threads": [
    505       {
    506         "hn_id": "44805436",
    507         "title": "Quantum machine learning via vector embeddings",
    508         "points": 11,
    509         "comments": 0,
    510         "url": "https://news.ycombinator.com/item?id=44805436",
    511         "created_at": "2025-08-05T22:46:47Z"
    512       },
    513       {
    514         "hn_id": "39606796",
    515         "title": "Dialect prejudice predicts AI decisions about people's character",
    516         "points": 4,
    517         "comments": 2,
    518         "url": "https://news.ycombinator.com/item?id=39606796",
    519         "created_at": "2024-03-05T17:46:09Z"
    520       },
    521       {
    522         "hn_id": "40308877",
    523         "title": "A Survey on the Real Power of ChatGPT",
    524         "points": 3,
    525         "comments": 0,
    526         "url": "https://news.ycombinator.com/item?id=40308877",
    527         "created_at": "2024-05-09T14:55:57Z"
    528       },
    529       {
    530         "hn_id": "41196057",
    531         "title": "Deceptive AI is most convincing",
    532         "points": 2,
    533         "comments": 0,
    534         "url": "https://news.ycombinator.com/item?id=41196057",
    535         "created_at": "2024-08-08T20:45:23Z"
    536       },
    537       {
    538         "hn_id": "37528261",
    539         "title": "An Exact Equivalence for Finite Classification Models",
    540         "points": 2,
    541         "comments": 1,
    542         "url": "https://news.ycombinator.com/item?id=37528261",
    543         "created_at": "2023-09-15T19:47:22Z"
    544       },
    545       {
    546         "hn_id": "39654130",
    547         "title": "Information Flow Routes: Automatically Interpreting Language Models at Scale",
    548         "points": 2,
    549         "comments": 0,
    550         "url": "https://news.ycombinator.com/item?id=39654130",
    551         "created_at": "2024-03-09T19:31:41Z"
    552       },
    553       {
    554         "hn_id": "39653620",
    555         "title": "Bert for Information Retrieval: Survey, Applications, Resources, and Challenges",
    556         "points": 2,
    557         "comments": 0,
    558         "url": "https://news.ycombinator.com/item?id=39653620",
    559         "created_at": "2024-03-09T18:14:39Z"
    560       },
    561       {
    562         "hn_id": "47111778",
    563         "title": "Deception Analysis with Artificial Intelligence an Interdisciplinary Perspective",
    564         "points": 1,
    565         "comments": 1,
    566         "url": "https://news.ycombinator.com/item?id=47111778",
    567         "created_at": "2026-02-22T15:30:25Z"
    568       },
    569       {
    570         "hn_id": "39671122",
    571         "title": "Dialect predicts AI decisions about character, employability, and criminality",
    572         "points": 1,
    573         "comments": 1,
    574         "url": "https://news.ycombinator.com/item?id=39671122",
    575         "created_at": "2024-03-11T17:39:22Z"
    576       },
    577       {
    578         "hn_id": "45265393",
    579         "title": "Coordinating \"7B Humans\" is hard",
    580         "points": 1,
    581         "comments": 0,
    582         "url": "https://news.ycombinator.com/item?id=45265393",
    583         "created_at": "2025-09-16T17:44:31Z"
    584       }
    585     ],
    586     "top_points": 11,
    587     "total_points": 29,
    588     "total_comments": 5
    589   }
    590 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs