scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25947B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Do We Truly Need So Many Samples? Multi-LLM Repeated Sampling Efficiently Scales Test-Time Compute",
      6     "authors": [
      7       "Jianhao Chen",
      8       "Zishuo Xun",
      9       "Bocheng Zhou",
     10       "Han Qi",
     11       "Qiaosheng Zhang"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2504.00762",
     16     "doi": "10.48550/arXiv.2504.00762"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All abstract claims (outperforms self-consistency, beats multi-agent debate, reduces costs, requires few comparable LLMs, extends with verification) are backed by experiments in Figures 4-7 and Table 3.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Ablation studies (Table 4) isolate the contribution of weighted voting components, and controlled experiments fix sampling budget across conditions to attribute improvements to ModelSwitch's switching mechanism.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The conclusion claims 'a practical and generalizable solution for various reasoning and knowledge-based tasks,' but 5 of 7 evaluation datasets are math benchmarks; the evidence for general task applicability is weak.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Section 5 provides formal analysis showing why multi-LLM mixing outperforms single-LLM, including counterexamples and conditions under which errors from different models counteract each other.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Claims are accuracy on benchmark test sets, which is exactly what is measured; no proxy mismatch exists.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "There is no dedicated limitations or threats-to-validity section anywhere in the paper.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No threats to validity are discussed; the paper does not address benchmark contamination, model API non-determinism, or the manual tuning of external weights per dataset.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not state what the results do NOT show — e.g., no discussion of inapplicability to open-ended generation, tasks without definitive answers, or real-world latency constraints.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding statement or acknowledgments section appears in the provided paper text.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All author affiliations are clearly listed on the title page (Nanjing University, Shanghai AI Laboratory, University of Auckland, Penn State).",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No funding is disclosed, so independence cannot be verified; Shanghai AI Laboratory has institutional interests in LLM efficiency research.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement is present in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are carefully defined: 'consistency' is explicitly defined as entropy of generated answers (Section 2), and 'ModelSwitch' is specified via Algorithm 1 with internal/external weights.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper explicitly lists four contributions in the introduction: empirical consistency-accuracy analysis, ModelSwitch method, experimental evaluation, and theoretical analysis.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 6 discusses generation-verification paradigm and multi-agent collaboration, showing how ModelSwitch differs from self-consistency, debate methods, and model routing, with specific comparisons to MAD, MOA, ChatEval, AgentVerse.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Code is available at https://github.com/JianhaoChen-nju/ModelSwitch per the paper's abstract.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "All seven evaluation datasets (GSM8K, MATH, MathBench, MGSM, DATE, MMLU-Pro, AIME24) are standard publicly available benchmarks used unmodified.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The paper describes compute hardware (Ubuntu 22.04, 8 A100 GPUs) but does not provide a requirements.txt, Dockerfile, or equivalent dependency specification.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Algorithm 1 describes the core method and hyperparameters are provided in Appendix A, but no step-by-step reproduction instructions are given in the paper itself.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results in figures and tables are point estimates with no confidence intervals or error bars reported.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are used for any of the comparative claims between ModelSwitch and baselines.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Absolute accuracy improvements are consistently reported with baseline context (e.g., '10.2-point increase over best single LLM on MMLU-Pro,' '34% average sample reduction').",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "Benchmark sizes are stated (e.g., MATH: 500 problems, MMLU-Pro: 500 random samples) but no justification or power analysis is provided for why these are adequate.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance or standard deviation across experimental runs is reported; all results are single-run point estimates.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Baselines include self-consistency for each individual LLM and five multi-agent methods (MAD, AgentVerse, ChatEval, MAD-MLD, MOA) compared under equal sampling budgets.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "All baselines use current state-of-the-art models (GPT-4o mini, Gemini 1.5 Flash, GPT-4o, Gemini 1.5 Pro) and recent methods (MOA 2024, MAD 2024).",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Table 4 ablates the weighted voting algorithm by removing internal weights, external weights, and both, showing each component contributes positively.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "The paper reports accuracy across 7 datasets plus efficiency metrics (actual sampling count, cost in dollars from Table 3).",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "NA — paper evaluates LLM performance on closed-form benchmarks with definitive answers; human evaluation is not relevant.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "All evaluations use established held-out test splits of publicly available benchmarks.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down across all 7 datasets individually in Figure 5 and Table 4; the scaling analysis in Figure 6 is shown per dataset.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "No failure cases or error analysis is provided; the paper does not discuss when or why ModelSwitch fails to improve over single-LLM sampling.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Section 4.4 reports that scaling beyond 2-3 models hurts or plateaus performance (e.g., DATE drops from 78.6% to 76.4% with 6 models), and AIME24 Appendix shows no improvement at budget=16.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Specific model versions are named and cited (GPT-4o mini [26], Gemini 1.5 Flash [27], Claude 3 Haiku [28], Llama-3.1-8B-Instruct [29], Gemma-2-9B-It [30], Qwen2.5-7B-Instruct [9]).",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "The paper states queries are in 'COT format by default' citing [25] but does not provide actual prompt templates or system instructions.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Temperature and top_p for GPT-4o mini are set to 1; external weights Wβ are given in Tables 1 and 2 for each model-dataset combination; other models use stated defaults.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "Algorithm 1 fully describes the ModelSwitch procedure including sequential model querying, consistency check, early stopping, and weighted voting aggregation.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Appendix A.1 specifies exactly which subsets were used: 300-question Arith subset of MathBench, 500 random questions from MMLU-Pro, 1000 sampled MGSM questions.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "The paper states code and data are on GitHub but does not explicitly confirm that raw model outputs/responses are included for independent verification.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Benchmark datasets are all publicly sourced and their provenance is cited; sampling procedure (K samples per model via API/inference) is described in the algorithm.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "NA — no human participants; evaluation uses standard automated benchmarks.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The full pipeline from benchmark query to answer extraction, entropy calculation, switching decision, and final voting is documented in Algorithm 1 and surrounding text.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "Training data cutoffs for any of the evaluated models (GPT-4o mini, Gemini 1.5 Flash, etc.) are not stated in the paper.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "No discussion of potential train/test overlap; AIME24 (2024 competition problems) is particularly high-risk for contamination given the models' training windows.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "Contamination is not addressed; AIME24 2024 problems were publicly available before the training cutoffs of most evaluated models.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "NA — no human participants.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "NA — no human participants.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "NA — no human participants.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "NA — no human participants.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "NA — no human participants.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "NA — no human participants.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "NA — no human participants.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Table 3 explicitly reports API costs in dollars for self-consistency vs. ModelSwitch across 6 datasets, showing 15-48% cost reductions.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": true,
    366           "justification": "Appendix A.1 states compute resources (Ubuntu 22.04, 1600GB RAM, 8 NVIDIA A100 GPUs) and notes minimum requirements for smaller deployments.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "ModelSwitch outperforms single-LLM self-consistency in accuracy across all 7 benchmarks tested",
    375       "evidence": "Figure 4 shows accuracy improvements over best individual LLM self-consistency on GSM8K, MATH, MathBench, MGSM, DATE, MMLU-Pro under equal sampling budgets",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Consistency (entropy) universally correlates with accuracy across diverse LLMs and datasets",
    380       "evidence": "Figure 2 reports r=0.61-0.96 (all p<0.001) across 6 LLMs on MATH and MathBench, extending the self-consistency finding to multiple models",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "ModelSwitch reduces average actual sampling count by 34% while improving accuracy",
    385       "evidence": "Section 4.2 reports average sampling counts of 9.2-13.4 from a budget of 16 across 6 datasets; Table 3 shows 15-48% cost reductions",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "ModelSwitch achieves state-of-the-art performance against multi-agent debate methods on 4/7 datasets",
    390       "evidence": "Figure 5 compares against MAD, AgentVerse, ChatEval, MAD-MLD, MOA under equal 15-sample budget; MMLU-Pro shows 63.2% vs best competitor 52.6% (MOA)",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Only 2-3 comparable LLMs are needed; adding more models beyond this does not improve and may hurt performance",
    395       "evidence": "Figure 6 shows performance plateaus or declines going from 2 to 6 models on MathBench and DATE under both strong-to-weak and weak-to-strong orderings",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "ModelSwitch with two weak LLMs can match performance of a single much larger model",
    400       "evidence": "Figure 1 shows 9B+8B open-source combination (69%) matches Llama-3.1-70B (68.7%) on MathBench with only 7 samples",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval",
    406     "theoretical"
    407   ],
    408   "key_findings": "ModelSwitch achieves a strong positive correlation between answer consistency (entropy) and accuracy across six diverse LLMs, and exploits this by switching to a second model when the first produces inconsistent answers. On seven benchmarks, ModelSwitch using two lightweight models (GPT-4o mini + Gemini 1.5 Flash) outperforms both individual models under self-consistency and five multi-agent debate methods, with a 10.2pp gain on MMLU-Pro and 34% average sampling reduction. The approach generalizes to reasoning LLMs (AIME24) and integrates cleanly with reward model verification. Theoretical analysis provides sufficient and necessary conditions for when mixing two models improves over single-model majority voting.",
    409   "red_flags": [
    410     {
    411       "flag": "No variance or CIs",
    412       "detail": "All results are single-run point estimates with no confidence intervals, error bars, or significance tests — comparative claims cannot be statistically evaluated."
    413     },
    414     {
    415       "flag": "Benchmark contamination unaddressed",
    416       "detail": "AIME24 (2024 competition) problems were publicly available before training cutoffs of GPT-4o mini and Gemini 1.5 Flash; no contamination analysis is provided."
    417     },
    418     {
    419       "flag": "External weights manually tuned per dataset",
    420       "detail": "Tables 1 and 2 show Wβ weights that vary by model and dataset — these appear hand-tuned, creating potential for over-fitting to specific benchmarks and inflating reported gains."
    421     },
    422     {
    423       "flag": "Math-heavy benchmark selection",
    424       "detail": "5 of 7 datasets are mathematics benchmarks; claims of generalizability to 'various reasoning and knowledge-based tasks' are not well-supported."
    425     },
    426     {
    427       "flag": "No limitations section",
    428       "detail": "The paper has no dedicated limitations or threats-to-validity section; failure modes (e.g., tasks without definitive answers, high-latency scenarios) are not discussed."
    429     },
    430     {
    431       "flag": "No funding disclosure",
    432       "detail": "No acknowledgments or funding source is declared despite institutional affiliations with Shanghai AI Laboratory."
    433     }
    434   ],
    435   "cited_papers": [
    436     {
    437       "title": "Self-consistency improves chain of thought reasoning in language models",
    438       "relevance": "Primary baseline — ModelSwitch is built on and compared against self-consistency throughout"
    439     },
    440     {
    441       "title": "Large language monkeys: Scaling inference compute with repeated sampling",
    442       "relevance": "Motivates the repeated sampling paradigm and establishes that scaling samples improves coverage"
    443     },
    444     {
    445       "title": "Improving factuality and reasoning in language models through multiagent debate",
    446       "relevance": "MAD — primary multi-agent debate baseline compared in Figure 5"
    447     },
    448     {
    449       "title": "Mixture-of-agents enhances large language model capabilities",
    450       "relevance": "MOA — strongest multi-agent baseline (52.6% on MMLU-Pro vs ModelSwitch's 63.2%)"
    451     },
    452     {
    453       "title": "MMLU-Pro: A more robust and challenging multi-task language understanding benchmark",
    454       "relevance": "Key evaluation benchmark where ModelSwitch shows largest relative gain (+10.2pp)"
    455     },
    456     {
    457       "title": "Scaling LLM test-time compute optimally can be more effective than scaling model parameters",
    458       "relevance": "Motivates test-time compute scaling as alternative to training-time scaling"
    459     },
    460     {
    461       "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning",
    462       "relevance": "Reasoning LLM used in AIME24 experiments; shows self-consistency improvements with R1"
    463     },
    464     {
    465       "title": "If multi-agent debate is the answer, what is the question?",
    466       "relevance": "Co-authored by Hangfan Zhang (paper co-author); provides critical analysis of debate methods motivating ModelSwitch's approach"
    467     }
    468   ],
    469   "engagement_factors": {
    470     "practical_relevance": {
    471       "score": 3,
    472       "justification": "Directly actionable with existing API models — no training required, code released, and costs are quantified in dollars."
    473     },
    474     "surprise_contrarian": {
    475       "score": 2,
    476       "justification": "Challenges the assumption that more samples from one model is optimal and that multi-agent debate adds value over simple switching."
    477     },
    478     "fear_safety": {
    479       "score": 0,
    480       "justification": "No safety or risk implications raised."
    481     },
    482     "drama_conflict": {
    483       "score": 1,
    484       "justification": "Implicitly critiques the multi-agent debate paradigm by showing a simpler approach outperforms it substantially on MMLU-Pro."
    485     },
    486     "demo_ability": {
    487       "score": 3,
    488       "justification": "Any developer with API keys for GPT-4o mini and Gemini Flash can run this immediately using the released code."
    489     },
    490     "brand_recognition": {
    491       "score": 2,
    492       "justification": "Uses GPT-4o, Gemini, and Claude models as both baselines and components, lending name recognition; Shanghai AI Lab is a prominent institution."
    493     }
    494   },
    495   "hn_data": {
    496     "threads": [
    497       {
    498         "hn_id": "40932006",
    499         "title": "An abundance of Katherines: The game theory of baby naming",
    500         "points": 288,
    501         "comments": 148,
    502         "url": "https://news.ycombinator.com/item?id=40932006"
    503       },
    504       {
    505         "hn_id": "44052041",
    506         "title": "Discord Unveiled: A Comprehensive Dataset of Public Communication (2015-2024)",
    507         "points": 152,
    508         "comments": 179,
    509         "url": "https://news.ycombinator.com/item?id=44052041"
    510       },
    511       {
    512         "hn_id": "43417530",
    513         "title": "Neurosymbolic Decision Trees",
    514         "points": 42,
    515         "comments": 0,
    516         "url": "https://news.ycombinator.com/item?id=43417530"
    517       },
    518       {
    519         "hn_id": "39986540",
    520         "title": "A Survey on Red Teaming for Generative Models",
    521         "points": 16,
    522         "comments": 0,
    523         "url": "https://news.ycombinator.com/item?id=39986540"
    524       },
    525       {
    526         "hn_id": "43986826",
    527         "title": "Bang for the Buck: Vector Search on Cloud CPUs",
    528         "points": 5,
    529         "comments": 0,
    530         "url": "https://news.ycombinator.com/item?id=43986826"
    531       },
    532       {
    533         "hn_id": "31032132",
    534         "title": "A Study of Real-World Data Races in Golang",
    535         "points": 5,
    536         "comments": 0,
    537         "url": "https://news.ycombinator.com/item?id=31032132"
    538       },
    539       {
    540         "hn_id": "43905563",
    541         "title": "(How) Do reasoning models reason?",
    542         "points": 3,
    543         "comments": 0,
    544         "url": "https://news.ycombinator.com/item?id=43905563"
    545       },
    546       {
    547         "hn_id": "46386776",
    548         "title": "LitBench: A Benchmark and Dataset for Reliable Evaluation of Creative Writing",
    549         "points": 3,
    550         "comments": 0,
    551         "url": "https://news.ycombinator.com/item?id=46386776"
    552       },
    553       {
    554         "hn_id": "43751796",
    555         "title": "(How) Do reasoning models reason?",
    556         "points": 2,
    557         "comments": 0,
    558         "url": "https://news.ycombinator.com/item?id=43751796"
    559       },
    560       {
    561         "hn_id": "44179940",
    562         "title": "Stop Anthropomorphizing Intermediate Tokens as Reasoning/Thinking Traces",
    563         "points": 1,
    564         "comments": 0,
    565         "url": "https://news.ycombinator.com/item?id=44179940"
    566       }
    567     ],
    568     "top_points": 288,
    569     "total_points": 517,
    570     "total_comments": 327
    571   }
    572 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs