scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27442B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Local LLM Ensembles for Zero-shot Portuguese Named Entity Recognition",
      6     "authors": [
      7       "João Lucas Luz Lima Sarcinelli",
      8       "Diego Furtado Silva"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv",
     12     "arxiv_id": "2512.10043",
     13     "doi": "10.48550/arXiv.2512.10043"
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "Abstract claims (ensembles outperform individual models on 4/5 datasets, cross-dataset ensembles work) are directly supported by Table 3 and Figure 4 results.",
     21         "source": "haiku"
     22       },
     23       "causal_claims_justified": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "Ablation study (Table 4) removes voting and disambiguation steps to test their causal impact on performance, meeting the standard for causal claims about pipeline components.",
     27         "source": "haiku"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Claims explicitly bounded to 'five distinct Portuguese NER datasets' and 'zero-shot' settings; future work mentions extending to other languages, indicating current scope is Portuguese-specific.",
     33         "source": "haiku"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "Section 5.1 explains HAREM underperformance (generic domain, simpler entities) vs. domain-specific dataset success (complex entity types); Section 5.3 discusses validation set representativeness issues.",
     39         "source": "haiku"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "Measured outcome (micro-F1 on NER test sets) directly matches claimed outcome (NER performance); no conflation of proxy metrics with intended claims.",
     45         "source": "haiku"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Dedicated Section 7 titled 'Limitations' discusses GPU memory requirements, processing time for evaluating 7,000+ combinations, and computational feasibility constraints.",
     53         "source": "haiku"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Section 5.3 identifies specific threat: heuristic-selected ensemble (0.591 F1) differs from optimal ensemble (0.622 F1 on HAREM), indicating validation set may not be representative of full test distribution.",
     59         "source": "haiku"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Scope explicitly bounded to 'five distinct Portuguese NER datasets' and 'zero-shot' prompting; future work implies extension to other languages is outside current scope.",
     65         "source": "haiku"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Section 8 explicitly states: 'financed in part by the Coordenação de Aperfeiçoamento de Pessoal de Nível Superior – Brasil (CAPES) – Finance Code 001.'",
     73         "source": "haiku"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Both authors listed with affiliation 'Instituto de Ciências Matemáticas e Computação, Universidade de São Paulo' and no conflict with evaluated model organizations (Meta, Google, Microsoft, etc.).",
     79         "source": "haiku"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "CAPES is a Brazilian government research funding agency, independent of the paper's outcomes regarding ensemble performance.",
     85         "source": "haiku"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No competing interests statement provided; no mention of patents, equity, consulting fees, or financial relationships with model publishers.",
     91         "source": "haiku"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Key terms established: NER as token-level task (standard in literature), ensemble defined via voting/disambiguation steps, zero-shot as 'no labeled examples,' in-context learning cited with reference [24].",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Three explicit contributions listed in Introduction: (1) novel three-step ensemble pipeline, (2) evaluation on 5 Portuguese datasets, (3) evidence that cross-dataset ensembles work.",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 2 positions work vs. MoE models, fusion-based ensembles (LLM-Blender), routing approaches (FrugalGPT), prior LLM NER work [13,16], showing novel contribution in zero-shot Portuguese NER with similarly-sized models.",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "empirical": {
    117       "artifacts": {
    118         "code_released": {
    119           "applies": true,
    120           "answer": true,
    121           "justification": "Abstract and Section 3 state 'Code is available at https://github.com/Joao-Luz/local-llm-ner-ensemble' with direct GitHub link provided.",
    122           "source": "haiku"
    123         },
    124         "data_released": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "All five evaluation datasets (HAREM, LeNER-Br, UlyssesNER-Br, GeoCorpus-2, MariNER) are publicly available standard benchmarks with citations to original papers.",
    128           "source": "haiku"
    129         },
    130         "environment_specified": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "Table 2 provides exact model versions with HuggingFace links and temperatures specified (t=0, 1.0, 1.5), but no requirements.txt, dependencies, or Python version documented in the paper.",
    134           "source": "haiku"
    135         },
    136         "reproduction_instructions": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "Pipeline steps described in Section 3 with figures, but actual prompts for extraction and voting steps not provided in full (only structure shown in Figures 1-2); no step-by-step reproducibility guide in paper.",
    140           "source": "haiku"
    141         }
    142       },
    143       "statistical_methodology": {
    144         "confidence_intervals_or_error_bars": {
    145           "applies": true,
    146           "answer": false,
    147           "justification": "Table 3 caption states 'averaged of three runs' to account for stochasticity, but no error bars, standard deviations, or confidence intervals reported in results.",
    148           "source": "haiku"
    149         },
    150         "significance_tests": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "No statistical significance tests (t-tests, ANOVA) reported for performance comparisons between ensemble and individual models; claims based only on point estimates.",
    154           "source": "haiku"
    155         },
    156         "effect_sizes_reported": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "Absolute micro-F1 scores provided but effect sizes not explicitly reported; e.g., ensemble improvement on LeNER-BR (0.541 vs 0.490) shown as raw numbers without effect size quantification.",
    160           "source": "haiku"
    161         },
    162         "sample_size_justified": {
    163           "applies": true,
    164           "answer": false,
    165           "justification": "Validation set uses 100 random sentences per dataset with justification only as 'simulating a low-resource scenario'; no power analysis or sample size calculation provided.",
    166           "source": "haiku"
    167         },
    168         "variance_reported": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "Only averages over three runs shown in Table 3; no standard deviations, variances, or confidence intervals across the runs reported.",
    172           "source": "haiku"
    173         }
    174       },
    175       "evaluation_design": {
    176         "baselines_included": {
    177           "applies": true,
    178           "answer": true,
    179           "justification": "Table 3 includes baseline results for all individual models (LLaMA3, Qwen2, Gemma2, Phi3, Mistral) and supervised RoBERTa model.",
    180           "source": "haiku"
    181         },
    182         "baselines_contemporary": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "Model baselines are contemporary (LLaMA3/Qwen2/Gemma2/Phi3 from 2024, Mistral from 2023); RoBERTa (2019) is standard supervised baseline.",
    186           "source": "haiku"
    187         },
    188         "ablation_study": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "Table 4 ablates the voting step (no voting) and simplifies disambiguation, showing impact of each component across all datasets.",
    192           "source": "haiku"
    193         },
    194         "multiple_metrics": {
    195           "applies": true,
    196           "answer": false,
    197           "justification": "Only micro-F1 reported as evaluation metric; no macro-F1, precision, recall, or per-entity-type metrics provided despite datasets having multiple entity types.",
    198           "source": "haiku"
    199         },
    200         "human_evaluation": {
    201           "applies": false,
    202           "answer": false,
    203           "justification": "No human evaluation needed for NER task evaluated on gold-standard annotated datasets; not applicable.",
    204           "source": "haiku"
    205         },
    206         "held_out_test_set": {
    207           "applies": true,
    208           "answer": true,
    209           "justification": "Section 4.1 states 'available train split to perform ensemble configuration selection and test split for testing'; results in Table 3 are on held-out test sets.",
    210           "source": "haiku"
    211         },
    212         "per_category_breakdown": {
    213           "applies": true,
    214           "answer": false,
    215           "justification": "Datasets (Table 1) specify multiple entity types per domain, but results (Table 3) only report overall micro-F1 without per-entity-type or per-category performance breakdowns.",
    216           "source": "haiku"
    217         },
    218         "failure_cases_discussed": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "HAREM underperformance (ensemble 0.591 vs individual 0.609) analyzed in Section 5.1 with explanation; GeoCorpus2 heuristic brittleness discussed in Section 5.3.",
    222           "source": "haiku"
    223         },
    224         "negative_results_reported": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "HAREM results (ensemble underperforms individual models) reported honestly in Table 3 and analyzed rather than omitted.",
    228           "source": "haiku"
    229         }
    230       },
    231       "setup_transparency": {
    232         "model_versions_specified": {
    233           "applies": true,
    234           "answer": true,
    235           "justification": "Table 2 specifies exact model names (LLaMA 3.1-8B, Qwen2-7B, Gemma-2-9B, Phi-3-Medium-128K, Mistral-7B) with HuggingFace links and parameter counts.",
    236           "source": "haiku"
    237         },
    238         "prompts_provided": {
    239           "applies": true,
    240           "answer": false,
    241           "justification": "Section 3 describes prompt structure ('task definition entity type descriptions...') and Figure 2 shows disambiguation prompt construction, but actual zero-shot prompts for extraction and voting steps not provided in full.",
    242           "source": "haiku"
    243         },
    244         "hyperparameters_reported": {
    245           "applies": true,
    246           "answer": true,
    247           "justification": "Temperature values for each model in extraction step explicitly reported (t=0 for LLaMA3/Mistral/Phi3, t=1.0 for Gemma2, t=1.5 for Qwen2); voting/disambiguation at t=0.",
    248           "source": "haiku"
    249         },
    250         "scaffolding_described": {
    251           "applies": true,
    252           "answer": true,
    253           "justification": "Three-step pipeline (extraction, voting, disambiguation) described in detail in Section 3 with mechanistic explanation of each step and Figures 1-2 illustrating the process.",
    254           "source": "haiku"
    255         },
    256         "data_preprocessing_documented": {
    257           "applies": true,
    258           "answer": true,
    259           "justification": "Section 3.1 describes validation set selection (100 random sentences per dataset) and entity parsing ('validated... to remove those with inconsistent types or those that cannot be matched').",
    260           "source": "haiku"
    261         }
    262       },
    263       "data_integrity": {
    264         "raw_data_available": {
    265           "applies": true,
    266           "answer": true,
    267           "justification": "All five datasets (HAREM, LeNER-Br, UlyssesNER-Br, GeoCorpus-2, MariNER) are publicly available with citations to original papers in Table 1.",
    268           "source": "haiku"
    269         },
    270         "data_collection_described": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "While not collecting new data, Table 1 provides dataset descriptions with citations to original papers that document collection procedures.",
    274           "source": "haiku"
    275         },
    276         "recruitment_methods_described": {
    277           "applies": false,
    278           "answer": false,
    279           "justification": "Not applicable—using existing annotated datasets, not recruiting participants.",
    280           "source": "haiku"
    281         },
    282         "data_pipeline_documented": {
    283           "applies": true,
    284           "answer": true,
    285           "justification": "Section 4 documents dataset selection, train/test split usage, validation set creation (100-sentence sample), and processing pipeline steps (extraction → voting → disambiguation).",
    286           "source": "haiku"
    287         }
    288       },
    289       "contamination": {
    290         "training_cutoff_stated": {
    291           "applies": true,
    292           "answer": false,
    293           "justification": "Model training cutoffs not stated; models listed with 2024 or 2023 release dates but no explicit knowledge cutoff documentation provided.",
    294           "source": "haiku"
    295         },
    296         "train_test_overlap_discussed": {
    297           "applies": true,
    298           "answer": false,
    299           "justification": "No discussion of whether Portuguese NER datasets (especially 2025's MariNER) might appear in model training data; Portuguese is lower-resource but potential overlap not addressed.",
    300           "source": "haiku"
    301         },
    302         "benchmark_contamination_addressed": {
    303           "applies": true,
    304           "answer": false,
    305           "justification": "Not addressed; no discussion of whether evaluation datasets were in training data or possibility of benchmark leakage.",
    306           "source": "haiku"
    307         }
    308       },
    309       "human_studies": {
    310         "applies": false,
    311         "answer": false
    312       },
    313       "cost_and_practicality": {
    314         "inference_cost_reported": {
    315           "applies": true,
    316           "answer": false,
    317           "justification": "Section 7 mentions 'high GPU memory requirements' and 'long processing times' qualitatively, but no concrete inference latency, tokens per second, or cost metrics reported.",
    318           "source": "haiku"
    319         },
    320         "compute_budget_stated": {
    321           "applies": true,
    322           "answer": false,
    323           "justification": "Section 7 states 'over 7,000 combinations' must be evaluated for configuration selection, but no total computational budget (GPU-hours, etc.) provided.",
    324           "source": "haiku"
    325         }
    326       }
    327     }
    328   },
    329   "claims": [
    330     {
    331       "claim": "Ensembles outperform individual LLMs for Portuguese NER on 4 out of 5 datasets",
    332       "evidence": "Table 3 shows ensemble F1 scores vs individual model F1 across LeNER-BR (0.541 vs 0.490), GeoCorpus2 (0.313 vs 0.270), UlyssesNER (0.353 vs 0.340), and MariNER (0.706 vs 0.699). HAREM is the exception (0.591 ensemble vs 0.609 best individual Gemma2).",
    333       "supported": "moderate"
    334     },
    335     {
    336       "claim": "Different models excel at different pipeline stages (extraction, voting, disambiguation)",
    337       "evidence": "Section 5.1 analysis shows LLaMA3/Gemma2 dominate as extractors (appear in 4/5 dataset extraction configs), while Phi3 appears mainly in voting/disambiguation (section 5.1: 'Phi3 shows up for most voting and disambiguation ensembles').",
    338       "supported": "moderate"
    339     },
    340     {
    341       "claim": "The voting step is critical to ensemble performance",
    342       "evidence": "Table 4 ablation shows removing voting step causes significant F1 drops: HAREM 0.592→0.410, LeNER-BR 0.541→0.478, GeoCorpus2 0.313→0.178, UlyssesNER 0.353→0.249, MariNER 0.706→0.621.",
    343       "supported": "strong"
    344     },
    345     {
    346       "claim": "Cross-dataset ensemble configurations perform as well as target-dataset ensembles",
    347       "evidence": "Figure 4 shows HAREM and UlyssesNER ensembles trained on other datasets outperform heuristic-best ensemble from same target dataset (text: 'ensemble configurations obtained in cross-dataset configurations may perform better than ones obtained from heuristic applied to same dataset').",
    348       "supported": "moderate"
    349     },
    350     {
    351       "claim": "The heuristic for selecting ensemble configuration is robust across train/test splits",
    352       "evidence": "Section 5.3 shows heuristic selects 0.591 F1 on HAREM test but optimal possible ensemble achieves 0.622 F1 (Figure 3), indicating validation set selection does not reliably predict test performance.",
    353       "supported": "weak"
    354     },
    355     {
    356       "claim": "Zero-shot LLM ensembles enable NER for lower-resource languages without fine-tuning",
    357       "evidence": "Paper evaluates all models zero-shot (no fine-tuning) on 5 Portuguese datasets. Results show ensembles competitive with individual models without labeled fine-tuning data.",
    358       "supported": "moderate"
    359     }
    360   ],
    361   "methodology_tags": [
    362     "benchmark-eval",
    363     "empirical"
    364   ],
    365   "key_findings": "The paper proposes a three-step ensemble pipeline (extraction → voting → disambiguation) for zero-shot Portuguese NER using five similarly-sized open-weight LLMs. Ensembles outperform individual models on 4 of 5 datasets (F1 gains most pronounced on domain-specific datasets with complex entity types), though a heuristic for configuration selection shows limited robustness: validation-set selection (0.591 F1 HAREM) underperforms the globally optimal ensemble (0.622 F1). Crucially, ensemble configurations trained on different source datasets often match or exceed target-dataset performance, enabling practical zero-shot NER without task-specific annotation.",
    366   "red_flags": [
    367     {
    368       "flag": "No statistical significance testing",
    369       "detail": "Results presented as point estimates (averages of 3 runs) without confidence intervals, standard deviations, or significance tests despite inherent variance in LLM outputs."
    370     },
    371     {
    372       "flag": "Heuristic selection brittleness",
    373       "detail": "Section 5.3 demonstrates that validation-set-based heuristic (0.591 F1) substantially underperforms globally optimal ensemble (0.622 F1 on HAREM), questioning robustness of configuration selection method."
    374     },
    375     {
    376       "flag": "Single evaluation metric",
    377       "detail": "Only micro-F1 reported; no macro-F1, precision, recall, or per-entity-type breakdowns despite datasets having 4-16 entity types each, limiting understanding of performance gaps."
    378     },
    379     {
    380       "flag": "Unjustified sample size",
    381       "detail": "100-sentence validation set used without power analysis or justification; section 5.3 hints this may be unrepresentative of full test distribution."
    382     },
    383     {
    384       "flag": "Training data contamination not addressed",
    385       "detail": "No discussion of whether Portuguese NER datasets (especially 2025's MariNER, released same year as paper) might be in model training data; Portuguese corpora less likely to be excluded than English but not verified."
    386     },
    387     {
    388       "flag": "Prompts under-specified",
    389       "detail": "Actual zero-shot prompts for extraction and voting steps not fully provided; only structural descriptions in Section 3 and Figure 2, limiting reproducibility."
    390     },
    391     {
    392       "flag": "No per-entity-type analysis",
    393       "detail": "Despite datasets having 4-16 entity types (Table 1), results report only overall micro-F1, preventing analysis of which entity types benefit from ensembling."
    394     },
    395     {
    396       "flag": "Supervision leakage in zero-shot claim",
    397       "detail": "Configuration selection uses 100 labeled sentences per dataset (section 4.3), technically violating 'zero-shot' framing; though minimized, ensemble is supervised in selection phase."
    398     }
    399   ],
    400   "cited_papers": [
    401     {
    402       "title": "GPT-NER: Named Entity Recognition via Large Language Models",
    403       "authors": "Wang et al.",
    404       "year": 2023,
    405       "relevance": "Directly addresses zero-shot NER with LLMs; prior work establishing LLM underperformance on token-level tasks."
    406     },
    407     {
    408       "title": "Empirical Study of Zero-Shot NER with ChatGPT",
    409       "authors": "Xie et al.",
    410       "year": 2023,
    411       "relevance": "Baseline zero-shot NER evaluation methodology; compares zero-shot prompting strategies."
    412     },
    413     {
    414       "title": "LLM Chain Ensembles for Scalable and Accurate Data Annotation",
    415       "authors": "Farr et al.",
    416       "year": 2024,
    417       "relevance": "Ensemble routing approach for task-specific model selection; alternative aggregation strategy."
    418     },
    419     {
    420       "title": "Improving entity recognition using ensembles of deep learning and fine-tuned large language models: A case study on adverse event extraction",
    421       "authors": "Li et al.",
    422       "year": 2025,
    423       "relevance": "Recent LLM ensemble for NER; uses fine-tuning and larger models (GPT-3.5), contrasts with this paper's zero-shot approach."
    424     },
    425     {
    426       "title": "An Ensemble of LLMs Finetuned with LoRA for NER in Portuguese Legal Documents",
    427       "authors": "Nunes et al.",
    428       "year": 2025,
    429       "relevance": "Portuguese NER ensemble; most closely related work; uses fine-tuning vs. zero-shot, legal domain only."
    430     },
    431     {
    432       "title": "A Review of Hybrid and Ensemble in Deep Learning for Natural Language Processing",
    433       "authors": "Jia, Liang, Liang",
    434       "year": 2024,
    435       "relevance": "Comprehensive ensemble methods survey; provides taxonomies of fusion-based and routing strategies."
    436     },
    437     {
    438       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    439       "authors": "Chen, Zaharia, Zou",
    440       "year": 2023,
    441       "relevance": "Routing approach for model selection; cost-aware ensemble strategy alternative to voting."
    442     },
    443     {
    444       "title": "A Survey of Large Language Models",
    445       "authors": "Zhao et al.",
    446       "year": 2025,
    447       "relevance": "Foundational LLM capabilities and limitations survey; contextualizes zero-shot performance gap."
    448     }
    449   ],
    450   "engagement_factors": {
    451     "practical_relevance": {
    452       "score": 2,
    453       "justification": "Useful for Portuguese NER practitioners but requires GPU resources to run 5 models in parallel; no production inference optimization or model distillation discussed."
    454     },
    455     "surprise_contrarian": {
    456       "score": 1,
    457       "justification": "Ensemble improvement is expected; the main surprise (HAREM underperformance on simple domain) is noted but not deeply investigated or challenged."
    458     },
    459     "fear_safety": {
    460       "score": 0,
    461       "justification": "No safety concerns raised, addressed, or relevant to NER task."
    462     },
    463     "drama_conflict": {
    464       "score": 1,
    465       "justification": "Mild tension in Section 5.3 (heuristic fails to find optimal ensemble), but insufficient for social engagement."
    466     },
    467     "demo_ability": {
    468       "score": 2,
    469       "justification": "Code released on GitHub; datasets are public; but reproducibility requires GPU setup and full prompt specifications are missing from paper."
    470     },
    471     "brand_recognition": {
    472       "score": 1,
    473       "justification": "Authors from USP (reputable but not top-tier AI lab); uses well-known open models (Meta LLaMA, Google Gemma) but no partnership or industry affiliation."
    474     }
    475   },
    476   "hn_data": {
    477     "threads": [
    478       {
    479         "hn_id": "45745607",
    480         "title": "Collatz-Weyl Generators: Pseudorandom Number Generators (2023)",
    481         "points": 59,
    482         "comments": 1,
    483         "url": "https://news.ycombinator.com/item?id=45745607",
    484         "created_at": "2025-10-29T11:55:42Z"
    485       },
    486       {
    487         "hn_id": "38794757",
    488         "title": "Knowledge Graph Reasoning Based on Attention GCN",
    489         "points": 52,
    490         "comments": 10,
    491         "url": "https://news.ycombinator.com/item?id=38794757",
    492         "created_at": "2023-12-28T16:02:24Z"
    493       },
    494       {
    495         "hn_id": "46287626",
    496         "title": "Detailed balance in large language model-driven agents",
    497         "points": 48,
    498         "comments": 5,
    499         "url": "https://news.ycombinator.com/item?id=46287626",
    500         "created_at": "2025-12-16T12:17:08Z"
    501       },
    502       {
    503         "hn_id": "38722880",
    504         "title": "ReST Meets ReAct: Self-Improvement for Multi-Step Reasoning LLM Agent",
    505         "points": 3,
    506         "comments": 1,
    507         "url": "https://news.ycombinator.com/item?id=38722880",
    508         "created_at": "2023-12-21T16:45:46Z"
    509       },
    510       {
    511         "hn_id": "38681247",
    512         "title": "Point Transformer V3: Simpler, Faster, Stronger",
    513         "points": 2,
    514         "comments": 1,
    515         "url": "https://news.ycombinator.com/item?id=38681247",
    516         "created_at": "2023-12-18T11:29:41Z"
    517       },
    518       {
    519         "hn_id": "46279856",
    520         "title": "Detailed balance in large language model-driven agents",
    521         "points": 1,
    522         "comments": 0,
    523         "url": "https://news.ycombinator.com/item?id=46279856",
    524         "created_at": "2025-12-15T20:11:28Z"
    525       },
    526       {
    527         "hn_id": "45930419",
    528         "title": "A Large-Scale Computational Analysis of Errors in ArXiv Papers",
    529         "points": 1,
    530         "comments": 0,
    531         "url": "https://news.ycombinator.com/item?id=45930419",
    532         "created_at": "2025-11-14T18:52:29Z"
    533       },
    534       {
    535         "hn_id": "45927607",
    536         "title": "Black-Box On-Policy Distillation of Large Language Models",
    537         "points": 1,
    538         "comments": 0,
    539         "url": "https://news.ycombinator.com/item?id=45927607",
    540         "created_at": "2025-11-14T15:18:22Z"
    541       },
    542       {
    543         "hn_id": "43204606",
    544         "title": "Strassen Multisystolic Array Hardware Architectures",
    545         "points": 1,
    546         "comments": 0,
    547         "url": "https://news.ycombinator.com/item?id=43204606",
    548         "created_at": "2025-02-28T11:48:06Z"
    549       },
    550       {
    551         "hn_id": "42418107",
    552         "title": "Towards Reasoning in Large Language Models: A Survey (2023)",
    553         "points": 1,
    554         "comments": 0,
    555         "url": "https://news.ycombinator.com/item?id=42418107",
    556         "created_at": "2024-12-14T16:50:49Z"
    557       }
    558     ],
    559     "top_points": 59,
    560     "total_points": 169,
    561     "total_comments": 18
    562   }
    563 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs