scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28029B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Hierarchical Document Refinement for Long-context Retrieval-augmented Generation",
      6     "authors": [
      7       "Jiajie Jin",
      8       "Xiaoxi Li",
      9       "Guanting Dong",
     10       "Yuyao Zhang",
     11       "Yutao Zhu",
     12       "Yongkang Wu",
     13       "Zhonghua Li",
     14       "Qi Ye",
     15       "Zhicheng Dou"
     16     ],
     17     "year": 2025,
     18     "venue": "Annual Meeting of the Association for Computational Linguistics",
     19     "arxiv_id": "2505.10413",
     20     "doi": "10.48550/arXiv.2505.10413"
     21   },
     22   "checklist": {
     23     "claims_and_evidence": {
     24       "abstract_claims_supported": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The '10x fewer computational costs' claim is supported by Table 2 (1,933 vs ~19,567 tokens vs Full Content); superiority across datasets is verified by results on all 7 benchmarks.",
     28         "source": "haiku"
     29       },
     30       "causal_claims_justified": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Ablation studies in Table 3 systematically remove each of the three components and measure performance degradation, providing adequate causal support for component contribution claims.",
     34         "source": "haiku"
     35       },
     36       "generalization_bounded": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper makes broad claims about 'real-world RAG applications' throughout, but the system is trained and evaluated exclusively on Wikipedia-derived corpora; limitations are acknowledged only in Section 7.",
     40         "source": "haiku"
     41       },
     42       "alternative_explanations_discussed": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper does not discuss whether training on the same Wikipedia corpus used for retrieval gives LongRefiner a structural advantage over baselines, nor whether the refiner model's larger parameter count affects fair comparison.",
     46         "source": "haiku"
     47       },
     48       "proxy_outcome_distinction": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper clearly distinguishes between what is measured (Accuracy, F1 on QA benchmarks, token count, latency) and the claimed benefit (improved RAG efficiency), with no conflation between proxy metrics and broader claims.",
     52         "source": "haiku"
     53       }
     54     },
     55     "limitations_and_scope": {
     56       "limitations_section_present": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Section 7 is explicitly titled 'Limitations' and discusses two concrete limitations: inability to handle non-text content (tables, images, hyperlinks) and domain transfer challenges beyond Wikipedia.",
     60         "source": "haiku"
     61       },
     62       "threats_to_validity_specific": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The limitations section specifically identifies that the system relies on general-domain Wikipedia corpus and explicitly states it 'is challenging to directly transfer to vertical domains such as enterprise or finance.'",
     66         "source": "haiku"
     67       },
     68       "scope_boundaries_stated": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The paper explicitly bounds the system to plain text input (not tables, images) and Wikipedia-style structured documents, stated in the limitations section.",
     72         "source": "haiku"
     73       }
     74     },
     75     "conflicts_of_interest": {
     76       "funding_disclosed": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "No funding acknowledgment appears anywhere in the paper; despite co-authors from Huawei Poisson Lab, no explicit funding disclosure is made.",
     80         "source": "haiku"
     81       },
     82       "affiliations_disclosed": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Author affiliations are clearly stated on the first page: Gaoling School of Artificial Intelligence at Renmin University of China and Huawei Poisson Lab.",
     86         "source": "haiku"
     87       },
     88       "funder_independent_of_outcome": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "Three co-authors (Yongkang Wu, Zhonghua Li, Qi Ye) are from Huawei Poisson Lab; Huawei has commercial interest in efficient RAG systems, making the institutional funder not independent of the outcome.",
     92         "source": "haiku"
     93       },
     94       "financial_interests_declared": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No competing interests statement, patent disclosure, or financial interests declaration is present anywhere in the paper.",
     98         "source": "haiku"
     99       }
    100     },
    101     "scope_and_framing": {
    102       "key_terms_defined": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "RAG is defined in the introduction, 'Local Level' and 'Global Level' query types are precisely defined in Section 3.1, and compression ratio γ is formally defined in Section 2.",
    106         "source": "haiku"
    107       },
    108       "intended_contribution_clear": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Four explicit contributions are listed at the end of the introduction, including the framework, three key steps, training/inference paradigm, and empirical results.",
    112         "source": "haiku"
    113       },
    114       "engagement_with_prior_work": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Section 5 (Related Works) categorizes existing methods into two research streams and explicitly contrasts LongRefiner's structural modeling with prior chunk-based and perplexity-based approaches.",
    118         "source": "haiku"
    119       }
    120     }
    121   },
    122   "type_checklist": {
    123     "empirical": {
    124       "artifacts": {
    125         "code_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "The abstract and introduction explicitly state 'Our code is available at https://github.com/ignorejjj/LongRefiner' with a live URL.",
    129           "source": "haiku"
    130         },
    131         "data_released": {
    132           "applies": true,
    133           "answer": true,
    134           "justification": "All seven evaluation datasets (NQ, TriviaQA, HotpotQA, 2WikiMultiHopQA, ASQA, ELI5, PopQA) are standard public benchmarks; the Wikipedia 2018 dump used for retrieval is also publicly available via KILT.",
    135           "source": "haiku"
    136         },
    137         "environment_specified": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "The paper mentions specific model names and GPU type (4 NVIDIA A800 GPUs) but provides no requirements.txt, Dockerfile, or explicit dependency list; environment details must be sourced from the code repository.",
    141           "source": "haiku"
    142         },
    143         "reproduction_instructions": {
    144           "applies": true,
    145           "answer": true,
    146           "justification": "Appendix A.3 and A.4 provide detailed training and inference settings including hyperparameters, model names, dataset configurations, and VLLM/LlamaFactory framework usage, sufficient to reproduce experiments.",
    147           "source": "haiku"
    148         }
    149       },
    150       "statistical_methodology": {
    151         "confidence_intervals_or_error_bars": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No confidence intervals or error bars are reported in any table or figure; all results are single point estimates.",
    155           "source": "haiku"
    156         },
    157         "significance_tests": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "No statistical significance tests are conducted for the comparative claims across methods in Tables 2, 4, or 5; narrow margins on ELI5 (~0.2 F1) are treated as meaningful differences without testing.",
    161           "source": "haiku"
    162         },
    163         "effect_sizes_reported": {
    164           "applies": true,
    165           "answer": true,
    166           "justification": "The paper reports absolute performance differences (e.g., 'surpassing the performance of perplexity-based methods by more than 9%') and percentage comparisons throughout the results section.",
    167           "source": "haiku"
    168         },
    169         "sample_size_justified": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "The '10,000 samples per dataset' training set size is stated without justification or ablation; the data scaling analysis in Figure 3 explores this but does not justify the final selection.",
    173           "source": "haiku"
    174         },
    175         "variance_reported": {
    176           "applies": true,
    177           "answer": false,
    178           "justification": "No variance, standard deviation, or multiple-run statistics are reported; greedy decoding (temp=0) eliminates sampling variance but other variance sources (retrieval stochasticity) are not addressed.",
    179           "source": "haiku"
    180         }
    181       },
    182       "evaluation_design": {
    183         "baselines_included": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Nine baselines across three categories (retrieval-based: 4 methods, semantic chunking: 2 methods, perplexity-based: 3 methods) are compared against LongRefiner in Table 2.",
    187           "source": "haiku"
    188         },
    189         "baselines_contemporary": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Baselines include LLMLingua2 (ACL 2024), Meta-Chunking (2024), and Bge-reranker (SIGIR 2024), which are recent competitive methods in the document refinement space.",
    193           "source": "haiku"
    194         },
    195         "ablation_study": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Table 3 presents a systematic ablation removing each of the three core components (query analysis, document structuring, adaptive refinement) across all three dataset type categories.",
    199           "source": "haiku"
    200         },
    201         "multiple_metrics": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "The paper reports Accuracy, F1 Score, input token count, and online latency (ms), covering both quality and efficiency dimensions.",
    205           "source": "haiku"
    206         },
    207         "human_evaluation": {
    208           "applies": false,
    209           "answer": false,
    210           "justification": "No human evaluation is relevant for this automated QA benchmark evaluation; all metrics are automated.",
    211           "source": "haiku"
    212         },
    213         "held_out_test_set": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Standard held-out test sets are used for all seven benchmarks; PopQA is explicitly noted as out-of-domain data with no training split, serving as a generalization test.",
    217           "source": "haiku"
    218         },
    219         "per_category_breakdown": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Table 3 breaks ablation results by dataset type (single-hop, multi-hop, long-form); Table 2 reports results per dataset; Figure 4 breaks results by document length bin.",
    223           "source": "haiku"
    224         },
    225         "failure_cases_discussed": {
    226           "applies": true,
    227           "answer": false,
    228           "justification": "The PopQA performance drop is noted as an exception but no systematic failure case analysis is presented; the data scaling recall dip is described briefly without example-level investigation.",
    229           "source": "haiku"
    230         },
    231         "negative_results_reported": {
    232           "applies": true,
    233           "answer": true,
    234           "justification": "The paper explicitly reports that LongRefiner performs below Full Content on PopQA due to information loss in low-noise short-document scenarios, and that recall temporarily declines with increased training data.",
    235           "source": "haiku"
    236         }
    237       },
    238       "setup_transparency": {
    239         "model_versions_specified": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "Exact model versions are specified: Llama3.1-8B-Instruct (with citation to Dubey et al. 2024 arXiv), Qwen2.5-3B-Instruct (with citation to Yang et al. 2024), and bge-reranker-v2-m3 for local scoring.",
    243           "source": "haiku"
    244         },
    245         "prompts_provided": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Appendix D provides four complete prompts: Prompt A (query analysis annotation), Prompt B (global search annotation), Prompt C.1 (short-form generation), Prompt C.2 (long-form generation).",
    249           "source": "haiku"
    250         },
    251         "hyperparameters_reported": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "Training hyperparameters are fully reported in Appendix A.3: batch size 1, gradient accumulation 8, lr 3e-5, warmup ratio 0.1, bf16 precision, 1 epoch, max sequence lengths 2k/32k/4k per task.",
    255           "source": "haiku"
    256         },
    257         "scaffolding_described": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "The three-step LongRefiner pipeline including offline/online task split, LoRA module switching, and inference paradigm is described in detail across Section 3 and Appendix A.4.",
    261           "source": "haiku"
    262         },
    263         "data_preprocessing_documented": {
    264           "applies": true,
    265           "answer": true,
    266           "justification": "Wikipedia dump preprocessing is described in Section A.3: removal of references, images, and links, extraction of structural information via scripts available in the code repository.",
    267           "source": "haiku"
    268         }
    269       },
    270       "data_integrity": {
    271         "raw_data_available": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "All evaluation datasets are standard public benchmarks; the Wikipedia 2018 dump is publicly available via KILT, and training data construction scripts are in the released code.",
    275           "source": "haiku"
    276         },
    277         "data_collection_described": {
    278           "applies": true,
    279           "answer": true,
    280           "justification": "Data collection for training (Wikipedia dump via KILT preprocessing, FlashRAG's first 10,000 training samples per dataset) and label generation via Llama3.1-70B-Instruct is described in Appendix A.3.",
    281           "source": "haiku"
    282         },
    283         "recruitment_methods_described": {
    284           "applies": false,
    285           "answer": false,
    286           "justification": "No human participants are recruited; all data comes from standard benchmarks and Wikipedia.",
    287           "source": "haiku"
    288         },
    289         "data_pipeline_documented": {
    290           "applies": true,
    291           "answer": true,
    292           "justification": "The full pipeline from Wikipedia preprocessing → training data construction → retrieval (top-8 MaxP) → refinement → generation is documented across Sections 3 and Appendix A.",
    293           "source": "haiku"
    294         }
    295       },
    296       "contamination": {
    297         "training_cutoff_stated": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "The paper uses Llama3.1-8B-Instruct and Qwen2.5-3B-Instruct as backbone models but does not state their training data cutoffs anywhere in the paper.",
    301           "source": "haiku"
    302         },
    303         "train_test_overlap_discussed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "The refiner model is trained on Wikipedia 2018 dump-derived data and retrieval also uses the same Wikipedia 2018 corpus; this structural advantage over baselines trained on other data is not discussed.",
    307           "source": "haiku"
    308         },
    309         "benchmark_contamination_addressed": {
    310           "applies": true,
    311           "answer": false,
    312           "justification": "NQ, TriviaQA, and HotpotQA predate Llama3.1 and Qwen2.5 training cutoffs; potential contamination of the base LLM with these benchmark examples is not addressed.",
    313           "source": "haiku"
    314         }
    315       },
    316       "human_studies": {
    317         "pre_registered": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants are involved in this study.",
    321           "source": "haiku"
    322         },
    323         "irb_or_ethics_approval": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants are involved in this study.",
    327           "source": "haiku"
    328         },
    329         "demographics_reported": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants are involved in this study.",
    333           "source": "haiku"
    334         },
    335         "inclusion_exclusion_criteria": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants are involved in this study.",
    339           "source": "haiku"
    340         },
    341         "randomization_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants are involved in this study.",
    345           "source": "haiku"
    346         },
    347         "blinding_described": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants are involved in this study.",
    351           "source": "haiku"
    352         },
    353         "attrition_reported": {
    354           "applies": false,
    355           "answer": false,
    356           "justification": "No human participants are involved in this study.",
    357           "source": "haiku"
    358         }
    359       },
    360       "cost_and_practicality": {
    361         "inference_cost_reported": {
    362           "applies": true,
    363           "answer": true,
    364           "justification": "Online latency (in ms) is reported for all methods in Table 2, showing LongRefiner at 10.8ms vs LongLLMLingua at 496.6ms and Full Content at 40.6ms.",
    365           "source": "haiku"
    366         },
    367         "compute_budget_stated": {
    368           "applies": true,
    369           "answer": false,
    370           "justification": "The paper mentions training on 4 NVIDIA A800 GPUs for 1 epoch but does not state the total compute budget in GPU-hours, GPU-days, or equivalent cost.",
    371           "source": "haiku"
    372         }
    373       }
    374     }
    375   },
    376   "claims": [
    377     {
    378       "claim": "LongRefiner achieves superior performance on 6 of 7 QA datasets while using 10x fewer tokens than full-content approach",
    379       "evidence": "Table 2 shows LongRefiner using 1,933 tokens vs 19,567 for Full Content, with higher or equal performance on NQ, TriviaQA, HotpotQA, 2Wiki, ASQA, and ELI5 with Llama3.1-8B generator",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "LongRefiner achieves ~46x lower latency than the best-performing baseline (LongLLMLingua)",
    384       "evidence": "Table 2 reports 10.8ms latency for LongRefiner vs 496.6ms for LongLLMLingua; the abstract's '10x' claim actually refers to token reduction vs Full Content, not latency vs best baseline",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Hierarchical Document Structuring is the most critical component, with its removal causing ~20-26% performance degradation",
    389       "evidence": "Table 3 ablation shows single-hop EM drops from 62.3 to 45.7 (26.6% drop) when removing document structuring, vs smaller drops for other ablations",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Performance scales with refiner model size, approaching full-content recall rates at larger model sizes",
    394       "evidence": "Figure 3 shows recall and accuracy improving monotonically from 0.5B to 7B parameters on TriviaQA, with 3B/7B models approaching full-content recall",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "LongRefiner significantly and consistently outperforms LongLLMLingua across all document lengths",
    399       "evidence": "Figure 4 shows LongRefiner consistently above LongLLMLingua across all document length bins from 10k to 40k tokens",
    400       "supported": "strong"
    401     },
    402     {
    403       "claim": "Any embedding or reranker scoring model outperforms the best prior baseline when used for local scoring",
    404       "evidence": "Table 4 shows all scoring models except BM25 exceed 'Best Baseline' (56.6 Acc single-hop), supporting plug-and-play scoring model flexibility",
    405       "supported": "moderate"
    406     }
    407   ],
    408   "methodology_tags": [
    409     "benchmark-eval"
    410   ],
    411   "key_findings": "LongRefiner achieves state-of-the-art QA performance on 7 benchmarks while using only 10% of tokens compared to full-context approaches (1,933 vs ~19,567 tokens) and achieving ~46x lower latency than the best-performing prior method (LongLLMLingua at 496.6ms vs 10.8ms). Ablation studies confirm all three components are necessary, with hierarchical document structuring being most critical—its removal degrades single-hop performance by 26.6%, far more than removing query analysis (3.2%) or adaptive refinement (7.4%). The system scales with both model size and training data, with larger refiner models approaching full-content recall while achieving superior QA accuracy due to noise reduction. The method underperforms full-content on short, low-noise documents (PopQA), identifying the boundary condition where compression is net harmful.",
    412   "red_flags": [
    413     {
    414       "flag": "Training-retrieval corpus overlap undiscussed",
    415       "detail": "The refiner is trained on Wikipedia 2018 structural data and retrieval also uses the Wikipedia 2018 dump; baselines were not trained on this corpus, creating a potential unfair advantage that is not acknowledged."
    416     },
    417     {
    418       "flag": "No statistical significance tests",
    419       "detail": "All comparisons in Tables 2–5 lack significance tests; on ELI5 where margins are <0.3 F1 points, declared superiority is statistically unverified."
    420     },
    421     {
    422       "flag": "Ambiguous '10x' latency claim in abstract",
    423       "detail": "The abstract states '10x fewer computational costs and latency compared to the best baseline'—the 10x token reduction is vs Full Content (not a baseline), while actual latency improvement vs best baseline is ~46x, making the claim misleading."
    424     },
    425     {
    426       "flag": "No conflict of interest disclosure",
    427       "detail": "Three co-authors are from Huawei Poisson Lab, a commercial entity with interests in efficient RAG technology; no competing interests statement is provided."
    428     },
    429     {
    430       "flag": "No variance or multi-run reporting",
    431       "detail": "Only single-run results are presented; while greedy decoding eliminates sampling variance, retrieval and preprocessing variability are not characterized."
    432     }
    433   ],
    434   "cited_papers": [
    435     {
    436       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    437       "relevance": "Foundational RAG paper that establishes the paradigm LongRefiner is designed to improve"
    438     },
    439     {
    440       "title": "LongLLMLingua: Accelerating and enhancing LLMs in long context scenarios via prompt compression",
    441       "relevance": "Primary baseline and strongest prior method; LongRefiner's main comparison target with 46x latency improvement claimed"
    442     },
    443     {
    444       "title": "LLMLingua-2: Data distillation for efficient and faithful task-agnostic prompt compression",
    445       "relevance": "Key perplexity-based baseline representing recent state of the art in prompt compression"
    446     },
    447     {
    448       "title": "RECOMP: improving retrieval-augmented LMs with compression and selective augmentation",
    449       "relevance": "Representative extraction-based refinement baseline with both extractive and abstractive variants"
    450     },
    451     {
    452       "title": "LongRAG: Enhancing retrieval-augmented generation with long-context LLMs",
    453       "relevance": "Establishes the long-document retrieval paradigm (top-8 full documents, MaxP design) adopted in this work"
    454     },
    455     {
    456       "title": "LoRA: Low-rank adaptation of large language models",
    457       "relevance": "Core technique used for multi-task training across LongRefiner's three tasks on a single backbone"
    458     },
    459     {
    460       "title": "Meta-chunking: Learning efficient text segmentation via logical perception",
    461       "relevance": "Recent (2024) semantic chunking baseline that LongRefiner outperforms"
    462     },
    463     {
    464       "title": "Compressing context to enhance inference efficiency of large language models",
    465       "relevance": "Selective-Context baseline and foundational work on perplexity-based context compression that LongRefiner surpasses"
    466     }
    467   ],
    468   "engagement_factors": {
    469     "practical_relevance": {
    470       "score": 3,
    471       "justification": "Directly addresses a major RAG deployment bottleneck (long document inference cost) with a plug-and-play solution, released code, and 10x token reduction at competitive accuracy."
    472     },
    473     "surprise_contrarian": {
    474       "score": 1,
    475       "justification": "The finding that structural modeling outperforms perplexity-based approaches is intuitive; no surprising reversals of prior understanding or counterintuitive results are presented."
    476     },
    477     "fear_safety": {
    478       "score": 0,
    479       "justification": "No AI safety or risk concerns are raised; purely a systems efficiency paper focused on RAG cost reduction."
    480     },
    481     "drama_conflict": {
    482       "score": 0,
    483       "justification": "No controversy or conflict angle; standard benchmark comparison paper with incremental improvement framing."
    484     },
    485     "demo_ability": {
    486       "score": 2,
    487       "justification": "Code is publicly released on GitHub with implementation details sufficient for practitioners to integrate LongRefiner into their own RAG pipelines."
    488     },
    489     "brand_recognition": {
    490       "score": 1,
    491       "justification": "Renmin University of China is a respected NLP institution and Huawei Poisson Lab is notable, but neither is a top-tier AI lab like OpenAI, Google DeepMind, or Meta AI."
    492     }
    493   },
    494   "hn_data": {
    495     "threads": [
    496       {
    497         "hn_id": "44276041",
    498         "title": "Unsupervised Elicitation of Language Models",
    499         "points": 135,
    500         "comments": 24,
    501         "url": "https://news.ycombinator.com/item?id=44276041"
    502       },
    503       {
    504         "hn_id": "45970338",
    505         "title": "Show HN: RowboatX – open-source Claude Code for everyday automations",
    506         "points": 131,
    507         "comments": 42,
    508         "url": "https://news.ycombinator.com/item?id=45970338"
    509       },
    510       {
    511         "hn_id": "27355583",
    512         "title": "Recommendations and Results Organization in Netflix Search",
    513         "points": 41,
    514         "comments": 17,
    515         "url": "https://news.ycombinator.com/item?id=27355583"
    516       },
    517       {
    518         "hn_id": "27711165",
    519         "title": "Multi-Horizon Forecasting for Limit Order Books",
    520         "points": 32,
    521         "comments": 9,
    522         "url": "https://news.ycombinator.com/item?id=27711165"
    523       },
    524       {
    525         "hn_id": "40389527",
    526         "title": "How Far Are We from AGI",
    527         "points": 14,
    528         "comments": 7,
    529         "url": "https://news.ycombinator.com/item?id=40389527"
    530       },
    531       {
    532         "hn_id": "44272444",
    533         "title": "Unsupervised Elicitation of Language Models",
    534         "points": 7,
    535         "comments": 0,
    536         "url": "https://news.ycombinator.com/item?id=44272444"
    537       },
    538       {
    539         "hn_id": "40484427",
    540         "title": "How Far Are We from AGI",
    541         "points": 5,
    542         "comments": 4,
    543         "url": "https://news.ycombinator.com/item?id=40484427"
    544       },
    545       {
    546         "hn_id": "44819042",
    547         "title": "Solving the compute crisis with physics-based ASICs",
    548         "points": 5,
    549         "comments": 2,
    550         "url": "https://news.ycombinator.com/item?id=44819042"
    551       },
    552       {
    553         "hn_id": "35836349",
    554         "title": "FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness",
    555         "points": 3,
    556         "comments": 1,
    557         "url": "https://news.ycombinator.com/item?id=35836349"
    558       },
    559       {
    560         "hn_id": "27350075",
    561         "title": "Recommendations and Results Organization in Netflix Search",
    562         "points": 3,
    563         "comments": 0,
    564         "url": "https://news.ycombinator.com/item?id=27350075"
    565       }
    566     ],
    567     "top_points": 135,
    568     "total_points": 376,
    569     "total_comments": 106
    570   }
    571 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs