scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27545B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "HyperGraphRAG: Retrieval-Augmented Generation with Hypergraph-Structured Knowledge Representation",
      6     "authors": [
      7       "Haoran Luo",
      8       "Haihong E",
      9       "Guanting Chen",
     10       "Yandan Zheng",
     11       "Xiaobao Wu",
     12       "Yikai Guo",
     13       "Qika Lin",
     14       "Yu Feng",
     15       "Zemin Kuang",
     16       "Meina Song",
     17       "Yifan Zhu",
     18       "Luu Anh Tuan"
     19     ],
     20     "year": 2025,
     21     "venue": "NeurIPS 2025",
     22     "arxiv_id": "2503.21322",
     23     "doi": "10.48550/arXiv.2503.21322"
     24   },
     25   "checklist": {
     26     "claims_and_evidence": {
     27       "abstract_claims_supported": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Abstract claims of outperforming standard RAG and graph-based RAG methods are supported by Table 2 results across all metrics (F1, R-S, G-E) in 5 domains. Code/data availability claim is supported by the GitHub link.",
     31         "source": "haiku"
     32       },
     33       "causal_claims_justified": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Causal claims like 'removing hyperedge retrieval drops F1 from 35.4 to 26.4' are supported by the ablation study in Figure 4 with controlled component removal across the same experimental setup.",
     37         "source": "haiku"
     38       },
     39       "generalization_bounded": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The conclusion claims the framework 'addresses limitations of binary graph-based RAG methods' broadly, but all experiments use a single LLM (GPT-4o-mini) and evaluation is limited to 5 pre-selected domains; generalization to other LLMs, open-domain settings, or different embedding models is not discussed.",
     43         "source": "haiku"
     44       },
     45       "alternative_explanations_discussed": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "No alternative explanations are discussed—e.g., that improvements may stem from hyperedges providing longer, more complete text segments rather than the n-ary structure itself, or that GPT-4o-mini's use in both construction and LLM-judge evaluation (G-E) creates systematic bias favoring HyperGraphRAG.",
     49         "source": "haiku"
     50       },
     51       "proxy_outcome_distinction": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "F1 word overlap is labeled 'answer accuracy' and G-E (GPT-4o-mini-as-judge) is labeled 'generation quality' without discussing what these metrics fail to capture; the LLM judge circularity (same model used for construction and evaluation) is not acknowledged.",
     55         "source": "haiku"
     56       }
     57     },
     58     "limitations_and_scope": {
     59       "limitations_section_present": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Appendix I contains a dedicated 'Limitations and Future Work' section with five subsections discussing multimodal, RL, federated, foundation model, and scaling directions.",
     63         "source": "haiku"
     64       },
     65       "threats_to_validity_specific": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "Appendix I discusses only future work directions, not threats to the validity of current results; key threats such as LLM-judge circularity, single-model generalizability, lack of statistical significance testing, and potential training data contamination are not mentioned.",
     69         "source": "haiku"
     70       },
     71       "scope_boundaries_stated": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No explicit statement of what the results do NOT show; for example, there is no acknowledgment that results are limited to GPT-4o-mini, to the five tested domains, or to the specific benchmark construction methodology.",
     75         "source": "haiku"
     76       }
     77     },
     78     "conflicts_of_interest": {
     79       "funding_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Acknowledgments disclose NSFC grants (62473271, 62176026, 62406036) and Engineering Research Center of Information Networks, Ministry of Education, China.",
     83         "source": "haiku"
     84       },
     85       "affiliations_disclosed": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "All author affiliations are listed on the title page: BUPT, NTU, Beijing Institute of Computer Technology, NUS, China Mobile Research Institute, Beijing Anzhen Hospital.",
     89         "source": "haiku"
     90       },
     91       "funder_independent_of_outcome": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "NSFC and Ministry of Education are government research funders with no direct commercial stake in the HyperGraphRAG system or its comparative performance.",
     95         "source": "haiku"
     96       },
     97       "financial_interests_declared": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) appears anywhere in the paper.",
    101         "source": "haiku"
    102       }
    103     },
    104     "scope_and_framing": {
    105       "key_terms_defined": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Key terms are formally defined: RAG (Definition 1), graph-based RAG (Definition 2), and hypergraph (Definition 3) with mathematical notation; n-ary relation, hyperedge, and entity are all precisely defined.",
    109         "source": "haiku"
    110       },
    111       "intended_contribution_clear": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper explicitly states its contribution as 'the first hypergraph-based RAG method that represents n-ary relational facts via hyperedges,' with a complete pipeline covering construction, retrieval, and generation.",
    115         "source": "haiku"
    116       },
    117       "engagement_with_prior_work": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Related work section compares directly with GraphRAG, LightRAG, PathRAG, HippoRAG2, and hypergraph embedding methods, explaining how each is constrained to binary relations and how HyperGraphRAG addresses this gap.",
    121         "source": "haiku"
    122       }
    123     }
    124   },
    125   "type_checklist": {
    126     "empirical": {
    127       "artifacts": {
    128         "code_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "Abstract and footnote both state 'Our data and code are publicly available' with a GitHub link (https://github.com/LHRLAB/HyperGraphRAG).",
    132           "source": "haiku"
    133         },
    134         "data_released": {
    135           "applies": true,
    136           "answer": true,
    137           "justification": "The paper uses UltraDomain (publicly available) and 2024 ESC hypertension guidelines; the evaluation QA datasets are claimed available via GitHub alongside the code.",
    138           "source": "haiku"
    139         },
    140         "environment_specified": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "Hardware specs (80-core CPU, 512GB RAM) and model names (GPT-4o-mini, text-embedding-3-small) are mentioned, but no requirements.txt, Dockerfile, or dependency specification is provided.",
    144           "source": "haiku"
    145         },
    146         "reproduction_instructions": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "Pseudocode (Algorithms 1 and 2) and hyperparameter tables are provided, but no step-by-step reproduction guide explaining how to run the pipeline end-to-end is included in the paper.",
    150           "source": "haiku"
    151         }
    152       },
    153       "statistical_methodology": {
    154         "confidence_intervals_or_error_bars": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "All results in Table 2 and Figures 4, 6, 7 are point estimates with no confidence intervals or error bars reported.",
    158           "source": "haiku"
    159         },
    160         "significance_tests": {
    161           "applies": true,
    162           "answer": false,
    163           "justification": "No statistical significance tests are applied to any comparative claims; all comparisons are raw point estimates.",
    164           "source": "haiku"
    165         },
    166         "effect_sizes_reported": {
    167           "applies": true,
    168           "answer": true,
    169           "justification": "Absolute improvements over baselines are reported (e.g., '+7.45 F1, +7.62 R-S, +3.69 G-E' vs StandardRAG) with baseline context, enabling meaningful interpretation of effect magnitude.",
    170           "source": "haiku"
    171         },
    172         "sample_size_justified": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "512 questions per domain (256 binary, 256 n-ary) are used without justification for this choice or any power analysis.",
    176           "source": "haiku"
    177         },
    178         "variance_reported": {
    179           "applies": true,
    180           "answer": false,
    181           "justification": "No variance, standard deviation, or spread across runs is reported for any experiment; all results are single-run point estimates.",
    182           "source": "haiku"
    183         }
    184       },
    185       "evaluation_design": {
    186         "baselines_included": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "Six baselines are included: NaiveGeneration, StandardRAG, GraphRAG, LightRAG, PathRAG, and HippoRAG2, covering retrieval-free, chunk-based, and four graph-based approaches.",
    190           "source": "haiku"
    191         },
    192         "baselines_contemporary": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "Baselines include 2024-2025 methods: GraphRAG (2024), LightRAG (2024), PathRAG (2025), HippoRAG2 (2025); these are competitive and recent.",
    196           "source": "haiku"
    197         },
    198         "ablation_study": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Figure 4 presents an ablation in the Medicine domain removing entity retrieval (w/o ER), hyperedge retrieval (w/o HR), chunk retrieval (w/o CR), and combinations.",
    202           "source": "haiku"
    203         },
    204         "multiple_metrics": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "Three complementary metrics are used: F1 (word overlap), Retrieval Similarity (semantic similarity to ground-truth knowledge), and Generation Evaluation (LLM-judged across 7 dimensions).",
    208           "source": "haiku"
    209         },
    210         "human_evaluation": {
    211           "applies": true,
    212           "answer": false,
    213           "justification": "Generation quality (G-E) is assessed by GPT-4o-mini as judge, not human evaluators; human annotators were used only for verifying QA dataset correctness, not for evaluating system outputs.",
    214           "source": "haiku"
    215         },
    216         "held_out_test_set": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "The QA benchmark (512 questions per domain, verified by human annotators) serves as the held-out evaluation set; the RAG systems do not train on these questions.",
    220           "source": "haiku"
    221         },
    222         "per_category_breakdown": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Table 2 provides per-domain breakdowns (Medicine, Agriculture, CS, Legal, Mix) and per-question-type breakdowns (Binary Source, N-ary Source) for all metrics.",
    226           "source": "haiku"
    227         },
    228         "failure_cases_discussed": {
    229           "applies": true,
    230           "answer": false,
    231           "justification": "The case study (Appendix H) shows only a success case for HyperGraphRAG; no failure cases or examples where the method underperforms are presented.",
    232           "source": "haiku"
    233         },
    234         "negative_results_reported": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "The paper explicitly reports that existing graph-based RAG methods (GraphRAG, LightRAG, PathRAG, HippoRAG2) often underperform StandardRAG due to knowledge fragmentation from binary relations—a meaningful negative finding about prior work.",
    238           "source": "haiku"
    239         }
    240       },
    241       "setup_transparency": {
    242         "model_versions_specified": {
    243           "applies": true,
    244           "answer": false,
    245           "justification": "'GPT-4o-mini' and 'text-embedding-3-small' are named but no snapshot dates or version identifiers are provided; OpenAI updates models under the same name.",
    246           "source": "haiku"
    247         },
    248         "prompts_provided": {
    249           "applies": true,
    250           "answer": true,
    251           "justification": "Full prompts are included in Appendix A: n-ary relation extraction prompt (Fig 8), entity extraction prompt (Fig 9), generation prompt (Fig 10), and all 7 G-E evaluation dimension prompts (Figs 11-12).",
    252           "source": "haiku"
    253         },
    254         "hyperparameters_reported": {
    255           "applies": true,
    256           "answer": true,
    257           "justification": "Retrieval hyperparameters (kV=60, τV=50, kH=60, τH=5, kC=5, τC=0.5) and generation hyperparameters (temperature=1.0, max 32k tokens, 16 parallel cores) are reported in Table 4 and Section 5.1.",
    258           "source": "haiku"
    259         },
    260         "scaffolding_described": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "The full pipeline is described with formal definitions (Equations 4-12), two pseudocode algorithms (construction and retrieval/generation), and detailed descriptions of bidirectional expansion, bipartite storage, and hybrid generation.",
    264           "source": "haiku"
    265         },
    266         "data_preprocessing_documented": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "Appendix D describes the question dataset construction including hop-based sampling strategy, binary vs. n-ary categorization, GPT-based question generation, and manual verification; the n-ary extraction pipeline is documented via prompts and algorithms.",
    270           "source": "haiku"
    271         }
    272       },
    273       "data_integrity": {
    274         "raw_data_available": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "The paper states 'Our data and code are publicly available' via GitHub; UltraDomain is a public dataset and the generated QA pairs are claimed to be released.",
    278           "source": "haiku"
    279         },
    280         "data_collection_described": {
    281           "applies": true,
    282           "answer": true,
    283           "justification": "Appendix D describes data sources (UltraDomain, ESC guidelines), per-domain question counts (512 each), hop-based sampling (1-hop: 128, 2-hop: 64, 3-hop: 64), GPT generation, and manual verification.",
    284           "source": "haiku"
    285         },
    286         "recruitment_methods_described": {
    287           "applies": false,
    288           "answer": false,
    289           "justification": "No human participants are recruited; the evaluation uses constructed QA benchmarks from existing knowledge sources, making this criterion not applicable.",
    290           "source": "haiku"
    291         },
    292         "data_pipeline_documented": {
    293           "applies": true,
    294           "answer": true,
    295           "justification": "The full pipeline from raw documents to hypergraph construction (Algorithm 1), QA benchmark creation (Appendix D), retrieval (Algorithm 2), and evaluation (Appendix E) is documented end-to-end.",
    296           "source": "haiku"
    297         }
    298       },
    299       "contamination": {
    300         "training_cutoff_stated": {
    301           "applies": true,
    302           "answer": false,
    303           "justification": "GPT-4o-mini's training cutoff is not stated; this is relevant because the NaiveGeneration baseline can answer some questions (F1 ~13-23) without retrieval, suggesting training data overlap with the tested domains.",
    304           "source": "haiku"
    305         },
    306         "train_test_overlap_discussed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "No discussion of whether GPT-4o-mini may have seen the hypertension guidelines (2024 ESC) or UltraDomain content during pretraining, despite NaiveGeneration achieving non-trivial F1 scores without any retrieval.",
    310           "source": "haiku"
    311         },
    312         "benchmark_contamination_addressed": {
    313           "applies": true,
    314           "answer": false,
    315           "justification": "The 2024 ESC hypertension guidelines and UltraDomain datasets are public documents that could be in GPT-4o-mini's training data; this is not discussed, potentially confounding the NaiveGeneration baseline comparison.",
    316           "source": "haiku"
    317         }
    318       },
    319       "human_studies": {
    320         "pre_registered": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No human participants; human annotators verified dataset quality but were not study subjects.",
    324           "source": "haiku"
    325         },
    326         "irb_or_ethics_approval": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No human subjects research; not applicable.",
    330           "source": "haiku"
    331         },
    332         "demographics_reported": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No human participants; not applicable.",
    336           "source": "haiku"
    337         },
    338         "inclusion_exclusion_criteria": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No human participants; not applicable.",
    342           "source": "haiku"
    343         },
    344         "randomization_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No human participants; not applicable.",
    348           "source": "haiku"
    349         },
    350         "blinding_described": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No human participants; not applicable.",
    354           "source": "haiku"
    355         },
    356         "attrition_reported": {
    357           "applies": false,
    358           "answer": false,
    359           "justification": "No human participants; not applicable.",
    360           "source": "haiku"
    361         }
    362       },
    363       "cost_and_practicality": {
    364         "inference_cost_reported": {
    365           "applies": true,
    366           "answer": true,
    367           "justification": "Table 3 reports construction cost ($0.0063/1k tokens) and generation cost ($3.184/1k queries) with comparison to all baselines, enabling practitioners to estimate deployment costs.",
    368           "source": "haiku"
    369         },
    370         "compute_budget_stated": {
    371           "applies": true,
    372           "answer": false,
    373           "justification": "Per-query and per-token costs are reported, but total compute budget for the full set of experiments (5 domains × 512 questions × 7 methods) is not stated.",
    374           "source": "haiku"
    375         }
    376       }
    377     }
    378   },
    379   "claims": [
    380     {
    381       "claim": "HyperGraphRAG outperforms StandardRAG by +7.45 F1, +7.62 R-S, and +3.69 G-E averaged across domains.",
    382       "evidence": "Table 2 overall results show HyperGraphRAG scores (e.g., Medicine: 35.35 F1 vs. 27.90 for StandardRAG); gains are consistent across all 5 domains.",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "Existing binary graph-based RAG methods (GraphRAG, LightRAG, PathRAG, HippoRAG2) often underperform StandardRAG due to knowledge fragmentation.",
    387       "evidence": "Table 2 confirms this pattern: e.g., in Medicine, LightRAG achieves 12.79 F1 vs. StandardRAG's 27.90; paper attributes this to binary relation limitations.",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "Hyperedge retrieval is the most critical component, with its removal causing the largest F1 drop (35.4 → 26.4) compared to entity or chunk retrieval removal.",
    392       "evidence": "Figure 4 ablation in Medicine domain shows w/o HR → 26.4 F1, w/o ER → 29.8 F1, w/o CR → 29.2 F1.",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "HyperGraphRAG constructs ~5x more relational units than LightRAG (e.g., 26,902 hyperedges vs. 5,632 relations in CS domain).",
    397       "evidence": "Figure 5(f) statistical comparison table shows counts for all 5 domains; HyperGraphRAG consistently produces more entities and hyperedges than competitors.",
    398       "supported": "strong"
    399     },
    400     {
    401       "claim": "HyperGraphRAG achieves superior performance even under constrained retrieval length limits, demonstrating more information-efficient retrieval.",
    402       "evidence": "Figure 6(b) shows HyperGraphRAG outperforms all methods across all retrieval length limits (32 to full), achieving 35.4 F1 at full vs. next best 29.4 for StandardRAG.",
    403       "supported": "moderate"
    404     },
    405     {
    406       "claim": "HyperGraphRAG achieves favorable cost-quality tradeoff with $3.184/1k queries (lower than LightRAG $3.359 and PathRAG $3.496) while outperforming them.",
    407       "evidence": "Table 3 shows HyperGraphRAG generation cost is lower than LightRAG and PathRAG while Table 2 shows higher performance; comparison is direct.",
    408       "supported": "moderate"
    409     }
    410   ],
    411   "methodology_tags": [
    412     "benchmark-eval",
    413     "case-study"
    414   ],
    415   "key_findings": "HyperGraphRAG proposes hyperedge-based knowledge representation for RAG, enabling modeling of n-ary (3+ entity) relations that binary graphs cannot capture without information loss. Experiments across 5 domains (Medicine, Agriculture, CS, Legal, Mix) with 2,560 QA pairs show consistent improvements over 6 baselines on F1, retrieval similarity, and LLM-judged generation quality. Unexpectedly, existing binary graph-based RAG methods (GraphRAG, LightRAG, PathRAG, HippoRAG2) underperform standard chunk-based RAG—attributed to knowledge fragmentation from forcing n-ary facts into binary triples. An ablation study confirms hyperedge retrieval is the most critical component, contributing more than entity or chunk retrieval alone.",
    416   "red_flags": [
    417     {
    418       "flag": "LLM-judge circularity",
    419       "detail": "GPT-4o-mini is used for both hypergraph construction (n-ary relation extraction) and as the judge in the G-E generation quality metric; this creates systematic bias since the same model's output style and knowledge preferences influence both what is stored and how generation is scored."
    420     },
    421     {
    422       "flag": "No statistical significance testing",
    423       "detail": "All comparative claims across 7 baselines and 5 domains are based on single-run point estimates with no confidence intervals, error bars, or significance tests. Differences as small as 0.5 F1 points are presented without any measure of reliability."
    424     },
    425     {
    426       "flag": "Single LLM evaluation",
    427       "detail": "All experiments use GPT-4o-mini exclusively for extraction, generation, and evaluation. Generalizability to other LLMs (open-source or different families) is untested and unacknowledged as a limitation."
    428     },
    429     {
    430       "flag": "Training data contamination unaddressed",
    431       "detail": "The 2024 ESC hypertension guidelines and UltraDomain are public documents likely in GPT-4o-mini's training data. NaiveGeneration achieves F1 of 12-22 across domains without any retrieval, yet the paper never discusses whether training data overlap inflates or deflates baseline comparisons."
    432     },
    433     {
    434       "flag": "No variance across runs",
    435       "detail": "No experiments are repeated with different seeds; all results are single runs with LLM temperature=1.0, meaning reported numbers could vary substantially across runs."
    436     }
    437   ],
    438   "cited_papers": [
    439     {
    440       "title": "From Local to Global: A Graph RAG Approach to Query-Focused Summarization",
    441       "relevance": "Primary baseline and inspiration; GraphRAG is the first graph-based RAG method being directly compared and improved upon."
    442     },
    443     {
    444       "title": "LightRAG: Simple and Fast Retrieval-Augmented Generation",
    445       "relevance": "Key baseline that adds entity+relation graph indexing to RAG; directly compared in all experiments."
    446     },
    447     {
    448       "title": "PathRAG: Pruning Graph-Based Retrieval Augmented Generation with Relational Paths",
    449       "relevance": "Contemporary graph-based RAG baseline using path pruning for retrieval."
    450     },
    451     {
    452       "title": "From RAG to Memory: Non-Parametric Continual Learning for Large Language Models (HippoRAG2)",
    453       "relevance": "Most recent graph-based RAG baseline using Personalized PageRank for retrieval."
    454     },
    455     {
    456       "title": "Retrieval-Augmented Generation for Large Language Models: A Survey",
    457       "relevance": "Provides survey context for the RAG landscape that HyperGraphRAG situates itself within."
    458     },
    459     {
    460       "title": "RAGAs: Automated Evaluation of Retrieval Augmented Generation",
    461       "relevance": "Source of the Retrieval Similarity (R-S) metric used in evaluation."
    462     },
    463     {
    464       "title": "FlashRAG: A Modular Toolkit for Efficient Retrieval-Augmented Generation Research",
    465       "relevance": "Source of the F1 evaluation metric used in the paper."
    466     },
    467     {
    468       "title": "Text2NKG: Fine-Grained N-ary Relation Extraction for N-ary Relational Knowledge Graph Construction",
    469       "relevance": "Directly relevant prior work on n-ary relation extraction from the same lead author group."
    470     }
    471   ],
    472   "engagement_factors": {
    473     "practical_relevance": {
    474       "score": 3,
    475       "justification": "RAG is widely deployed in production; the paper provides code, cost analysis, and a complete pipeline that practitioners can directly adopt."
    476     },
    477     "surprise_contrarian": {
    478       "score": 2,
    479       "justification": "The finding that all four prior graph-based RAG methods underperform plain StandardRAG is genuinely surprising and challenges assumptions about graph RAG superiority."
    480     },
    481     "fear_safety": {
    482       "score": 0,
    483       "justification": "No AI safety concerns raised; the paper is purely a system improvement paper."
    484     },
    485     "drama_conflict": {
    486       "score": 0,
    487       "justification": "No controversy or replication challenge; straightforward benchmark comparison paper."
    488     },
    489     "demo_ability": {
    490       "score": 2,
    491       "justification": "Code is publicly available on GitHub with the full pipeline; practitioners can run it on their own documents with GPT-4o-mini API access."
    492     },
    493     "brand_recognition": {
    494       "score": 2,
    495       "justification": "Published at NeurIPS 2025 (top venue); affiliations include NTU and NUS (well-known Asian research institutions)."
    496     }
    497   },
    498   "hn_data": {
    499     "threads": [
    500       {
    501         "hn_id": "43676837",
    502         "title": "NoProp: Training neural networks without back-propagation or forward-propagation",
    503         "points": 161,
    504         "comments": 49,
    505         "url": "https://news.ycombinator.com/item?id=43676837"
    506       },
    507       {
    508         "hn_id": "44747954",
    509         "title": "Fluidically Innervated Lattices: 3-D-printed tactile sensors for soft robots",
    510         "points": 3,
    511         "comments": 1,
    512         "url": "https://news.ycombinator.com/item?id=44747954"
    513       },
    514       {
    515         "hn_id": "43821040",
    516         "title": "A Coordination Framework of Small LLMs Matches Large LLMs in Data Synthesis",
    517         "points": 2,
    518         "comments": 0,
    519         "url": "https://news.ycombinator.com/item?id=43821040"
    520       },
    521       {
    522         "hn_id": "47515635",
    523         "title": "Generalized Discrete Diffusion from Snapshots",
    524         "points": 1,
    525         "comments": 0,
    526         "url": "https://news.ycombinator.com/item?id=47515635"
    527       },
    528       {
    529         "hn_id": "42816121",
    530         "title": "People Reduce Workers' Compensation for Using Artificial Intelligence [pdf]",
    531         "points": 1,
    532         "comments": 0,
    533         "url": "https://news.ycombinator.com/item?id=42816121"
    534       }
    535     ],
    536     "top_points": 161,
    537     "total_points": 168,
    538     "total_comments": 50
    539   }
    540 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs