enrich-hn.py - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

enrich-hn.py (5388B)
      1 #!/usr/bin/env python3
      2 """
      3 Enrich papers with Hacker News discussion data.
      4 
      5 For each v2-scanned paper, searches HN Algolia API for submissions
      6 matching the arxiv ID (URL match) or paper title (text match fallback).
      7 Saves results to papers/{slug}/hn.json.
      8 
      9 Usage:
     10     python3 scripts/enrich-hn.py              # All v2 papers without hn.json
     11     python3 scripts/enrich-hn.py --force      # Re-fetch all
     12     python3 scripts/enrich-hn.py --limit 50   # First N only
     13 """
     14 
     15 import json
     16 import sys
     17 import urllib.request
     18 import urllib.parse
     19 from pathlib import Path
     20 
     21 ROOT = Path(__file__).resolve().parent.parent
     22 PAPERS_DIR = ROOT / "papers"
     23 
     24 HN_API = "https://hn.algolia.com/api/v1/search"
     25 
     26 
     27 def search_hn(query, tags="story", hits=10):
     28     """Search HN Algolia API. Returns list of hits."""
     29     params = urllib.parse.urlencode({
     30         "query": query,
     31         "tags": tags,
     32         "hitsPerPage": hits,
     33     })
     34     url = f"{HN_API}?{params}"
     35     try:
     36         req = urllib.request.Request(url, headers={"User-Agent": "research-survey/1.0"})
     37         resp = urllib.request.urlopen(req, timeout=15)
     38         data = json.loads(resp.read())
     39         return data.get("hits", [])
     40     except Exception as e:
     41         print(f"    API error: {e}")
     42         return []
     43 
     44 
     45 def extract_threads(hits):
     46     """Extract thread data from HN hits."""
     47     threads = []
     48     seen_ids = set()
     49     for h in hits:
     50         oid = h.get("objectID")
     51         if oid in seen_ids:
     52             continue
     53         seen_ids.add(oid)
     54         threads.append({
     55             "hn_id": oid,
     56             "title": h.get("title", ""),
     57             "points": h.get("points", 0) or 0,
     58             "comments": h.get("num_comments", 0) or 0,
     59             "url": f"https://news.ycombinator.com/item?id={oid}",
     60             "created_at": h.get("created_at", ""),
     61         })
     62     threads.sort(key=lambda t: -t["points"])
     63     return threads
     64 
     65 
     66 def main():
     67     args = sys.argv[1:]
     68     force = "--force" in args
     69     limit = None
     70     for i, arg in enumerate(args):
     71         if arg == "--limit" and i + 1 < len(args):
     72             limit = int(args[i + 1])
     73 
     74     # Collect v2 papers
     75     papers = []
     76     for scan_path in sorted(PAPERS_DIR.glob("*/scan.json")):
     77         with open(scan_path) as f:
     78             s = json.load(f)
     79         if s.get("scan_version", 1) < 2:
     80             continue
     81         pid = scan_path.parent.name
     82         hn_path = scan_path.parent / "hn.json"
     83         if hn_path.exists() and not force:
     84             continue
     85         papers.append({
     86             "id": pid,
     87             "arxiv_id": s.get("paper", {}).get("arxiv_id", ""),
     88             "title": s.get("paper", {}).get("title", ""),
     89         })
     90 
     91     if limit:
     92         papers = papers[:limit]
     93 
     94     if not papers:
     95         print("No papers to enrich.")
     96         return
     97 
     98     print(f"Enriching {len(papers)} papers with HN data...\n")
     99 
    100     stats = {"arxiv_match": 0, "title_match": 0, "not_found": 0, "total_threads": 0}
    101 
    102     for i, p in enumerate(papers):
    103         threads = []
    104 
    105         # Strategy 1: search by arxiv ID (matches URL submissions)
    106         if p["arxiv_id"]:
    107             hits = search_hn(p["arxiv_id"])
    108             threads = extract_threads(hits)
    109 
    110         # Strategy 2: title search fallback if no arxiv match
    111         if not threads and p["title"]:
    112             # Use quoted title for exact-ish matching, truncated to avoid API issues
    113             title_query = p["title"][:80]
    114             hits = search_hn(title_query)
    115             # Filter to hits whose title has significant word overlap with our title
    116             title_words = set(p["title"].lower().split())
    117             filtered = []
    118             for h in hits:
    119                 hn_words = set((h.get("title", "") or "").lower().split())
    120                 overlap = len(title_words & hn_words)
    121                 if overlap >= min(3, len(title_words) // 2):
    122                     filtered.append(h)
    123             if filtered:
    124                 threads = extract_threads(filtered)
    125                 if threads:
    126                     stats["title_match"] += 1
    127             if not threads:
    128                 stats["not_found"] += 1
    129         elif not threads:
    130             stats["not_found"] += 1
    131 
    132         if threads and p["arxiv_id"]:
    133             stats["arxiv_match"] += 1
    134 
    135         stats["total_threads"] += len(threads)
    136 
    137         # Write hn.json
    138         hn_data = {
    139             "threads": threads,
    140             "top_points": threads[0]["points"] if threads else 0,
    141             "total_points": sum(t["points"] for t in threads),
    142             "total_comments": sum(t["comments"] for t in threads),
    143         }
    144         hn_path = PAPERS_DIR / p["id"] / "hn.json"
    145         with open(hn_path, "w") as f:
    146             json.dump(hn_data, f, ensure_ascii=False, indent=2)
    147 
    148         if threads:
    149             top = threads[0]
    150             print(f"  [{i+1:3d}/{len(papers)}] {p['id'][:45]:45s} {top['points']:>4d}pts {hn_data['total_comments']:>4d}cmt {len(threads)}t")
    151         else:
    152             pass  # silent for not-found to keep output clean
    153 
    154         # Print progress every 100
    155         if (i + 1) % 100 == 0:
    156             print(f"  ... {i+1}/{len(papers)} done")
    157 
    158     print(f"\nDone. {len(papers)} papers processed.")
    159     print(f"  Matched via arxiv_id: {stats['arxiv_match']}")
    160     print(f"  Matched via title: {stats['title_match']}")
    161     print(f"  Not found: {stats['not_found']}")
    162     print(f"  Total HN threads: {stats['total_threads']}")
    163 
    164 
    165 if __name__ == "__main__":
    166     main()
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs