enrich-hn.py (5388B)
1 #!/usr/bin/env python3 2 """ 3 Enrich papers with Hacker News discussion data. 4 5 For each v2-scanned paper, searches HN Algolia API for submissions 6 matching the arxiv ID (URL match) or paper title (text match fallback). 7 Saves results to papers/{slug}/hn.json. 8 9 Usage: 10 python3 scripts/enrich-hn.py # All v2 papers without hn.json 11 python3 scripts/enrich-hn.py --force # Re-fetch all 12 python3 scripts/enrich-hn.py --limit 50 # First N only 13 """ 14 15 import json 16 import sys 17 import urllib.request 18 import urllib.parse 19 from pathlib import Path 20 21 ROOT = Path(__file__).resolve().parent.parent 22 PAPERS_DIR = ROOT / "papers" 23 24 HN_API = "https://hn.algolia.com/api/v1/search" 25 26 27 def search_hn(query, tags="story", hits=10): 28 """Search HN Algolia API. Returns list of hits.""" 29 params = urllib.parse.urlencode({ 30 "query": query, 31 "tags": tags, 32 "hitsPerPage": hits, 33 }) 34 url = f"{HN_API}?{params}" 35 try: 36 req = urllib.request.Request(url, headers={"User-Agent": "research-survey/1.0"}) 37 resp = urllib.request.urlopen(req, timeout=15) 38 data = json.loads(resp.read()) 39 return data.get("hits", []) 40 except Exception as e: 41 print(f" API error: {e}") 42 return [] 43 44 45 def extract_threads(hits): 46 """Extract thread data from HN hits.""" 47 threads = [] 48 seen_ids = set() 49 for h in hits: 50 oid = h.get("objectID") 51 if oid in seen_ids: 52 continue 53 seen_ids.add(oid) 54 threads.append({ 55 "hn_id": oid, 56 "title": h.get("title", ""), 57 "points": h.get("points", 0) or 0, 58 "comments": h.get("num_comments", 0) or 0, 59 "url": f"https://news.ycombinator.com/item?id={oid}", 60 "created_at": h.get("created_at", ""), 61 }) 62 threads.sort(key=lambda t: -t["points"]) 63 return threads 64 65 66 def main(): 67 args = sys.argv[1:] 68 force = "--force" in args 69 limit = None 70 for i, arg in enumerate(args): 71 if arg == "--limit" and i + 1 < len(args): 72 limit = int(args[i + 1]) 73 74 # Collect v2 papers 75 papers = [] 76 for scan_path in sorted(PAPERS_DIR.glob("*/scan.json")): 77 with open(scan_path) as f: 78 s = json.load(f) 79 if s.get("scan_version", 1) < 2: 80 continue 81 pid = scan_path.parent.name 82 hn_path = scan_path.parent / "hn.json" 83 if hn_path.exists() and not force: 84 continue 85 papers.append({ 86 "id": pid, 87 "arxiv_id": s.get("paper", {}).get("arxiv_id", ""), 88 "title": s.get("paper", {}).get("title", ""), 89 }) 90 91 if limit: 92 papers = papers[:limit] 93 94 if not papers: 95 print("No papers to enrich.") 96 return 97 98 print(f"Enriching {len(papers)} papers with HN data...\n") 99 100 stats = {"arxiv_match": 0, "title_match": 0, "not_found": 0, "total_threads": 0} 101 102 for i, p in enumerate(papers): 103 threads = [] 104 105 # Strategy 1: search by arxiv ID (matches URL submissions) 106 if p["arxiv_id"]: 107 hits = search_hn(p["arxiv_id"]) 108 threads = extract_threads(hits) 109 110 # Strategy 2: title search fallback if no arxiv match 111 if not threads and p["title"]: 112 # Use quoted title for exact-ish matching, truncated to avoid API issues 113 title_query = p["title"][:80] 114 hits = search_hn(title_query) 115 # Filter to hits whose title has significant word overlap with our title 116 title_words = set(p["title"].lower().split()) 117 filtered = [] 118 for h in hits: 119 hn_words = set((h.get("title", "") or "").lower().split()) 120 overlap = len(title_words & hn_words) 121 if overlap >= min(3, len(title_words) // 2): 122 filtered.append(h) 123 if filtered: 124 threads = extract_threads(filtered) 125 if threads: 126 stats["title_match"] += 1 127 if not threads: 128 stats["not_found"] += 1 129 elif not threads: 130 stats["not_found"] += 1 131 132 if threads and p["arxiv_id"]: 133 stats["arxiv_match"] += 1 134 135 stats["total_threads"] += len(threads) 136 137 # Write hn.json 138 hn_data = { 139 "threads": threads, 140 "top_points": threads[0]["points"] if threads else 0, 141 "total_points": sum(t["points"] for t in threads), 142 "total_comments": sum(t["comments"] for t in threads), 143 } 144 hn_path = PAPERS_DIR / p["id"] / "hn.json" 145 with open(hn_path, "w") as f: 146 json.dump(hn_data, f, ensure_ascii=False, indent=2) 147 148 if threads: 149 top = threads[0] 150 print(f" [{i+1:3d}/{len(papers)}] {p['id'][:45]:45s} {top['points']:>4d}pts {hn_data['total_comments']:>4d}cmt {len(threads)}t") 151 else: 152 pass # silent for not-found to keep output clean 153 154 # Print progress every 100 155 if (i + 1) % 100 == 0: 156 print(f" ... {i+1}/{len(papers)} done") 157 158 print(f"\nDone. {len(papers)} papers processed.") 159 print(f" Matched via arxiv_id: {stats['arxiv_match']}") 160 print(f" Matched via title: {stats['title_match']}") 161 print(f" Not found: {stats['not_found']}") 162 print(f" Total HN threads: {stats['total_threads']}") 163 164 165 if __name__ == "__main__": 166 main()