harvest-citations.py (5086B)
1 #!/usr/bin/env python3 2 """ 3 Harvest cited papers from scan.json files and propose new registry entries. 4 5 Reads all papers/*/scan.json files, collects cited_papers arrays, deduplicates 6 against the existing registry.jsonl, and prints proposed new entries. 7 8 Usage: 9 python scripts/harvest-citations.py # Print proposed entries 10 python scripts/harvest-citations.py --apply # Append to registry.jsonl 11 """ 12 13 import json 14 import sys 15 import os 16 from pathlib import Path 17 from datetime import date 18 19 ROOT = Path(__file__).resolve().parent.parent 20 REGISTRY_PATH = ROOT / "registry.jsonl" 21 PAPERS_DIR = ROOT / "papers" 22 23 24 def load_registry(): 25 """Load existing registry entries and build lookup indexes.""" 26 entries = [] 27 arxiv_ids = set() 28 dois = set() 29 titles_lower = set() 30 31 if REGISTRY_PATH.exists(): 32 with open(REGISTRY_PATH) as f: 33 for line in f: 34 line = line.strip() 35 if not line: 36 continue 37 entry = json.loads(line) 38 entries.append(entry) 39 if entry.get("arxiv_id"): 40 arxiv_ids.add(entry["arxiv_id"]) 41 if entry.get("doi"): 42 dois.add(entry["doi"]) 43 titles_lower.add(entry["title"].lower().strip()) 44 45 return entries, arxiv_ids, dois, titles_lower 46 47 48 def collect_cited_papers(): 49 """Read all scan.json files and collect cited_papers.""" 50 cited = [] 51 if not PAPERS_DIR.exists(): 52 return cited 53 54 for scan_path in PAPERS_DIR.glob("*/scan.json"): 55 with open(scan_path) as f: 56 scan = json.load(f) 57 paper_id = scan_path.parent.name 58 for cp in scan.get("cited_papers", []): 59 cp["_cited_by"] = paper_id 60 cited.append(cp) 61 62 return cited 63 64 65 def make_slug(title, year=None): 66 """Generate a registry slug from a title.""" 67 words = title.lower().split() 68 # Take first 4-5 meaningful words, skip articles 69 skip = {"a", "an", "the", "of", "in", "on", "for", "and", "with", "to", "is", "are"} 70 meaningful = [w for w in words if w not in skip][:5] 71 slug = "-".join(meaningful) 72 # Clean non-alphanumeric chars 73 slug = "".join(c if c.isalnum() or c == "-" else "" for c in slug) 74 # Remove double hyphens 75 while "--" in slug: 76 slug = slug.replace("--", "-") 77 slug = slug.strip("-") 78 if year: 79 slug = f"{slug}-{year}" 80 return slug 81 82 83 def is_duplicate(cp, arxiv_ids, dois, titles_lower): 84 """Check if a cited paper already exists in the registry.""" 85 if cp.get("arxiv_id") and cp["arxiv_id"] in arxiv_ids: 86 return True 87 if cp.get("doi") and cp["doi"] in dois: 88 return True 89 if cp.get("title") and cp["title"].lower().strip() in titles_lower: 90 return True 91 return False 92 93 94 def make_registry_entry(cp): 95 """Convert a cited_paper object into a registry entry.""" 96 year = cp.get("year") 97 slug = make_slug(cp["title"], year) 98 99 entry = { 100 "id": slug, 101 "title": cp["title"], 102 "authors": cp.get("authors", ["Unknown"]), 103 "year": year or 0, 104 "venue": "Unknown", 105 "source": "arxiv" if cp.get("arxiv_id") else "manual", 106 "status": "queued", 107 "tags": [], 108 "added": date.today().isoformat(), 109 "notes": f"Citation-chased from {cp['_cited_by']}. {cp['relevance']}", 110 } 111 112 if cp.get("arxiv_id"): 113 entry["arxiv_id"] = cp["arxiv_id"] 114 entry["source_url"] = f"https://arxiv.org/abs/{cp['arxiv_id']}" 115 if cp.get("doi"): 116 entry["doi"] = cp["doi"] 117 118 return entry 119 120 121 def main(): 122 apply = "--apply" in sys.argv 123 124 _, arxiv_ids, dois, titles_lower = load_registry() 125 cited = collect_cited_papers() 126 127 if not cited: 128 print("No cited_papers found in any scan.json files.") 129 return 130 131 # Deduplicate cited papers against registry AND against each other 132 seen_arxiv = set(arxiv_ids) 133 seen_doi = set(dois) 134 seen_title = set(titles_lower) 135 new_entries = [] 136 137 for cp in cited: 138 if is_duplicate(cp, seen_arxiv, seen_doi, seen_title): 139 continue 140 141 entry = make_registry_entry(cp) 142 new_entries.append(entry) 143 144 # Track to avoid duplicates within this batch 145 if cp.get("arxiv_id"): 146 seen_arxiv.add(cp["arxiv_id"]) 147 if cp.get("doi"): 148 seen_doi.add(cp["doi"]) 149 seen_title.add(cp["title"].lower().strip()) 150 151 if not new_entries: 152 print("All cited papers already in registry. Nothing to add.") 153 return 154 155 print(f"Found {len(new_entries)} new paper(s) from citation chasing:\n") 156 157 for entry in new_entries: 158 line = json.dumps(entry, ensure_ascii=False) 159 print(f" {entry['id']}: {entry['title']}") 160 if apply: 161 with open(REGISTRY_PATH, "a") as f: 162 f.write(line + "\n") 163 164 if apply: 165 print(f"\nAppended {len(new_entries)} entries to {REGISTRY_PATH}") 166 else: 167 print(f"\nDry run. Use --apply to append to {REGISTRY_PATH}") 168 169 170 if __name__ == "__main__": 171 main()