build-citation-graph.py (5210B)
1 #!/usr/bin/env python3 2 """ 3 Build a citation graph from cited_papers in all scan.json files. 4 5 Matches cited papers against registry entries by title (case-insensitive), 6 arxiv_id, or doi. Outputs analysis/citation-graph.json with: 7 - nodes (all papers in registry that have been scanned) 8 - edges (citing → cited relationships) 9 - most_cited (top papers by incoming citation count) 10 - connected_components (groups of papers linked by citations) 11 12 Usage: 13 python scripts/build-citation-graph.py 14 """ 15 16 import json 17 import re 18 from collections import defaultdict 19 from pathlib import Path 20 21 ROOT = Path(__file__).resolve().parent.parent 22 PAPERS_DIR = ROOT / "papers" 23 REGISTRY = ROOT / "registry.jsonl" 24 OUTPUT = ROOT / "analysis" / "citation-graph.json" 25 26 27 def normalize_title(title): 28 """Normalize title for fuzzy matching.""" 29 return re.sub(r'[^a-z0-9\s]', '', title.lower()).strip() 30 31 32 def load_registry(): 33 """Load registry and build lookup indices.""" 34 entries = [] 35 by_title = {} 36 by_arxiv = {} 37 by_doi = {} 38 39 with open(REGISTRY) as f: 40 for line in f: 41 line = line.strip() 42 if not line: 43 continue 44 entry = json.loads(line) 45 entries.append(entry) 46 slug = entry["id"] 47 48 title = entry.get("title", "") 49 if title: 50 by_title[normalize_title(title)] = slug 51 52 arxiv_id = entry.get("arxiv_id", "") 53 if arxiv_id: 54 by_arxiv[arxiv_id] = slug 55 56 doi = entry.get("doi", "") 57 if doi: 58 by_doi[doi.lower()] = slug 59 60 return entries, by_title, by_arxiv, by_doi 61 62 63 def find_connected_components(adjacency, all_nodes): 64 """Find connected components in an undirected graph.""" 65 visited = set() 66 components = [] 67 68 def dfs(node, component): 69 visited.add(node) 70 component.append(node) 71 for neighbor in adjacency.get(node, []): 72 if neighbor not in visited: 73 dfs(neighbor, component) 74 75 for node in all_nodes: 76 if node not in visited: 77 component = [] 78 dfs(node, component) 79 components.append(sorted(component)) 80 81 return components 82 83 84 def main(): 85 entries, by_title, by_arxiv, by_doi = load_registry() 86 87 nodes = [] 88 edges = [] 89 incoming_count = defaultdict(int) 90 91 # Build undirected adjacency for connected components 92 adjacency = defaultdict(set) 93 scanned_slugs = set() 94 95 for scan_path in sorted(PAPERS_DIR.glob("*/scan.json")): 96 slug = scan_path.parent.name 97 try: 98 data = json.loads(scan_path.read_text()) 99 except (json.JSONDecodeError, FileNotFoundError): 100 continue 101 102 scanned_slugs.add(slug) 103 title = data.get("paper", {}).get("title", slug) 104 nodes.append({"id": slug, "title": title}) 105 106 cited = data.get("cited_papers", []) 107 for ref in cited: 108 target = None 109 110 # Match by arxiv_id 111 arxiv_id = ref.get("arxiv_id", "") 112 if arxiv_id and arxiv_id in by_arxiv: 113 target = by_arxiv[arxiv_id] 114 115 # Match by doi 116 if not target: 117 doi = ref.get("doi", "") 118 if doi and doi.lower() in by_doi: 119 target = by_doi[doi.lower()] 120 121 # Match by title 122 if not target: 123 ref_title = ref.get("title", "") 124 if ref_title: 125 norm = normalize_title(ref_title) 126 if norm in by_title: 127 target = by_title[norm] 128 129 if target and target != slug: 130 edges.append({"source": slug, "target": target}) 131 incoming_count[target] += 1 132 adjacency[slug].add(target) 133 adjacency[target].add(slug) 134 135 # Most cited 136 most_cited = sorted(incoming_count.items(), key=lambda x: -x[1])[:30] 137 most_cited = [{"slug": slug, "incoming_citations": count} for slug, count in most_cited] 138 139 # Connected components (only among scanned papers that appear in edges) 140 edge_nodes = set() 141 for e in edges: 142 edge_nodes.add(e["source"]) 143 edge_nodes.add(e["target"]) 144 components = find_connected_components(adjacency, edge_nodes) 145 # Sort by size descending 146 components.sort(key=len, reverse=True) 147 148 result = { 149 "node_count": len(nodes), 150 "edge_count": len(edges), 151 "nodes": nodes, 152 "edges": edges, 153 "most_cited": most_cited, 154 "connected_components": { 155 "count": len(components), 156 "largest_size": len(components[0]) if components else 0, 157 "components": components[:20], # Top 20 by size 158 }, 159 } 160 161 OUTPUT.parent.mkdir(parents=True, exist_ok=True) 162 OUTPUT.write_text(json.dumps(result, indent=2, ensure_ascii=False) + "\n") 163 print(f"Citation graph written to {OUTPUT}") 164 print(f" Nodes: {len(nodes)}") 165 print(f" Edges: {len(edges)}") 166 print(f" Components: {len(components)}") 167 if most_cited: 168 print(f" Most cited: {most_cited[0]['slug']} ({most_cited[0]['incoming_citations']} citations)") 169 170 171 if __name__ == "__main__": 172 main()