build-citation-graph.py - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

build-citation-graph.py (5210B)
      1 #!/usr/bin/env python3
      2 """
      3 Build a citation graph from cited_papers in all scan.json files.
      4 
      5 Matches cited papers against registry entries by title (case-insensitive),
      6 arxiv_id, or doi. Outputs analysis/citation-graph.json with:
      7 - nodes (all papers in registry that have been scanned)
      8 - edges (citing → cited relationships)
      9 - most_cited (top papers by incoming citation count)
     10 - connected_components (groups of papers linked by citations)
     11 
     12 Usage:
     13     python scripts/build-citation-graph.py
     14 """
     15 
     16 import json
     17 import re
     18 from collections import defaultdict
     19 from pathlib import Path
     20 
     21 ROOT = Path(__file__).resolve().parent.parent
     22 PAPERS_DIR = ROOT / "papers"
     23 REGISTRY = ROOT / "registry.jsonl"
     24 OUTPUT = ROOT / "analysis" / "citation-graph.json"
     25 
     26 
     27 def normalize_title(title):
     28     """Normalize title for fuzzy matching."""
     29     return re.sub(r'[^a-z0-9\s]', '', title.lower()).strip()
     30 
     31 
     32 def load_registry():
     33     """Load registry and build lookup indices."""
     34     entries = []
     35     by_title = {}
     36     by_arxiv = {}
     37     by_doi = {}
     38 
     39     with open(REGISTRY) as f:
     40         for line in f:
     41             line = line.strip()
     42             if not line:
     43                 continue
     44             entry = json.loads(line)
     45             entries.append(entry)
     46             slug = entry["id"]
     47 
     48             title = entry.get("title", "")
     49             if title:
     50                 by_title[normalize_title(title)] = slug
     51 
     52             arxiv_id = entry.get("arxiv_id", "")
     53             if arxiv_id:
     54                 by_arxiv[arxiv_id] = slug
     55 
     56             doi = entry.get("doi", "")
     57             if doi:
     58                 by_doi[doi.lower()] = slug
     59 
     60     return entries, by_title, by_arxiv, by_doi
     61 
     62 
     63 def find_connected_components(adjacency, all_nodes):
     64     """Find connected components in an undirected graph."""
     65     visited = set()
     66     components = []
     67 
     68     def dfs(node, component):
     69         visited.add(node)
     70         component.append(node)
     71         for neighbor in adjacency.get(node, []):
     72             if neighbor not in visited:
     73                 dfs(neighbor, component)
     74 
     75     for node in all_nodes:
     76         if node not in visited:
     77             component = []
     78             dfs(node, component)
     79             components.append(sorted(component))
     80 
     81     return components
     82 
     83 
     84 def main():
     85     entries, by_title, by_arxiv, by_doi = load_registry()
     86 
     87     nodes = []
     88     edges = []
     89     incoming_count = defaultdict(int)
     90 
     91     # Build undirected adjacency for connected components
     92     adjacency = defaultdict(set)
     93     scanned_slugs = set()
     94 
     95     for scan_path in sorted(PAPERS_DIR.glob("*/scan.json")):
     96         slug = scan_path.parent.name
     97         try:
     98             data = json.loads(scan_path.read_text())
     99         except (json.JSONDecodeError, FileNotFoundError):
    100             continue
    101 
    102         scanned_slugs.add(slug)
    103         title = data.get("paper", {}).get("title", slug)
    104         nodes.append({"id": slug, "title": title})
    105 
    106         cited = data.get("cited_papers", [])
    107         for ref in cited:
    108             target = None
    109 
    110             # Match by arxiv_id
    111             arxiv_id = ref.get("arxiv_id", "")
    112             if arxiv_id and arxiv_id in by_arxiv:
    113                 target = by_arxiv[arxiv_id]
    114 
    115             # Match by doi
    116             if not target:
    117                 doi = ref.get("doi", "")
    118                 if doi and doi.lower() in by_doi:
    119                     target = by_doi[doi.lower()]
    120 
    121             # Match by title
    122             if not target:
    123                 ref_title = ref.get("title", "")
    124                 if ref_title:
    125                     norm = normalize_title(ref_title)
    126                     if norm in by_title:
    127                         target = by_title[norm]
    128 
    129             if target and target != slug:
    130                 edges.append({"source": slug, "target": target})
    131                 incoming_count[target] += 1
    132                 adjacency[slug].add(target)
    133                 adjacency[target].add(slug)
    134 
    135     # Most cited
    136     most_cited = sorted(incoming_count.items(), key=lambda x: -x[1])[:30]
    137     most_cited = [{"slug": slug, "incoming_citations": count} for slug, count in most_cited]
    138 
    139     # Connected components (only among scanned papers that appear in edges)
    140     edge_nodes = set()
    141     for e in edges:
    142         edge_nodes.add(e["source"])
    143         edge_nodes.add(e["target"])
    144     components = find_connected_components(adjacency, edge_nodes)
    145     # Sort by size descending
    146     components.sort(key=len, reverse=True)
    147 
    148     result = {
    149         "node_count": len(nodes),
    150         "edge_count": len(edges),
    151         "nodes": nodes,
    152         "edges": edges,
    153         "most_cited": most_cited,
    154         "connected_components": {
    155             "count": len(components),
    156             "largest_size": len(components[0]) if components else 0,
    157             "components": components[:20],  # Top 20 by size
    158         },
    159     }
    160 
    161     OUTPUT.parent.mkdir(parents=True, exist_ok=True)
    162     OUTPUT.write_text(json.dumps(result, indent=2, ensure_ascii=False) + "\n")
    163     print(f"Citation graph written to {OUTPUT}")
    164     print(f"  Nodes: {len(nodes)}")
    165     print(f"  Edges: {len(edges)}")
    166     print(f"  Components: {len(components)}")
    167     if most_cited:
    168         print(f"  Most cited: {most_cited[0]['slug']} ({most_cited[0]['incoming_citations']} citations)")
    169 
    170 
    171 if __name__ == "__main__":
    172     main()
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs