harvest-citations.py - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

harvest-citations.py (5086B)
      1 #!/usr/bin/env python3
      2 """
      3 Harvest cited papers from scan.json files and propose new registry entries.
      4 
      5 Reads all papers/*/scan.json files, collects cited_papers arrays, deduplicates
      6 against the existing registry.jsonl, and prints proposed new entries.
      7 
      8 Usage:
      9     python scripts/harvest-citations.py              # Print proposed entries
     10     python scripts/harvest-citations.py --apply      # Append to registry.jsonl
     11 """
     12 
     13 import json
     14 import sys
     15 import os
     16 from pathlib import Path
     17 from datetime import date
     18 
     19 ROOT = Path(__file__).resolve().parent.parent
     20 REGISTRY_PATH = ROOT / "registry.jsonl"
     21 PAPERS_DIR = ROOT / "papers"
     22 
     23 
     24 def load_registry():
     25     """Load existing registry entries and build lookup indexes."""
     26     entries = []
     27     arxiv_ids = set()
     28     dois = set()
     29     titles_lower = set()
     30 
     31     if REGISTRY_PATH.exists():
     32         with open(REGISTRY_PATH) as f:
     33             for line in f:
     34                 line = line.strip()
     35                 if not line:
     36                     continue
     37                 entry = json.loads(line)
     38                 entries.append(entry)
     39                 if entry.get("arxiv_id"):
     40                     arxiv_ids.add(entry["arxiv_id"])
     41                 if entry.get("doi"):
     42                     dois.add(entry["doi"])
     43                 titles_lower.add(entry["title"].lower().strip())
     44 
     45     return entries, arxiv_ids, dois, titles_lower
     46 
     47 
     48 def collect_cited_papers():
     49     """Read all scan.json files and collect cited_papers."""
     50     cited = []
     51     if not PAPERS_DIR.exists():
     52         return cited
     53 
     54     for scan_path in PAPERS_DIR.glob("*/scan.json"):
     55         with open(scan_path) as f:
     56             scan = json.load(f)
     57         paper_id = scan_path.parent.name
     58         for cp in scan.get("cited_papers", []):
     59             cp["_cited_by"] = paper_id
     60             cited.append(cp)
     61 
     62     return cited
     63 
     64 
     65 def make_slug(title, year=None):
     66     """Generate a registry slug from a title."""
     67     words = title.lower().split()
     68     # Take first 4-5 meaningful words, skip articles
     69     skip = {"a", "an", "the", "of", "in", "on", "for", "and", "with", "to", "is", "are"}
     70     meaningful = [w for w in words if w not in skip][:5]
     71     slug = "-".join(meaningful)
     72     # Clean non-alphanumeric chars
     73     slug = "".join(c if c.isalnum() or c == "-" else "" for c in slug)
     74     # Remove double hyphens
     75     while "--" in slug:
     76         slug = slug.replace("--", "-")
     77     slug = slug.strip("-")
     78     if year:
     79         slug = f"{slug}-{year}"
     80     return slug
     81 
     82 
     83 def is_duplicate(cp, arxiv_ids, dois, titles_lower):
     84     """Check if a cited paper already exists in the registry."""
     85     if cp.get("arxiv_id") and cp["arxiv_id"] in arxiv_ids:
     86         return True
     87     if cp.get("doi") and cp["doi"] in dois:
     88         return True
     89     if cp.get("title") and cp["title"].lower().strip() in titles_lower:
     90         return True
     91     return False
     92 
     93 
     94 def make_registry_entry(cp):
     95     """Convert a cited_paper object into a registry entry."""
     96     year = cp.get("year")
     97     slug = make_slug(cp["title"], year)
     98 
     99     entry = {
    100         "id": slug,
    101         "title": cp["title"],
    102         "authors": cp.get("authors", ["Unknown"]),
    103         "year": year or 0,
    104         "venue": "Unknown",
    105         "source": "arxiv" if cp.get("arxiv_id") else "manual",
    106         "status": "queued",
    107         "tags": [],
    108         "added": date.today().isoformat(),
    109         "notes": f"Citation-chased from {cp['_cited_by']}. {cp['relevance']}",
    110     }
    111 
    112     if cp.get("arxiv_id"):
    113         entry["arxiv_id"] = cp["arxiv_id"]
    114         entry["source_url"] = f"https://arxiv.org/abs/{cp['arxiv_id']}"
    115     if cp.get("doi"):
    116         entry["doi"] = cp["doi"]
    117 
    118     return entry
    119 
    120 
    121 def main():
    122     apply = "--apply" in sys.argv
    123 
    124     _, arxiv_ids, dois, titles_lower = load_registry()
    125     cited = collect_cited_papers()
    126 
    127     if not cited:
    128         print("No cited_papers found in any scan.json files.")
    129         return
    130 
    131     # Deduplicate cited papers against registry AND against each other
    132     seen_arxiv = set(arxiv_ids)
    133     seen_doi = set(dois)
    134     seen_title = set(titles_lower)
    135     new_entries = []
    136 
    137     for cp in cited:
    138         if is_duplicate(cp, seen_arxiv, seen_doi, seen_title):
    139             continue
    140 
    141         entry = make_registry_entry(cp)
    142         new_entries.append(entry)
    143 
    144         # Track to avoid duplicates within this batch
    145         if cp.get("arxiv_id"):
    146             seen_arxiv.add(cp["arxiv_id"])
    147         if cp.get("doi"):
    148             seen_doi.add(cp["doi"])
    149         seen_title.add(cp["title"].lower().strip())
    150 
    151     if not new_entries:
    152         print("All cited papers already in registry. Nothing to add.")
    153         return
    154 
    155     print(f"Found {len(new_entries)} new paper(s) from citation chasing:\n")
    156 
    157     for entry in new_entries:
    158         line = json.dumps(entry, ensure_ascii=False)
    159         print(f"  {entry['id']}: {entry['title']}")
    160         if apply:
    161             with open(REGISTRY_PATH, "a") as f:
    162                 f.write(line + "\n")
    163 
    164     if apply:
    165         print(f"\nAppended {len(new_entries)} entries to {REGISTRY_PATH}")
    166     else:
    167         print(f"\nDry run. Use --apply to append to {REGISTRY_PATH}")
    168 
    169 
    170 if __name__ == "__main__":
    171     main()
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs