Add citation-chasing pipeline: cited_papers in scan + harvest script - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

commit aaa7097d653f63eb5a6a63611954e3c1ec4c4887
parent ed00b8092b4a223958bce1880699cb75d0dc4fe7
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Fri, 27 Feb 2026 20:57:52 +0100

Add citation-chasing pipeline: cited_papers in scan + harvest script

- Add cited_papers array to scan.schema.json (required field)
- Update scan-agent.md with instructions to extract survey-relevant
  references from each scanned paper (expect 3-15 per paper)
- Add scripts/harvest-citations.py: reads cited_papers from all
  scan.json files, deduplicates against registry by arxiv_id/doi/title,
  and proposes or appends new registry entries (--apply flag)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Diffstat:
M agents/scan-agent.md  | 17 ++++++++++++++++-
M schema/scan.schema.json  | 39 ++++++++++++++++++++++++++++++++++++++-
A scripts/harvest-citations.py  | 171 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

3 files changed, 225 insertions(+), 2 deletions(-)
diff --git a/agents/scan-agent.md b/agents/scan-agent.md
@@ -61,7 +61,22 @@ Assign one or more tags describing the research methodology:
 
 Write a 2-4 sentence summary of the paper's most important findings. Be factual and specific.
 
-### 6. Flag Red Flags
+### 6. Extract Cited Papers
+
+Scan the paper's references for other papers that fall within the survey scope (AI/LLM capability, productivity, safety, code generation, agentic workflows). For each relevant cited paper, extract:
+
+- **title**: As it appears in the references section
+- **authors**: If listed (at least first author)
+- **year**: If available
+- **arxiv_id**: If an arXiv URL or ID appears in the reference
+- **doi**: If available
+- **relevance**: One sentence on why this paper belongs in the survey
+
+Do NOT include every reference. Only include papers that meet the survey's inclusion criteria from `context/methodology.md`. A typical paper might cite 30-60 references; you should extract 3-15 relevant ones.
+
+These cited papers feed a citation-chasing pipeline: the `scripts/harvest-citations.py` script reads them from all scan.json files and proposes new registry entries for papers we haven't seen yet.
+
+### 7. Flag Red Flags
 
 Note any methodological concerns, including but not limited to:
 - Cherry-picked results or selective reporting
diff --git a/schema/scan.schema.json b/schema/scan.schema.json
@@ -10,7 +10,8 @@
     "claims",
     "methodology_tags",
     "key_findings",
-    "red_flags"
+    "red_flags",
+    "cited_papers"
   ],
   "properties": {
     "paper": {
@@ -109,6 +110,42 @@
           }
         }
       }
+    },
+    "cited_papers": {
+      "type": "array",
+      "description": "Papers cited in this paper that are relevant to the survey scope. Used for citation-chasing: these become candidates for the registry.",
+      "items": {
+        "type": "object",
+        "required": ["title", "relevance"],
+        "properties": {
+          "title": {
+            "type": "string",
+            "description": "Title of the cited paper as it appears in the references."
+          },
+          "authors": {
+            "type": "array",
+            "items": { "type": "string" },
+            "description": "Author names if available from the reference."
+          },
+          "year": {
+            "type": "integer",
+            "description": "Publication year if available."
+          },
+          "arxiv_id": {
+            "type": "string",
+            "pattern": "^\\d{4}\\.\\d{4,5}$",
+            "description": "arXiv ID if available."
+          },
+          "doi": {
+            "type": "string",
+            "description": "DOI if available."
+          },
+          "relevance": {
+            "type": "string",
+            "description": "Why this cited paper is relevant to the survey (1 sentence)."
+          }
+        }
+      }
     }
   },
   "$defs": {
diff --git a/scripts/harvest-citations.py b/scripts/harvest-citations.py
@@ -0,0 +1,171 @@
+#!/usr/bin/env python3
+"""
+Harvest cited papers from scan.json files and propose new registry entries.
+
+Reads all papers/*/scan.json files, collects cited_papers arrays, deduplicates
+against the existing registry.jsonl, and prints proposed new entries.
+
+Usage:
+    python scripts/harvest-citations.py              # Print proposed entries
+    python scripts/harvest-citations.py --apply      # Append to registry.jsonl
+"""
+
+import json
+import sys
+import os
+from pathlib import Path
+from datetime import date
+
+ROOT = Path(__file__).resolve().parent.parent
+REGISTRY_PATH = ROOT / "registry.jsonl"
+PAPERS_DIR = ROOT / "papers"
+
+
+def load_registry():
+    """Load existing registry entries and build lookup indexes."""
+    entries = []
+    arxiv_ids = set()
+    dois = set()
+    titles_lower = set()
+
+    if REGISTRY_PATH.exists():
+        with open(REGISTRY_PATH) as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                entry = json.loads(line)
+                entries.append(entry)
+                if entry.get("arxiv_id"):
+                    arxiv_ids.add(entry["arxiv_id"])
+                if entry.get("doi"):
+                    dois.add(entry["doi"])
+                titles_lower.add(entry["title"].lower().strip())
+
+    return entries, arxiv_ids, dois, titles_lower
+
+
+def collect_cited_papers():
+    """Read all scan.json files and collect cited_papers."""
+    cited = []
+    if not PAPERS_DIR.exists():
+        return cited
+
+    for scan_path in PAPERS_DIR.glob("*/scan.json"):
+        with open(scan_path) as f:
+            scan = json.load(f)
+        paper_id = scan_path.parent.name
+        for cp in scan.get("cited_papers", []):
+            cp["_cited_by"] = paper_id
+            cited.append(cp)
+
+    return cited
+
+
+def make_slug(title, year=None):
+    """Generate a registry slug from a title."""
+    words = title.lower().split()
+    # Take first 4-5 meaningful words, skip articles
+    skip = {"a", "an", "the", "of", "in", "on", "for", "and", "with", "to", "is", "are"}
+    meaningful = [w for w in words if w not in skip][:5]
+    slug = "-".join(meaningful)
+    # Clean non-alphanumeric chars
+    slug = "".join(c if c.isalnum() or c == "-" else "" for c in slug)
+    # Remove double hyphens
+    while "--" in slug:
+        slug = slug.replace("--", "-")
+    slug = slug.strip("-")
+    if year:
+        slug = f"{slug}-{year}"
+    return slug
+
+
+def is_duplicate(cp, arxiv_ids, dois, titles_lower):
+    """Check if a cited paper already exists in the registry."""
+    if cp.get("arxiv_id") and cp["arxiv_id"] in arxiv_ids:
+        return True
+    if cp.get("doi") and cp["doi"] in dois:
+        return True
+    if cp.get("title") and cp["title"].lower().strip() in titles_lower:
+        return True
+    return False
+
+
+def make_registry_entry(cp):
+    """Convert a cited_paper object into a registry entry."""
+    year = cp.get("year")
+    slug = make_slug(cp["title"], year)
+
+    entry = {
+        "id": slug,
+        "title": cp["title"],
+        "authors": cp.get("authors", ["Unknown"]),
+        "year": year or 0,
+        "venue": "Unknown",
+        "source": "arxiv" if cp.get("arxiv_id") else "manual",
+        "status": "queued",
+        "tags": [],
+        "added": date.today().isoformat(),
+        "notes": f"Citation-chased from {cp['_cited_by']}. {cp['relevance']}",
+    }
+
+    if cp.get("arxiv_id"):
+        entry["arxiv_id"] = cp["arxiv_id"]
+        entry["source_url"] = f"https://arxiv.org/abs/{cp['arxiv_id']}"
+    if cp.get("doi"):
+        entry["doi"] = cp["doi"]
+
+    return entry
+
+
+def main():
+    apply = "--apply" in sys.argv
+
+    _, arxiv_ids, dois, titles_lower = load_registry()
+    cited = collect_cited_papers()
+
+    if not cited:
+        print("No cited_papers found in any scan.json files.")
+        return
+
+    # Deduplicate cited papers against registry AND against each other
+    seen_arxiv = set(arxiv_ids)
+    seen_doi = set(dois)
+    seen_title = set(titles_lower)
+    new_entries = []
+
+    for cp in cited:
+        if is_duplicate(cp, seen_arxiv, seen_doi, seen_title):
+            continue
+
+        entry = make_registry_entry(cp)
+        new_entries.append(entry)
+
+        # Track to avoid duplicates within this batch
+        if cp.get("arxiv_id"):
+            seen_arxiv.add(cp["arxiv_id"])
+        if cp.get("doi"):
+            seen_doi.add(cp["doi"])
+        seen_title.add(cp["title"].lower().strip())
+
+    if not new_entries:
+        print("All cited papers already in registry. Nothing to add.")
+        return
+
+    print(f"Found {len(new_entries)} new paper(s) from citation chasing:\n")
+
+    for entry in new_entries:
+        line = json.dumps(entry, ensure_ascii=False)
+        print(f"  {entry['id']}: {entry['title']}")
+        if apply:
+            with open(REGISTRY_PATH, "a") as f:
+                f.write(line + "\n")
+
+    if apply:
+        print(f"\nAppended {len(new_entries)} entries to {REGISTRY_PATH}")
+    else:
+        print(f"\nDry run. Use --apply to append to {REGISTRY_PATH}")
+
+
+if __name__ == "__main__":
+    main()

	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs

M	agents/scan-agent.md	\|	17	++++++++++++++++-
M	schema/scan.schema.json	\|	39	++++++++++++++++++++++++++++++++++++++-
A	scripts/harvest-citations.py	\|	171	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++