commit aaa7097d653f63eb5a6a63611954e3c1ec4c4887
parent ed00b8092b4a223958bce1880699cb75d0dc4fe7
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Fri, 27 Feb 2026 20:57:52 +0100
Add citation-chasing pipeline: cited_papers in scan + harvest script
- Add cited_papers array to scan.schema.json (required field)
- Update scan-agent.md with instructions to extract survey-relevant
references from each scanned paper (expect 3-15 per paper)
- Add scripts/harvest-citations.py: reads cited_papers from all
scan.json files, deduplicates against registry by arxiv_id/doi/title,
and proposes or appends new registry entries (--apply flag)
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Diffstat:
3 files changed, 225 insertions(+), 2 deletions(-)
diff --git a/agents/scan-agent.md b/agents/scan-agent.md
@@ -61,7 +61,22 @@ Assign one or more tags describing the research methodology:
Write a 2-4 sentence summary of the paper's most important findings. Be factual and specific.
-### 6. Flag Red Flags
+### 6. Extract Cited Papers
+
+Scan the paper's references for other papers that fall within the survey scope (AI/LLM capability, productivity, safety, code generation, agentic workflows). For each relevant cited paper, extract:
+
+- **title**: As it appears in the references section
+- **authors**: If listed (at least first author)
+- **year**: If available
+- **arxiv_id**: If an arXiv URL or ID appears in the reference
+- **doi**: If available
+- **relevance**: One sentence on why this paper belongs in the survey
+
+Do NOT include every reference. Only include papers that meet the survey's inclusion criteria from `context/methodology.md`. A typical paper might cite 30-60 references; you should extract 3-15 relevant ones.
+
+These cited papers feed a citation-chasing pipeline: the `scripts/harvest-citations.py` script reads them from all scan.json files and proposes new registry entries for papers we haven't seen yet.
+
+### 7. Flag Red Flags
Note any methodological concerns, including but not limited to:
- Cherry-picked results or selective reporting
diff --git a/schema/scan.schema.json b/schema/scan.schema.json
@@ -10,7 +10,8 @@
"claims",
"methodology_tags",
"key_findings",
- "red_flags"
+ "red_flags",
+ "cited_papers"
],
"properties": {
"paper": {
@@ -109,6 +110,42 @@
}
}
}
+ },
+ "cited_papers": {
+ "type": "array",
+ "description": "Papers cited in this paper that are relevant to the survey scope. Used for citation-chasing: these become candidates for the registry.",
+ "items": {
+ "type": "object",
+ "required": ["title", "relevance"],
+ "properties": {
+ "title": {
+ "type": "string",
+ "description": "Title of the cited paper as it appears in the references."
+ },
+ "authors": {
+ "type": "array",
+ "items": { "type": "string" },
+ "description": "Author names if available from the reference."
+ },
+ "year": {
+ "type": "integer",
+ "description": "Publication year if available."
+ },
+ "arxiv_id": {
+ "type": "string",
+ "pattern": "^\\d{4}\\.\\d{4,5}$",
+ "description": "arXiv ID if available."
+ },
+ "doi": {
+ "type": "string",
+ "description": "DOI if available."
+ },
+ "relevance": {
+ "type": "string",
+ "description": "Why this cited paper is relevant to the survey (1 sentence)."
+ }
+ }
+ }
}
},
"$defs": {
diff --git a/scripts/harvest-citations.py b/scripts/harvest-citations.py
@@ -0,0 +1,171 @@
+#!/usr/bin/env python3
+"""
+Harvest cited papers from scan.json files and propose new registry entries.
+
+Reads all papers/*/scan.json files, collects cited_papers arrays, deduplicates
+against the existing registry.jsonl, and prints proposed new entries.
+
+Usage:
+ python scripts/harvest-citations.py # Print proposed entries
+ python scripts/harvest-citations.py --apply # Append to registry.jsonl
+"""
+
+import json
+import sys
+import os
+from pathlib import Path
+from datetime import date
+
+ROOT = Path(__file__).resolve().parent.parent
+REGISTRY_PATH = ROOT / "registry.jsonl"
+PAPERS_DIR = ROOT / "papers"
+
+
+def load_registry():
+ """Load existing registry entries and build lookup indexes."""
+ entries = []
+ arxiv_ids = set()
+ dois = set()
+ titles_lower = set()
+
+ if REGISTRY_PATH.exists():
+ with open(REGISTRY_PATH) as f:
+ for line in f:
+ line = line.strip()
+ if not line:
+ continue
+ entry = json.loads(line)
+ entries.append(entry)
+ if entry.get("arxiv_id"):
+ arxiv_ids.add(entry["arxiv_id"])
+ if entry.get("doi"):
+ dois.add(entry["doi"])
+ titles_lower.add(entry["title"].lower().strip())
+
+ return entries, arxiv_ids, dois, titles_lower
+
+
+def collect_cited_papers():
+ """Read all scan.json files and collect cited_papers."""
+ cited = []
+ if not PAPERS_DIR.exists():
+ return cited
+
+ for scan_path in PAPERS_DIR.glob("*/scan.json"):
+ with open(scan_path) as f:
+ scan = json.load(f)
+ paper_id = scan_path.parent.name
+ for cp in scan.get("cited_papers", []):
+ cp["_cited_by"] = paper_id
+ cited.append(cp)
+
+ return cited
+
+
+def make_slug(title, year=None):
+ """Generate a registry slug from a title."""
+ words = title.lower().split()
+ # Take first 4-5 meaningful words, skip articles
+ skip = {"a", "an", "the", "of", "in", "on", "for", "and", "with", "to", "is", "are"}
+ meaningful = [w for w in words if w not in skip][:5]
+ slug = "-".join(meaningful)
+ # Clean non-alphanumeric chars
+ slug = "".join(c if c.isalnum() or c == "-" else "" for c in slug)
+ # Remove double hyphens
+ while "--" in slug:
+ slug = slug.replace("--", "-")
+ slug = slug.strip("-")
+ if year:
+ slug = f"{slug}-{year}"
+ return slug
+
+
+def is_duplicate(cp, arxiv_ids, dois, titles_lower):
+ """Check if a cited paper already exists in the registry."""
+ if cp.get("arxiv_id") and cp["arxiv_id"] in arxiv_ids:
+ return True
+ if cp.get("doi") and cp["doi"] in dois:
+ return True
+ if cp.get("title") and cp["title"].lower().strip() in titles_lower:
+ return True
+ return False
+
+
+def make_registry_entry(cp):
+ """Convert a cited_paper object into a registry entry."""
+ year = cp.get("year")
+ slug = make_slug(cp["title"], year)
+
+ entry = {
+ "id": slug,
+ "title": cp["title"],
+ "authors": cp.get("authors", ["Unknown"]),
+ "year": year or 0,
+ "venue": "Unknown",
+ "source": "arxiv" if cp.get("arxiv_id") else "manual",
+ "status": "queued",
+ "tags": [],
+ "added": date.today().isoformat(),
+ "notes": f"Citation-chased from {cp['_cited_by']}. {cp['relevance']}",
+ }
+
+ if cp.get("arxiv_id"):
+ entry["arxiv_id"] = cp["arxiv_id"]
+ entry["source_url"] = f"https://arxiv.org/abs/{cp['arxiv_id']}"
+ if cp.get("doi"):
+ entry["doi"] = cp["doi"]
+
+ return entry
+
+
+def main():
+ apply = "--apply" in sys.argv
+
+ _, arxiv_ids, dois, titles_lower = load_registry()
+ cited = collect_cited_papers()
+
+ if not cited:
+ print("No cited_papers found in any scan.json files.")
+ return
+
+ # Deduplicate cited papers against registry AND against each other
+ seen_arxiv = set(arxiv_ids)
+ seen_doi = set(dois)
+ seen_title = set(titles_lower)
+ new_entries = []
+
+ for cp in cited:
+ if is_duplicate(cp, seen_arxiv, seen_doi, seen_title):
+ continue
+
+ entry = make_registry_entry(cp)
+ new_entries.append(entry)
+
+ # Track to avoid duplicates within this batch
+ if cp.get("arxiv_id"):
+ seen_arxiv.add(cp["arxiv_id"])
+ if cp.get("doi"):
+ seen_doi.add(cp["doi"])
+ seen_title.add(cp["title"].lower().strip())
+
+ if not new_entries:
+ print("All cited papers already in registry. Nothing to add.")
+ return
+
+ print(f"Found {len(new_entries)} new paper(s) from citation chasing:\n")
+
+ for entry in new_entries:
+ line = json.dumps(entry, ensure_ascii=False)
+ print(f" {entry['id']}: {entry['title']}")
+ if apply:
+ with open(REGISTRY_PATH, "a") as f:
+ f.write(line + "\n")
+
+ if apply:
+ print(f"\nAppended {len(new_entries)} entries to {REGISTRY_PATH}")
+ else:
+ print(f"\nDry run. Use --apply to append to {REGISTRY_PATH}")
+
+
+if __name__ == "__main__":
+ main()