ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

commit aaa7097d653f63eb5a6a63611954e3c1ec4c4887
parent ed00b8092b4a223958bce1880699cb75d0dc4fe7
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Fri, 27 Feb 2026 20:57:52 +0100

Add citation-chasing pipeline: cited_papers in scan + harvest script

- Add cited_papers array to scan.schema.json (required field)
- Update scan-agent.md with instructions to extract survey-relevant
  references from each scanned paper (expect 3-15 per paper)
- Add scripts/harvest-citations.py: reads cited_papers from all
  scan.json files, deduplicates against registry by arxiv_id/doi/title,
  and proposes or appends new registry entries (--apply flag)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Diffstat:
Magents/scan-agent.md | 17++++++++++++++++-
Mschema/scan.schema.json | 39++++++++++++++++++++++++++++++++++++++-
Ascripts/harvest-citations.py | 171+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 225 insertions(+), 2 deletions(-)

diff --git a/agents/scan-agent.md b/agents/scan-agent.md @@ -61,7 +61,22 @@ Assign one or more tags describing the research methodology: Write a 2-4 sentence summary of the paper's most important findings. Be factual and specific. -### 6. Flag Red Flags +### 6. Extract Cited Papers + +Scan the paper's references for other papers that fall within the survey scope (AI/LLM capability, productivity, safety, code generation, agentic workflows). For each relevant cited paper, extract: + +- **title**: As it appears in the references section +- **authors**: If listed (at least first author) +- **year**: If available +- **arxiv_id**: If an arXiv URL or ID appears in the reference +- **doi**: If available +- **relevance**: One sentence on why this paper belongs in the survey + +Do NOT include every reference. Only include papers that meet the survey's inclusion criteria from `context/methodology.md`. A typical paper might cite 30-60 references; you should extract 3-15 relevant ones. + +These cited papers feed a citation-chasing pipeline: the `scripts/harvest-citations.py` script reads them from all scan.json files and proposes new registry entries for papers we haven't seen yet. + +### 7. Flag Red Flags Note any methodological concerns, including but not limited to: - Cherry-picked results or selective reporting diff --git a/schema/scan.schema.json b/schema/scan.schema.json @@ -10,7 +10,8 @@ "claims", "methodology_tags", "key_findings", - "red_flags" + "red_flags", + "cited_papers" ], "properties": { "paper": { @@ -109,6 +110,42 @@ } } } + }, + "cited_papers": { + "type": "array", + "description": "Papers cited in this paper that are relevant to the survey scope. Used for citation-chasing: these become candidates for the registry.", + "items": { + "type": "object", + "required": ["title", "relevance"], + "properties": { + "title": { + "type": "string", + "description": "Title of the cited paper as it appears in the references." + }, + "authors": { + "type": "array", + "items": { "type": "string" }, + "description": "Author names if available from the reference." + }, + "year": { + "type": "integer", + "description": "Publication year if available." + }, + "arxiv_id": { + "type": "string", + "pattern": "^\\d{4}\\.\\d{4,5}$", + "description": "arXiv ID if available." + }, + "doi": { + "type": "string", + "description": "DOI if available." + }, + "relevance": { + "type": "string", + "description": "Why this cited paper is relevant to the survey (1 sentence)." + } + } + } } }, "$defs": { diff --git a/scripts/harvest-citations.py b/scripts/harvest-citations.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +""" +Harvest cited papers from scan.json files and propose new registry entries. + +Reads all papers/*/scan.json files, collects cited_papers arrays, deduplicates +against the existing registry.jsonl, and prints proposed new entries. + +Usage: + python scripts/harvest-citations.py # Print proposed entries + python scripts/harvest-citations.py --apply # Append to registry.jsonl +""" + +import json +import sys +import os +from pathlib import Path +from datetime import date + +ROOT = Path(__file__).resolve().parent.parent +REGISTRY_PATH = ROOT / "registry.jsonl" +PAPERS_DIR = ROOT / "papers" + + +def load_registry(): + """Load existing registry entries and build lookup indexes.""" + entries = [] + arxiv_ids = set() + dois = set() + titles_lower = set() + + if REGISTRY_PATH.exists(): + with open(REGISTRY_PATH) as f: + for line in f: + line = line.strip() + if not line: + continue + entry = json.loads(line) + entries.append(entry) + if entry.get("arxiv_id"): + arxiv_ids.add(entry["arxiv_id"]) + if entry.get("doi"): + dois.add(entry["doi"]) + titles_lower.add(entry["title"].lower().strip()) + + return entries, arxiv_ids, dois, titles_lower + + +def collect_cited_papers(): + """Read all scan.json files and collect cited_papers.""" + cited = [] + if not PAPERS_DIR.exists(): + return cited + + for scan_path in PAPERS_DIR.glob("*/scan.json"): + with open(scan_path) as f: + scan = json.load(f) + paper_id = scan_path.parent.name + for cp in scan.get("cited_papers", []): + cp["_cited_by"] = paper_id + cited.append(cp) + + return cited + + +def make_slug(title, year=None): + """Generate a registry slug from a title.""" + words = title.lower().split() + # Take first 4-5 meaningful words, skip articles + skip = {"a", "an", "the", "of", "in", "on", "for", "and", "with", "to", "is", "are"} + meaningful = [w for w in words if w not in skip][:5] + slug = "-".join(meaningful) + # Clean non-alphanumeric chars + slug = "".join(c if c.isalnum() or c == "-" else "" for c in slug) + # Remove double hyphens + while "--" in slug: + slug = slug.replace("--", "-") + slug = slug.strip("-") + if year: + slug = f"{slug}-{year}" + return slug + + +def is_duplicate(cp, arxiv_ids, dois, titles_lower): + """Check if a cited paper already exists in the registry.""" + if cp.get("arxiv_id") and cp["arxiv_id"] in arxiv_ids: + return True + if cp.get("doi") and cp["doi"] in dois: + return True + if cp.get("title") and cp["title"].lower().strip() in titles_lower: + return True + return False + + +def make_registry_entry(cp): + """Convert a cited_paper object into a registry entry.""" + year = cp.get("year") + slug = make_slug(cp["title"], year) + + entry = { + "id": slug, + "title": cp["title"], + "authors": cp.get("authors", ["Unknown"]), + "year": year or 0, + "venue": "Unknown", + "source": "arxiv" if cp.get("arxiv_id") else "manual", + "status": "queued", + "tags": [], + "added": date.today().isoformat(), + "notes": f"Citation-chased from {cp['_cited_by']}. {cp['relevance']}", + } + + if cp.get("arxiv_id"): + entry["arxiv_id"] = cp["arxiv_id"] + entry["source_url"] = f"https://arxiv.org/abs/{cp['arxiv_id']}" + if cp.get("doi"): + entry["doi"] = cp["doi"] + + return entry + + +def main(): + apply = "--apply" in sys.argv + + _, arxiv_ids, dois, titles_lower = load_registry() + cited = collect_cited_papers() + + if not cited: + print("No cited_papers found in any scan.json files.") + return + + # Deduplicate cited papers against registry AND against each other + seen_arxiv = set(arxiv_ids) + seen_doi = set(dois) + seen_title = set(titles_lower) + new_entries = [] + + for cp in cited: + if is_duplicate(cp, seen_arxiv, seen_doi, seen_title): + continue + + entry = make_registry_entry(cp) + new_entries.append(entry) + + # Track to avoid duplicates within this batch + if cp.get("arxiv_id"): + seen_arxiv.add(cp["arxiv_id"]) + if cp.get("doi"): + seen_doi.add(cp["doi"]) + seen_title.add(cp["title"].lower().strip()) + + if not new_entries: + print("All cited papers already in registry. Nothing to add.") + return + + print(f"Found {len(new_entries)} new paper(s) from citation chasing:\n") + + for entry in new_entries: + line = json.dumps(entry, ensure_ascii=False) + print(f" {entry['id']}: {entry['title']}") + if apply: + with open(REGISTRY_PATH, "a") as f: + f.write(line + "\n") + + if apply: + print(f"\nAppended {len(new_entries)} entries to {REGISTRY_PATH}") + else: + print(f"\nDry run. Use --apply to append to {REGISTRY_PATH}") + + +if __name__ == "__main__": + main()

Impressum · Datenschutz