commit 3a05a3aea1e82e39df3a495ed95e9004e4d25b8f
parent 69c92da1bfbb276ddd27e9ba8256d0087e01c43a
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Fri, 27 Feb 2026 21:27:14 +0100
Add build pipeline: text extraction, summary aggregation, venue list
- scripts/extract-text.py: pymupdf text extraction with Sonnet fallback
for low-quality results. Outputs paper.txt co-located with PDFs.
- scripts/build-summary.py: aggregates all scan.json into
analysis/summary.json + summary.md (score distributions, ranked lists,
red flags, breakdowns by year/tag). Static artifact for narrative work.
- context/requirements.md: full pipeline diagram, venue brainstorm
(TOSEM, EMSE, NeurIPS D&B, ICSE, Nature MI, etc.), output format (LaTeX)
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Diffstat:
5 files changed, 506 insertions(+), 0 deletions(-)
diff --git a/analysis/summary.json b/analysis/summary.json
@@ -0,0 +1,8 @@
+{
+ "total_registry": 155,
+ "total_scanned": 0,
+ "status_counts": {
+ "queued": 155
+ },
+ "note": "No scans completed yet."
+}
+\ No newline at end of file
diff --git a/analysis/summary.md b/analysis/summary.md
@@ -0,0 +1,6 @@
+# Survey Summary
+
+Registry: 155 papers
+Scanned: 0 papers
+
+*No scans completed yet.*
diff --git a/context/requirements.md b/context/requirements.md
@@ -73,6 +73,65 @@ discover -> download -> scan -> deep eval (optional) -> aggregate
- `theoretical` - Theoretical / analytical
- `qualitative` - Qualitative research
+## Build Pipeline
+
+```
+registry.jsonl (queued)
+ → scripts/download-arxiv.py → papers/<slug>/paper.pdf (downloaded)
+ → scripts/extract-text.py → papers/<slug>/paper.txt
+ → scan agent (Opus) → papers/<slug>/scan.json (scanned)
+ → scripts/harvest-citations.py → new registry entries
+ → scripts/build-summary.py → analysis/summary.json + summary.md
+ → LaTeX build → paper.pdf
+```
+
+Text extraction uses pymupdf (free, fast). Falls back to Sonnet via `claude` CLI if pymupdf output fails quality checks (too short, garbled, low words-per-page).
+
+The summary artifact (`analysis/summary.json` and `analysis/summary.md`) is built before the LaTeX paper. It contains score distributions, ranked lists, red flag counts, and breakdowns by year/tag/methodology. This is the working document for developing the narrative sections.
+
+## Output Format
+
+LaTeX paper. Submittable to academic venues.
+
+### Venue Brainstorm
+
+**Top targets (SE + AI intersection):**
+- **ICSE** (International Conference on Software Engineering) — premier SE venue, has had AI4SE tracks
+- **FSE/ESEC** (Foundations of Software Engineering) — strong SE venue, accepts empirical studies
+- **ASE** (Automated Software Engineering) — good fit for tooling/methodology papers
+- **MSR** (Mining Software Repositories) — empirical SE, data-heavy studies welcome
+
+**AI/ML venues:**
+- **NeurIPS** (Datasets and Benchmarks track) — good fit for "the benchmarks are broken" angle
+- **ICML** — possible but harder sell for a survey
+- **AAAI** — broad AI, accepts surveys and position papers
+- **COLM** (Conference on Language Modeling) — new venue, directly relevant
+
+**NLP venues:**
+- **ACL** — accepts survey/position papers, has done "methodology critique" before
+- **EMNLP** — similar to ACL, slightly more empirical focus
+- **NAACL** — regional but well-regarded
+
+**Journals:**
+- **TOSEM** (ACM Transactions on Software Engineering and Methodology) — perfect fit for a systematic review
+- **TSE** (IEEE Transactions on Software Engineering) — prestigious, accepts surveys
+- **EMSE** (Empirical Software Engineering) — Springer journal, literally made for this
+- **Nature Machine Intelligence** — high impact, accepts perspective/review articles
+- **Communications of the ACM** — broad reach, good for "the field has a methodology problem" message
+
+**Meta-research / open science:**
+- **MetaArXiv** — preprint server for meta-research
+- **Royal Society Open Science** — open access, accepts methodological critiques across fields
+
+**Workshop / special tracks:**
+- **NeurIPS Datasets and Benchmarks** — if framed as benchmark quality assessment
+- **ICSE NIER** (New Ideas and Emerging Results) — shorter format, good for early results
+- **LLM4Code** workshop (co-located with ICSE) — directly on topic
+
+### Venue Strategy
+
+TOSEM or EMSE are the most natural fit: they publish systematic reviews routinely, the reviewers understand the format, and "methodological quality of AI/SE research" is exactly their beat. For maximum impact outside SE, NeurIPS Datasets & Benchmarks or Nature Machine Intelligence would reach the ML audience that needs to hear it most.
+
## Virality Tracking
Deferred until after initial data collection. Future work may track citation counts, social media mentions, and media coverage to correlate with methodological quality.
diff --git a/scripts/build-summary.py b/scripts/build-summary.py
@@ -0,0 +1,253 @@
+#!/usr/bin/env python3
+"""
+Aggregate all scan.json results into a summary artifact.
+
+Produces analysis/summary.json (machine-readable) and analysis/summary.md
+(human-readable). These are static artifacts for working on the narrative
+sections of the paper before the LaTeX build runs.
+
+Usage:
+ python scripts/build-summary.py
+"""
+
+import json
+from collections import Counter, defaultdict
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parent.parent
+REGISTRY_PATH = ROOT / "registry.jsonl"
+PAPERS_DIR = ROOT / "papers"
+ANALYSIS_DIR = ROOT / "analysis"
+
+RUBRIC_DIMENSIONS = [
+ "artifacts_reproducibility",
+ "statistical_rigor",
+ "benchmark_quality",
+ "claim_to_evidence",
+ "setup_transparency",
+ "limitations_discussion",
+]
+
+SCORE_LABELS = {0: "absent", 1: "weak", 2: "adequate", 3: "strong"}
+
+
+def load_registry():
+ entries = {}
+ with open(REGISTRY_PATH) as f:
+ for line in f:
+ line = line.strip()
+ if line:
+ entry = json.loads(line)
+ entries[entry["id"]] = entry
+ return entries
+
+
+def load_scans():
+ scans = {}
+ for scan_path in PAPERS_DIR.glob("*/scan.json"):
+ paper_id = scan_path.parent.name
+ with open(scan_path) as f:
+ scans[paper_id] = json.load(f)
+ return scans
+
+
+def compute_summary(registry, scans):
+ summary = {
+ "total_registry": len(registry),
+ "total_scanned": len(scans),
+ "status_counts": Counter(e["status"] for e in registry.values()),
+ }
+
+ if not scans:
+ summary["note"] = "No scans completed yet."
+ return summary
+
+ # Score distributions per dimension
+ dimension_scores = {d: [] for d in RUBRIC_DIMENSIONS}
+ for scan in scans.values():
+ rubric = scan.get("rubric", {})
+ for dim in RUBRIC_DIMENSIONS:
+ if dim in rubric:
+ dimension_scores[dim].append(rubric[dim]["score"])
+
+ summary["dimensions"] = {}
+ for dim, scores in dimension_scores.items():
+ if not scores:
+ continue
+ dist = Counter(scores)
+ summary["dimensions"][dim] = {
+ "mean": round(sum(scores) / len(scores), 2),
+ "distribution": {SCORE_LABELS[k]: dist.get(k, 0) for k in range(4)},
+ "n": len(scores),
+ }
+
+ # Overall scores per paper
+ paper_scores = {}
+ for paper_id, scan in scans.items():
+ rubric = scan.get("rubric", {})
+ scores = [rubric[d]["score"] for d in RUBRIC_DIMENSIONS if d in rubric]
+ if scores:
+ paper_scores[paper_id] = {
+ "total": sum(scores),
+ "mean": round(sum(scores) / len(scores), 2),
+ "title": scan.get("paper", {}).get("title", registry.get(paper_id, {}).get("title", "?")),
+ }
+
+ ranked = sorted(paper_scores.items(), key=lambda x: x[1]["total"])
+ summary["bottom_10"] = [
+ {"id": pid, "title": ps["title"], "total": ps["total"], "mean": ps["mean"]}
+ for pid, ps in ranked[:10]
+ ]
+ summary["top_10"] = [
+ {"id": pid, "title": ps["title"], "total": ps["total"], "mean": ps["mean"]}
+ for pid, ps in ranked[-10:]
+ ][::-1]
+
+ # Red flags
+ all_flags = []
+ for paper_id, scan in scans.items():
+ for rf in scan.get("red_flags", []):
+ all_flags.append(rf["flag"])
+ summary["red_flag_counts"] = dict(Counter(all_flags).most_common(20))
+
+ # Methodology tag distribution
+ all_method_tags = []
+ for scan in scans.values():
+ all_method_tags.extend(scan.get("methodology_tags", []))
+ summary["methodology_tag_counts"] = dict(Counter(all_method_tags).most_common())
+
+ # Topic tag distribution (from registry)
+ all_topic_tags = []
+ for paper_id in scans:
+ if paper_id in registry:
+ all_topic_tags.extend(registry[paper_id].get("tags", []))
+ summary["topic_tag_counts"] = dict(Counter(all_topic_tags).most_common())
+
+ # Claims support breakdown
+ support_counts = Counter()
+ for scan in scans.values():
+ for claim in scan.get("claims", []):
+ support_counts[claim.get("supported", "unknown")] += 1
+ summary["claim_support"] = dict(support_counts)
+
+ # Year breakdown of scores
+ year_scores = defaultdict(list)
+ for paper_id, ps in paper_scores.items():
+ year = registry.get(paper_id, {}).get("year") or scans[paper_id].get("paper", {}).get("year")
+ if year:
+ year_scores[year].append(ps["mean"])
+ summary["mean_score_by_year"] = {
+ str(y): round(sum(s) / len(s), 2) for y, s in sorted(year_scores.items())
+ }
+
+ # Citation-chased papers count
+ total_cited = 0
+ for scan in scans.values():
+ total_cited += len(scan.get("cited_papers", []))
+ summary["total_cited_papers_extracted"] = total_cited
+
+ return summary
+
+
+def render_markdown(summary):
+ lines = ["# Survey Summary\n"]
+ lines.append(f"Registry: {summary['total_registry']} papers")
+ lines.append(f"Scanned: {summary['total_scanned']} papers\n")
+
+ if summary.get("note"):
+ lines.append(f"*{summary['note']}*\n")
+ return "\n".join(lines)
+
+ status = summary.get("status_counts", {})
+ lines.append("## Pipeline Status\n")
+ for s, c in sorted(status.items()):
+ lines.append(f"- {s}: {c}")
+ lines.append("")
+
+ dims = summary.get("dimensions", {})
+ if dims:
+ lines.append("## Rubric Score Distributions\n")
+ lines.append("| Dimension | Mean | Absent | Weak | Adequate | Strong | N |")
+ lines.append("|-----------|------|--------|------|----------|--------|---|")
+ for dim in RUBRIC_DIMENSIONS:
+ if dim not in dims:
+ continue
+ d = dims[dim]
+ dist = d["distribution"]
+ label = dim.replace("_", " ").title()
+ lines.append(
+ f"| {label} | {d['mean']} | {dist['absent']} | "
+ f"{dist['weak']} | {dist['adequate']} | {dist['strong']} | {d['n']} |"
+ )
+ lines.append("")
+
+ if summary.get("bottom_10"):
+ lines.append("## Bottom 10 (Lowest Total Scores)\n")
+ for p in summary["bottom_10"]:
+ lines.append(f"1. **{p['title']}** ({p['id']}) — {p['total']}/18, mean {p['mean']}")
+ lines.append("")
+
+ if summary.get("top_10"):
+ lines.append("## Top 10 (Highest Total Scores)\n")
+ for p in summary["top_10"]:
+ lines.append(f"1. **{p['title']}** ({p['id']}) — {p['total']}/18, mean {p['mean']}")
+ lines.append("")
+
+ if summary.get("red_flag_counts"):
+ lines.append("## Most Common Red Flags\n")
+ for flag, count in summary["red_flag_counts"].items():
+ lines.append(f"- {flag}: {count}")
+ lines.append("")
+
+ if summary.get("claim_support"):
+ lines.append("## Claim Support Breakdown\n")
+ for level, count in summary["claim_support"].items():
+ lines.append(f"- {level}: {count}")
+ lines.append("")
+
+ if summary.get("methodology_tag_counts"):
+ lines.append("## Methodology Types\n")
+ for tag, count in summary["methodology_tag_counts"].items():
+ lines.append(f"- {tag}: {count}")
+ lines.append("")
+
+ if summary.get("mean_score_by_year"):
+ lines.append("## Mean Score by Year\n")
+ for year, score in summary["mean_score_by_year"].items():
+ lines.append(f"- {year}: {score}")
+ lines.append("")
+
+ return "\n".join(lines)
+
+
+def main():
+ ANALYSIS_DIR.mkdir(exist_ok=True)
+
+ registry = load_registry()
+ scans = load_scans()
+ summary = compute_summary(registry, scans)
+
+ json_path = ANALYSIS_DIR / "summary.json"
+ md_path = ANALYSIS_DIR / "summary.md"
+
+ with open(json_path, "w") as f:
+ json.dump(summary, f, indent=2, ensure_ascii=False)
+ print(f"Wrote {json_path}")
+
+ md = render_markdown(summary)
+ with open(md_path, "w") as f:
+ f.write(md)
+ print(f"Wrote {md_path}")
+
+ # Print a quick overview
+ print(f"\n{summary['total_scanned']}/{summary['total_registry']} papers scanned")
+ if summary.get("dimensions"):
+ print("Mean scores:")
+ for dim in RUBRIC_DIMENSIONS:
+ if dim in summary["dimensions"]:
+ label = dim.replace("_", " ").title()
+ print(f" {label}: {summary['dimensions'][dim]['mean']}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/extract-text.py b/scripts/extract-text.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+"""
+Extract text from paper PDFs. Uses pymupdf (fast, free) with optional
+Sonnet fallback for papers where extraction quality is poor.
+
+Output: papers/<slug>/paper.txt co-located with the PDF.
+
+Usage:
+ python scripts/extract-text.py # All downloaded papers
+ python scripts/extract-text.py --id metr-rct-2025 # Specific paper
+ python scripts/extract-text.py --force # Re-extract even if .txt exists
+ python scripts/extract-text.py --dry-run # Show what would be extracted
+"""
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parent.parent
+REGISTRY_PATH = ROOT / "registry.jsonl"
+PAPERS_DIR = ROOT / "papers"
+
+# Heuristics for bad extraction
+MIN_CHARS = 500 # Shorter than this = probably failed
+MIN_WORDS_PER_PAGE = 30 # Fewer than this per page = probably garbled
+MAX_GARBLE_RATIO = 0.15 # More than 15% non-ASCII = probably garbled
+
+
+def load_registry():
+ entries = []
+ with open(REGISTRY_PATH) as f:
+ for line in f:
+ line = line.strip()
+ if line:
+ entries.append(json.loads(line))
+ return entries
+
+
+def extract_with_pymupdf(pdf_path):
+ """Extract text using pymupdf. Returns (text, page_count) or raises."""
+ import fitz
+ doc = fitz.open(str(pdf_path))
+ pages = []
+ for page in doc:
+ pages.append(page.get_text())
+ doc.close()
+ return "\n\n".join(pages), len(pages)
+
+
+def quality_check(text, page_count):
+ """Check if extracted text looks reasonable. Returns (ok, reason)."""
+ if len(text) < MIN_CHARS:
+ return False, f"too short ({len(text)} chars)"
+
+ words = text.split()
+ words_per_page = len(words) / max(page_count, 1)
+ if words_per_page < MIN_WORDS_PER_PAGE:
+ return False, f"too few words per page ({words_per_page:.0f})"
+
+ non_ascii = sum(1 for c in text if ord(c) > 127 and not c.isalpha())
+ garble_ratio = non_ascii / max(len(text), 1)
+ if garble_ratio > MAX_GARBLE_RATIO:
+ return False, f"too much garble ({garble_ratio:.1%} non-ASCII)"
+
+ return True, "ok"
+
+
+def extract_with_sonnet(pdf_path, txt_path):
+ """Fall back to Claude Sonnet for text extraction."""
+ print(" Falling back to Sonnet for extraction...")
+ prompt = (
+ "Extract all text content from this PDF. Preserve the structure: "
+ "section headings, paragraphs, lists, tables (as plain text), "
+ "figure captions, and references. Do not summarize or omit anything. "
+ "Output plain text only, no markdown formatting."
+ )
+ try:
+ result = subprocess.run(
+ [
+ "claude", "-p", prompt,
+ "--model", "sonnet",
+ "--output-format", "text",
+ str(pdf_path),
+ ],
+ capture_output=True, text=True, timeout=300,
+ )
+ if result.returncode == 0 and len(result.stdout.strip()) > MIN_CHARS:
+ return result.stdout.strip()
+ else:
+ print(f" Sonnet extraction failed: exit={result.returncode}, "
+ f"len={len(result.stdout.strip())}")
+ if result.stderr:
+ print(f" stderr: {result.stderr[:200]}")
+ return None
+ except FileNotFoundError:
+ print(" 'claude' CLI not found, cannot fall back to Sonnet")
+ return None
+ except subprocess.TimeoutExpired:
+ print(" Sonnet extraction timed out (300s)")
+ return None
+
+
+def main():
+ args = sys.argv[1:]
+ dry_run = "--dry-run" in args
+ force = "--force" in args
+ specific_id = None
+ for i, arg in enumerate(args):
+ if arg == "--id" and i + 1 < len(args):
+ specific_id = args[i + 1]
+
+ entries = load_registry()
+
+ candidates = []
+ for entry in entries:
+ if specific_id and entry["id"] != specific_id:
+ continue
+ if entry["status"] not in ("downloaded", "scanned", "deep_eval") and not specific_id:
+ continue
+ pdf_path = PAPERS_DIR / entry["id"] / "paper.pdf"
+ txt_path = PAPERS_DIR / entry["id"] / "paper.txt"
+ if not pdf_path.exists():
+ continue
+ if txt_path.exists() and not force:
+ continue
+ candidates.append(entry)
+
+ if not candidates:
+ print("No papers to extract.")
+ return
+
+ print(f"{'Would extract' if dry_run else 'Extracting'} {len(candidates)} paper(s):\n")
+
+ extracted = 0
+ fallback = 0
+ failed = 0
+
+ for i, entry in enumerate(candidates):
+ pdf_path = PAPERS_DIR / entry["id"] / "paper.pdf"
+ txt_path = PAPERS_DIR / entry["id"] / "paper.txt"
+ print(f"[{i+1}/{len(candidates)}] {entry['id']}")
+
+ if dry_run:
+ continue
+
+ # Try pymupdf first
+ try:
+ text, page_count = extract_with_pymupdf(pdf_path)
+ ok, reason = quality_check(text, page_count)
+
+ if ok:
+ txt_path.write_text(text, encoding="utf-8")
+ print(f" OK: {len(text)} chars, {page_count} pages")
+ extracted += 1
+ continue
+ else:
+ print(f" pymupdf quality check failed: {reason}")
+ except Exception as e:
+ print(f" pymupdf error: {e}")
+
+ # Fall back to Sonnet
+ sonnet_text = extract_with_sonnet(pdf_path, txt_path)
+ if sonnet_text:
+ txt_path.write_text(sonnet_text, encoding="utf-8")
+ print(f" OK (via Sonnet): {len(sonnet_text)} chars")
+ extracted += 1
+ fallback += 1
+ else:
+ print(f" FAILED: could not extract text")
+ failed += 1
+
+ if not dry_run:
+ print(f"\nDone. Extracted: {extracted} (pymupdf: {extracted - fallback}, "
+ f"sonnet fallback: {fallback}), Failed: {failed}")
+
+
+if __name__ == "__main__":
+ main()