extract-text.py - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

extract-text.py (6752B)
      1 #!/usr/bin/env python3
      2 """
      3 Extract text from paper PDFs. Uses pymupdf (fast, free) with optional
      4 Sonnet fallback for papers where extraction quality is poor.
      5 
      6 Output: papers/<slug>/paper.txt co-located with the PDF.
      7 
      8 Usage:
      9     python scripts/extract-text.py                    # All downloaded papers
     10     python scripts/extract-text.py --id metr-rct-2025 # Specific paper
     11     python scripts/extract-text.py --force             # Re-extract even if .txt exists
     12     python scripts/extract-text.py --dry-run           # Show what would be extracted
     13 """
     14 
     15 import json
     16 import subprocess
     17 import sys
     18 from pathlib import Path
     19 
     20 ROOT = Path(__file__).resolve().parent.parent
     21 REGISTRY_PATH = ROOT / "registry.jsonl"
     22 PAPERS_DIR = ROOT / "papers"
     23 
     24 # Heuristics for bad extraction
     25 MIN_CHARS = 500          # Shorter than this = probably failed
     26 MIN_WORDS_PER_PAGE = 30  # Fewer than this per page = probably garbled
     27 MAX_GARBLE_RATIO = 0.15  # More than 15% non-ASCII = probably garbled
     28 
     29 
     30 def load_registry():
     31     entries = []
     32     with open(REGISTRY_PATH) as f:
     33         for line in f:
     34             line = line.strip()
     35             if line:
     36                 entries.append(json.loads(line))
     37     return entries
     38 
     39 
     40 def extract_with_pymupdf(pdf_path):
     41     """Extract text using pymupdf. Returns (text, page_count) or raises."""
     42     import fitz
     43     doc = fitz.open(str(pdf_path))
     44     pages = []
     45     for page in doc:
     46         pages.append(page.get_text())
     47     doc.close()
     48     return "\n\n".join(pages), len(pages)
     49 
     50 
     51 def quality_check(text, page_count):
     52     """Check if extracted text looks reasonable. Returns (ok, reason)."""
     53     if len(text) < MIN_CHARS:
     54         return False, f"too short ({len(text)} chars)"
     55 
     56     words = text.split()
     57     words_per_page = len(words) / max(page_count, 1)
     58     if words_per_page < MIN_WORDS_PER_PAGE:
     59         return False, f"too few words per page ({words_per_page:.0f})"
     60 
     61     non_ascii = sum(1 for c in text if ord(c) > 127 and not c.isalpha())
     62     garble_ratio = non_ascii / max(len(text), 1)
     63     if garble_ratio > MAX_GARBLE_RATIO:
     64         return False, f"too much garble ({garble_ratio:.1%} non-ASCII)"
     65 
     66     return True, "ok"
     67 
     68 
     69 def extract_with_sonnet(pdf_path, txt_path):
     70     """Fall back to Claude Sonnet for text extraction."""
     71     print("  Falling back to Sonnet for extraction...")
     72     prompt = (
     73         "Extract all text content from this PDF. Preserve the structure: "
     74         "section headings, paragraphs, lists, tables (as plain text), "
     75         "figure captions, and references. Do not summarize or omit anything. "
     76         "Output plain text only, no markdown formatting."
     77     )
     78     try:
     79         result = subprocess.run(
     80             [
     81                 "claude", "-p", prompt,
     82                 "--model", "sonnet",
     83                 "--output-format", "text",
     84                 str(pdf_path),
     85             ],
     86             capture_output=True, text=True, timeout=300,
     87         )
     88         if result.returncode == 0 and len(result.stdout.strip()) > MIN_CHARS:
     89             return result.stdout.strip()
     90         else:
     91             print(f"  Sonnet extraction failed: exit={result.returncode}, "
     92                   f"len={len(result.stdout.strip())}")
     93             if result.stderr:
     94                 print(f"  stderr: {result.stderr[:200]}")
     95             return None
     96     except FileNotFoundError:
     97         print("  'claude' CLI not found, cannot fall back to Sonnet")
     98         return None
     99     except subprocess.TimeoutExpired:
    100         print("  Sonnet extraction timed out (300s)")
    101         return None
    102 
    103 
    104 def main():
    105     args = sys.argv[1:]
    106     dry_run = "--dry-run" in args
    107     force = "--force" in args
    108     specific_id = None
    109     for i, arg in enumerate(args):
    110         if arg == "--id" and i + 1 < len(args):
    111             specific_id = args[i + 1]
    112 
    113     entries = load_registry()
    114 
    115     candidates = []
    116     for entry in entries:
    117         if specific_id and entry["id"] != specific_id:
    118             continue
    119         if entry["status"] not in ("downloaded", "scanned", "deep_eval") and not specific_id:
    120             continue
    121         pdf_path = PAPERS_DIR / entry["id"] / "paper.pdf"
    122         txt_path = PAPERS_DIR / entry["id"] / "paper.txt"
    123         if not pdf_path.exists():
    124             continue
    125         if txt_path.exists() and not force:
    126             continue
    127         candidates.append(entry)
    128 
    129     if not candidates:
    130         print("No papers to extract.")
    131         return
    132 
    133     print(f"{'Would extract' if dry_run else 'Extracting'} {len(candidates)} paper(s):\n")
    134 
    135     extracted = 0
    136     fallback = 0
    137     failed = 0
    138 
    139     for i, entry in enumerate(candidates):
    140         pdf_path = PAPERS_DIR / entry["id"] / "paper.pdf"
    141         txt_path = PAPERS_DIR / entry["id"] / "paper.txt"
    142         print(f"[{i+1}/{len(candidates)}] {entry['id']}")
    143 
    144         if dry_run:
    145             continue
    146 
    147         # Try pymupdf first
    148         try:
    149             text, page_count = extract_with_pymupdf(pdf_path)
    150             ok, reason = quality_check(text, page_count)
    151 
    152             if ok:
    153                 txt_path.write_text(text, encoding="utf-8")
    154                 print(f"  OK: {len(text)} chars, {page_count} pages")
    155                 extracted += 1
    156                 continue
    157             else:
    158                 print(f"  pymupdf quality check failed: {reason}")
    159         except Exception as e:
    160             print(f"  pymupdf error: {e}")
    161 
    162         # Fall back to Sonnet
    163         sonnet_text = extract_with_sonnet(pdf_path, txt_path)
    164         if sonnet_text:
    165             txt_path.write_text(sonnet_text, encoding="utf-8")
    166             print(f"  OK (via Sonnet): {len(sonnet_text)} chars")
    167             extracted += 1
    168             fallback += 1
    169         else:
    170             print(f"  FAILED: could not extract text")
    171             failed += 1
    172 
    173     if not dry_run:
    174         print(f"\nDone. Extracted: {extracted} (pymupdf: {extracted - fallback}, "
    175               f"sonnet fallback: {fallback}), Failed: {failed}")
    176 
    177         # Write failure log
    178         if failed > 0:
    179             failed_entries = [
    180                 entry for entry in candidates
    181                 if not (PAPERS_DIR / entry["id"] / "paper.txt").exists()
    182             ]
    183             failure_path = ROOT / "extraction-failures.txt"
    184             with open(failure_path, "w") as f:
    185                 f.write(f"# Text extraction failures ({len(failed_entries)} total)\n")
    186                 f.write(f"# pymupdf failed quality check and Sonnet fallback also failed.\n")
    187                 f.write(f"# These papers need manual text extraction or alternative tools.\n\n")
    188                 for e in failed_entries:
    189                     f.write(f"{e['id']}\n")
    190                     f.write(f"  {e['title']}\n")
    191                     f.write(f"  papers/{e['id']}/paper.pdf\n\n")
    192             print(f"Failure log written to {failure_path}")
    193 
    194 
    195 if __name__ == "__main__":
    196     main()
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs