extract-text.py (6752B)
1 #!/usr/bin/env python3 2 """ 3 Extract text from paper PDFs. Uses pymupdf (fast, free) with optional 4 Sonnet fallback for papers where extraction quality is poor. 5 6 Output: papers/<slug>/paper.txt co-located with the PDF. 7 8 Usage: 9 python scripts/extract-text.py # All downloaded papers 10 python scripts/extract-text.py --id metr-rct-2025 # Specific paper 11 python scripts/extract-text.py --force # Re-extract even if .txt exists 12 python scripts/extract-text.py --dry-run # Show what would be extracted 13 """ 14 15 import json 16 import subprocess 17 import sys 18 from pathlib import Path 19 20 ROOT = Path(__file__).resolve().parent.parent 21 REGISTRY_PATH = ROOT / "registry.jsonl" 22 PAPERS_DIR = ROOT / "papers" 23 24 # Heuristics for bad extraction 25 MIN_CHARS = 500 # Shorter than this = probably failed 26 MIN_WORDS_PER_PAGE = 30 # Fewer than this per page = probably garbled 27 MAX_GARBLE_RATIO = 0.15 # More than 15% non-ASCII = probably garbled 28 29 30 def load_registry(): 31 entries = [] 32 with open(REGISTRY_PATH) as f: 33 for line in f: 34 line = line.strip() 35 if line: 36 entries.append(json.loads(line)) 37 return entries 38 39 40 def extract_with_pymupdf(pdf_path): 41 """Extract text using pymupdf. Returns (text, page_count) or raises.""" 42 import fitz 43 doc = fitz.open(str(pdf_path)) 44 pages = [] 45 for page in doc: 46 pages.append(page.get_text()) 47 doc.close() 48 return "\n\n".join(pages), len(pages) 49 50 51 def quality_check(text, page_count): 52 """Check if extracted text looks reasonable. Returns (ok, reason).""" 53 if len(text) < MIN_CHARS: 54 return False, f"too short ({len(text)} chars)" 55 56 words = text.split() 57 words_per_page = len(words) / max(page_count, 1) 58 if words_per_page < MIN_WORDS_PER_PAGE: 59 return False, f"too few words per page ({words_per_page:.0f})" 60 61 non_ascii = sum(1 for c in text if ord(c) > 127 and not c.isalpha()) 62 garble_ratio = non_ascii / max(len(text), 1) 63 if garble_ratio > MAX_GARBLE_RATIO: 64 return False, f"too much garble ({garble_ratio:.1%} non-ASCII)" 65 66 return True, "ok" 67 68 69 def extract_with_sonnet(pdf_path, txt_path): 70 """Fall back to Claude Sonnet for text extraction.""" 71 print(" Falling back to Sonnet for extraction...") 72 prompt = ( 73 "Extract all text content from this PDF. Preserve the structure: " 74 "section headings, paragraphs, lists, tables (as plain text), " 75 "figure captions, and references. Do not summarize or omit anything. " 76 "Output plain text only, no markdown formatting." 77 ) 78 try: 79 result = subprocess.run( 80 [ 81 "claude", "-p", prompt, 82 "--model", "sonnet", 83 "--output-format", "text", 84 str(pdf_path), 85 ], 86 capture_output=True, text=True, timeout=300, 87 ) 88 if result.returncode == 0 and len(result.stdout.strip()) > MIN_CHARS: 89 return result.stdout.strip() 90 else: 91 print(f" Sonnet extraction failed: exit={result.returncode}, " 92 f"len={len(result.stdout.strip())}") 93 if result.stderr: 94 print(f" stderr: {result.stderr[:200]}") 95 return None 96 except FileNotFoundError: 97 print(" 'claude' CLI not found, cannot fall back to Sonnet") 98 return None 99 except subprocess.TimeoutExpired: 100 print(" Sonnet extraction timed out (300s)") 101 return None 102 103 104 def main(): 105 args = sys.argv[1:] 106 dry_run = "--dry-run" in args 107 force = "--force" in args 108 specific_id = None 109 for i, arg in enumerate(args): 110 if arg == "--id" and i + 1 < len(args): 111 specific_id = args[i + 1] 112 113 entries = load_registry() 114 115 candidates = [] 116 for entry in entries: 117 if specific_id and entry["id"] != specific_id: 118 continue 119 if entry["status"] not in ("downloaded", "scanned", "deep_eval") and not specific_id: 120 continue 121 pdf_path = PAPERS_DIR / entry["id"] / "paper.pdf" 122 txt_path = PAPERS_DIR / entry["id"] / "paper.txt" 123 if not pdf_path.exists(): 124 continue 125 if txt_path.exists() and not force: 126 continue 127 candidates.append(entry) 128 129 if not candidates: 130 print("No papers to extract.") 131 return 132 133 print(f"{'Would extract' if dry_run else 'Extracting'} {len(candidates)} paper(s):\n") 134 135 extracted = 0 136 fallback = 0 137 failed = 0 138 139 for i, entry in enumerate(candidates): 140 pdf_path = PAPERS_DIR / entry["id"] / "paper.pdf" 141 txt_path = PAPERS_DIR / entry["id"] / "paper.txt" 142 print(f"[{i+1}/{len(candidates)}] {entry['id']}") 143 144 if dry_run: 145 continue 146 147 # Try pymupdf first 148 try: 149 text, page_count = extract_with_pymupdf(pdf_path) 150 ok, reason = quality_check(text, page_count) 151 152 if ok: 153 txt_path.write_text(text, encoding="utf-8") 154 print(f" OK: {len(text)} chars, {page_count} pages") 155 extracted += 1 156 continue 157 else: 158 print(f" pymupdf quality check failed: {reason}") 159 except Exception as e: 160 print(f" pymupdf error: {e}") 161 162 # Fall back to Sonnet 163 sonnet_text = extract_with_sonnet(pdf_path, txt_path) 164 if sonnet_text: 165 txt_path.write_text(sonnet_text, encoding="utf-8") 166 print(f" OK (via Sonnet): {len(sonnet_text)} chars") 167 extracted += 1 168 fallback += 1 169 else: 170 print(f" FAILED: could not extract text") 171 failed += 1 172 173 if not dry_run: 174 print(f"\nDone. Extracted: {extracted} (pymupdf: {extracted - fallback}, " 175 f"sonnet fallback: {fallback}), Failed: {failed}") 176 177 # Write failure log 178 if failed > 0: 179 failed_entries = [ 180 entry for entry in candidates 181 if not (PAPERS_DIR / entry["id"] / "paper.txt").exists() 182 ] 183 failure_path = ROOT / "extraction-failures.txt" 184 with open(failure_path, "w") as f: 185 f.write(f"# Text extraction failures ({len(failed_entries)} total)\n") 186 f.write(f"# pymupdf failed quality check and Sonnet fallback also failed.\n") 187 f.write(f"# These papers need manual text extraction or alternative tools.\n\n") 188 for e in failed_entries: 189 f.write(f"{e['id']}\n") 190 f.write(f" {e['title']}\n") 191 f.write(f" papers/{e['id']}/paper.pdf\n\n") 192 print(f"Failure log written to {failure_path}") 193 194 195 if __name__ == "__main__": 196 main()