generate.py (5252B)
1 #!/usr/bin/env python3 2 """Generate noise files for context_noise axis. 3 4 Creates wikipedia and lorem ipsum text files at 25/50/75% of context window. 5 Target sizes: 25%=200KB, 50%=400KB, 75%=600KB 6 7 Usage: python3 tasks/tetris/noise/generate.py 8 """ 9 10 import json 11 import urllib.request 12 import os 13 14 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 15 16 # Target sizes in characters 17 SIZES = { 18 "25": 200_000, 19 "50": 400_000, 20 "75": 600_000, 21 } 22 23 WIKI_ARTICLES = [ 24 "Roman_Empire", "French_Revolution", "World_War_II", "Ancient_Egypt", 25 "Quantum_mechanics", "Theory_of_relativity", "Evolution", "DNA", 26 "Amazon_rainforest", "Pacific_Ocean", "Himalayas", 27 "Philosophy_of_mind", "Ethics", "Epistemology", 28 "Computer_science", "Algorithm", "Machine_learning", "Cryptography", 29 "Renaissance", "Industrial_Revolution", "Cold_War", "Byzantine_Empire", 30 "Solar_System", "Photosynthesis", "Plate_tectonics", "Periodic_table", 31 "Democracy", "Capitalism", "Feudalism", "Imperialism", 32 "Mozart", "Shakespeare", "Leonardo_da_Vinci", "Isaac_Newton", 33 "Olympic_Games", "Chess", "Association_football", 34 "Universe", "Black_hole", "Milky_Way", "Jupiter", 35 ] 36 37 LOREM_BLOCK = ( 38 "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod " 39 "tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim " 40 "veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea " 41 "commodo consequat. Duis aute irure dolor in reprehenderit in voluptate " 42 "velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint " 43 "occaecat cupidatat non proident, sunt in culpa qui officia deserunt " 44 "mollit anim id est laborum.\n\n" 45 "Sed ut perspiciatis unde omnis iste natus error sit voluptatem " 46 "accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae " 47 "ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt " 48 "explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut " 49 "odit aut fugit, sed quia consequuntur magni dolores eos qui ratione " 50 "voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum " 51 "quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam " 52 "eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat " 53 "voluptatem.\n\n" 54 "At vero eos et accusamus et iusto odio dignissimos ducimus qui " 55 "blanditiis praesentium voluptatum deleniti atque corrupti quos dolores " 56 "et quas molestias excepturi sint occaecati cupiditate non provident, " 57 "similique sunt in culpa qui officia deserunt mollitia animi, id est " 58 "laborum et dolorum fuga. Et harum quidem rerum facilis est et expedita " 59 "distinctio. Nam libero tempore, cum soluta nobis est eligendi optio " 60 "cumque nihil impedit quo minus id quod maxime placeat facere possimus, " 61 "omnis voluptas assumenda est, omnis dolor repellendus.\n\n" 62 ) 63 64 65 def fetch_wiki_text(title): 66 """Fetch plain text extract of a Wikipedia article.""" 67 url = ( 68 "https://en.wikipedia.org/w/api.php?" 69 f"action=query&titles={title}&prop=extracts&explaintext=true&format=json" 70 ) 71 try: 72 req = urllib.request.Request(url, headers={"User-Agent": "LoopBench/1.0"}) 73 with urllib.request.urlopen(req, timeout=15) as resp: 74 data = json.loads(resp.read()) 75 pages = data.get("query", {}).get("pages", {}) 76 for page in pages.values(): 77 extract = page.get("extract", "") 78 if extract: 79 return extract 80 except Exception as e: 81 print(f" Failed to fetch {title}: {e}") 82 return "" 83 84 85 def generate_wikipedia_files(): 86 """Generate wikipedia noise files at 25/50/75% sizes.""" 87 print("Fetching Wikipedia articles...") 88 all_text = "" 89 for title in WIKI_ARTICLES: 90 if len(all_text) >= SIZES["75"]: 91 break 92 text = fetch_wiki_text(title) 93 if text: 94 all_text += f"\n\n--- {title.replace('_', ' ')} ---\n\n{text}" 95 print(f" {title}: {len(text)} chars (total: {len(all_text)})") 96 97 if len(all_text) < SIZES["75"]: 98 print(f" Warning: only got {len(all_text)} chars, need {SIZES['75']}") 99 # Pad by repeating 100 while len(all_text) < SIZES["75"]: 101 all_text += all_text[:SIZES["75"] - len(all_text)] 102 103 for level, target in SIZES.items(): 104 path = os.path.join(SCRIPT_DIR, f"wikipedia_{level}.txt") 105 content = all_text[:target] 106 with open(path, "w") as f: 107 f.write(content) 108 print(f" wikipedia_{level}.txt: {len(content)} chars ({len(content)//1000}KB)") 109 110 111 def generate_lorem_files(): 112 """Generate lorem ipsum noise files at 25/50/75% sizes.""" 113 print("Generating lorem ipsum files...") 114 for level, target in SIZES.items(): 115 repeats = (target // len(LOREM_BLOCK)) + 1 116 content = (LOREM_BLOCK * repeats)[:target] 117 path = os.path.join(SCRIPT_DIR, f"lorem_{level}.txt") 118 with open(path, "w") as f: 119 f.write(content) 120 print(f" lorem_{level}.txt: {len(content)} chars ({len(content)//1000}KB)") 121 122 123 if __name__ == "__main__": 124 generate_lorem_files() 125 generate_wikipedia_files() 126 print("Done.")