First good level of progress

2025-06-29 12:36:25 -04:00
commit 159be1eb82
15 changed files with 10628 additions and 0 deletions
--- a/tools/book_downloader.py
+++ b/tools/book_downloader.py
@ -0,0 +1,38 @@
+# tools/book_downloader.py
+
+import requests
+import os
+
+DATA_DIR = os.path.join("data", "books")
+GUTENBERG_URL = "https://www.gutenberg.org/files/{id}/{id}-0.txt"
+
+
+def download_book(gutenberg_id, title_hint="book"):
+    os.makedirs(DATA_DIR, exist_ok=True)
+    url = GUTENBERG_URL.format(id=gutenberg_id)
+    try:
+        response = requests.get(url, timeout=10)
+        if response.status_code != 200:
+            print(f"❌ Failed to download book ID {gutenberg_id}")
+            return
+
+        filename = os.path.join(DATA_DIR, f"{title_hint}_{gutenberg_id}.txt")
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(response.text)
+
+        print(f"✅ Saved: {filename}")
+    except Exception as e:
+        print(f"❌ Error: {e}")
+
+
+if __name__ == "__main__":
+    books = [
+        (1342, "PrideAndPrejudice"),  # Jane Austen
+        (11, "AliceInWonderland"),    # Lewis Carroll
+        (98, "AesopFables"),          # Aesop
+        (1661, "SherlockHolmes"),     # Doyle
+        (76, "HuckFinn")              # Mark Twain
+    ]
+
+    for gutenberg_id, name in books:
+        download_book(gutenberg_id, name)
--- a/tools/openwebtext_fetcher.py
+++ b/tools/openwebtext_fetcher.py
@ -0,0 +1,21 @@
+# tools/openwebtext_fetcher.py
+
+from datasets import load_dataset
+import os
+from tqdm import tqdm
+
+TARGET_DIR = "data/openwebtext"
+os.makedirs(TARGET_DIR, exist_ok=True)
+
+
+def fetch_subset(n=10000, split="train"):
+    ds = load_dataset("stas/openwebtext-10k", split=split)
+    with open(os.path.join(TARGET_DIR, f"owt_{n}.jsonl"), "w", encoding="utf-8") as f:
+        for i, item in tqdm(enumerate(ds), total=n, desc="Writing JSONL"):
+            f.write(f"{item['text'].replace(chr(10),' ')}\n")
+            if i + 1 >= n:
+                break
+
+
+if __name__ == "__main__":
+    fetch_subset(20000)  # fetch 20K examples (~100 MB)