Updated the model capacity

2025-06-30 18:08:11 -04:00
parent 159be1eb82
commit 6366f72716
6 changed files with 95 additions and 10058 deletions
--- a/tools/openwebtext_fetcher.py
+++ b/tools/openwebtext_fetcher.py
@ -1,5 +1,3 @@
-# tools/openwebtext_fetcher.py
-
 from datasets import load_dataset
 import os
 from tqdm import tqdm
@ -8,14 +6,29 @@ TARGET_DIR = "data/openwebtext"
 os.makedirs(TARGET_DIR, exist_ok=True)


-def fetch_subset(n=10000, split="train"):
+def fetch_subset(total=20000, chunk_size=5000, split="train"):
    ds = load_dataset("stas/openwebtext-10k", split=split)
-    with open(os.path.join(TARGET_DIR, f"owt_{n}.jsonl"), "w", encoding="utf-8") as f:
-        for i, item in tqdm(enumerate(ds), total=n, desc="Writing JSONL"):
-            f.write(f"{item['text'].replace(chr(10),' ')}\n")
-            if i + 1 >= n:
-                break
+    print(f"[INFO] Total to fetch: {total} | Chunk size: {chunk_size}")

+    count = 0
+    file_index = 0
+    f = None
+
+    for item in tqdm(ds, desc="Downloading"):
+        if count % chunk_size == 0:
+            if f: f.close()
+            file_path = os.path.join(TARGET_DIR, f"owt_{file_index:05d}.jsonl")
+            f = open(file_path, "w", encoding="utf-8")
+            print(f"[INFO] ➕ Created {file_path}")
+            file_index += 1
+        f.write(f"{item['text'].replace(chr(10), ' ')}\n")
+        count += 1
+        if count >= total:
+            break
+
+    if f:
+        f.close()
+        print(f"[INFO] ✅ Done. {count} samples across {file_index} files.")

 if __name__ == "__main__":
-    fetch_subset(20000)  # fetch 20K examples (~100 MB)
+    fetch_subset(total=100000, chunk_size=5000)