Updated the model capacity

This commit is contained in:
2025-06-30 18:08:11 -04:00
parent 159be1eb82
commit 6366f72716
6 changed files with 95 additions and 10058 deletions

View File

@ -1,5 +1,3 @@
# tools/openwebtext_fetcher.py
from datasets import load_dataset
import os
from tqdm import tqdm
@ -8,14 +6,29 @@ TARGET_DIR = "data/openwebtext"
os.makedirs(TARGET_DIR, exist_ok=True)
def fetch_subset(n=10000, split="train"):
def fetch_subset(total=20000, chunk_size=5000, split="train"):
ds = load_dataset("stas/openwebtext-10k", split=split)
with open(os.path.join(TARGET_DIR, f"owt_{n}.jsonl"), "w", encoding="utf-8") as f:
for i, item in tqdm(enumerate(ds), total=n, desc="Writing JSONL"):
f.write(f"{item['text'].replace(chr(10),' ')}\n")
if i + 1 >= n:
break
print(f"[INFO] Total to fetch: {total} | Chunk size: {chunk_size}")
count = 0
file_index = 0
f = None
for item in tqdm(ds, desc="Downloading"):
if count % chunk_size == 0:
if f: f.close()
file_path = os.path.join(TARGET_DIR, f"owt_{file_index:05d}.jsonl")
f = open(file_path, "w", encoding="utf-8")
print(f"[INFO] Created {file_path}")
file_index += 1
f.write(f"{item['text'].replace(chr(10), ' ')}\n")
count += 1
if count >= total:
break
if f:
f.close()
print(f"[INFO] ✅ Done. {count} samples across {file_index} files.")
if __name__ == "__main__":
fetch_subset(20000) # fetch 20K examples (~100MB)
fetch_subset(total=100000, chunk_size=5000)