Catlin/tools/openwebtext_fetcher.py

from datasets import load_dataset
import os
from tqdm import tqdm

TARGET_DIR = "data/openwebtext"
os.makedirs(TARGET_DIR, exist_ok=True)


def fetch_subset(total=20000, chunk_size=5000, split="train"):
    ds = load_dataset("stas/openwebtext-10k", split=split)
    print(f"[INFO] Total to fetch: {total} | Chunk size: {chunk_size}")

    count = 0
    file_index = 0
    f = None

    for item in tqdm(ds, desc="Downloading"):
        if count % chunk_size == 0:
            if f: f.close()
            file_path = os.path.join(TARGET_DIR, f"owt_{file_index:05d}.jsonl")
            f = open(file_path, "w", encoding="utf-8")
            print(f"[INFO] ➕ Created {file_path}")
            file_index += 1
        f.write(f"{item['text'].replace(chr(10), ' ')}\n")
        count += 1
        if count >= total:
            break

    if f:
        f.close()
        print(f"[INFO] ✅ Done. {count} samples across {file_index} files.")

if __name__ == "__main__":
    fetch_subset(total=100000, chunk_size=5000)