Files
Catlin/tools/openwebtext_fetcher.py
2025-06-29 12:36:25 -04:00

22 lines
633 B
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# tools/openwebtext_fetcher.py
from datasets import load_dataset
import os
from tqdm import tqdm
TARGET_DIR = "data/openwebtext"
os.makedirs(TARGET_DIR, exist_ok=True)
def fetch_subset(n=10000, split="train"):
ds = load_dataset("stas/openwebtext-10k", split=split)
with open(os.path.join(TARGET_DIR, f"owt_{n}.jsonl"), "w", encoding="utf-8") as f:
for i, item in tqdm(enumerate(ds), total=n, desc="Writing JSONL"):
f.write(f"{item['text'].replace(chr(10),' ')}\n")
if i + 1 >= n:
break
if __name__ == "__main__":
fetch_subset(20000) # fetch 20K examples (~100MB)