# tools/openwebtext_fetcher.py from datasets import load_dataset import os from tqdm import tqdm TARGET_DIR = "data/openwebtext" os.makedirs(TARGET_DIR, exist_ok=True) def fetch_subset(n=10000, split="train"): ds = load_dataset("stas/openwebtext-10k", split=split) with open(os.path.join(TARGET_DIR, f"owt_{n}.jsonl"), "w", encoding="utf-8") as f: for i, item in tqdm(enumerate(ds), total=n, desc="Writing JSONL"): f.write(f"{item['text'].replace(chr(10),' ')}\n") if i + 1 >= n: break if __name__ == "__main__": fetch_subset(20000) # fetch 20K examples (~100 MB)