from datasets import load_dataset import os from tqdm import tqdm TARGET_DIR = "data/openwebtext" os.makedirs(TARGET_DIR, exist_ok=True) def fetch_subset(total=20000, chunk_size=5000, split="train"): ds = load_dataset("stas/openwebtext-10k", split=split) print(f"[INFO] Total to fetch: {total} | Chunk size: {chunk_size}") count = 0 file_index = 0 f = None for item in tqdm(ds, desc="Downloading"): if count % chunk_size == 0: if f: f.close() file_path = os.path.join(TARGET_DIR, f"owt_{file_index:05d}.jsonl") f = open(file_path, "w", encoding="utf-8") print(f"[INFO] ➕ Created {file_path}") file_index += 1 f.write(f"{item['text'].replace(chr(10), ' ')}\n") count += 1 if count >= total: break if f: f.close() print(f"[INFO] ✅ Done. {count} samples across {file_index} files.") if __name__ == "__main__": fetch_subset(total=100000, chunk_size=5000)