35 lines
1014 B
Python
35 lines
1014 B
Python
from datasets import load_dataset
|
||
import os
|
||
from tqdm import tqdm
|
||
|
||
TARGET_DIR = "data/openwebtext"
|
||
os.makedirs(TARGET_DIR, exist_ok=True)
|
||
|
||
|
||
def fetch_subset(total=20000, chunk_size=5000, split="train"):
|
||
ds = load_dataset("stas/openwebtext-10k", split=split)
|
||
print(f"[INFO] Total to fetch: {total} | Chunk size: {chunk_size}")
|
||
|
||
count = 0
|
||
file_index = 0
|
||
f = None
|
||
|
||
for item in tqdm(ds, desc="Downloading"):
|
||
if count % chunk_size == 0:
|
||
if f: f.close()
|
||
file_path = os.path.join(TARGET_DIR, f"owt_{file_index:05d}.jsonl")
|
||
f = open(file_path, "w", encoding="utf-8")
|
||
print(f"[INFO] ➕ Created {file_path}")
|
||
file_index += 1
|
||
f.write(f"{item['text'].replace(chr(10), ' ')}\n")
|
||
count += 1
|
||
if count >= total:
|
||
break
|
||
|
||
if f:
|
||
f.close()
|
||
print(f"[INFO] ✅ Done. {count} samples across {file_index} files.")
|
||
|
||
if __name__ == "__main__":
|
||
fetch_subset(total=100000, chunk_size=5000)
|