22 lines
633 B
Python
22 lines
633 B
Python
# tools/openwebtext_fetcher.py
|
||
|
||
from datasets import load_dataset
|
||
import os
|
||
from tqdm import tqdm
|
||
|
||
TARGET_DIR = "data/openwebtext"
|
||
os.makedirs(TARGET_DIR, exist_ok=True)
|
||
|
||
|
||
def fetch_subset(n=10000, split="train"):
|
||
ds = load_dataset("stas/openwebtext-10k", split=split)
|
||
with open(os.path.join(TARGET_DIR, f"owt_{n}.jsonl"), "w", encoding="utf-8") as f:
|
||
for i, item in tqdm(enumerate(ds), total=n, desc="Writing JSONL"):
|
||
f.write(f"{item['text'].replace(chr(10),' ')}\n")
|
||
if i + 1 >= n:
|
||
break
|
||
|
||
|
||
if __name__ == "__main__":
|
||
fetch_subset(20000) # fetch 20K examples (~100 MB)
|