First good level of progress
This commit is contained in:
21
tools/openwebtext_fetcher.py
Normal file
21
tools/openwebtext_fetcher.py
Normal file
@ -0,0 +1,21 @@
|
||||
# tools/openwebtext_fetcher.py
|
||||
|
||||
from datasets import load_dataset
|
||||
import os
|
||||
from tqdm import tqdm
|
||||
|
||||
TARGET_DIR = "data/openwebtext"
|
||||
os.makedirs(TARGET_DIR, exist_ok=True)
|
||||
|
||||
|
||||
def fetch_subset(n=10000, split="train"):
|
||||
ds = load_dataset("stas/openwebtext-10k", split=split)
|
||||
with open(os.path.join(TARGET_DIR, f"owt_{n}.jsonl"), "w", encoding="utf-8") as f:
|
||||
for i, item in tqdm(enumerate(ds), total=n, desc="Writing JSONL"):
|
||||
f.write(f"{item['text'].replace(chr(10),' ')}\n")
|
||||
if i + 1 >= n:
|
||||
break
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
fetch_subset(20000) # fetch 20K examples (~100 MB)
|
Reference in New Issue
Block a user