Updated the model capacity
This commit is contained in:
@ -1,5 +1,3 @@
|
||||
# tools/openwebtext_fetcher.py
|
||||
|
||||
from datasets import load_dataset
|
||||
import os
|
||||
from tqdm import tqdm
|
||||
@ -8,14 +6,29 @@ TARGET_DIR = "data/openwebtext"
|
||||
os.makedirs(TARGET_DIR, exist_ok=True)
|
||||
|
||||
|
||||
def fetch_subset(n=10000, split="train"):
|
||||
def fetch_subset(total=20000, chunk_size=5000, split="train"):
|
||||
ds = load_dataset("stas/openwebtext-10k", split=split)
|
||||
with open(os.path.join(TARGET_DIR, f"owt_{n}.jsonl"), "w", encoding="utf-8") as f:
|
||||
for i, item in tqdm(enumerate(ds), total=n, desc="Writing JSONL"):
|
||||
f.write(f"{item['text'].replace(chr(10),' ')}\n")
|
||||
if i + 1 >= n:
|
||||
break
|
||||
print(f"[INFO] Total to fetch: {total} | Chunk size: {chunk_size}")
|
||||
|
||||
count = 0
|
||||
file_index = 0
|
||||
f = None
|
||||
|
||||
for item in tqdm(ds, desc="Downloading"):
|
||||
if count % chunk_size == 0:
|
||||
if f: f.close()
|
||||
file_path = os.path.join(TARGET_DIR, f"owt_{file_index:05d}.jsonl")
|
||||
f = open(file_path, "w", encoding="utf-8")
|
||||
print(f"[INFO] ➕ Created {file_path}")
|
||||
file_index += 1
|
||||
f.write(f"{item['text'].replace(chr(10), ' ')}\n")
|
||||
count += 1
|
||||
if count >= total:
|
||||
break
|
||||
|
||||
if f:
|
||||
f.close()
|
||||
print(f"[INFO] ✅ Done. {count} samples across {file_index} files.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
fetch_subset(20000) # fetch 20K examples (~100 MB)
|
||||
fetch_subset(total=100000, chunk_size=5000)
|
||||
|
Reference in New Issue
Block a user