First good level of progress
This commit is contained in:
38
tools/book_downloader.py
Normal file
38
tools/book_downloader.py
Normal file
@ -0,0 +1,38 @@
|
||||
# tools/book_downloader.py
|
||||
|
||||
import requests
|
||||
import os
|
||||
|
||||
DATA_DIR = os.path.join("data", "books")
|
||||
GUTENBERG_URL = "https://www.gutenberg.org/files/{id}/{id}-0.txt"
|
||||
|
||||
|
||||
def download_book(gutenberg_id, title_hint="book"):
|
||||
os.makedirs(DATA_DIR, exist_ok=True)
|
||||
url = GUTENBERG_URL.format(id=gutenberg_id)
|
||||
try:
|
||||
response = requests.get(url, timeout=10)
|
||||
if response.status_code != 200:
|
||||
print(f"❌ Failed to download book ID {gutenberg_id}")
|
||||
return
|
||||
|
||||
filename = os.path.join(DATA_DIR, f"{title_hint}_{gutenberg_id}.txt")
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
f.write(response.text)
|
||||
|
||||
print(f"✅ Saved: {filename}")
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
books = [
|
||||
(1342, "PrideAndPrejudice"), # Jane Austen
|
||||
(11, "AliceInWonderland"), # Lewis Carroll
|
||||
(98, "AesopFables"), # Aesop
|
||||
(1661, "SherlockHolmes"), # Doyle
|
||||
(76, "HuckFinn") # Mark Twain
|
||||
]
|
||||
|
||||
for gutenberg_id, name in books:
|
||||
download_book(gutenberg_id, name)
|
21
tools/openwebtext_fetcher.py
Normal file
21
tools/openwebtext_fetcher.py
Normal file
@ -0,0 +1,21 @@
|
||||
# tools/openwebtext_fetcher.py
|
||||
|
||||
from datasets import load_dataset
|
||||
import os
|
||||
from tqdm import tqdm
|
||||
|
||||
TARGET_DIR = "data/openwebtext"
|
||||
os.makedirs(TARGET_DIR, exist_ok=True)
|
||||
|
||||
|
||||
def fetch_subset(n=10000, split="train"):
|
||||
ds = load_dataset("stas/openwebtext-10k", split=split)
|
||||
with open(os.path.join(TARGET_DIR, f"owt_{n}.jsonl"), "w", encoding="utf-8") as f:
|
||||
for i, item in tqdm(enumerate(ds), total=n, desc="Writing JSONL"):
|
||||
f.write(f"{item['text'].replace(chr(10),' ')}\n")
|
||||
if i + 1 >= n:
|
||||
break
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
fetch_subset(20000) # fetch 20K examples (~100 MB)
|
Reference in New Issue
Block a user