Files
vivi/download_corpus.py

36 lines
1.1 KiB
Python

import gutenbergpy.textget
import re
import glob
# Download books by Gutenberg ID
def download_gutenberg_book(book_id, output_file):
try:
raw_text = gutenbergpy.textget.get_text_by_id(book_id)
# Remove headers/footers
text = re.sub(r'\*\*\*.*?\*\*\*', '', raw_text.decode('utf-8'), flags=re.DOTALL)
text = re.sub(r'\n+', '\n', text).strip()
with open(output_file, 'w', encoding='utf-8') as f:
f.write(text)
except Exception as e:
print(f"Error downloading book {book_id}: {e}")
# Download selected books
books = [
(1342, 'pride_and_prejudice.txt'),
(45, 'anne_of_green_gables.txt'),
(74, 'tom_sawyer.txt')
]
for book_id, filename in books:
print(f"Downloading book ID {book_id}...")
download_gutenberg_book(book_id, filename)
# Combine into corpus
corpus = ''
for file in glob.glob('*.txt'):
with open(file, 'r', encoding='utf-8') as f:
corpus += f.read() + '\n'
with open('corpus.txt', 'w', encoding='utf-8') as f:
f.write(corpus)
print("Corpus created at corpus.txt")