Added the basics of her code, updated to not include any extra files

This commit is contained in:
2025-06-10 10:56:56 -04:00
parent 6d18d21f2a
commit 3d0e5410f1
7 changed files with 364 additions and 0 deletions

36
download_corpus.py Normal file
View File

@ -0,0 +1,36 @@
import gutenbergpy.textget
import re
import glob
# Download books by Gutenberg ID
def download_gutenberg_book(book_id, output_file):
try:
raw_text = gutenbergpy.textget.get_text_by_id(book_id)
# Remove headers/footers
text = re.sub(r'\*\*\*.*?\*\*\*', '', raw_text.decode('utf-8'), flags=re.DOTALL)
text = re.sub(r'\n+', '\n', text).strip()
with open(output_file, 'w', encoding='utf-8') as f:
f.write(text)
except Exception as e:
print(f"Error downloading book {book_id}: {e}")
# Download selected books
books = [
(1342, 'pride_and_prejudice.txt'),
(45, 'anne_of_green_gables.txt'),
(74, 'tom_sawyer.txt')
]
for book_id, filename in books:
print(f"Downloading book ID {book_id}...")
download_gutenberg_book(book_id, filename)
# Combine into corpus
corpus = ''
for file in glob.glob('*.txt'):
with open(file, 'r', encoding='utf-8') as f:
corpus += f.read() + '\n'
with open('corpus.txt', 'w', encoding='utf-8') as f:
f.write(corpus)
print("Corpus created at corpus.txt")