Added the basics of her code, updated to not include any extra files

2025-06-10 10:56:56 -04:00
parent 6d18d21f2a
commit 3d0e5410f1
7 changed files with 364 additions and 0 deletions
--- a/download_corpus.py
+++ b/download_corpus.py
@ -0,0 +1,36 @@
+import gutenbergpy.textget
+import re
+import glob
+
+
+# Download books by Gutenberg ID
+def download_gutenberg_book(book_id, output_file):
+    try:
+        raw_text = gutenbergpy.textget.get_text_by_id(book_id)
+        # Remove headers/footers
+        text = re.sub(r'\*\*\*.*?\*\*\*', '', raw_text.decode('utf-8'), flags=re.DOTALL)
+        text = re.sub(r'\n+', '\n', text).strip()
+        with open(output_file, 'w', encoding='utf-8') as f:
+            f.write(text)
+    except Exception as e:
+        print(f"Error downloading book {book_id}: {e}")
+
+
+# Download selected books
+books = [
+    (1342, 'pride_and_prejudice.txt'),
+    (45, 'anne_of_green_gables.txt'),
+    (74, 'tom_sawyer.txt')
+]
+for book_id, filename in books:
+    print(f"Downloading book ID {book_id}...")
+    download_gutenberg_book(book_id, filename)
+
+# Combine into corpus
+corpus = ''
+for file in glob.glob('*.txt'):
+    with open(file, 'r', encoding='utf-8') as f:
+        corpus += f.read() + '\n'
+with open('corpus.txt', 'w', encoding='utf-8') as f:
+    f.write(corpus)
+print("Corpus created at corpus.txt")