import gutenbergpy.textget import re import glob # Download books by Gutenberg ID def download_gutenberg_book(book_id, output_file): try: raw_text = gutenbergpy.textget.get_text_by_id(book_id) # Remove headers/footers text = re.sub(r'\*\*\*.*?\*\*\*', '', raw_text.decode('utf-8'), flags=re.DOTALL) text = re.sub(r'\n+', '\n', text).strip() with open(output_file, 'w', encoding='utf-8') as f: f.write(text) except Exception as e: print(f"Error downloading book {book_id}: {e}") # Download selected books books = [ (1342, 'pride_and_prejudice.txt'), (45, 'anne_of_green_gables.txt'), (74, 'tom_sawyer.txt') ] for book_id, filename in books: print(f"Downloading book ID {book_id}...") download_gutenberg_book(book_id, filename) # Combine into corpus corpus = '' for file in glob.glob('*.txt'): with open(file, 'r', encoding='utf-8') as f: corpus += f.read() + '\n' with open('corpus.txt', 'w', encoding='utf-8') as f: f.write(corpus) print("Corpus created at corpus.txt")