""" Download Training Data Script Downloads public domain datasets for training Rosie's base language model """ import os import requests from tqdm import tqdm import json import argparse from pathlib import Path def download_file(url: str, filepath: str, description: str = ""): """Download a file with progress bar""" print(f"Downloading {description}...") response = requests.get(url, stream=True) total_size = int(response.headers.get('content-length', 0)) with open(filepath, 'wb') as f, tqdm( desc=description, total=total_size, unit='iB', unit_scale=True, unit_divisor=1024, ) as pbar: for chunk in response.iter_content(chunk_size=8192): size = f.write(chunk) pbar.update(size) print(f"✓ Downloaded to {filepath}\n") def download_openwebtext_sample(): """Download a sample of OpenWebText dataset""" print("=" * 60) print("OpenWebText Sample") print("=" * 60) print("OpenWebText is a large web-scraped dataset (~40GB)") print("We'll download a small sample for initial training\n") # Note: You'll need to download the full dataset from: # https://skylion007.github.io/OpenWebTextCorpus/ print("To get the full OpenWebText dataset:") print("1. Visit: https://skylion007.github.io/OpenWebTextCorpus/") print("2. Download the .xz files") print("3. Extract to data/openwebtext/\n") # For now, we'll create a placeholder os.makedirs('data/openwebtext', exist_ok=True) print("✓ Created data/openwebtext/ directory") print(" Please download OpenWebText files here\n") def download_gutenberg_books(): """Download sample books from Project Gutenberg""" print("=" * 60) print("Project Gutenberg Books") print("=" * 60) print("Downloading public domain books for language training\n") os.makedirs('data/books', exist_ok=True) # Sample books (all public domain) books = [ { 'url': 'https://www.gutenberg.org/files/1342/1342-0.txt', 'name': 'Pride and Prejudice', 'file': 'pride_and_prejudice.txt' }, { 'url': 'https://www.gutenberg.org/files/11/11-0.txt', 'name': 'Alice in Wonderland', 'file': 'alice_in_wonderland.txt' }, { 'url': 'https://www.gutenberg.org/files/84/84-0.txt', 'name': 'Frankenstein', 'file': 'frankenstein.txt' }, { 'url': 'https://www.gutenberg.org/files/1661/1661-0.txt', 'name': 'Sherlock Holmes', 'file': 'sherlock_holmes.txt' }, { 'url': 'https://www.gutenberg.org/files/2701/2701-0.txt', 'name': 'Moby Dick', 'file': 'moby_dick.txt' }, ] for book in books: filepath = f"data/books/{book['file']}" if os.path.exists(filepath): print(f"✓ {book['name']} already downloaded") continue try: download_file(book['url'], filepath, book['name']) except Exception as e: print(f"✗ Failed to download {book['name']}: {e}\n") print("✓ Books downloaded\n") def create_combined_dataset(): """Combine all downloaded data into training format""" print("=" * 60) print("Creating Combined Dataset") print("=" * 60) texts = [] # Load books books_dir = Path('data/books') if books_dir.exists(): print("Processing books...") for book_file in books_dir.glob('*.txt'): try: with open(book_file, 'r', encoding='utf-8') as f: content = f.read() # Split into paragraphs paragraphs = [p.strip() for p in content.split('\n\n') if len(p.strip()) > 100] texts.extend(paragraphs) print(f" ✓ {book_file.name}: {len(paragraphs)} paragraphs") except Exception as e: print(f" ✗ Error reading {book_file.name}: {e}") # Load personality data personality_files = ['data/personality_base.json'] for pfile in personality_files: if os.path.exists(pfile): print(f"Loading {pfile}...") with open(pfile, 'r', encoding='utf-8') as f: data = json.load(f) texts.extend(data['texts']) print(f" ✓ {len(data['texts'])} personality examples") print(f"\nTotal texts collected: {len(texts)}") # Save combined dataset output_file = 'data/combined_training.json' with open(output_file, 'w', encoding='utf-8') as f: json.dump({'texts': texts}, f, indent=2) print(f"✓ Saved to {output_file}\n") # Calculate approximate token count (rough estimate: 1 token ≈ 4 characters) total_chars = sum(len(text) for text in texts) approx_tokens = total_chars // 4 print(f"Approximate tokens: {approx_tokens:,} ({approx_tokens/1e6:.1f}M)") print(f"This is a SMALL dataset. For full training, you'll need 10-50B tokens.") print(f"Consider downloading OpenWebText or The Pile for complete training.\n") def show_dataset_info(): """Show information about available datasets""" print("\n" + "=" * 60) print("Available Public Datasets for Training") print("=" * 60) print() datasets = [ { 'name': 'OpenWebText', 'size': '~40GB (38GB compressed)', 'tokens': '~8B tokens', 'url': 'https://skylion007.github.io/OpenWebTextCorpus/', 'description': 'Web-scraped text from Reddit links' }, { 'name': 'The Pile', 'size': '~800GB', 'tokens': '~300B tokens', 'url': 'https://pile.eleuther.ai/', 'description': 'Massive diverse text dataset' }, { 'name': 'BookCorpus', 'size': '~5GB', 'tokens': '~1B tokens', 'url': 'HuggingFace: bookcorpus', 'description': 'Books corpus (11K books)' }, { 'name': 'Wikipedia', 'size': '~20GB', 'tokens': '~3B tokens', 'url': 'https://dumps.wikimedia.org/', 'description': 'Wikipedia dumps (all languages)' }, { 'name': 'Project Gutenberg', 'size': '~10GB', 'tokens': '~2B tokens', 'url': 'https://www.gutenberg.org/', 'description': 'Public domain books (60K+ books)' }, ] for dataset in datasets: print(f"[*] {dataset['name']}") print(f" Size: {dataset['size']}") print(f" Tokens: {dataset['tokens']}") print(f" URL: {dataset['url']}") print(f" Description: {dataset['description']}") print() print("Recommendation for Rosie training:") print(" - Start: Books + Personality data (~500M tokens)") print(" - Better: + OpenWebText (~8B tokens)") print(" - Best: + The Pile subset (~50B tokens)") print() def main(): parser = argparse.ArgumentParser(description="Download training data for Rosie") parser.add_argument('--books', action='store_true', help='Download sample books') parser.add_argument('--info', action='store_true', help='Show dataset information') parser.add_argument('--combine', action='store_true', help='Combine downloaded data') parser.add_argument('--all', action='store_true', help='Download all available samples') args = parser.parse_args() # Create data directory os.makedirs('data', exist_ok=True) if args.info or (not any([args.books, args.combine, args.all])): show_dataset_info() if args.books or args.all: download_gutenberg_books() download_openwebtext_sample() if args.combine or args.all: create_combined_dataset() print("=" * 60) print("Next Steps:") print("=" * 60) print("1. Download more data (see --info for sources)") print("2. Run: python train_rosie.py --data_path data/combined_training.json") print("3. Monitor training progress") print("4. Test the model with test_rosie.py") print() if __name__ == "__main__": main()