feat: add training data collection for Rosie

Personality Dataset (300+ examples): - Greetings and farewells - Emotions and reactions - Physical interactions (pats, drags, touches) - Questions and answers - Help and support - Jokes and entertainment - Mood-based responses - Conversation fillers - Various user intents Data Download Script: - Download Project Gutenberg books (public domain) - Instructions for OpenWebText (~8B tokens) - Instructions for The Pile (~300B tokens) - Automatic dataset combination - Token counting and statistics - Download progress bars Ready to train: 1. Run: python scripts/download_training_data.py --all 2. Download additional datasets as needed 3. Run: python train_rosie.py --data_path data/combined_training.json 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-30 23:44:36 -04:00
parent c7ce0085fb
commit 10ccdc2420
2 changed files with 595 additions and 0 deletions
--- a/scripts/download_training_data.py
+++ b/scripts/download_training_data.py
@@ -0,0 +1,251 @@
+"""
+Download Training Data Script
+Downloads public domain datasets for training Rosie's base language model
+"""
+import os
+import requests
+from tqdm import tqdm
+import json
+import argparse
+from pathlib import Path
+
+
+def download_file(url: str, filepath: str, description: str = ""):
+    """Download a file with progress bar"""
+    print(f"Downloading {description}...")
+    response = requests.get(url, stream=True)
+    total_size = int(response.headers.get('content-length', 0))
+
+    with open(filepath, 'wb') as f, tqdm(
+        desc=description,
+        total=total_size,
+        unit='iB',
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as pbar:
+        for chunk in response.iter_content(chunk_size=8192):
+            size = f.write(chunk)
+            pbar.update(size)
+
+    print(f"✓ Downloaded to {filepath}\n")
+
+
+def download_openwebtext_sample():
+    """Download a sample of OpenWebText dataset"""
+    print("=" * 60)
+    print("OpenWebText Sample")
+    print("=" * 60)
+    print("OpenWebText is a large web-scraped dataset (~40GB)")
+    print("We'll download a small sample for initial training\n")
+
+    # Note: You'll need to download the full dataset from:
+    # https://skylion007.github.io/OpenWebTextCorpus/
+    print("To get the full OpenWebText dataset:")
+    print("1. Visit: https://skylion007.github.io/OpenWebTextCorpus/")
+    print("2. Download the .xz files")
+    print("3. Extract to data/openwebtext/\n")
+
+    # For now, we'll create a placeholder
+    os.makedirs('data/openwebtext', exist_ok=True)
+    print("✓ Created data/openwebtext/ directory")
+    print("  Please download OpenWebText files here\n")
+
+
+def download_gutenberg_books():
+    """Download sample books from Project Gutenberg"""
+    print("=" * 60)
+    print("Project Gutenberg Books")
+    print("=" * 60)
+    print("Downloading public domain books for language training\n")
+
+    os.makedirs('data/books', exist_ok=True)
+
+    # Sample books (all public domain)
+    books = [
+        {
+            'url': 'https://www.gutenberg.org/files/1342/1342-0.txt',
+            'name': 'Pride and Prejudice',
+            'file': 'pride_and_prejudice.txt'
+        },
+        {
+            'url': 'https://www.gutenberg.org/files/11/11-0.txt',
+            'name': 'Alice in Wonderland',
+            'file': 'alice_in_wonderland.txt'
+        },
+        {
+            'url': 'https://www.gutenberg.org/files/84/84-0.txt',
+            'name': 'Frankenstein',
+            'file': 'frankenstein.txt'
+        },
+        {
+            'url': 'https://www.gutenberg.org/files/1661/1661-0.txt',
+            'name': 'Sherlock Holmes',
+            'file': 'sherlock_holmes.txt'
+        },
+        {
+            'url': 'https://www.gutenberg.org/files/2701/2701-0.txt',
+            'name': 'Moby Dick',
+            'file': 'moby_dick.txt'
+        },
+    ]
+
+    for book in books:
+        filepath = f"data/books/{book['file']}"
+        if os.path.exists(filepath):
+            print(f"✓ {book['name']} already downloaded")
+            continue
+
+        try:
+            download_file(book['url'], filepath, book['name'])
+        except Exception as e:
+            print(f"✗ Failed to download {book['name']}: {e}\n")
+
+    print("✓ Books downloaded\n")
+
+
+def create_combined_dataset():
+    """Combine all downloaded data into training format"""
+    print("=" * 60)
+    print("Creating Combined Dataset")
+    print("=" * 60)
+
+    texts = []
+
+    # Load books
+    books_dir = Path('data/books')
+    if books_dir.exists():
+        print("Processing books...")
+        for book_file in books_dir.glob('*.txt'):
+            try:
+                with open(book_file, 'r', encoding='utf-8') as f:
+                    content = f.read()
+
+                    # Split into paragraphs
+                    paragraphs = [p.strip() for p in content.split('\n\n') if len(p.strip()) > 100]
+                    texts.extend(paragraphs)
+                    print(f"  ✓ {book_file.name}: {len(paragraphs)} paragraphs")
+
+            except Exception as e:
+                print(f"  ✗ Error reading {book_file.name}: {e}")
+
+    # Load personality data
+    personality_files = ['data/personality_base.json']
+    for pfile in personality_files:
+        if os.path.exists(pfile):
+            print(f"Loading {pfile}...")
+            with open(pfile, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                texts.extend(data['texts'])
+                print(f"  ✓ {len(data['texts'])} personality examples")
+
+    print(f"\nTotal texts collected: {len(texts)}")
+
+    # Save combined dataset
+    output_file = 'data/combined_training.json'
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump({'texts': texts}, f, indent=2)
+
+    print(f"✓ Saved to {output_file}\n")
+
+    # Calculate approximate token count (rough estimate: 1 token ≈ 4 characters)
+    total_chars = sum(len(text) for text in texts)
+    approx_tokens = total_chars // 4
+    print(f"Approximate tokens: {approx_tokens:,} ({approx_tokens/1e6:.1f}M)")
+    print(f"This is a SMALL dataset. For full training, you'll need 10-50B tokens.")
+    print(f"Consider downloading OpenWebText or The Pile for complete training.\n")
+
+
+def show_dataset_info():
+    """Show information about available datasets"""
+    print("\n" + "=" * 60)
+    print("Available Public Datasets for Training")
+    print("=" * 60)
+    print()
+
+    datasets = [
+        {
+            'name': 'OpenWebText',
+            'size': '~40GB (38GB compressed)',
+            'tokens': '~8B tokens',
+            'url': 'https://skylion007.github.io/OpenWebTextCorpus/',
+            'description': 'Web-scraped text from Reddit links'
+        },
+        {
+            'name': 'The Pile',
+            'size': '~800GB',
+            'tokens': '~300B tokens',
+            'url': 'https://pile.eleuther.ai/',
+            'description': 'Massive diverse text dataset'
+        },
+        {
+            'name': 'BookCorpus',
+            'size': '~5GB',
+            'tokens': '~1B tokens',
+            'url': 'HuggingFace: bookcorpus',
+            'description': 'Books corpus (11K books)'
+        },
+        {
+            'name': 'Wikipedia',
+            'size': '~20GB',
+            'tokens': '~3B tokens',
+            'url': 'https://dumps.wikimedia.org/',
+            'description': 'Wikipedia dumps (all languages)'
+        },
+        {
+            'name': 'Project Gutenberg',
+            'size': '~10GB',
+            'tokens': '~2B tokens',
+            'url': 'https://www.gutenberg.org/',
+            'description': 'Public domain books (60K+ books)'
+        },
+    ]
+
+    for dataset in datasets:
+        print(f"[*] {dataset['name']}")
+        print(f"   Size: {dataset['size']}")
+        print(f"   Tokens: {dataset['tokens']}")
+        print(f"   URL: {dataset['url']}")
+        print(f"   Description: {dataset['description']}")
+        print()
+
+    print("Recommendation for Rosie training:")
+    print("  - Start: Books + Personality data (~500M tokens)")
+    print("  - Better: + OpenWebText (~8B tokens)")
+    print("  - Best: + The Pile subset (~50B tokens)")
+    print()
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Download training data for Rosie")
+    parser.add_argument('--books', action='store_true', help='Download sample books')
+    parser.add_argument('--info', action='store_true', help='Show dataset information')
+    parser.add_argument('--combine', action='store_true', help='Combine downloaded data')
+    parser.add_argument('--all', action='store_true', help='Download all available samples')
+
+    args = parser.parse_args()
+
+    # Create data directory
+    os.makedirs('data', exist_ok=True)
+
+    if args.info or (not any([args.books, args.combine, args.all])):
+        show_dataset_info()
+
+    if args.books or args.all:
+        download_gutenberg_books()
+        download_openwebtext_sample()
+
+    if args.combine or args.all:
+        create_combined_dataset()
+
+    print("=" * 60)
+    print("Next Steps:")
+    print("=" * 60)
+    print("1. Download more data (see --info for sources)")
+    print("2. Run: python train_rosie.py --data_path data/combined_training.json")
+    print("3. Monitor training progress")
+    print("4. Test the model with test_rosie.py")
+    print()
+
+
+if __name__ == "__main__":
+    main()