""" Rosie Training Script Train the custom transformer model from scratch """ import os import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import Dataset, DataLoader from typing import List, Dict import json from tqdm import tqdm import argparse from src.llm.model import RosieModel, RosieConfig, create_rosie_model from src.llm.tokenizer import RosieTokenizer, create_tokenizer class TextDataset(Dataset): """Dataset for language modeling""" def __init__(self, texts: List[str], tokenizer: RosieTokenizer, max_length: int = 512): self.tokenizer = tokenizer self.max_length = max_length self.examples = [] print(f"Tokenizing {len(texts)} texts...") for text in tqdm(texts): token_ids = tokenizer.encode(text, add_special_tokens=True) # Split into chunks of max_length for i in range(0, len(token_ids), max_length): chunk = token_ids[i:i + max_length] if len(chunk) > 1: # Need at least 2 tokens (input + target) self.examples.append(chunk) print(f"Created {len(self.examples)} training examples") def __len__(self): return len(self.examples) def __getitem__(self, idx): tokens = self.examples[idx] # Pad to max_length if len(tokens) < self.max_length: tokens = tokens + [self.tokenizer.pad_token_id] * (self.max_length - len(tokens)) # Input and target (shifted by 1) input_ids = torch.tensor(tokens[:-1]) target_ids = torch.tensor(tokens[1:]) return input_ids, target_ids def train_epoch( model: RosieModel, dataloader: DataLoader, optimizer: optim.Optimizer, device: torch.device, epoch: int, ): """Train for one epoch""" model.train() total_loss = 0 criterion = nn.CrossEntropyLoss(ignore_index=0) # Ignore padding progress_bar = tqdm(dataloader, desc=f"Epoch {epoch}") for batch_idx, (input_ids, target_ids) in enumerate(progress_bar): input_ids = input_ids.to(device) target_ids = target_ids.to(device) # Forward pass optimizer.zero_grad() logits, _ = model(input_ids) # Calculate loss loss = criterion(logits.view(-1, model.config.vocab_size), target_ids.view(-1)) # Backward pass loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Gradient clipping optimizer.step() total_loss += loss.item() # Update progress bar progress_bar.set_postfix({'loss': loss.item()}) avg_loss = total_loss / len(dataloader) return avg_loss def main(): parser = argparse.ArgumentParser(description="Train Rosie model") parser.add_argument('--data_path', type=str, required=True, help="Path to training data (JSON file)") parser.add_argument('--output_dir', type=str, default='./models/rosie_model', help="Output directory") parser.add_argument('--vocab_size', type=int, default=32000, help="Vocabulary size") parser.add_argument('--hidden_size', type=int, default=768, help="Hidden size") parser.add_argument('--num_layers', type=int, default=12, help="Number of layers") parser.add_argument('--num_heads', type=int, default=12, help="Number of attention heads") parser.add_argument('--max_length', type=int, default=512, help="Maximum sequence length") parser.add_argument('--batch_size', type=int, default=4, help="Batch size") parser.add_argument('--epochs', type=int, default=10, help="Number of epochs") parser.add_argument('--lr', type=float, default=1e-4, help="Learning rate") parser.add_argument('--device', type=str, default='cuda', help="Device (cuda/cpu)") args = parser.parse_args() # Create output directory os.makedirs(args.output_dir, exist_ok=True) # Load training data print(f"Loading training data from {args.data_path}...") with open(args.data_path, 'r', encoding='utf-8') as f: data = json.load(f) if isinstance(data, list): texts = data elif isinstance(data, dict) and 'texts' in data: texts = data['texts'] else: raise ValueError("Data must be a list of texts or dict with 'texts' key") print(f"Loaded {len(texts)} texts") # Create/load tokenizer tokenizer_path = os.path.join(args.output_dir, 'tokenizer') if os.path.exists(tokenizer_path): print(f"Loading existing tokenizer from {tokenizer_path}") tokenizer = create_tokenizer(args.vocab_size) tokenizer.load(tokenizer_path) else: print("Training new tokenizer...") tokenizer = create_tokenizer(args.vocab_size) tokenizer.train(texts, save_path=tokenizer_path) # Create dataset dataset = TextDataset(texts, tokenizer, max_length=args.max_length) dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=0) # Create model config = RosieConfig( vocab_size=len(tokenizer.vocab), hidden_size=args.hidden_size, num_layers=args.num_layers, num_heads=args.num_heads, max_position_embeddings=args.max_length, ) model = create_rosie_model(config) # Move to device device = torch.device(args.device if torch.cuda.is_available() else 'cpu') print(f"Using device: {device}") model = model.to(device) # Optimizer optimizer = optim.AdamW(model.parameters(), lr=args.lr, weight_decay=0.01) # Training loop print(f"\nStarting training for {args.epochs} epochs...") print(f"Batch size: {args.batch_size}") print(f"Total batches per epoch: {len(dataloader)}") print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}\n") for epoch in range(1, args.epochs + 1): avg_loss = train_epoch(model, dataloader, optimizer, device, epoch) print(f"Epoch {epoch}/{args.epochs} - Average Loss: {avg_loss:.4f}") # Save checkpoint every epoch checkpoint_path = os.path.join(args.output_dir, f'checkpoint_epoch_{epoch}.pth') torch.save({ 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': avg_loss, 'config': config.__dict__, }, checkpoint_path) print(f"Checkpoint saved to {checkpoint_path}\n") # Save final model final_path = os.path.join(args.output_dir, 'rosie_final.pth') torch.save(model.state_dict(), final_path) print(f"\nTraining complete! Model saved to {final_path}") if __name__ == "__main__": main()