diff --git a/phoebe/discord_bot.py b/phoebe/discord_bot.py index 80be9e1..83e6c29 100644 --- a/phoebe/discord_bot.py +++ b/phoebe/discord_bot.py @@ -1,13 +1,14 @@ +import os import discord -from train_gpt_model import process_message -from gpt_model import load_model import torch from dotenv import load_dotenv -import os +from train_gpt_model import process_message +from gpt_model import load_model +# Load environment variables from .env file load_dotenv() -# Discord bot token +# Get the Discord bot token from environment variables TOKEN = os.getenv("DISCORD_TOKEN") # Load the vocabulary diff --git a/phoebe/gpt_model.py b/phoebe/gpt_model.py index e6e7034..13b22f1 100644 --- a/phoebe/gpt_model.py +++ b/phoebe/gpt_model.py @@ -1,7 +1,6 @@ import torch import torch.nn as nn import torch.nn.functional as F -import os # Hyperparameters batch_size = 64 @@ -120,14 +119,11 @@ class GPT(nn.Module): loss = F.cross_entropy(logits, targets) return logits, loss - def generate(self, idx, max_new_tokens): + def generate(self, idx, max_new_tokens, temperature=1.0): for _ in range(max_new_tokens): idx_cond = idx[:, -block_size:] logits, _ = self(idx_cond) - print(f"Logits shape: {logits.shape}") # Debug print - if logits.size(1) == 0: - raise ValueError("Logits tensor is empty.") - logits = logits[:, -1, :] + logits = logits[:, -1, :] / temperature probs = F.softmax(logits, dim=-1) idx_next = torch.multinomial(probs, num_samples=1) idx = torch.cat((idx, idx_next), dim=1) @@ -136,27 +132,19 @@ class GPT(nn.Module): def encode(s, string_to_int): # Replace unknown characters with a special token (e.g., "") - encoded = [] - for c in s: - if c in string_to_int: - encoded.append(string_to_int[c]) - else: - print(f"Unknown character encountered during encoding: {c}") - encoded.append(string_to_int[""]) - return encoded + return [string_to_int.get(c, string_to_int[""]) for c in s] def decode(lst, int_to_string): return "".join([int_to_string[i] for i in lst]) -def load_model(vocab_size, model_path="phoebe_model.pt"): +def load_model(vocab_size, model_path=None): model = GPT(vocab_size) - if os.path.exists(model_path): - model.load_state_dict( - torch.load(model_path, map_location=torch.device("cpu")) - ) - print("Model loaded successfully.") - else: - print("No pre-trained model found. Initialized a new model.") + if model_path: + try: + model.load_state_dict(torch.load(model_path)) + print("Model loaded successfully.") + except FileNotFoundError: + print("No pre-trained model found. Initialized a new model.") return model diff --git a/phoebe/train_gpt_model.py b/phoebe/train_gpt_model.py index 27ad604..fc07fd6 100644 --- a/phoebe/train_gpt_model.py +++ b/phoebe/train_gpt_model.py @@ -1,5 +1,8 @@ +import re import torch +import torch.optim as optim import random +import os from gpt_model import encode, decode, load_model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -7,10 +10,11 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Hyperparameters batch_size = 64 block_size = 256 -max_iters = 3000 -learning_rate = 2e-4 -eval_iters = 250 +max_iters = 5000 +learning_rate = 1e-5 # Adjusted learning rate +eval_iters = 100 dropout = 0.2 +patience = 500 # Number of iterations to wait for improvement before stopping # Load the vocabulary and encoded data with open("vocab.txt", "r", encoding="utf-8") as f: @@ -32,12 +36,25 @@ vocab_size = len(chars) string_to_int = {ch: i for i, ch in enumerate(chars)} int_to_string = {i: ch for i, ch in enumerate(chars)} -# Load and preprocess training and validation data from .txt files -with open("train_split.txt", "r", encoding="utf-8") as f: - train_data = f.read() -with open("eval_split.txt", "r", encoding="utf-8") as f: - val_data = f.read() +def clean_text(text): + """Remove special characters and unwanted symbols from the text.""" + text = re.sub(r"[^a-zA-Z0-9\s.,;!?\'\"]+", "", text) + text = re.sub(r"\s+", " ", text) + text = text.strip() + return text + + +# Load and preprocess training and validation data from cleaned .txt files +def load_and_clean_data(file_path): + with open(file_path, "r", encoding="utf-8") as f: + text = f.read() + cleaned_text = clean_text(text) + return cleaned_text + + +train_data = load_and_clean_data("train_split_cleaned.txt") +val_data = load_and_clean_data("eval_split_cleaned.txt") train_data = torch.tensor(encode(train_data, string_to_int), dtype=torch.long) val_data = torch.tensor(encode(val_data, string_to_int), dtype=torch.long) @@ -58,7 +75,17 @@ def get_batch(data, block_size, batch_size): return x, y -model = load_model(vocab_size).to(device) +def load_or_initialize_model(vocab_size): + model = load_model(vocab_size) + if os.path.exists("phoebe_model.pt"): + model.load_state_dict(torch.load("phoebe_model.pt")) + print("Model loaded from phoebe_model.pt") + else: + print("Initialized a new model") + return model + + +model = load_or_initialize_model(vocab_size).to(device) @torch.no_grad() @@ -78,7 +105,11 @@ def estimate_loss(): def train_model(): - optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) + optimizer = optim.AdamW(model.parameters(), lr=learning_rate) + scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.1) + best_val_loss = float("inf") + patience_counter = 0 + for iter in range(max_iters): if iter % eval_iters == 0: losses = estimate_loss() @@ -86,15 +117,31 @@ def train_model(): f"step {iter}: train loss {losses['train']:.3f}, " f"val loss {losses['val']:.3f}" ) + + # Check for improvement in validation loss + if losses["val"] < best_val_loss: + best_val_loss = losses["val"] + patience_counter = 0 + torch.save(model.state_dict(), "phoebe_model.pt") + print("Model Saved!") + else: + patience_counter += eval_iters + + # Early stopping + if patience_counter >= patience: + print("Early stopping triggered.") + break + xb, yb = get_batch(train_data, block_size, batch_size) logits, loss = model(xb, yb) optimizer.zero_grad(set_to_none=True) loss.backward() optimizer.step() + scheduler.step() - print(loss.item()) - torch.save(model.state_dict(), "phoebe_model.pt") - print("Model Saved!") + if patience_counter < patience: + print("Training completed without early stopping.") + print(f"Final loss: {loss.item()}") def check_input_chars(s, string_to_int): @@ -124,7 +171,7 @@ def process_message(message): print("Message could not be processed.") # Debug print return "Message could not be processed." - response = model.generate(encoded_text, max_new_tokens=50) + response = model.generate(encoded_text, max_new_tokens=50, temperature=0.7) decoded_response = decode(response[0].tolist(), int_to_string) print(f"Generated response: '{decoded_response}'") # Debug print return decoded_response