From 5cef36a2df06efea812bbc310a052e244404065d Mon Sep 17 00:00:00 2001 From: Dan Date: Wed, 2 Oct 2024 16:44:09 -0400 Subject: [PATCH] Based Code figured out. Now to just figure how to train her. --- .vscode/launch.json | 15 ++++ main.py | 123 +++++++++++++++++++++++++++++ model.py | 183 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 321 insertions(+) create mode 100644 .vscode/launch.json create mode 100644 main.py create mode 100644 model.py diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..3299cb9 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Jade", + "type": "debugpy", + "request": "launch", + "program": "E:\\Development\\AI Development\\Jade\\main.py", + "console": "integratedTerminal" + } + ] +} \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..39e23e0 --- /dev/null +++ b/main.py @@ -0,0 +1,123 @@ +import discord +import torch +from model import SimpleTokenizer, initialize_model, train_on_conversation, save_model, update_model_vocab +import torch.nn.functional as F +import os +from dotenv import load_dotenv + + +load_dotenv() + + +class DiscordBot(discord.Client): + def __init__(self, **options): + super().__init__(**options) + self.tokenizer = SimpleTokenizer() + self.tokenizer_vocab_path = 'tokenizer_vocab.json' + self.tokenizer.load_vocab(self.tokenizer_vocab_path) + self.device = 'cuda' if torch.cuda.is_available() else 'cpu' + self.model, self.optimizer, self.criterion = initialize_model(self.tokenizer, self.device) + self.conversation_history = [] # Keep track of conversations for learning + self.previous_reply = None # Store last reply for pattern recognition + + async def on_ready(self): + print(f'Logged in as {self.user.name}') + + async def on_message(self, message): + if message.author == self.user: + return + + print(f"Received message from {message.author}: {message.content}") + + # Update tokenizer vocabulary with the new message + previous_vocab_size = len(self.tokenizer.token2idx) + self.tokenizer.build_vocab([message.content]) + new_vocab_size = len(self.tokenizer.token2idx) + + # Update model if vocabulary has changed + if new_vocab_size != previous_vocab_size: + self.model = update_model_vocab(self.model, self.tokenizer, self.device) + self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001) + print("Model vocabulary updated.") + + # Generate a reply + self.model.eval() + with torch.no_grad(): + reply = self.generate_reply(message.content) + print(f"Sending reply: {reply}") + await message.channel.send(reply) + + # Append conversation to history for future learning + self.conversation_history.append({ + "user_message": message.content, + "bot_reply": reply, + "channel": message.channel + }) + + # Continuous learning: Train on this conversation pair + loss = train_on_conversation( + self.model, + self.optimizer, + self.criterion, + self.tokenizer, + message.content, + reply, + self.device + ) + + # Save the model and tokenizer for future sessions + save_model(self.model) + self.tokenizer.save_vocab(self.tokenizer_vocab_path) + + # Store this reply to help Jade learn from repetition in future responses + self.previous_reply = reply + + def generate_reply(self, input_text, max_length=20, temperature=1.0, top_k=10): + # Prepare the input sequence with special tokens + input_sequence = [''] + input_text.split() + [''] + input_indices = self.tokenizer.encode(' '.join(input_sequence)) + input_tensor = torch.tensor([input_indices], dtype=torch.long, device=self.device) + + generated_indices = [] + for _ in range(max_length): + output = self.model(input_tensor) + if output.size(0) == 0: + print("Model output is empty. Breaking out of generation loop.") + break + next_token_logits = output[-1, 0, :] / temperature + + # Penalize + unk_token_idx = self.tokenizer.token2idx.get('', None) + if unk_token_idx is not None: + next_token_logits[unk_token_idx] = -float('inf') + + # Apply Top-K sampling + top_k = min(top_k, next_token_logits.size(-1)) + values, indices = torch.topk(next_token_logits, top_k) + probabilities = F.softmax(values, dim=-1) + predicted_index = indices[torch.multinomial(probabilities, 1)].item() + + # Stop if token is generated + if predicted_index == self.tokenizer.token2idx.get(''): + break + + generated_indices.append(predicted_index) + input_indices.append(predicted_index) + input_tensor = torch.tensor([input_indices], dtype=torch.long, device=self.device) + + # Filter out special tokens from generated indices + special_token_indices = set(self.tokenizer.token2idx[token] for token in ['', '', '', '']) + filtered_indices = [idx for idx in generated_indices if idx not in special_token_indices] + + # Decode the filtered indices + reply = self.tokenizer.decode(filtered_indices) + return reply + + +DISCORD_TOKEN = os.getenv('DISCORD_TOKEN') + +# Initialize and run the Discord bot +intents = discord.Intents.default() +intents.messages = True +bot = DiscordBot(intents=intents) +bot.run(DISCORD_TOKEN) diff --git a/model.py b/model.py new file mode 100644 index 0000000..313b9fc --- /dev/null +++ b/model.py @@ -0,0 +1,183 @@ +import torch +import torch.nn as nn +import threading +import os +import json + + +# Simple Tokenizer +class SimpleTokenizer: + def __init__(self): + self.token2idx = {'': 0, '': 1, '': 2, '': 3} + self.idx2token = {idx: token for token, idx in self.token2idx.items()} + self.lock = threading.Lock() + + def build_vocab(self, texts): + with self.lock: + for text in texts: + tokens = text.split() + for token in tokens: + if token not in self.token2idx: + idx = len(self.token2idx) + self.token2idx[token] = idx + self.idx2token[idx] = token + + def encode(self, text): + with self.lock: + return [self.token2idx.get(token, self.token2idx['']) for token in text.split()] + + def decode(self, indices): + with self.lock: + return ' '.join([self.idx2token.get(idx, '') for idx in indices]) + + def save_vocab(self, path): + with open(path, 'w') as f: + json.dump({'token2idx': self.token2idx, 'idx2token': self.idx2token}, f) + + def load_vocab(self, path): + if os.path.exists(path): + with open(path, 'r') as f: + vocab = json.load(f) + self.token2idx = vocab['token2idx'] + self.idx2token = {int(k): v for k, v in vocab['idx2token'].items()} + print('Tokenizer vocabulary loaded from', path) + # Ensure special tokens are present + special_tokens = {'': 0, '': 1, '': 2, '': 3} + for token, idx in special_tokens.items(): + if token not in self.token2idx: + self.token2idx[token] = idx + self.idx2token[idx] = token + else: + print('No existing tokenizer vocabulary found. Starting fresh.') + self.token2idx = {'': 0, '': 1, '': 2, '': 3} + self.idx2token = {idx: token for token, idx in self.token2idx.items()} + + +# Positional Encoding +class PositionalEncoding(nn.Module): + def __init__(self, d_model, max_len=5000): + super(PositionalEncoding, self).__init__() + pe = torch.zeros(max_len, d_model) + position = torch.arange(0, max_len).unsqueeze(1).float() + div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model)) + pe[:, 0::2] = torch.sin(position * div_term) + if d_model % 2 == 1: + pe[:, -1] = torch.cos(position.squeeze() * div_term[-1]) + else: + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(1) # Shape: [max_len, 1, d_model] + self.register_buffer('pe', pe) + + def forward(self, x): + # x: [seq_len, batch_size, d_model] + x = x + self.pe[:x.size(0)] + return x + + +# GPT Model +class GPTModel(nn.Module): + def __init__(self, vocab_size, d_model=128, nhead=8, num_layers=2): + super(GPTModel, self).__init__() + self.model_type = 'Transformer' + self.d_model = d_model + self.embedding = nn.Embedding(vocab_size, d_model) + self.pos_encoder = PositionalEncoding(d_model) + encoder_layers = nn.TransformerEncoderLayer(d_model, nhead) + self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers) + self.fc_out = nn.Linear(d_model, vocab_size) + self.src_mask = None + + def _generate_square_subsequent_mask(self, sz): + mask = torch.triu(torch.ones(sz, sz) == 1).transpose(0, 1) + mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) + return mask + + def forward(self, src): + # src: [seq_len, batch_size] + src = src.transpose(0, 1) # Shape: [seq_len, batch_size] + src = self.embedding(src) * torch.sqrt(torch.tensor(self.d_model, dtype=torch.float32)) + src = self.pos_encoder(src) + if self.src_mask is None or self.src_mask.size(0) != src.size(0): + device = src.device + self.src_mask = self._generate_square_subsequent_mask(src.size(0)).to(device) + output = self.transformer_encoder(src, self.src_mask) + logits = self.fc_out(output) + return logits # Shape: [seq_len, batch_size, vocab_size] + + +# Training function +def train_step(model, optimizer, criterion, input_tensor, target_tensor): + model.train() + optimizer.zero_grad() + output = model(input_tensor) # [seq_len, batch_size, vocab_size] + output = output.view(-1, output.size(-1)) # [seq_len * batch_size, vocab_size] + target = target_tensor.transpose(0,1).contiguous().view(-1) # [seq_len * batch_size] + loss = criterion(output, target) + loss.backward() + optimizer.step() + print(f'Training loss: {loss.item():.4f}') + return loss.item() + + +def initialize_model(tokenizer, device): + vocab_size = len(tokenizer.token2idx) + model = GPTModel(vocab_size=vocab_size).to(device) + optimizer = torch.optim.Adam(model.parameters(), lr=0.001) + criterion = nn.CrossEntropyLoss(ignore_index=0) + model_path = 'gpt_model.pth' + + # Load existing model if available + if os.path.exists(model_path): + model.load_state_dict(torch.load(model_path, map_location=device)) + print('Model loaded from', model_path) + else: + print('No existing model found. Starting fresh.') + return model, optimizer, criterion + + +def save_model(model): + model_path = 'gpt_model.pth' + torch.save(model.state_dict(), model_path) + + +def update_model_vocab(model, tokenizer, device): + vocab_size = len(tokenizer.token2idx) + + old_embedding_weight = model.embedding.weight.data + old_vocab_size, embedding_dim = old_embedding_weight.shape + new_embedding = nn.Embedding(vocab_size, model.d_model).to(device) + new_embedding.weight.data[:old_vocab_size] = old_embedding_weight + model.embedding = new_embedding + + old_fc_out_weight = model.fc_out.weight.data + old_fc_out_bias = model.fc_out.bias.data + new_fc_out = nn.Linear(model.d_model, vocab_size).to(device) + new_fc_out.weight.data[:old_vocab_size] = old_fc_out_weight + new_fc_out.bias.data[:old_vocab_size] = old_fc_out_bias + model.fc_out = new_fc_out + + return model + + +def train_on_conversation(model, optimizer, criterion, tokenizer, input_text, target_text, device): + tokenizer.build_vocab([input_text, target_text]) + input_indices = tokenizer.encode(input_text) + target_indices = tokenizer.encode(target_text) + + # Concatenate input and target indices to create a single sequence + full_indices = input_indices + target_indices + + # Create input and target sequences for training + input_sequence = full_indices[:-1] # All tokens except the last + target_sequence = full_indices[1:] # All tokens except the first + + # Update model if vocabulary has changed + if len(tokenizer.token2idx) != model.embedding.num_embeddings: + model = update_model_vocab(model, tokenizer, device) + optimizer = torch.optim.Adam(model.parameters(), lr=0.001) + + input_tensor = torch.tensor([input_sequence], dtype=torch.long, device=device) + target_tensor = torch.tensor([target_sequence], dtype=torch.long, device=device) + + loss = train_step(model, optimizer, criterion, input_tensor, target_tensor) + return loss