From e0ea105872187954dcd3a7716d8e00ddf0bba78e Mon Sep 17 00:00:00 2001 From: Dan Date: Mon, 18 Nov 2024 21:46:34 -0500 Subject: [PATCH] Got Jade to exactly copy without extra characters - Version Solstice-Horizon --- main.py | 145 ++++++--------------------- model.py | 299 +++++++++++++++++++++++++------------------------------ 2 files changed, 168 insertions(+), 276 deletions(-) diff --git a/main.py b/main.py index b4ccd82..154bdf0 100644 --- a/main.py +++ b/main.py @@ -1,123 +1,40 @@ +# main.py: Discord Bot Code + import discord import torch -from model import SimpleTokenizer, initialize_model, train_on_conversation, save_model, update_model_vocab -import torch.nn.functional as F +from model import JadeModel import os from dotenv import load_dotenv - +# Load environment variables load_dotenv() - -class DiscordBot(discord.Client): - def __init__(self, **options): - super().__init__(**options) - self.tokenizer = SimpleTokenizer() - self.tokenizer_vocab_path = 'tokenizer_vocab.json' - self.tokenizer.load_vocab(self.tokenizer_vocab_path) - self.device = 'cuda' if torch.cuda.is_available() else 'cpu' - self.model, self.optimizer, self.criterion = initialize_model(self.tokenizer, self.device) - self.conversation_history = [] # Keep track of conversations for learning - self.previous_reply = None # Store last reply for pattern recognition - - async def on_ready(self): - print(f'Logged in as {self.user.name}') - - async def on_message(self, message): - if message.author == self.user: - return - - print(f"Received message from {message.author}: {message.content}") - - # Update tokenizer vocabulary with the new message - previous_vocab_size = len(self.tokenizer.token2idx) - self.tokenizer.build_vocab([message.content]) - new_vocab_size = len(self.tokenizer.token2idx) - - # Update model if vocabulary has changed - if new_vocab_size != previous_vocab_size: - self.model = update_model_vocab(self.model, self.tokenizer, self.device) - self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001) - print("Model vocabulary updated.") - - # Generate a reply - self.model.eval() - with torch.no_grad(): - reply = self.generate_reply(message.content) - print(f"Sending reply: {reply}") - await message.channel.send(reply) - - # Append conversation to history for future learning - self.conversation_history.append({ - "user_message": message.content, - "bot_reply": reply, - "channel": message.channel - }) - - # Continuous learning: Train on this conversation pair - loss = train_on_conversation( - self.model, - self.optimizer, - self.criterion, - self.tokenizer, - message.content, - reply, - self.device - ) - - # Save the model and tokenizer for future sessions - save_model(self.model) - self.tokenizer.save_vocab(self.tokenizer_vocab_path) - - # Store this reply to help Jade learn from repetition in future responses - self.previous_reply = reply - - def generate_reply(self, input_text, max_length=20, temperature=1.0, top_k=10): - # Prepare the input sequence with special tokens - input_sequence = [''] + input_text.split() + [''] - input_indices = self.tokenizer.encode(' '.join(input_sequence)) - input_tensor = torch.tensor([input_indices], dtype=torch.long, device=self.device) - - generated_indices = [] - for _ in range(max_length): - output = self.model(input_tensor) - if output.size(0) == 0: - print("Model output is empty. Breaking out of generation loop.") - break - next_token_logits = output[-1, 0, :] / temperature - - # Penalize - unk_token_idx = self.tokenizer.token2idx.get('', None) - if unk_token_idx is not None: - next_token_logits[unk_token_idx] = -float('inf') - - # Apply Top-K sampling - top_k = min(top_k, next_token_logits.size(-1)) - values, indices = torch.topk(next_token_logits, top_k) - probabilities = F.softmax(values, dim=-1) - predicted_index = indices[torch.multinomial(probabilities, 1)].item() - - # Stop if token is generated - if predicted_index == self.tokenizer.token2idx.get(''): - break - - generated_indices.append(predicted_index) - input_indices.append(predicted_index) - input_tensor = torch.tensor([input_indices], dtype=torch.long, device=self.device) - - # Filter out special tokens from generated indices - special_token_indices = set(self.tokenizer.token2idx[token] for token in ['', '', '', '']) - filtered_indices = [idx for idx in generated_indices if idx not in special_token_indices] - - # Decode the filtered indices - reply = self.tokenizer.decode(filtered_indices) - return reply - - -DISCORD_TOKEN = os.getenv('DISCORD_TOKEN') - -# Initialize and run the Discord bot intents = discord.Intents.default() +intents.messages = True intents.message_content = True -bot = DiscordBot(intents=intents) -bot.run(DISCORD_TOKEN) +client = discord.Client(intents=intents) + +# Initialize the model +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model = JadeModel().to(device) + + +@client.event +async def on_ready(): + print(f'We have logged in as {client.user}') + + +@client.event +async def on_message(message): + if message.author == client.user: + return + + # Train Jade with the new message + model.train_on_message(message.content) + + # Generate a response using Jade + response = model.generate_response(message.content) + await message.channel.send(response) + +# Start the bot with your token +client.run(os.getenv('DISCORD_TOKEN')) diff --git a/model.py b/model.py index 313b9fc..3086570 100644 --- a/model.py +++ b/model.py @@ -1,183 +1,158 @@ +# Suggested Refinements for Jade (Model.py) + import torch import torch.nn as nn -import threading -import os -import json +import torch.optim as optim +import random +import string +import numpy as np - -# Simple Tokenizer -class SimpleTokenizer: +class JadeModel(nn.Module): def __init__(self): - self.token2idx = {'': 0, '': 1, '': 2, '': 3} - self.idx2token = {idx: token for token, idx in self.token2idx.items()} - self.lock = threading.Lock() + super(JadeModel, self).__init__() + # GPT-like Transformer architecture + self.vocab_size = 256 # Character-level tokenization (ASCII range) + self.embedding_dim = 768 # GPT-like embedding dimension + self.num_heads = 12 # Number of attention heads + self.num_layers = 12 # Number of transformer layers + self.max_position_embeddings = 512 # Maximum sequence length - def build_vocab(self, texts): - with self.lock: - for text in texts: - tokens = text.split() - for token in tokens: - if token not in self.token2idx: - idx = len(self.token2idx) - self.token2idx[token] = idx - self.idx2token[idx] = token + # Embedding layers + self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim) + self.position_embedding = nn.Embedding(self.max_position_embeddings, self.embedding_dim) + + # Transformer layers + self.transformer_layers = nn.ModuleList([ + nn.TransformerEncoderLayer(d_model=self.embedding_dim, nhead=self.num_heads) + for _ in range(self.num_layers) + ]) + + # Output layer + self.fc = nn.Linear(self.embedding_dim, self.vocab_size) + self.softmax = nn.Softmax(dim=-1) + + # Optimizer and loss function + self.optimizer = optim.Adam(self.parameters(), lr=0.001) + self.criterion = nn.CrossEntropyLoss() - def encode(self, text): - with self.lock: - return [self.token2idx.get(token, self.token2idx['']) for token in text.split()] + # Device setup + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.to(self.device) - def decode(self, indices): - with self.lock: - return ' '.join([self.idx2token.get(idx, '') for idx in indices]) + # Debug message to verify changes (updated unique message for each change) + self.debug_message = "[DEBUG] Model initialized with version: Jade-Solstice-Horizon" + print(self.debug_message) - def save_vocab(self, path): - with open(path, 'w') as f: - json.dump({'token2idx': self.token2idx, 'idx2token': self.idx2token}, f) + def forward(self, input_ids): + # Create position ids for input sequence + seq_length = input_ids.size(1) + position_ids = torch.arange(0, seq_length, dtype=torch.long, device=self.device) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) - def load_vocab(self, path): - if os.path.exists(path): - with open(path, 'r') as f: - vocab = json.load(f) - self.token2idx = vocab['token2idx'] - self.idx2token = {int(k): v for k, v in vocab['idx2token'].items()} - print('Tokenizer vocabulary loaded from', path) - # Ensure special tokens are present - special_tokens = {'': 0, '': 1, '': 2, '': 3} - for token, idx in special_tokens.items(): - if token not in self.token2idx: - self.token2idx[token] = idx - self.idx2token[idx] = token - else: - print('No existing tokenizer vocabulary found. Starting fresh.') - self.token2idx = {'': 0, '': 1, '': 2, '': 3} - self.idx2token = {idx: token for token, idx in self.token2idx.items()} - - -# Positional Encoding -class PositionalEncoding(nn.Module): - def __init__(self, d_model, max_len=5000): - super(PositionalEncoding, self).__init__() - pe = torch.zeros(max_len, d_model) - position = torch.arange(0, max_len).unsqueeze(1).float() - div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model)) - pe[:, 0::2] = torch.sin(position * div_term) - if d_model % 2 == 1: - pe[:, -1] = torch.cos(position.squeeze() * div_term[-1]) - else: - pe[:, 1::2] = torch.cos(position * div_term) - pe = pe.unsqueeze(1) # Shape: [max_len, 1, d_model] - self.register_buffer('pe', pe) - - def forward(self, x): - # x: [seq_len, batch_size, d_model] - x = x + self.pe[:x.size(0)] + # Embedding lookup + x = self.embedding(input_ids) + self.position_embedding(position_ids) + + # Pass through transformer layers + for layer in self.transformer_layers: + x = layer(x) + + # Output layer + x = self.fc(x) return x + def generate_response(self, input_text, initial_temperature=0.85, top_p=0.8, repetition_penalty=1.4, max_token_frequency=2): + # Convert input_text to token ids + input_ids = self.tokenize(input_text) + input_tensor = torch.tensor(input_ids).unsqueeze(0).to(self.device) + generated_tokens = input_ids.copy() + recent_tokens = list(input_ids[-10:]) # Expanded recent tokens window to 10 + temperature = initial_temperature -# GPT Model -class GPTModel(nn.Module): - def __init__(self, vocab_size, d_model=128, nhead=8, num_layers=2): - super(GPTModel, self).__init__() - self.model_type = 'Transformer' - self.d_model = d_model - self.embedding = nn.Embedding(vocab_size, d_model) - self.pos_encoder = PositionalEncoding(d_model) - encoder_layers = nn.TransformerEncoderLayer(d_model, nhead) - self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers) - self.fc_out = nn.Linear(d_model, vocab_size) - self.src_mask = None + with torch.no_grad(): + for i in range(50): # Generate up to 50 more tokens + output = self.forward(input_tensor) + logits = output[:, -1, :] # Consider only the last token's logits + logits = logits / (temperature + 1e-2) # Apply temperature for sampling diversity - def _generate_square_subsequent_mask(self, sz): - mask = torch.triu(torch.ones(sz, sz) == 1).transpose(0, 1) - mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) - return mask + # Apply repetition penalty + for token in set(generated_tokens): + if generated_tokens.count(token) > 1: + logits[0, token] /= (repetition_penalty + generated_tokens.count(token) * 0.02) # Frequency-based scaling for penalty - def forward(self, src): - # src: [seq_len, batch_size] - src = src.transpose(0, 1) # Shape: [seq_len, batch_size] - src = self.embedding(src) * torch.sqrt(torch.tensor(self.d_model, dtype=torch.float32)) - src = self.pos_encoder(src) - if self.src_mask is None or self.src_mask.size(0) != src.size(0): - device = src.device - self.src_mask = self._generate_square_subsequent_mask(src.size(0)).to(device) - output = self.transformer_encoder(src, self.src_mask) - logits = self.fc_out(output) - return logits # Shape: [seq_len, batch_size, vocab_size] + # Apply slight logits smoothing to avoid overly confident peaks + logits = logits - torch.mean(logits) * 0.01 + # Dynamic Nucleus (top-p) sampling with adjusted threshold + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + cumulative_probs = torch.cumsum(self.softmax(sorted_logits), dim=-1) + top_p_mask = cumulative_probs < top_p + top_p_logits = sorted_logits[top_p_mask] + top_p_indices = sorted_indices[top_p_mask] -# Training function -def train_step(model, optimizer, criterion, input_tensor, target_tensor): - model.train() - optimizer.zero_grad() - output = model(input_tensor) # [seq_len, batch_size, vocab_size] - output = output.view(-1, output.size(-1)) # [seq_len * batch_size, vocab_size] - target = target_tensor.transpose(0,1).contiguous().view(-1) # [seq_len * batch_size] - loss = criterion(output, target) - loss.backward() - optimizer.step() - print(f'Training loss: {loss.item():.4f}') - return loss.item() + if len(top_p_logits) > 1: + top_p_probs = self.softmax(top_p_logits) + sampled_token = top_p_indices[torch.multinomial(top_p_probs, num_samples=1).item()].item() + else: + sampled_token = sorted_indices[0, 0].item() # Fallback to the most probable token if none pass the top-p threshold + + # Enforce diversity constraint by limiting token frequency + if generated_tokens.count(sampled_token) >= max_token_frequency: + logits[0, sampled_token] -= 1.5 # Adjusted penalty to limit token frequency + continue # Skip adding this token if it has reached the max frequency + + # Stop repetition if the sampled token was recently repeated + if len(generated_tokens) > 1 and generated_tokens[-1] == sampled_token: + continue + + # Add token and update state + generated_tokens.append(sampled_token) + recent_tokens.append(sampled_token) + if len(recent_tokens) > 10: + recent_tokens.pop(0) # Maintain a window of recent tokens to suppress + + # Update input tensor to include the generated token + input_tensor = torch.tensor(generated_tokens).unsqueeze(0).to(self.device) + + # Gradually decrease temperature to reduce randomness more smoothly + temperature = max(0.75, temperature * 0.98) + + response = self.detokenize(generated_tokens) + print("[DEBUG] Generated response:", response) # Debug statement to verify changes + print(f"[DEBUG] Generation loss rate (approximated): {temperature}") # Approximate loss rate + return response + def tokenize(self, text): + # Character-level tokenizer: converts text to ASCII values + token_ids = [ord(char) for char in text if ord(char) < self.vocab_size] + return token_ids -def initialize_model(tokenizer, device): - vocab_size = len(tokenizer.token2idx) - model = GPTModel(vocab_size=vocab_size).to(device) - optimizer = torch.optim.Adam(model.parameters(), lr=0.001) - criterion = nn.CrossEntropyLoss(ignore_index=0) - model_path = 'gpt_model.pth' + def detokenize(self, token_ids): + # Detokenizer to convert ASCII values back to characters + return "".join([chr(id) for id in token_ids]) - # Load existing model if available - if os.path.exists(model_path): - model.load_state_dict(torch.load(model_path, map_location=device)) - print('Model loaded from', model_path) - else: - print('No existing model found. Starting fresh.') - return model, optimizer, criterion + def train_on_message(self, message): + # Tokenize the message + input_ids = self.tokenize(message) + input_tensor = torch.tensor(input_ids).unsqueeze(0).to(self.device) + + # Create target labels (next character prediction task) + labels = input_ids[1:] + [input_ids[-1]] # Shift tokens for training + labels_tensor = torch.tensor(labels).unsqueeze(0).to(self.device) + + # Training step + self.optimizer.zero_grad() + outputs = self.forward(input_tensor) + loss = self.criterion(outputs.view(-1, outputs.size(-1)), labels_tensor.view(-1)) + loss.backward() + self.optimizer.step() + print(f"Training loss: {loss.item()}") +# Changes made: +# Version: Jade-Solstice-Horizon +# - Reverted temperature, top_p, and repetition penalty settings to be closer to Solstice. +# - Introduced explicit stop criteria to prevent repeating tokens consecutively. +# - Applied slight smoothing to logits to prevent high peaks and excessive repetition. +# - Updated debug message to reflect the new version. -def save_model(model): - model_path = 'gpt_model.pth' - torch.save(model.state_dict(), model_path) - - -def update_model_vocab(model, tokenizer, device): - vocab_size = len(tokenizer.token2idx) - - old_embedding_weight = model.embedding.weight.data - old_vocab_size, embedding_dim = old_embedding_weight.shape - new_embedding = nn.Embedding(vocab_size, model.d_model).to(device) - new_embedding.weight.data[:old_vocab_size] = old_embedding_weight - model.embedding = new_embedding - - old_fc_out_weight = model.fc_out.weight.data - old_fc_out_bias = model.fc_out.bias.data - new_fc_out = nn.Linear(model.d_model, vocab_size).to(device) - new_fc_out.weight.data[:old_vocab_size] = old_fc_out_weight - new_fc_out.bias.data[:old_vocab_size] = old_fc_out_bias - model.fc_out = new_fc_out - - return model - - -def train_on_conversation(model, optimizer, criterion, tokenizer, input_text, target_text, device): - tokenizer.build_vocab([input_text, target_text]) - input_indices = tokenizer.encode(input_text) - target_indices = tokenizer.encode(target_text) - - # Concatenate input and target indices to create a single sequence - full_indices = input_indices + target_indices - - # Create input and target sequences for training - input_sequence = full_indices[:-1] # All tokens except the last - target_sequence = full_indices[1:] # All tokens except the first - - # Update model if vocabulary has changed - if len(tokenizer.token2idx) != model.embedding.num_embeddings: - model = update_model_vocab(model, tokenizer, device) - optimizer = torch.optim.Adam(model.parameters(), lr=0.001) - - input_tensor = torch.tensor([input_sequence], dtype=torch.long, device=device) - target_tensor = torch.tensor([target_sequence], dtype=torch.long, device=device) - - loss = train_step(model, optimizer, criterion, input_tensor, target_tensor) - return loss +# Observations: +# - Aimed to retain the strengths of Solstice while reducing remaining issues with repetitive tokens by adding specific repetition stop criteria.