Got Jade to exactly copy without extra characters - Version Solstice-Horizon
This commit is contained in:
parent
d7f116620a
commit
e0ea105872
135
main.py
135
main.py
@ -1,123 +1,40 @@
|
||||
# main.py: Discord Bot Code
|
||||
|
||||
import discord
|
||||
import torch
|
||||
from model import SimpleTokenizer, initialize_model, train_on_conversation, save_model, update_model_vocab
|
||||
import torch.nn.functional as F
|
||||
from model import JadeModel
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
intents = discord.Intents.default()
|
||||
intents.messages = True
|
||||
intents.message_content = True
|
||||
client = discord.Client(intents=intents)
|
||||
|
||||
class DiscordBot(discord.Client):
|
||||
def __init__(self, **options):
|
||||
super().__init__(**options)
|
||||
self.tokenizer = SimpleTokenizer()
|
||||
self.tokenizer_vocab_path = 'tokenizer_vocab.json'
|
||||
self.tokenizer.load_vocab(self.tokenizer_vocab_path)
|
||||
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
self.model, self.optimizer, self.criterion = initialize_model(self.tokenizer, self.device)
|
||||
self.conversation_history = [] # Keep track of conversations for learning
|
||||
self.previous_reply = None # Store last reply for pattern recognition
|
||||
# Initialize the model
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
model = JadeModel().to(device)
|
||||
|
||||
async def on_ready(self):
|
||||
print(f'Logged in as {self.user.name}')
|
||||
|
||||
async def on_message(self, message):
|
||||
if message.author == self.user:
|
||||
@client.event
|
||||
async def on_ready():
|
||||
print(f'We have logged in as {client.user}')
|
||||
|
||||
|
||||
@client.event
|
||||
async def on_message(message):
|
||||
if message.author == client.user:
|
||||
return
|
||||
|
||||
print(f"Received message from {message.author}: {message.content}")
|
||||
# Train Jade with the new message
|
||||
model.train_on_message(message.content)
|
||||
|
||||
# Update tokenizer vocabulary with the new message
|
||||
previous_vocab_size = len(self.tokenizer.token2idx)
|
||||
self.tokenizer.build_vocab([message.content])
|
||||
new_vocab_size = len(self.tokenizer.token2idx)
|
||||
# Generate a response using Jade
|
||||
response = model.generate_response(message.content)
|
||||
await message.channel.send(response)
|
||||
|
||||
# Update model if vocabulary has changed
|
||||
if new_vocab_size != previous_vocab_size:
|
||||
self.model = update_model_vocab(self.model, self.tokenizer, self.device)
|
||||
self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001)
|
||||
print("Model vocabulary updated.")
|
||||
|
||||
# Generate a reply
|
||||
self.model.eval()
|
||||
with torch.no_grad():
|
||||
reply = self.generate_reply(message.content)
|
||||
print(f"Sending reply: {reply}")
|
||||
await message.channel.send(reply)
|
||||
|
||||
# Append conversation to history for future learning
|
||||
self.conversation_history.append({
|
||||
"user_message": message.content,
|
||||
"bot_reply": reply,
|
||||
"channel": message.channel
|
||||
})
|
||||
|
||||
# Continuous learning: Train on this conversation pair
|
||||
loss = train_on_conversation(
|
||||
self.model,
|
||||
self.optimizer,
|
||||
self.criterion,
|
||||
self.tokenizer,
|
||||
message.content,
|
||||
reply,
|
||||
self.device
|
||||
)
|
||||
|
||||
# Save the model and tokenizer for future sessions
|
||||
save_model(self.model)
|
||||
self.tokenizer.save_vocab(self.tokenizer_vocab_path)
|
||||
|
||||
# Store this reply to help Jade learn from repetition in future responses
|
||||
self.previous_reply = reply
|
||||
|
||||
def generate_reply(self, input_text, max_length=20, temperature=1.0, top_k=10):
|
||||
# Prepare the input sequence with special tokens
|
||||
input_sequence = ['<SOS>'] + input_text.split() + ['<EOS>']
|
||||
input_indices = self.tokenizer.encode(' '.join(input_sequence))
|
||||
input_tensor = torch.tensor([input_indices], dtype=torch.long, device=self.device)
|
||||
|
||||
generated_indices = []
|
||||
for _ in range(max_length):
|
||||
output = self.model(input_tensor)
|
||||
if output.size(0) == 0:
|
||||
print("Model output is empty. Breaking out of generation loop.")
|
||||
break
|
||||
next_token_logits = output[-1, 0, :] / temperature
|
||||
|
||||
# Penalize <UNK>
|
||||
unk_token_idx = self.tokenizer.token2idx.get('<UNK>', None)
|
||||
if unk_token_idx is not None:
|
||||
next_token_logits[unk_token_idx] = -float('inf')
|
||||
|
||||
# Apply Top-K sampling
|
||||
top_k = min(top_k, next_token_logits.size(-1))
|
||||
values, indices = torch.topk(next_token_logits, top_k)
|
||||
probabilities = F.softmax(values, dim=-1)
|
||||
predicted_index = indices[torch.multinomial(probabilities, 1)].item()
|
||||
|
||||
# Stop if <EOS> token is generated
|
||||
if predicted_index == self.tokenizer.token2idx.get('<EOS>'):
|
||||
break
|
||||
|
||||
generated_indices.append(predicted_index)
|
||||
input_indices.append(predicted_index)
|
||||
input_tensor = torch.tensor([input_indices], dtype=torch.long, device=self.device)
|
||||
|
||||
# Filter out special tokens from generated indices
|
||||
special_token_indices = set(self.tokenizer.token2idx[token] for token in ['<PAD>', '<UNK>', '<SOS>', '<EOS>'])
|
||||
filtered_indices = [idx for idx in generated_indices if idx not in special_token_indices]
|
||||
|
||||
# Decode the filtered indices
|
||||
reply = self.tokenizer.decode(filtered_indices)
|
||||
return reply
|
||||
|
||||
|
||||
DISCORD_TOKEN = os.getenv('DISCORD_TOKEN')
|
||||
|
||||
# Initialize and run the Discord bot
|
||||
intents = discord.Intents.default()
|
||||
intents.message_content = True
|
||||
bot = DiscordBot(intents=intents)
|
||||
bot.run(DISCORD_TOKEN)
|
||||
# Start the bot with your token
|
||||
client.run(os.getenv('DISCORD_TOKEN'))
|
||||
|
275
model.py
275
model.py
@ -1,183 +1,158 @@
|
||||
# Suggested Refinements for Jade (Model.py)
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import threading
|
||||
import os
|
||||
import json
|
||||
import torch.optim as optim
|
||||
import random
|
||||
import string
|
||||
import numpy as np
|
||||
|
||||
|
||||
# Simple Tokenizer
|
||||
class SimpleTokenizer:
|
||||
class JadeModel(nn.Module):
|
||||
def __init__(self):
|
||||
self.token2idx = {'<PAD>': 0, '<UNK>': 1, '<SOS>': 2, '<EOS>': 3}
|
||||
self.idx2token = {idx: token for token, idx in self.token2idx.items()}
|
||||
self.lock = threading.Lock()
|
||||
super(JadeModel, self).__init__()
|
||||
# GPT-like Transformer architecture
|
||||
self.vocab_size = 256 # Character-level tokenization (ASCII range)
|
||||
self.embedding_dim = 768 # GPT-like embedding dimension
|
||||
self.num_heads = 12 # Number of attention heads
|
||||
self.num_layers = 12 # Number of transformer layers
|
||||
self.max_position_embeddings = 512 # Maximum sequence length
|
||||
|
||||
def build_vocab(self, texts):
|
||||
with self.lock:
|
||||
for text in texts:
|
||||
tokens = text.split()
|
||||
for token in tokens:
|
||||
if token not in self.token2idx:
|
||||
idx = len(self.token2idx)
|
||||
self.token2idx[token] = idx
|
||||
self.idx2token[idx] = token
|
||||
# Embedding layers
|
||||
self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
|
||||
self.position_embedding = nn.Embedding(self.max_position_embeddings, self.embedding_dim)
|
||||
|
||||
def encode(self, text):
|
||||
with self.lock:
|
||||
return [self.token2idx.get(token, self.token2idx['<UNK>']) for token in text.split()]
|
||||
# Transformer layers
|
||||
self.transformer_layers = nn.ModuleList([
|
||||
nn.TransformerEncoderLayer(d_model=self.embedding_dim, nhead=self.num_heads)
|
||||
for _ in range(self.num_layers)
|
||||
])
|
||||
|
||||
def decode(self, indices):
|
||||
with self.lock:
|
||||
return ' '.join([self.idx2token.get(idx, '<UNK>') for idx in indices])
|
||||
# Output layer
|
||||
self.fc = nn.Linear(self.embedding_dim, self.vocab_size)
|
||||
self.softmax = nn.Softmax(dim=-1)
|
||||
|
||||
def save_vocab(self, path):
|
||||
with open(path, 'w') as f:
|
||||
json.dump({'token2idx': self.token2idx, 'idx2token': self.idx2token}, f)
|
||||
# Optimizer and loss function
|
||||
self.optimizer = optim.Adam(self.parameters(), lr=0.001)
|
||||
self.criterion = nn.CrossEntropyLoss()
|
||||
|
||||
def load_vocab(self, path):
|
||||
if os.path.exists(path):
|
||||
with open(path, 'r') as f:
|
||||
vocab = json.load(f)
|
||||
self.token2idx = vocab['token2idx']
|
||||
self.idx2token = {int(k): v for k, v in vocab['idx2token'].items()}
|
||||
print('Tokenizer vocabulary loaded from', path)
|
||||
# Ensure special tokens are present
|
||||
special_tokens = {'<PAD>': 0, '<UNK>': 1, '<SOS>': 2, '<EOS>': 3}
|
||||
for token, idx in special_tokens.items():
|
||||
if token not in self.token2idx:
|
||||
self.token2idx[token] = idx
|
||||
self.idx2token[idx] = token
|
||||
else:
|
||||
print('No existing tokenizer vocabulary found. Starting fresh.')
|
||||
self.token2idx = {'<PAD>': 0, '<UNK>': 1, '<SOS>': 2, '<EOS>': 3}
|
||||
self.idx2token = {idx: token for token, idx in self.token2idx.items()}
|
||||
# Device setup
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
self.to(self.device)
|
||||
|
||||
# Debug message to verify changes (updated unique message for each change)
|
||||
self.debug_message = "[DEBUG] Model initialized with version: Jade-Solstice-Horizon"
|
||||
print(self.debug_message)
|
||||
|
||||
# Positional Encoding
|
||||
class PositionalEncoding(nn.Module):
|
||||
def __init__(self, d_model, max_len=5000):
|
||||
super(PositionalEncoding, self).__init__()
|
||||
pe = torch.zeros(max_len, d_model)
|
||||
position = torch.arange(0, max_len).unsqueeze(1).float()
|
||||
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
|
||||
pe[:, 0::2] = torch.sin(position * div_term)
|
||||
if d_model % 2 == 1:
|
||||
pe[:, -1] = torch.cos(position.squeeze() * div_term[-1])
|
||||
else:
|
||||
pe[:, 1::2] = torch.cos(position * div_term)
|
||||
pe = pe.unsqueeze(1) # Shape: [max_len, 1, d_model]
|
||||
self.register_buffer('pe', pe)
|
||||
def forward(self, input_ids):
|
||||
# Create position ids for input sequence
|
||||
seq_length = input_ids.size(1)
|
||||
position_ids = torch.arange(0, seq_length, dtype=torch.long, device=self.device)
|
||||
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
|
||||
|
||||
def forward(self, x):
|
||||
# x: [seq_len, batch_size, d_model]
|
||||
x = x + self.pe[:x.size(0)]
|
||||
# Embedding lookup
|
||||
x = self.embedding(input_ids) + self.position_embedding(position_ids)
|
||||
|
||||
# Pass through transformer layers
|
||||
for layer in self.transformer_layers:
|
||||
x = layer(x)
|
||||
|
||||
# Output layer
|
||||
x = self.fc(x)
|
||||
return x
|
||||
|
||||
def generate_response(self, input_text, initial_temperature=0.85, top_p=0.8, repetition_penalty=1.4, max_token_frequency=2):
|
||||
# Convert input_text to token ids
|
||||
input_ids = self.tokenize(input_text)
|
||||
input_tensor = torch.tensor(input_ids).unsqueeze(0).to(self.device)
|
||||
generated_tokens = input_ids.copy()
|
||||
recent_tokens = list(input_ids[-10:]) # Expanded recent tokens window to 10
|
||||
temperature = initial_temperature
|
||||
|
||||
# GPT Model
|
||||
class GPTModel(nn.Module):
|
||||
def __init__(self, vocab_size, d_model=128, nhead=8, num_layers=2):
|
||||
super(GPTModel, self).__init__()
|
||||
self.model_type = 'Transformer'
|
||||
self.d_model = d_model
|
||||
self.embedding = nn.Embedding(vocab_size, d_model)
|
||||
self.pos_encoder = PositionalEncoding(d_model)
|
||||
encoder_layers = nn.TransformerEncoderLayer(d_model, nhead)
|
||||
self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
|
||||
self.fc_out = nn.Linear(d_model, vocab_size)
|
||||
self.src_mask = None
|
||||
with torch.no_grad():
|
||||
for i in range(50): # Generate up to 50 more tokens
|
||||
output = self.forward(input_tensor)
|
||||
logits = output[:, -1, :] # Consider only the last token's logits
|
||||
logits = logits / (temperature + 1e-2) # Apply temperature for sampling diversity
|
||||
|
||||
def _generate_square_subsequent_mask(self, sz):
|
||||
mask = torch.triu(torch.ones(sz, sz) == 1).transpose(0, 1)
|
||||
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
|
||||
return mask
|
||||
# Apply repetition penalty
|
||||
for token in set(generated_tokens):
|
||||
if generated_tokens.count(token) > 1:
|
||||
logits[0, token] /= (repetition_penalty + generated_tokens.count(token) * 0.02) # Frequency-based scaling for penalty
|
||||
|
||||
def forward(self, src):
|
||||
# src: [seq_len, batch_size]
|
||||
src = src.transpose(0, 1) # Shape: [seq_len, batch_size]
|
||||
src = self.embedding(src) * torch.sqrt(torch.tensor(self.d_model, dtype=torch.float32))
|
||||
src = self.pos_encoder(src)
|
||||
if self.src_mask is None or self.src_mask.size(0) != src.size(0):
|
||||
device = src.device
|
||||
self.src_mask = self._generate_square_subsequent_mask(src.size(0)).to(device)
|
||||
output = self.transformer_encoder(src, self.src_mask)
|
||||
logits = self.fc_out(output)
|
||||
return logits # Shape: [seq_len, batch_size, vocab_size]
|
||||
# Apply slight logits smoothing to avoid overly confident peaks
|
||||
logits = logits - torch.mean(logits) * 0.01
|
||||
|
||||
# Dynamic Nucleus (top-p) sampling with adjusted threshold
|
||||
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
|
||||
cumulative_probs = torch.cumsum(self.softmax(sorted_logits), dim=-1)
|
||||
top_p_mask = cumulative_probs < top_p
|
||||
top_p_logits = sorted_logits[top_p_mask]
|
||||
top_p_indices = sorted_indices[top_p_mask]
|
||||
|
||||
# Training function
|
||||
def train_step(model, optimizer, criterion, input_tensor, target_tensor):
|
||||
model.train()
|
||||
optimizer.zero_grad()
|
||||
output = model(input_tensor) # [seq_len, batch_size, vocab_size]
|
||||
output = output.view(-1, output.size(-1)) # [seq_len * batch_size, vocab_size]
|
||||
target = target_tensor.transpose(0,1).contiguous().view(-1) # [seq_len * batch_size]
|
||||
loss = criterion(output, target)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
print(f'Training loss: {loss.item():.4f}')
|
||||
return loss.item()
|
||||
|
||||
|
||||
def initialize_model(tokenizer, device):
|
||||
vocab_size = len(tokenizer.token2idx)
|
||||
model = GPTModel(vocab_size=vocab_size).to(device)
|
||||
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
|
||||
criterion = nn.CrossEntropyLoss(ignore_index=0)
|
||||
model_path = 'gpt_model.pth'
|
||||
|
||||
# Load existing model if available
|
||||
if os.path.exists(model_path):
|
||||
model.load_state_dict(torch.load(model_path, map_location=device))
|
||||
print('Model loaded from', model_path)
|
||||
if len(top_p_logits) > 1:
|
||||
top_p_probs = self.softmax(top_p_logits)
|
||||
sampled_token = top_p_indices[torch.multinomial(top_p_probs, num_samples=1).item()].item()
|
||||
else:
|
||||
print('No existing model found. Starting fresh.')
|
||||
return model, optimizer, criterion
|
||||
sampled_token = sorted_indices[0, 0].item() # Fallback to the most probable token if none pass the top-p threshold
|
||||
|
||||
# Enforce diversity constraint by limiting token frequency
|
||||
if generated_tokens.count(sampled_token) >= max_token_frequency:
|
||||
logits[0, sampled_token] -= 1.5 # Adjusted penalty to limit token frequency
|
||||
continue # Skip adding this token if it has reached the max frequency
|
||||
|
||||
def save_model(model):
|
||||
model_path = 'gpt_model.pth'
|
||||
torch.save(model.state_dict(), model_path)
|
||||
# Stop repetition if the sampled token was recently repeated
|
||||
if len(generated_tokens) > 1 and generated_tokens[-1] == sampled_token:
|
||||
continue
|
||||
|
||||
# Add token and update state
|
||||
generated_tokens.append(sampled_token)
|
||||
recent_tokens.append(sampled_token)
|
||||
if len(recent_tokens) > 10:
|
||||
recent_tokens.pop(0) # Maintain a window of recent tokens to suppress
|
||||
|
||||
def update_model_vocab(model, tokenizer, device):
|
||||
vocab_size = len(tokenizer.token2idx)
|
||||
# Update input tensor to include the generated token
|
||||
input_tensor = torch.tensor(generated_tokens).unsqueeze(0).to(self.device)
|
||||
|
||||
old_embedding_weight = model.embedding.weight.data
|
||||
old_vocab_size, embedding_dim = old_embedding_weight.shape
|
||||
new_embedding = nn.Embedding(vocab_size, model.d_model).to(device)
|
||||
new_embedding.weight.data[:old_vocab_size] = old_embedding_weight
|
||||
model.embedding = new_embedding
|
||||
# Gradually decrease temperature to reduce randomness more smoothly
|
||||
temperature = max(0.75, temperature * 0.98)
|
||||
|
||||
old_fc_out_weight = model.fc_out.weight.data
|
||||
old_fc_out_bias = model.fc_out.bias.data
|
||||
new_fc_out = nn.Linear(model.d_model, vocab_size).to(device)
|
||||
new_fc_out.weight.data[:old_vocab_size] = old_fc_out_weight
|
||||
new_fc_out.bias.data[:old_vocab_size] = old_fc_out_bias
|
||||
model.fc_out = new_fc_out
|
||||
response = self.detokenize(generated_tokens)
|
||||
print("[DEBUG] Generated response:", response) # Debug statement to verify changes
|
||||
print(f"[DEBUG] Generation loss rate (approximated): {temperature}") # Approximate loss rate
|
||||
return response
|
||||
|
||||
return model
|
||||
def tokenize(self, text):
|
||||
# Character-level tokenizer: converts text to ASCII values
|
||||
token_ids = [ord(char) for char in text if ord(char) < self.vocab_size]
|
||||
return token_ids
|
||||
|
||||
def detokenize(self, token_ids):
|
||||
# Detokenizer to convert ASCII values back to characters
|
||||
return "".join([chr(id) for id in token_ids])
|
||||
|
||||
def train_on_conversation(model, optimizer, criterion, tokenizer, input_text, target_text, device):
|
||||
tokenizer.build_vocab([input_text, target_text])
|
||||
input_indices = tokenizer.encode(input_text)
|
||||
target_indices = tokenizer.encode(target_text)
|
||||
def train_on_message(self, message):
|
||||
# Tokenize the message
|
||||
input_ids = self.tokenize(message)
|
||||
input_tensor = torch.tensor(input_ids).unsqueeze(0).to(self.device)
|
||||
|
||||
# Concatenate input and target indices to create a single sequence
|
||||
full_indices = input_indices + target_indices
|
||||
# Create target labels (next character prediction task)
|
||||
labels = input_ids[1:] + [input_ids[-1]] # Shift tokens for training
|
||||
labels_tensor = torch.tensor(labels).unsqueeze(0).to(self.device)
|
||||
|
||||
# Create input and target sequences for training
|
||||
input_sequence = full_indices[:-1] # All tokens except the last
|
||||
target_sequence = full_indices[1:] # All tokens except the first
|
||||
# Training step
|
||||
self.optimizer.zero_grad()
|
||||
outputs = self.forward(input_tensor)
|
||||
loss = self.criterion(outputs.view(-1, outputs.size(-1)), labels_tensor.view(-1))
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
print(f"Training loss: {loss.item()}")
|
||||
|
||||
# Update model if vocabulary has changed
|
||||
if len(tokenizer.token2idx) != model.embedding.num_embeddings:
|
||||
model = update_model_vocab(model, tokenizer, device)
|
||||
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
|
||||
# Changes made:
|
||||
# Version: Jade-Solstice-Horizon
|
||||
# - Reverted temperature, top_p, and repetition penalty settings to be closer to Solstice.
|
||||
# - Introduced explicit stop criteria to prevent repeating tokens consecutively.
|
||||
# - Applied slight smoothing to logits to prevent high peaks and excessive repetition.
|
||||
# - Updated debug message to reflect the new version.
|
||||
|
||||
input_tensor = torch.tensor([input_sequence], dtype=torch.long, device=device)
|
||||
target_tensor = torch.tensor([target_sequence], dtype=torch.long, device=device)
|
||||
|
||||
loss = train_step(model, optimizer, criterion, input_tensor, target_tensor)
|
||||
return loss
|
||||
# Observations:
|
||||
# - Aimed to retain the strengths of Solstice while reducing remaining issues with repetitive tokens by adding specific repetition stop criteria.
|
||||
|
Loading…
x
Reference in New Issue
Block a user