2024-11-18 21:46:34 -05:00
|
|
|
# Suggested Refinements for Jade (Model.py)
|
|
|
|
|
2024-10-02 16:44:09 -04:00
|
|
|
import torch
|
|
|
|
import torch.nn as nn
|
2024-11-18 21:46:34 -05:00
|
|
|
import torch.optim as optim
|
|
|
|
import random
|
|
|
|
import string
|
|
|
|
import numpy as np
|
2024-10-02 16:44:09 -04:00
|
|
|
|
2024-11-18 21:46:34 -05:00
|
|
|
class JadeModel(nn.Module):
|
2024-10-02 16:44:09 -04:00
|
|
|
def __init__(self):
|
2024-11-18 21:46:34 -05:00
|
|
|
super(JadeModel, self).__init__()
|
|
|
|
# GPT-like Transformer architecture
|
|
|
|
self.vocab_size = 256 # Character-level tokenization (ASCII range)
|
|
|
|
self.embedding_dim = 768 # GPT-like embedding dimension
|
|
|
|
self.num_heads = 12 # Number of attention heads
|
|
|
|
self.num_layers = 12 # Number of transformer layers
|
|
|
|
self.max_position_embeddings = 512 # Maximum sequence length
|
|
|
|
|
|
|
|
# Embedding layers
|
|
|
|
self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
|
|
|
|
self.position_embedding = nn.Embedding(self.max_position_embeddings, self.embedding_dim)
|
|
|
|
|
|
|
|
# Transformer layers
|
|
|
|
self.transformer_layers = nn.ModuleList([
|
|
|
|
nn.TransformerEncoderLayer(d_model=self.embedding_dim, nhead=self.num_heads)
|
|
|
|
for _ in range(self.num_layers)
|
|
|
|
])
|
|
|
|
|
|
|
|
# Output layer
|
|
|
|
self.fc = nn.Linear(self.embedding_dim, self.vocab_size)
|
|
|
|
self.softmax = nn.Softmax(dim=-1)
|
|
|
|
|
|
|
|
# Optimizer and loss function
|
|
|
|
self.optimizer = optim.Adam(self.parameters(), lr=0.001)
|
|
|
|
self.criterion = nn.CrossEntropyLoss()
|
|
|
|
|
|
|
|
# Device setup
|
|
|
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
self.to(self.device)
|
|
|
|
|
|
|
|
# Debug message to verify changes (updated unique message for each change)
|
|
|
|
self.debug_message = "[DEBUG] Model initialized with version: Jade-Solstice-Horizon"
|
|
|
|
print(self.debug_message)
|
|
|
|
|
|
|
|
def forward(self, input_ids):
|
|
|
|
# Create position ids for input sequence
|
|
|
|
seq_length = input_ids.size(1)
|
|
|
|
position_ids = torch.arange(0, seq_length, dtype=torch.long, device=self.device)
|
|
|
|
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
|
|
|
|
|
|
|
|
# Embedding lookup
|
|
|
|
x = self.embedding(input_ids) + self.position_embedding(position_ids)
|
|
|
|
|
|
|
|
# Pass through transformer layers
|
|
|
|
for layer in self.transformer_layers:
|
|
|
|
x = layer(x)
|
|
|
|
|
|
|
|
# Output layer
|
|
|
|
x = self.fc(x)
|
2024-10-02 16:44:09 -04:00
|
|
|
return x
|
|
|
|
|
2024-11-18 21:46:34 -05:00
|
|
|
def generate_response(self, input_text, initial_temperature=0.85, top_p=0.8, repetition_penalty=1.4, max_token_frequency=2):
|
|
|
|
# Convert input_text to token ids
|
|
|
|
input_ids = self.tokenize(input_text)
|
|
|
|
input_tensor = torch.tensor(input_ids).unsqueeze(0).to(self.device)
|
|
|
|
generated_tokens = input_ids.copy()
|
|
|
|
recent_tokens = list(input_ids[-10:]) # Expanded recent tokens window to 10
|
|
|
|
temperature = initial_temperature
|
|
|
|
|
|
|
|
with torch.no_grad():
|
|
|
|
for i in range(50): # Generate up to 50 more tokens
|
|
|
|
output = self.forward(input_tensor)
|
|
|
|
logits = output[:, -1, :] # Consider only the last token's logits
|
|
|
|
logits = logits / (temperature + 1e-2) # Apply temperature for sampling diversity
|
|
|
|
|
|
|
|
# Apply repetition penalty
|
|
|
|
for token in set(generated_tokens):
|
|
|
|
if generated_tokens.count(token) > 1:
|
|
|
|
logits[0, token] /= (repetition_penalty + generated_tokens.count(token) * 0.02) # Frequency-based scaling for penalty
|
|
|
|
|
|
|
|
# Apply slight logits smoothing to avoid overly confident peaks
|
|
|
|
logits = logits - torch.mean(logits) * 0.01
|
|
|
|
|
|
|
|
# Dynamic Nucleus (top-p) sampling with adjusted threshold
|
|
|
|
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
|
|
|
|
cumulative_probs = torch.cumsum(self.softmax(sorted_logits), dim=-1)
|
|
|
|
top_p_mask = cumulative_probs < top_p
|
|
|
|
top_p_logits = sorted_logits[top_p_mask]
|
|
|
|
top_p_indices = sorted_indices[top_p_mask]
|
|
|
|
|
|
|
|
if len(top_p_logits) > 1:
|
|
|
|
top_p_probs = self.softmax(top_p_logits)
|
|
|
|
sampled_token = top_p_indices[torch.multinomial(top_p_probs, num_samples=1).item()].item()
|
|
|
|
else:
|
|
|
|
sampled_token = sorted_indices[0, 0].item() # Fallback to the most probable token if none pass the top-p threshold
|
|
|
|
|
|
|
|
# Enforce diversity constraint by limiting token frequency
|
|
|
|
if generated_tokens.count(sampled_token) >= max_token_frequency:
|
|
|
|
logits[0, sampled_token] -= 1.5 # Adjusted penalty to limit token frequency
|
|
|
|
continue # Skip adding this token if it has reached the max frequency
|
|
|
|
|
|
|
|
# Stop repetition if the sampled token was recently repeated
|
|
|
|
if len(generated_tokens) > 1 and generated_tokens[-1] == sampled_token:
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Add token and update state
|
|
|
|
generated_tokens.append(sampled_token)
|
|
|
|
recent_tokens.append(sampled_token)
|
|
|
|
if len(recent_tokens) > 10:
|
|
|
|
recent_tokens.pop(0) # Maintain a window of recent tokens to suppress
|
|
|
|
|
|
|
|
# Update input tensor to include the generated token
|
|
|
|
input_tensor = torch.tensor(generated_tokens).unsqueeze(0).to(self.device)
|
|
|
|
|
|
|
|
# Gradually decrease temperature to reduce randomness more smoothly
|
|
|
|
temperature = max(0.75, temperature * 0.98)
|
|
|
|
|
|
|
|
response = self.detokenize(generated_tokens)
|
|
|
|
print("[DEBUG] Generated response:", response) # Debug statement to verify changes
|
|
|
|
print(f"[DEBUG] Generation loss rate (approximated): {temperature}") # Approximate loss rate
|
|
|
|
return response
|
|
|
|
|
|
|
|
def tokenize(self, text):
|
|
|
|
# Character-level tokenizer: converts text to ASCII values
|
|
|
|
token_ids = [ord(char) for char in text if ord(char) < self.vocab_size]
|
|
|
|
return token_ids
|
|
|
|
|
|
|
|
def detokenize(self, token_ids):
|
|
|
|
# Detokenizer to convert ASCII values back to characters
|
|
|
|
return "".join([chr(id) for id in token_ids])
|
|
|
|
|
|
|
|
def train_on_message(self, message):
|
|
|
|
# Tokenize the message
|
|
|
|
input_ids = self.tokenize(message)
|
|
|
|
input_tensor = torch.tensor(input_ids).unsqueeze(0).to(self.device)
|
|
|
|
|
|
|
|
# Create target labels (next character prediction task)
|
|
|
|
labels = input_ids[1:] + [input_ids[-1]] # Shift tokens for training
|
|
|
|
labels_tensor = torch.tensor(labels).unsqueeze(0).to(self.device)
|
|
|
|
|
|
|
|
# Training step
|
|
|
|
self.optimizer.zero_grad()
|
|
|
|
outputs = self.forward(input_tensor)
|
|
|
|
loss = self.criterion(outputs.view(-1, outputs.size(-1)), labels_tensor.view(-1))
|
|
|
|
loss.backward()
|
|
|
|
self.optimizer.step()
|
|
|
|
print(f"Training loss: {loss.item()}")
|
|
|
|
|
|
|
|
# Changes made:
|
|
|
|
# Version: Jade-Solstice-Horizon
|
|
|
|
# - Reverted temperature, top_p, and repetition penalty settings to be closer to Solstice.
|
|
|
|
# - Introduced explicit stop criteria to prevent repeating tokens consecutively.
|
|
|
|
# - Applied slight smoothing to logits to prevent high peaks and excessive repetition.
|
|
|
|
# - Updated debug message to reflect the new version.
|
|
|
|
|
|
|
|
# Observations:
|
|
|
|
# - Aimed to retain the strengths of Solstice while reducing remaining issues with repetitive tokens by adding specific repetition stop criteria.
|