Fix: Moved the Files around due to imports not working right
Feat: Phoebe replies but it's gibbish This is a version break because of the file structure change.
This commit is contained in:
162
phoebe/gpt_model.py
Normal file
162
phoebe/gpt_model.py
Normal file
@ -0,0 +1,162 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import os
|
||||
|
||||
# Hyperparameters
|
||||
batch_size = 64
|
||||
block_size = 256
|
||||
num_embed = 384 # Ensure consistency in naming
|
||||
num_heads = 8
|
||||
num_layers = 8
|
||||
dropout = 0.2
|
||||
|
||||
|
||||
class Head(nn.Module):
|
||||
def __init__(self, head_size):
|
||||
super().__init__()
|
||||
self.key = nn.Linear(num_embed, head_size)
|
||||
self.query = nn.Linear(num_embed, head_size)
|
||||
self.value = nn.Linear(num_embed, head_size)
|
||||
self.register_buffer(
|
||||
"tril", torch.tril(torch.ones(block_size, block_size))
|
||||
)
|
||||
self.dropout = nn.Dropout(dropout)
|
||||
|
||||
def forward(self, x):
|
||||
B, T, C = x.shape
|
||||
k = self.key(x)
|
||||
q = self.query(x)
|
||||
wei = q @ k.transpose(-2, -1) * C**-0.5
|
||||
wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
|
||||
wei = F.softmax(wei, dim=-1)
|
||||
wei = self.dropout(wei)
|
||||
v = self.value(x)
|
||||
out = wei @ v
|
||||
return out
|
||||
|
||||
|
||||
class MultiHeadAttention(nn.Module):
|
||||
def __init__(self, num_heads, head_size):
|
||||
super().__init__()
|
||||
self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
|
||||
self.proj = nn.Linear(num_embed, num_embed)
|
||||
self.dropout = nn.Dropout(dropout)
|
||||
|
||||
def forward(self, x):
|
||||
out = torch.cat([h(x) for h in self.heads], dim=-1)
|
||||
out = self.dropout(self.proj(out))
|
||||
return out
|
||||
|
||||
|
||||
class FeedForward(nn.Module):
|
||||
def __init__(self, num_embed):
|
||||
super().__init__()
|
||||
self.net = nn.Sequential(
|
||||
nn.Linear(num_embed, 4 * num_embed),
|
||||
nn.ReLU(),
|
||||
nn.Linear(4 * num_embed, num_embed),
|
||||
nn.Dropout(dropout),
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
return self.net(x)
|
||||
|
||||
|
||||
class Block(nn.Module):
|
||||
def __init__(self, num_embed, num_head):
|
||||
super().__init__()
|
||||
head_size = num_embed // num_head
|
||||
self.sa = MultiHeadAttention(num_head, head_size)
|
||||
self.ff = FeedForward(num_embed)
|
||||
self.ln1 = nn.LayerNorm(num_embed)
|
||||
self.ln2 = nn.LayerNorm(num_embed)
|
||||
|
||||
def forward(self, x):
|
||||
y = self.sa(x)
|
||||
x = self.ln1(x + y)
|
||||
y = self.ff(x)
|
||||
x = self.ln2(x + y)
|
||||
return x
|
||||
|
||||
|
||||
class GPT(nn.Module):
|
||||
def __init__(self, vocab_size):
|
||||
super().__init__()
|
||||
self.token_embedding_table = nn.Embedding(vocab_size, num_embed)
|
||||
self.position_embedding_table = nn.Embedding(block_size, num_embed)
|
||||
self.blocks = nn.Sequential(
|
||||
*[Block(num_embed, num_heads) for _ in range(num_layers)]
|
||||
)
|
||||
self.ln = nn.LayerNorm(num_embed)
|
||||
self.lm_head = nn.Linear(num_embed, vocab_size)
|
||||
self.apply(self._init_weights)
|
||||
|
||||
def _init_weights(self, module):
|
||||
if isinstance(module, nn.Linear):
|
||||
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
||||
if module.bias is not None:
|
||||
torch.nn.init.zeros_(module.bias)
|
||||
elif isinstance(module, nn.Embedding):
|
||||
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
||||
|
||||
def forward(self, idx, targets=None):
|
||||
B, T = idx.shape
|
||||
tok_emb = self.token_embedding_table(idx)
|
||||
pos_emb = self.position_embedding_table(
|
||||
torch.arange(T, device=idx.device)
|
||||
)
|
||||
x = tok_emb + pos_emb
|
||||
x = self.blocks(x)
|
||||
x = self.ln(x)
|
||||
logits = self.lm_head(x)
|
||||
|
||||
if targets is None:
|
||||
loss = None
|
||||
else:
|
||||
B, T, C = logits.shape
|
||||
logits = logits.view(B * T, C)
|
||||
targets = targets.view(B * T)
|
||||
loss = F.cross_entropy(logits, targets)
|
||||
return logits, loss
|
||||
|
||||
def generate(self, idx, max_new_tokens):
|
||||
for _ in range(max_new_tokens):
|
||||
idx_cond = idx[:, -block_size:]
|
||||
logits, _ = self(idx_cond)
|
||||
print(f"Logits shape: {logits.shape}") # Debug print
|
||||
if logits.size(1) == 0:
|
||||
raise ValueError("Logits tensor is empty.")
|
||||
logits = logits[:, -1, :]
|
||||
probs = F.softmax(logits, dim=-1)
|
||||
idx_next = torch.multinomial(probs, num_samples=1)
|
||||
idx = torch.cat((idx, idx_next), dim=1)
|
||||
return idx
|
||||
|
||||
|
||||
def encode(s, string_to_int):
|
||||
# Replace unknown characters with a special token (e.g., "<unk>")
|
||||
encoded = []
|
||||
for c in s:
|
||||
if c in string_to_int:
|
||||
encoded.append(string_to_int[c])
|
||||
else:
|
||||
print(f"Unknown character encountered during encoding: {c}")
|
||||
encoded.append(string_to_int["<unk>"])
|
||||
return encoded
|
||||
|
||||
|
||||
def decode(lst, int_to_string):
|
||||
return "".join([int_to_string[i] for i in lst])
|
||||
|
||||
|
||||
def load_model(vocab_size, model_path="phoebe_model.pt"):
|
||||
model = GPT(vocab_size)
|
||||
if os.path.exists(model_path):
|
||||
model.load_state_dict(
|
||||
torch.load(model_path, map_location=torch.device("cpu"))
|
||||
)
|
||||
print("Model loaded successfully.")
|
||||
else:
|
||||
print("No pre-trained model found. Initialized a new model.")
|
||||
return model
|
Reference in New Issue
Block a user