Fix: Moved the Files around due to imports not working right

Feat: Phoebe replies but it's gibbish This is a version break because of the file structure change.
2024-05-15 20:13:35 -04:00
parent 12071fbf61
commit 75f1116b3b
4 changed files with 116 additions and 20 deletions
--- a/phoebe/gpt_model.py
+++ b/phoebe/gpt_model.py
@ -0,0 +1,162 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import os
+
+# Hyperparameters
+batch_size = 64
+block_size = 256
+num_embed = 384  # Ensure consistency in naming
+num_heads = 8
+num_layers = 8
+dropout = 0.2
+
+
+class Head(nn.Module):
+    def __init__(self, head_size):
+        super().__init__()
+        self.key = nn.Linear(num_embed, head_size)
+        self.query = nn.Linear(num_embed, head_size)
+        self.value = nn.Linear(num_embed, head_size)
+        self.register_buffer(
+            "tril", torch.tril(torch.ones(block_size, block_size))
+        )
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        B, T, C = x.shape
+        k = self.key(x)
+        q = self.query(x)
+        wei = q @ k.transpose(-2, -1) * C**-0.5
+        wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
+        wei = F.softmax(wei, dim=-1)
+        wei = self.dropout(wei)
+        v = self.value(x)
+        out = wei @ v
+        return out
+
+
+class MultiHeadAttention(nn.Module):
+    def __init__(self, num_heads, head_size):
+        super().__init__()
+        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
+        self.proj = nn.Linear(num_embed, num_embed)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        out = torch.cat([h(x) for h in self.heads], dim=-1)
+        out = self.dropout(self.proj(out))
+        return out
+
+
+class FeedForward(nn.Module):
+    def __init__(self, num_embed):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(num_embed, 4 * num_embed),
+            nn.ReLU(),
+            nn.Linear(4 * num_embed, num_embed),
+            nn.Dropout(dropout),
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class Block(nn.Module):
+    def __init__(self, num_embed, num_head):
+        super().__init__()
+        head_size = num_embed // num_head
+        self.sa = MultiHeadAttention(num_head, head_size)
+        self.ff = FeedForward(num_embed)
+        self.ln1 = nn.LayerNorm(num_embed)
+        self.ln2 = nn.LayerNorm(num_embed)
+
+    def forward(self, x):
+        y = self.sa(x)
+        x = self.ln1(x + y)
+        y = self.ff(x)
+        x = self.ln2(x + y)
+        return x
+
+
+class GPT(nn.Module):
+    def __init__(self, vocab_size):
+        super().__init__()
+        self.token_embedding_table = nn.Embedding(vocab_size, num_embed)
+        self.position_embedding_table = nn.Embedding(block_size, num_embed)
+        self.blocks = nn.Sequential(
+            *[Block(num_embed, num_heads) for _ in range(num_layers)]
+        )
+        self.ln = nn.LayerNorm(num_embed)
+        self.lm_head = nn.Linear(num_embed, vocab_size)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+
+    def forward(self, idx, targets=None):
+        B, T = idx.shape
+        tok_emb = self.token_embedding_table(idx)
+        pos_emb = self.position_embedding_table(
+            torch.arange(T, device=idx.device)
+        )
+        x = tok_emb + pos_emb
+        x = self.blocks(x)
+        x = self.ln(x)
+        logits = self.lm_head(x)
+
+        if targets is None:
+            loss = None
+        else:
+            B, T, C = logits.shape
+            logits = logits.view(B * T, C)
+            targets = targets.view(B * T)
+            loss = F.cross_entropy(logits, targets)
+        return logits, loss
+
+    def generate(self, idx, max_new_tokens):
+        for _ in range(max_new_tokens):
+            idx_cond = idx[:, -block_size:]
+            logits, _ = self(idx_cond)
+            print(f"Logits shape: {logits.shape}")  # Debug print
+            if logits.size(1) == 0:
+                raise ValueError("Logits tensor is empty.")
+            logits = logits[:, -1, :]
+            probs = F.softmax(logits, dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1)
+            idx = torch.cat((idx, idx_next), dim=1)
+        return idx
+
+
+def encode(s, string_to_int):
+    # Replace unknown characters with a special token (e.g., "<unk>")
+    encoded = []
+    for c in s:
+        if c in string_to_int:
+            encoded.append(string_to_int[c])
+        else:
+            print(f"Unknown character encountered during encoding: {c}")
+            encoded.append(string_to_int["<unk>"])
+    return encoded
+
+
+def decode(lst, int_to_string):
+    return "".join([int_to_string[i] for i in lst])
+
+
+def load_model(vocab_size, model_path="phoebe_model.pt"):
+    model = GPT(vocab_size)
+    if os.path.exists(model_path):
+        model.load_state_dict(
+            torch.load(model_path, map_location=torch.device("cpu"))
+        )
+        print("Model loaded successfully.")
+    else:
+        print("No pre-trained model found. Initialized a new model.")
+    return model