From 9bf650ca79658d1cc8f324eeafbc4d4673eb55fa Mon Sep 17 00:00:00 2001 From: Dani Date: Sun, 11 May 2025 22:20:07 -0400 Subject: [PATCH] Trying a new plan of only adding small features one at a time. --- .gitignore | 1 + core/brain.py | 115 ++++++++++++++++++++++++++++++++++++++++++ core/dataset.py | 37 ++++++++++++++ core/model.py | 130 ++++++++++++++++++++++++++++++++++++++++++++++++ main.py | 39 +++++++++++++++ train.py | 46 +++++++++++++++++ 6 files changed, 368 insertions(+) create mode 100644 core/brain.py create mode 100644 core/dataset.py create mode 100644 core/model.py create mode 100644 main.py create mode 100644 train.py diff --git a/.gitignore b/.gitignore index 134ee59..f702cb0 100644 --- a/.gitignore +++ b/.gitignore @@ -171,3 +171,4 @@ cython_debug/ books/* *.json models/best_gen.pt +/model.pth \ No newline at end of file diff --git a/core/brain.py b/core/brain.py new file mode 100644 index 0000000..33460da --- /dev/null +++ b/core/brain.py @@ -0,0 +1,115 @@ +import os +import time +import asyncio +import torch +from torch.utils.data import DataLoader +from torch.optim import AdamW +import discord +from core.dataset import CharDataset +from core.model import GPT, GPTConfig + + +class Brain: + """ + Loads model and dataset, serves generate_response() to Discord, + and runs an async online training loop whenever Ruby is idle. + """ + def __init__( + self, + books_dir: str = './books', + model_path: str = './model.pth', + block_size: int = 128, + train_batch_size: int = 8, + idle_threshold: float = 60.0, # seconds of idle before training + lr: float = 3e-4, + client: discord.Client = None, + status_channel_id: int = None + ): + # device + self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + # dataset + loader + ds = CharDataset(books_dir, block_size) + self.stoi, self.itos = ds.stoi, ds.itos + self.block_size = block_size + self.train_loader = DataLoader(ds, batch_size=train_batch_size, shuffle=True) + self._train_iter = iter(self.train_loader) + # model & optimizer + config = GPTConfig( + vocab_size=ds.vocab_size, + block_size=block_size, + n_layer=6, + n_head=6, + n_embd=384, + ) + self.model = GPT(config).to(self.device) + if os.path.exists(model_path): + self.model.load_state_dict(torch.load(model_path, map_location=self.device)) + self.optimizer = AdamW(self.model.parameters(), lr=lr) + self.model.train() + # tracking idle time + self.last_active = time.time() + self.idle_threshold = idle_threshold + self.model_path = model_path + # discord hooks + self.client = client + self.status_channel_id = status_channel_id + + async def generate_response(self, prompt: str, **gen_kwargs) -> str: + self.last_active = time.time() + idx = torch.tensor( + [[self.stoi.get(ch, 0) for ch in prompt[-self.block_size:]]], + dtype=torch.long, + device=self.device + ) + self.model.eval() + out = self.model.generate(idx, **gen_kwargs)[0] + self.model.train() + return ''.join(self.itos[i] for i in out.tolist()) + + async def train_online(self): + """ + Background task: whenever idle >= idle_threshold, + perform one training batch, save checkpoint, then loop. + """ + while True: + if time.time() - self.last_active >= self.idle_threshold: + # 1) log & presence + print("⚙️ [Brain] Idle threshold reached—starting training batch.") + if self.client: + await self.client.change_presence( + activity=discord.Activity( + type=discord.ActivityType.watching, + name="Training Ruby…" + ) + ) + + # 2) pull next batch + try: + xb, yb = next(self._train_iter) + except StopIteration: + self._train_iter = iter(self.train_loader) + xb, yb = next(self._train_iter) + xb, yb = xb.to(self.device), yb.to(self.device) + + # 3) forward/backward + logits, loss = self.model(xb, yb) + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + + # 4) save & log + torch.save(self.model.state_dict(), self.model_path) + print(f"✅ [Brain] Finished batch. Loss: {loss.item():.4f}") + + # 5) optional Discord ping + if self.client and self.status_channel_id: + chan = self.client.get_channel(self.status_channel_id) + if chan: + await chan.send(f"🤖 Trained one batch, loss: {loss.item():.4f}") + + # 6) reset presence & idle timer + if self.client: + await self.client.change_presence(activity=None) + self.last_active = time.time() + + await asyncio.sleep(1) diff --git a/core/dataset.py b/core/dataset.py new file mode 100644 index 0000000..5e58206 --- /dev/null +++ b/core/dataset.py @@ -0,0 +1,37 @@ +import os +import torch +from torch.utils.data import Dataset + + +class CharDataset(Dataset): + """ + Builds a char-level dataset from all .txt files under books_dir. + Returns sequences of length block_size for next-char prediction. + """ + def __init__(self, books_dir: str, block_size: int): + texts = [] + for fn in os.listdir(books_dir): + if fn.lower().endswith('.txt'): + path = os.path.join(books_dir, fn) + with open(path, 'r', encoding='utf8') as f: + texts.append(f.read()) + data = '\n'.join(texts) + # build vocab + chars = sorted(set(data)) + self.stoi = {ch: i for i, ch in enumerate(chars)} + self.itos = {i: ch for ch, i in self.stoi.items()} + self.vocab_size = len(self.stoi) + # encode all data as a single tensor + self.data = torch.tensor( + [self.stoi[ch] for ch in data], + dtype=torch.long + ) + self.block_size = block_size + + def __len__(self): + return len(self.data) - self.block_size + + def __getitem__(self, idx): + x = self.data[idx: idx + self.block_size] + y = self.data[idx + 1: idx + 1 + self.block_size] + return x, y diff --git a/core/model.py b/core/model.py new file mode 100644 index 0000000..e6e3c09 --- /dev/null +++ b/core/model.py @@ -0,0 +1,130 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import math + + +class GPTConfig: + """Configuration for our GPT model.""" + def __init__( + self, + vocab_size: int, + block_size: int, + n_layer: int = 8, + n_head: int = 8, + n_embd: int = 512, + ): + self.vocab_size = vocab_size + self.block_size = block_size + self.n_layer = n_layer + self.n_head = n_head + self.n_embd = n_embd + + +class CausalSelfAttention(nn.Module): + """A single multi-head causal self-attention layer.""" + def __init__(self, config: GPTConfig): + super().__init__() + assert config.n_embd % config.n_head == 0 + self.key = nn.Linear(config.n_embd, config.n_embd) + self.query = nn.Linear(config.n_embd, config.n_embd) + self.value = nn.Linear(config.n_embd, config.n_embd) + self.proj = nn.Linear(config.n_embd, config.n_embd) + self.n_head = config.n_head + self.head_dim = config.n_embd // config.n_head + # causal mask, buffer not a parameter + mask = torch.tril(torch.ones(config.block_size, config.block_size)) + self.register_buffer("mask", mask) + + def forward(self, x): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2) + # compute attention scores + att = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim) + att = att.masked_fill(self.mask[:T, :T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + v = self.value(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2) + out = att @ v + out = out.transpose(1, 2).contiguous().view(B, T, C) + return self.proj(out) + + +class MLP(nn.Module): + """Feed-forward layer.""" + def __init__(self, config: GPTConfig): + super().__init__() + self.fc1 = nn.Linear(config.n_embd, 4 * config.n_embd) + self.fc2 = nn.Linear(4 * config.n_embd, config.n_embd) + + def forward(self, x): + return self.fc2(F.gelu(self.fc1(x))) + + +class Block(nn.Module): + """Transformer block: attention + feed-forward.""" + def __init__(self, config: GPTConfig): + super().__init__() + self.ln1 = nn.LayerNorm(config.n_embd) + self.ln2 = nn.LayerNorm(config.n_embd) + self.attn = CausalSelfAttention(config) + self.mlp = MLP(config) + + def forward(self, x): + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + + +class GPT(nn.Module): + """GPT language model from scratch.""" + def __init__(self, config: GPTConfig): + super().__init__() + self.token_emb = nn.Embedding(config.vocab_size, config.n_embd) + self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd)) + self.blocks = nn.ModuleList([Block(config) for _ in range(config.n_layer)]) + self.ln_f = nn.LayerNorm(config.n_embd) + self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False) + self.block_size = config.block_size + + def forward(self, idx, targets=None): + B, T = idx.size() + tok_emb = self.token_emb(idx) # (B,T,C) + pos_emb = self.pos_emb[:, :T, :] # (1,T,C) + x = tok_emb + pos_emb + for block in self.blocks: + x = block(x) + x = self.ln_f(x) + logits = self.head(x) # (B,T,vocab) + loss = None + if targets is not None: + # shift logits and targets for next-token prediction + logits = logits.view(B * T, -1) + targets = targets.view(B * T) + loss = F.cross_entropy(logits, targets) + return logits, loss + + @torch.no_grad() + def generate( + self, + idx, + max_new_tokens: int, + temperature: float = 1.0, + top_k: int = None + ): + """ + Iteratively predict next token and append to sequence. + - idx is (B,T) starting context. + - Returns (B, T+max_new_tokens). + """ + for _ in range(max_new_tokens): + idx_cond = idx[:, -self.block_size :] + logits, _ = self(idx_cond) + logits = logits[:, -1, :] / temperature + if top_k is not None: + v, _ = torch.topk(logits, top_k) + logits[logits < v[:, [-1]]] = -float('Inf') + probs = F.softmax(logits, dim=-1) + next_token = torch.multinomial(probs, num_samples=1) + idx = torch.cat([idx, next_token], dim=1) + return idx diff --git a/main.py b/main.py new file mode 100644 index 0000000..38f35c6 --- /dev/null +++ b/main.py @@ -0,0 +1,39 @@ +import os +import asyncio +from dotenv import load_dotenv +import discord +from core.brain import Brain + +load_dotenv() +TOKEN = os.getenv('DISCORD_TOKEN') +if not TOKEN: + raise RuntimeError('DISCORD_TOKEN not set in .env') + +STATUS_CHANNEL_ID = 1371307441400184883 # ← replace with your channel ID + +intents = discord.Intents.default() +intents.message_content = True + +client = discord.Client(intents=intents) +brain = Brain(client=client, status_channel_id=STATUS_CHANNEL_ID) + +@client.event +async def on_ready(): + print(f'🚀 Logged in as {client.user} (ID: {client.user.id})') + # fire-and-forget the online trainer + asyncio.create_task(brain.train_online()) + +@client.event +async def on_message(message): + if message.author.bot: + return + reply = await brain.generate_response( + message.content, + max_new_tokens=200, + temperature=1.0, + top_k=50 + ) + await message.channel.send(reply) + +if __name__ == '__main__': + client.run(TOKEN) diff --git a/train.py b/train.py new file mode 100644 index 0000000..490c6af --- /dev/null +++ b/train.py @@ -0,0 +1,46 @@ +import torch +from torch.utils.data import DataLoader +from core.dataset import CharDataset +from core.model import GPT, GPTConfig + + +def train(): + # hyperparameters + books_dir = './books' + block_size = 128 + batch_size = 32 + epochs = 10 + lr = 3e-4 + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + # dataset & model + dataset = CharDataset(books_dir, block_size) + loader = DataLoader(dataset, batch_size=batch_size, shuffle=True) + config = GPTConfig( + vocab_size=dataset.vocab_size, + block_size=block_size, + n_layer=6, + n_head=6, + n_embd=384 + ) + model = GPT(config).to(device) + optimizer = torch.optim.AdamW(model.parameters(), lr=lr) + + model.train() + for epoch in range(1, epochs + 1): + total_loss = 0.0 + for xb, yb in loader: + xb, yb = xb.to(device), yb.to(device) + optimizer.zero_grad() + _, loss = model(xb, yb) + loss.backward() + optimizer.step() + total_loss += loss.item() + avg = total_loss / len(loader) + print(f'Epoch {epoch}/{epochs} — avg loss: {avg:.4f}') + # save checkpoint each epoch + torch.save(model.state_dict(), 'model.pth') + + +if __name__ == '__main__': + train()