Trying a new plan of only adding small features one at a time.
This commit is contained in:
parent
bf6706c72c
commit
9bf650ca79
1
.gitignore
vendored
1
.gitignore
vendored
@ -171,3 +171,4 @@ cython_debug/
|
|||||||
books/*
|
books/*
|
||||||
*.json
|
*.json
|
||||||
models/best_gen.pt
|
models/best_gen.pt
|
||||||
|
/model.pth
|
115
core/brain.py
Normal file
115
core/brain.py
Normal file
@ -0,0 +1,115 @@
|
|||||||
|
import os
|
||||||
|
import time
|
||||||
|
import asyncio
|
||||||
|
import torch
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
from torch.optim import AdamW
|
||||||
|
import discord
|
||||||
|
from core.dataset import CharDataset
|
||||||
|
from core.model import GPT, GPTConfig
|
||||||
|
|
||||||
|
|
||||||
|
class Brain:
|
||||||
|
"""
|
||||||
|
Loads model and dataset, serves generate_response() to Discord,
|
||||||
|
and runs an async online training loop whenever Ruby is idle.
|
||||||
|
"""
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
books_dir: str = './books',
|
||||||
|
model_path: str = './model.pth',
|
||||||
|
block_size: int = 128,
|
||||||
|
train_batch_size: int = 8,
|
||||||
|
idle_threshold: float = 60.0, # seconds of idle before training
|
||||||
|
lr: float = 3e-4,
|
||||||
|
client: discord.Client = None,
|
||||||
|
status_channel_id: int = None
|
||||||
|
):
|
||||||
|
# device
|
||||||
|
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
|
# dataset + loader
|
||||||
|
ds = CharDataset(books_dir, block_size)
|
||||||
|
self.stoi, self.itos = ds.stoi, ds.itos
|
||||||
|
self.block_size = block_size
|
||||||
|
self.train_loader = DataLoader(ds, batch_size=train_batch_size, shuffle=True)
|
||||||
|
self._train_iter = iter(self.train_loader)
|
||||||
|
# model & optimizer
|
||||||
|
config = GPTConfig(
|
||||||
|
vocab_size=ds.vocab_size,
|
||||||
|
block_size=block_size,
|
||||||
|
n_layer=6,
|
||||||
|
n_head=6,
|
||||||
|
n_embd=384,
|
||||||
|
)
|
||||||
|
self.model = GPT(config).to(self.device)
|
||||||
|
if os.path.exists(model_path):
|
||||||
|
self.model.load_state_dict(torch.load(model_path, map_location=self.device))
|
||||||
|
self.optimizer = AdamW(self.model.parameters(), lr=lr)
|
||||||
|
self.model.train()
|
||||||
|
# tracking idle time
|
||||||
|
self.last_active = time.time()
|
||||||
|
self.idle_threshold = idle_threshold
|
||||||
|
self.model_path = model_path
|
||||||
|
# discord hooks
|
||||||
|
self.client = client
|
||||||
|
self.status_channel_id = status_channel_id
|
||||||
|
|
||||||
|
async def generate_response(self, prompt: str, **gen_kwargs) -> str:
|
||||||
|
self.last_active = time.time()
|
||||||
|
idx = torch.tensor(
|
||||||
|
[[self.stoi.get(ch, 0) for ch in prompt[-self.block_size:]]],
|
||||||
|
dtype=torch.long,
|
||||||
|
device=self.device
|
||||||
|
)
|
||||||
|
self.model.eval()
|
||||||
|
out = self.model.generate(idx, **gen_kwargs)[0]
|
||||||
|
self.model.train()
|
||||||
|
return ''.join(self.itos[i] for i in out.tolist())
|
||||||
|
|
||||||
|
async def train_online(self):
|
||||||
|
"""
|
||||||
|
Background task: whenever idle >= idle_threshold,
|
||||||
|
perform one training batch, save checkpoint, then loop.
|
||||||
|
"""
|
||||||
|
while True:
|
||||||
|
if time.time() - self.last_active >= self.idle_threshold:
|
||||||
|
# 1) log & presence
|
||||||
|
print("⚙️ [Brain] Idle threshold reached—starting training batch.")
|
||||||
|
if self.client:
|
||||||
|
await self.client.change_presence(
|
||||||
|
activity=discord.Activity(
|
||||||
|
type=discord.ActivityType.watching,
|
||||||
|
name="Training Ruby…"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2) pull next batch
|
||||||
|
try:
|
||||||
|
xb, yb = next(self._train_iter)
|
||||||
|
except StopIteration:
|
||||||
|
self._train_iter = iter(self.train_loader)
|
||||||
|
xb, yb = next(self._train_iter)
|
||||||
|
xb, yb = xb.to(self.device), yb.to(self.device)
|
||||||
|
|
||||||
|
# 3) forward/backward
|
||||||
|
logits, loss = self.model(xb, yb)
|
||||||
|
self.optimizer.zero_grad()
|
||||||
|
loss.backward()
|
||||||
|
self.optimizer.step()
|
||||||
|
|
||||||
|
# 4) save & log
|
||||||
|
torch.save(self.model.state_dict(), self.model_path)
|
||||||
|
print(f"✅ [Brain] Finished batch. Loss: {loss.item():.4f}")
|
||||||
|
|
||||||
|
# 5) optional Discord ping
|
||||||
|
if self.client and self.status_channel_id:
|
||||||
|
chan = self.client.get_channel(self.status_channel_id)
|
||||||
|
if chan:
|
||||||
|
await chan.send(f"🤖 Trained one batch, loss: {loss.item():.4f}")
|
||||||
|
|
||||||
|
# 6) reset presence & idle timer
|
||||||
|
if self.client:
|
||||||
|
await self.client.change_presence(activity=None)
|
||||||
|
self.last_active = time.time()
|
||||||
|
|
||||||
|
await asyncio.sleep(1)
|
37
core/dataset.py
Normal file
37
core/dataset.py
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
import os
|
||||||
|
import torch
|
||||||
|
from torch.utils.data import Dataset
|
||||||
|
|
||||||
|
|
||||||
|
class CharDataset(Dataset):
|
||||||
|
"""
|
||||||
|
Builds a char-level dataset from all .txt files under books_dir.
|
||||||
|
Returns sequences of length block_size for next-char prediction.
|
||||||
|
"""
|
||||||
|
def __init__(self, books_dir: str, block_size: int):
|
||||||
|
texts = []
|
||||||
|
for fn in os.listdir(books_dir):
|
||||||
|
if fn.lower().endswith('.txt'):
|
||||||
|
path = os.path.join(books_dir, fn)
|
||||||
|
with open(path, 'r', encoding='utf8') as f:
|
||||||
|
texts.append(f.read())
|
||||||
|
data = '\n'.join(texts)
|
||||||
|
# build vocab
|
||||||
|
chars = sorted(set(data))
|
||||||
|
self.stoi = {ch: i for i, ch in enumerate(chars)}
|
||||||
|
self.itos = {i: ch for ch, i in self.stoi.items()}
|
||||||
|
self.vocab_size = len(self.stoi)
|
||||||
|
# encode all data as a single tensor
|
||||||
|
self.data = torch.tensor(
|
||||||
|
[self.stoi[ch] for ch in data],
|
||||||
|
dtype=torch.long
|
||||||
|
)
|
||||||
|
self.block_size = block_size
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.data) - self.block_size
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
x = self.data[idx: idx + self.block_size]
|
||||||
|
y = self.data[idx + 1: idx + 1 + self.block_size]
|
||||||
|
return x, y
|
130
core/model.py
Normal file
130
core/model.py
Normal file
@ -0,0 +1,130 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
import math
|
||||||
|
|
||||||
|
|
||||||
|
class GPTConfig:
|
||||||
|
"""Configuration for our GPT model."""
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab_size: int,
|
||||||
|
block_size: int,
|
||||||
|
n_layer: int = 8,
|
||||||
|
n_head: int = 8,
|
||||||
|
n_embd: int = 512,
|
||||||
|
):
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.block_size = block_size
|
||||||
|
self.n_layer = n_layer
|
||||||
|
self.n_head = n_head
|
||||||
|
self.n_embd = n_embd
|
||||||
|
|
||||||
|
|
||||||
|
class CausalSelfAttention(nn.Module):
|
||||||
|
"""A single multi-head causal self-attention layer."""
|
||||||
|
def __init__(self, config: GPTConfig):
|
||||||
|
super().__init__()
|
||||||
|
assert config.n_embd % config.n_head == 0
|
||||||
|
self.key = nn.Linear(config.n_embd, config.n_embd)
|
||||||
|
self.query = nn.Linear(config.n_embd, config.n_embd)
|
||||||
|
self.value = nn.Linear(config.n_embd, config.n_embd)
|
||||||
|
self.proj = nn.Linear(config.n_embd, config.n_embd)
|
||||||
|
self.n_head = config.n_head
|
||||||
|
self.head_dim = config.n_embd // config.n_head
|
||||||
|
# causal mask, buffer not a parameter
|
||||||
|
mask = torch.tril(torch.ones(config.block_size, config.block_size))
|
||||||
|
self.register_buffer("mask", mask)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
B, T, C = x.size()
|
||||||
|
k = self.key(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
|
||||||
|
q = self.query(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
|
||||||
|
# compute attention scores
|
||||||
|
att = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
|
||||||
|
att = att.masked_fill(self.mask[:T, :T] == 0, float('-inf'))
|
||||||
|
att = F.softmax(att, dim=-1)
|
||||||
|
v = self.value(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
|
||||||
|
out = att @ v
|
||||||
|
out = out.transpose(1, 2).contiguous().view(B, T, C)
|
||||||
|
return self.proj(out)
|
||||||
|
|
||||||
|
|
||||||
|
class MLP(nn.Module):
|
||||||
|
"""Feed-forward layer."""
|
||||||
|
def __init__(self, config: GPTConfig):
|
||||||
|
super().__init__()
|
||||||
|
self.fc1 = nn.Linear(config.n_embd, 4 * config.n_embd)
|
||||||
|
self.fc2 = nn.Linear(4 * config.n_embd, config.n_embd)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return self.fc2(F.gelu(self.fc1(x)))
|
||||||
|
|
||||||
|
|
||||||
|
class Block(nn.Module):
|
||||||
|
"""Transformer block: attention + feed-forward."""
|
||||||
|
def __init__(self, config: GPTConfig):
|
||||||
|
super().__init__()
|
||||||
|
self.ln1 = nn.LayerNorm(config.n_embd)
|
||||||
|
self.ln2 = nn.LayerNorm(config.n_embd)
|
||||||
|
self.attn = CausalSelfAttention(config)
|
||||||
|
self.mlp = MLP(config)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = x + self.attn(self.ln1(x))
|
||||||
|
x = x + self.mlp(self.ln2(x))
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class GPT(nn.Module):
|
||||||
|
"""GPT language model from scratch."""
|
||||||
|
def __init__(self, config: GPTConfig):
|
||||||
|
super().__init__()
|
||||||
|
self.token_emb = nn.Embedding(config.vocab_size, config.n_embd)
|
||||||
|
self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd))
|
||||||
|
self.blocks = nn.ModuleList([Block(config) for _ in range(config.n_layer)])
|
||||||
|
self.ln_f = nn.LayerNorm(config.n_embd)
|
||||||
|
self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||||
|
self.block_size = config.block_size
|
||||||
|
|
||||||
|
def forward(self, idx, targets=None):
|
||||||
|
B, T = idx.size()
|
||||||
|
tok_emb = self.token_emb(idx) # (B,T,C)
|
||||||
|
pos_emb = self.pos_emb[:, :T, :] # (1,T,C)
|
||||||
|
x = tok_emb + pos_emb
|
||||||
|
for block in self.blocks:
|
||||||
|
x = block(x)
|
||||||
|
x = self.ln_f(x)
|
||||||
|
logits = self.head(x) # (B,T,vocab)
|
||||||
|
loss = None
|
||||||
|
if targets is not None:
|
||||||
|
# shift logits and targets for next-token prediction
|
||||||
|
logits = logits.view(B * T, -1)
|
||||||
|
targets = targets.view(B * T)
|
||||||
|
loss = F.cross_entropy(logits, targets)
|
||||||
|
return logits, loss
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def generate(
|
||||||
|
self,
|
||||||
|
idx,
|
||||||
|
max_new_tokens: int,
|
||||||
|
temperature: float = 1.0,
|
||||||
|
top_k: int = None
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Iteratively predict next token and append to sequence.
|
||||||
|
- idx is (B,T) starting context.
|
||||||
|
- Returns (B, T+max_new_tokens).
|
||||||
|
"""
|
||||||
|
for _ in range(max_new_tokens):
|
||||||
|
idx_cond = idx[:, -self.block_size :]
|
||||||
|
logits, _ = self(idx_cond)
|
||||||
|
logits = logits[:, -1, :] / temperature
|
||||||
|
if top_k is not None:
|
||||||
|
v, _ = torch.topk(logits, top_k)
|
||||||
|
logits[logits < v[:, [-1]]] = -float('Inf')
|
||||||
|
probs = F.softmax(logits, dim=-1)
|
||||||
|
next_token = torch.multinomial(probs, num_samples=1)
|
||||||
|
idx = torch.cat([idx, next_token], dim=1)
|
||||||
|
return idx
|
39
main.py
Normal file
39
main.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
import os
|
||||||
|
import asyncio
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import discord
|
||||||
|
from core.brain import Brain
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
TOKEN = os.getenv('DISCORD_TOKEN')
|
||||||
|
if not TOKEN:
|
||||||
|
raise RuntimeError('DISCORD_TOKEN not set in .env')
|
||||||
|
|
||||||
|
STATUS_CHANNEL_ID = 1371307441400184883 # ← replace with your channel ID
|
||||||
|
|
||||||
|
intents = discord.Intents.default()
|
||||||
|
intents.message_content = True
|
||||||
|
|
||||||
|
client = discord.Client(intents=intents)
|
||||||
|
brain = Brain(client=client, status_channel_id=STATUS_CHANNEL_ID)
|
||||||
|
|
||||||
|
@client.event
|
||||||
|
async def on_ready():
|
||||||
|
print(f'🚀 Logged in as {client.user} (ID: {client.user.id})')
|
||||||
|
# fire-and-forget the online trainer
|
||||||
|
asyncio.create_task(brain.train_online())
|
||||||
|
|
||||||
|
@client.event
|
||||||
|
async def on_message(message):
|
||||||
|
if message.author.bot:
|
||||||
|
return
|
||||||
|
reply = await brain.generate_response(
|
||||||
|
message.content,
|
||||||
|
max_new_tokens=200,
|
||||||
|
temperature=1.0,
|
||||||
|
top_k=50
|
||||||
|
)
|
||||||
|
await message.channel.send(reply)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
client.run(TOKEN)
|
46
train.py
Normal file
46
train.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
import torch
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
from core.dataset import CharDataset
|
||||||
|
from core.model import GPT, GPTConfig
|
||||||
|
|
||||||
|
|
||||||
|
def train():
|
||||||
|
# hyperparameters
|
||||||
|
books_dir = './books'
|
||||||
|
block_size = 128
|
||||||
|
batch_size = 32
|
||||||
|
epochs = 10
|
||||||
|
lr = 3e-4
|
||||||
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
|
|
||||||
|
# dataset & model
|
||||||
|
dataset = CharDataset(books_dir, block_size)
|
||||||
|
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
||||||
|
config = GPTConfig(
|
||||||
|
vocab_size=dataset.vocab_size,
|
||||||
|
block_size=block_size,
|
||||||
|
n_layer=6,
|
||||||
|
n_head=6,
|
||||||
|
n_embd=384
|
||||||
|
)
|
||||||
|
model = GPT(config).to(device)
|
||||||
|
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
|
||||||
|
|
||||||
|
model.train()
|
||||||
|
for epoch in range(1, epochs + 1):
|
||||||
|
total_loss = 0.0
|
||||||
|
for xb, yb in loader:
|
||||||
|
xb, yb = xb.to(device), yb.to(device)
|
||||||
|
optimizer.zero_grad()
|
||||||
|
_, loss = model(xb, yb)
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
total_loss += loss.item()
|
||||||
|
avg = total_loss / len(loader)
|
||||||
|
print(f'Epoch {epoch}/{epochs} — avg loss: {avg:.4f}')
|
||||||
|
# save checkpoint each epoch
|
||||||
|
torch.save(model.state_dict(), 'model.pth')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
train()
|
Loading…
x
Reference in New Issue
Block a user