From ae546a60a12fd94635b28ceb39ce95550c6851d0 Mon Sep 17 00:00:00 2001 From: Dani Date: Fri, 25 Apr 2025 12:45:30 -0400 Subject: [PATCH] Added a clean up --- main.py | 10 ++++++ model/brain.py | 66 ++++++++++++++++++++++++++++++++++------ model/cleanup.py | 79 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 146 insertions(+), 9 deletions(-) create mode 100644 model/cleanup.py diff --git a/main.py b/main.py index f559b05..88659df 100644 --- a/main.py +++ b/main.py @@ -5,6 +5,8 @@ from dotenv import load_dotenv import os from model.train import train_on_message from model.brain import generate_response +from model.cleanup import full_cleanup +from reader.reader import read_books_forever from dashboard.dashboard import run_dashboard load_dotenv() @@ -35,8 +37,16 @@ async def on_message(message): # Launch Flask in background threading.Thread(target=run_dashboard, daemon=True).start() + +async def background_cleanup_loop(): + while True: + full_cleanup() + await asyncio.sleep(300) # 5 minutes + + loop = asyncio.get_event_loop() loop.create_task(read_books_forever()) # Book reader task +loop.create_task(background_cleanup_loop()) # Launch Discord bot (blocking) client.run(TOKEN) diff --git a/model/brain.py b/model/brain.py index af78cd0..171fb3e 100644 --- a/model/brain.py +++ b/model/brain.py @@ -4,7 +4,7 @@ import random from model.tokenizer import Tokenizer import torch.nn.functional as F from model.memory import save_dream -import time + recent_dreams = [] @@ -14,17 +14,65 @@ VOCAB_SIZE = 10000 # Temporary cap, grows dynamically EMBED_DIM = 128 -class TinyTransformer(nn.Module): - def __init__(self): +class MultiHeadSelfAttention(nn.Module): + def __init__(self, embed_dim, heads): super().__init__() - self.embed = nn.Embedding(VOCAB_SIZE, EMBED_DIM) - self.ln1 = nn.LayerNorm(EMBED_DIM) - self.fc = nn.Linear(EMBED_DIM, VOCAB_SIZE) + assert embed_dim % heads == 0 + self.heads = heads + self.head_dim = embed_dim // heads + self.scale = torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32)) + + self.to_qkv = nn.Linear(embed_dim, embed_dim * 3) + self.out = nn.Linear(embed_dim, embed_dim) def forward(self, x): - x = self.embed(x) - x = self.ln1(x) - return self.fc(x) + B, T, C = x.shape + qkv = self.to_qkv(x).view(B, T, self.heads, 3 * self.head_dim) + q, k, v = qkv.chunk(3, dim=-1) + + attn_scores = (q @ k.transpose(-2, -1)) / self.scale + attn_weights = torch.softmax(attn_scores, dim=-1) + + out = attn_weights @ v + out = out.transpose(1, 2).contiguous().view(B, T, C) + return self.out(out) + + +class TransformerBlock(nn.Module): + def __init__(self, embed_dim, heads): + super().__init__() + self.attn = MultiHeadSelfAttention(embed_dim, heads) + self.norm1 = nn.LayerNorm(embed_dim) + self.ff = nn.Sequential( + nn.Linear(embed_dim, embed_dim * 4), + nn.ReLU(), + nn.Linear(embed_dim * 4, embed_dim) + ) + self.norm2 = nn.LayerNorm(embed_dim) + + def forward(self, x): + x = x + self.attn(self.norm1(x)) + x = x + self.ff(self.norm2(x)) + return x + + +class TinyTransformer(nn.Module): + def __init__(self, vocab_size=VOCAB_SIZE, embed_dim=EMBED_DIM, depth=2, heads=4): + super().__init__() + self.token_embed = nn.Embedding(vocab_size, embed_dim) + self.pos_embed = nn.Parameter(torch.randn(1, 128, embed_dim)) # max sequence length = 128 + self.blocks = nn.Sequential(*[TransformerBlock(embed_dim, heads) for _ in range(depth)]) + self.norm = nn.LayerNorm(embed_dim) + self.head = nn.Linear(embed_dim, vocab_size) + + def forward(self, x): + B, T = x.shape + tok = self.token_embed(x) + pos = self.pos_embed[:, :T, :] + x = tok + pos + x = self.blocks(x) + x = self.norm(x) + return self.head(x) model = TinyTransformer().to(DEVICE) diff --git a/model/cleanup.py b/model/cleanup.py new file mode 100644 index 0000000..9feb62e --- /dev/null +++ b/model/cleanup.py @@ -0,0 +1,79 @@ +import re +import json +import os +import time +from model.tokenizer import VOCAB_PATH +from model.memory import DREAM_LOG_PATH +from context.context import CONTEXT_FILE + +CLEANUP_LOG = "data/logs/cleanup.log" + + +def log(msg): + os.makedirs(os.path.dirname(CLEANUP_LOG), exist_ok=True) + with open(CLEANUP_LOG, "a", encoding="utf-8") as f: + f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} | {msg}\n") + + +def cleanup_vocab(): + if not os.path.exists(VOCAB_PATH): + return + + with open(VOCAB_PATH, "r", encoding="utf-8") as f: + vocab = json.load(f) + + removed = [] + for word in list(vocab.keys()): + if re.search(r"[^\w-]", word): + removed.append(word) + del vocab[word] + elif len(word) <= 2 and not word.isalpha(): + removed.append(word) + del vocab[word] + elif "�" in word or "\ufffd" in word: + removed.append(word) + del vocab[word] + + with open(VOCAB_PATH, "w", encoding="utf-8") as f: + json.dump(vocab, f, indent=2) + + if removed: + log(f"Removed {len(removed)} malformed tokens: {removed[:5]}...") + + +def cleanup_dreams(): + if not os.path.exists(DREAM_LOG_PATH): + return + + with open(DREAM_LOG_PATH, "r", encoding="utf-8") as f: + dreams = json.load(f) + + filtered = [d for d in dreams if d["score"] >= 0.3][:100] + + with open(DREAM_LOG_PATH, "w", encoding="utf-8") as f: + json.dump(filtered, f, indent=2) + + if len(filtered) < len(dreams): + log(f"Removed {len(dreams) - len(filtered)} low-score dreams") + + +def cleanup_context(): + if not os.path.exists(CONTEXT_FILE): + return + + with open(CONTEXT_FILE, "r", encoding="utf-8") as f: + context = json.load(f) + + filtered = context[-100:] + + with open(CONTEXT_FILE, "w", encoding="utf-8") as f: + json.dump(filtered, f, indent=2) + + if len(filtered) < len(context): + log(f"Trimmed context memory from {len(context)} → {len(filtered)}") + + +def full_cleanup(): + cleanup_vocab() + cleanup_dreams() + cleanup_context()