From a8adc0fb37ce05c87d1a0df25de61b6035c13b83 Mon Sep 17 00:00:00 2001 From: Dani Date: Fri, 25 Apr 2025 22:49:13 -0400 Subject: [PATCH] Fixed a circular import --- main.py | 2 +- model/brain.py | 107 ++++++++------------------------- model/brain_architecture.py | 63 +++++++++++++++++++ model/brain_state.py | 13 ++++ model/dream_replay.py | 3 +- model/journal.py | 2 +- model/rehearsal.py | 2 +- model/{train.py => trainer.py} | 17 +----- reader/reader.py | 2 +- 9 files changed, 108 insertions(+), 103 deletions(-) create mode 100644 model/brain_architecture.py create mode 100644 model/brain_state.py rename model/{train.py => trainer.py} (69%) diff --git a/main.py b/main.py index b7acfe4..76e11f1 100644 --- a/main.py +++ b/main.py @@ -3,7 +3,7 @@ import asyncio import threading from dotenv import load_dotenv import os -from model.train import train_on_message +from model.trainer import train_on_message from model.brain import generate_response from model.cleanup import full_cleanup from model.dream_replay import replay_dreams diff --git a/model/brain.py b/model/brain.py index 9a99b70..589aa82 100644 --- a/model/brain.py +++ b/model/brain.py @@ -1,93 +1,32 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F import random -from model.tokenizer import Tokenizer +import torch +import torch.nn.functional as F from model.memory import save_dream -from model.train import train_on_message -from model.journal import record_to_journal - +from model.brain_state import model, tokenizer, DEVICE recent_dreams = [] -DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") -tokenizer = Tokenizer() -VOCAB_SIZE = 10000 # Temporary cap, grows dynamically -EMBED_DIM = 128 - - -class MultiHeadSelfAttention(nn.Module): - def __init__(self, embed_dim, heads): - super().__init__() - assert embed_dim % heads == 0 - self.heads = heads - self.head_dim = embed_dim // heads - self.scale = torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32)) - - self.to_qkv = nn.Linear(embed_dim, embed_dim * 3) - self.out = nn.Linear(embed_dim, embed_dim) - - def forward(self, x): - B, T, C = x.shape - qkv = self.to_qkv(x).view(B, T, self.heads, 3 * self.head_dim) - q, k, v = qkv.chunk(3, dim=-1) - - attn_scores = (q @ k.transpose(-2, -1)) / self.scale - attn_weights = torch.softmax(attn_scores, dim=-1) - - out = attn_weights @ v - out = out.transpose(1, 2).contiguous().view(B, T, C) - return self.out(out) - - -class TransformerBlock(nn.Module): - def __init__(self, embed_dim, heads): - super().__init__() - self.attn = MultiHeadSelfAttention(embed_dim, heads) - self.norm1 = nn.LayerNorm(embed_dim) - self.ff = nn.Sequential( - nn.Linear(embed_dim, embed_dim * 4), - nn.ReLU(), - nn.Linear(embed_dim * 4, embed_dim) - ) - self.norm2 = nn.LayerNorm(embed_dim) - - def forward(self, x): - x = x + self.attn(self.norm1(x)) - x = x + self.ff(self.norm2(x)) - return x - - -class TinyTransformer(nn.Module): - def __init__(self, vocab_size=VOCAB_SIZE, embed_dim=256, depth=4, heads=8): - super().__init__() - self.token_embed = nn.Embedding(vocab_size, embed_dim) - self.pos_embed = nn.Parameter(torch.randn(1, 128, embed_dim)) - self.blocks = nn.Sequential(*[TransformerBlock(embed_dim, heads) for _ in range(depth)]) - self.norm = nn.LayerNorm(embed_dim) - self.head = nn.Linear(embed_dim, vocab_size) - - def forward(self, x): - B, T = x.shape - tok = self.token_embed(x) - pos = self.pos_embed[:, :T, :] - x = tok + pos - x = self.blocks(x) - x = self.norm(x) - return self.head(x) - - -model = TinyTransformer().to(DEVICE) -optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) -loss_fn = nn.CrossEntropyLoss() - def generate_response(): - seed = torch.tensor([random.randint(0, tokenizer.next_id - 1)], device=DEVICE) - output = model(seed.unsqueeze(0)) + model.eval() + # Pick a real known word to seed from context memory + context_texts = get_recent_context(5) + if context_texts: + start = random.choice(context_texts) + seed_tokens = tokenizer.tokenize(start) + if seed_tokens: + seed = torch.tensor([seed_tokens[-1]], device=DEVICE).unsqueeze(0) + else: + seed = torch.tensor([random.randint(0, tokenizer.next_id - 1)], device=DEVICE).unsqueeze(0) + else: + seed = torch.tensor([random.randint(0, tokenizer.next_id - 1)], device=DEVICE).unsqueeze(0) + + output = model(seed) pred = torch.argmax(output, dim=-1).squeeze().tolist() + if not isinstance(pred, list): pred = [pred] + return tokenizer.detokenize(pred) @@ -118,8 +57,10 @@ def daydream(): if score > 0.45: save_dream(sentence, score) + from model.journal import record_to_journal record_to_journal(sentence) + from model.trainer import train_on_message train_on_message(sentence) - recent_dreams.append((score, sentence)) - if len(recent_dreams) > 10: - recent_dreams.pop(0) + + if len(recent_dreams) > 10: + recent_dreams.pop(0) diff --git a/model/brain_architecture.py b/model/brain_architecture.py new file mode 100644 index 0000000..974e4cd --- /dev/null +++ b/model/brain_architecture.py @@ -0,0 +1,63 @@ +import torch +import torch.nn as nn + + +class MultiHeadSelfAttention(nn.Module): + def __init__(self, embed_dim, heads): + super().__init__() + assert embed_dim % heads == 0 + self.heads = heads + self.head_dim = embed_dim // heads + self.scale = torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32)) + + self.to_qkv = nn.Linear(embed_dim, embed_dim * 3) + self.out = nn.Linear(embed_dim, embed_dim) + + def forward(self, x): + B, T, C = x.shape + qkv = self.to_qkv(x).view(B, T, self.heads, 3 * self.head_dim) + q, k, v = qkv.chunk(3, dim=-1) + + attn_scores = (q @ k.transpose(-2, -1)) / self.scale + attn_weights = torch.softmax(attn_scores, dim=-1) + + out = attn_weights @ v + out = out.transpose(1, 2).contiguous().view(B, T, C) + return self.out(out) + + +class TransformerBlock(nn.Module): + def __init__(self, embed_dim, heads): + super().__init__() + self.attn = MultiHeadSelfAttention(embed_dim, heads) + self.norm1 = nn.LayerNorm(embed_dim) + self.ff = nn.Sequential( + nn.Linear(embed_dim, embed_dim * 4), + nn.ReLU(), + nn.Linear(embed_dim * 4, embed_dim) + ) + self.norm2 = nn.LayerNorm(embed_dim) + + def forward(self, x): + x = x + self.attn(self.norm1(x)) + x = x + self.ff(self.norm2(x)) + return x + + +class TinyTransformer(nn.Module): + def __init__(self, vocab_size, embed_dim=256, depth=4, heads=8): + super().__init__() + self.token_embed = nn.Embedding(vocab_size, embed_dim) + self.pos_embed = nn.Parameter(torch.randn(1, 128, embed_dim)) + self.blocks = nn.Sequential(*[TransformerBlock(embed_dim, heads) for _ in range(depth)]) + self.norm = nn.LayerNorm(embed_dim) + self.head = nn.Linear(embed_dim, vocab_size) + + def forward(self, x): + B, T = x.shape + tok = self.token_embed(x) + pos = self.pos_embed[:, :T, :] + x = tok + pos + x = self.blocks(x) + x = self.norm(x) + return self.head(x) diff --git a/model/brain_state.py b/model/brain_state.py new file mode 100644 index 0000000..f9553a8 --- /dev/null +++ b/model/brain_state.py @@ -0,0 +1,13 @@ +import torch +import torch.nn as nn +from model.brain_architecture import TinyTransformer +from model.tokenizer import Tokenizer + +DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +tokenizer = Tokenizer() +VOCAB_SIZE = 10000 # Expandable if needed + +model = TinyTransformer(vocab_size=VOCAB_SIZE).to(DEVICE) +optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) +loss_fn = nn.CrossEntropyLoss() diff --git a/model/dream_replay.py b/model/dream_replay.py index 8bac8bc..34a3e76 100644 --- a/model/dream_replay.py +++ b/model/dream_replay.py @@ -1,7 +1,6 @@ import random -import torch from model.memory import load_dreams -from model.train import train_on_message +from model.trainer import train_on_message def replay_dreams(): diff --git a/model/journal.py b/model/journal.py index 5495746..5e5c009 100644 --- a/model/journal.py +++ b/model/journal.py @@ -1,6 +1,6 @@ import os import time -from model.train import train_on_message +from model.trainer import train_on_message import random JOURNAL_PATH = "data/memory/journal.txt" diff --git a/model/rehearsal.py b/model/rehearsal.py index d1f0a2b..088b0f1 100644 --- a/model/rehearsal.py +++ b/model/rehearsal.py @@ -1,6 +1,6 @@ import torch from model.brain import model, tokenizer, DEVICE -from model.train import train_on_message +from model.trainer import train_on_message def simulate_conversation(): diff --git a/model/train.py b/model/trainer.py similarity index 69% rename from model/train.py rename to model/trainer.py index 23b3ac0..5dfe916 100644 --- a/model/train.py +++ b/model/trainer.py @@ -1,11 +1,7 @@ import torch -import torch.nn as nn -import random import time -from model.brain import model, tokenizer, DEVICE, optimizer, loss_fn, daydream -from context.context import get_recent_context, add_to_context - -_last_thought = time.time() +from model.brain_state import model, tokenizer, DEVICE, optimizer, loss_fn +from context.context import add_to_context, get_recent_context LOSS_FILE = "data/logs/loss.log" @@ -16,7 +12,6 @@ def log_loss(value: float): def train_on_message(text: str): - global _last_thought model.train() context_texts = get_recent_context(3) augmented_text = " ".join(context_texts + [text]) @@ -30,16 +25,10 @@ def train_on_message(text: str): output = model(input_tensor) loss = loss_fn(output.view(-1, output.size(-1)), target_tensor.view(-1)) - log_loss(loss.item()) optimizer.zero_grad() loss.backward() optimizer.step() + log_loss(loss.item()) add_to_context(text) - - now = time.time() - if now - _last_thought > 15: - for _ in range(3): - daydream() - _last_thought = now diff --git a/reader/reader.py b/reader/reader.py index 8b12448..be7dc2c 100644 --- a/reader/reader.py +++ b/reader/reader.py @@ -1,6 +1,6 @@ import os import asyncio -from model.train import train_on_message +from model.trainer import train_on_message from reader.filter import is_valid_line BOOK_DIR = "data/books"