diff --git a/.gitignore b/.gitignore index ac2928d..3b2462f 100644 --- a/.gitignore +++ b/.gitignore @@ -169,5 +169,7 @@ cython_debug/ #.idea/ .vscode/launch.json -/content/* -/memory/* \ No newline at end of file +/books/* +/memory/* +vocab.json +progress.json diff --git a/body.py b/body.py new file mode 100644 index 0000000..f0aeaf4 --- /dev/null +++ b/body.py @@ -0,0 +1,126 @@ +import os +import asyncio +import glob +import threading +import json +from collections import deque +import logging +import discord + +from nervous_system import NervousSystem +import dashboard # <-- import your new Flask app +import brain_map # <-- import the blueprint to inject system + +# Path for progress persistence +PROGRESS_PATH = 'progress.json' + +# Mute logger +for noisy_logger in ["werkzeug", "flask", "flask.app"]: + logging.getLogger(noisy_logger).setLevel(logging.CRITICAL) + +# ─── Initialize Ruby & Discord ───────────────────────────────────────────────── # noqa: E501 + +intents = discord.Intents.default() +intents.message_content = True +client = discord.Client(intents=intents) +system = NervousSystem() +system.history = deque(maxlen=100) + +# Load or resume vocab + embeddings +system.sensory.load_vocab('vocab.json') +system._resize_embeddings() +print('Loaded vocab size:', len(system.sensory.stoi)) + +# Resume progress +if os.path.isfile(PROGRESS_PATH): + with open(PROGRESS_PATH, 'r', encoding='utf-8') as f: + data = json.load(f) + system.processed_lines = data.get('processed_lines', 0) +else: + system.processed_lines = 0 + +# Compute total book lines +total = sum( + 1 + for path in glob.glob('books/*.txt') + for line in open(path, encoding='utf-8') + if line.strip() +) +system.total_lines = total + +print(f'Resuming training at {system.processed_lines}/{system.total_lines} lines') + +# Inject into Flask contexts +dashboard.system = system +brain_map.system = system + +# ─── Book-training when idle ──────────────────────────────────────────────────── # noqa: E501 + + +async def train_books_idle(): + await client.wait_until_ready() + await asyncio.sleep(5) + processed = 0 + skip = system.processed_lines + + for path in glob.glob('books/*.txt'): + with open(path, encoding='utf-8') as f: + for raw in f: + text = raw.strip() + if not text: + continue + if processed < skip: + processed += 1 + continue + + await asyncio.to_thread(system.train, text, text) + processed += 1 + system.processed_lines = processed + + if processed % 200 == 0 or processed == system.total_lines: + system.sensory.save_vocab('vocab.json') + with open(PROGRESS_PATH, 'w', encoding='utf-8') as pf: + json.dump({'processed_lines': processed}, pf) + + # Final checkpoint + system.sensory.save_vocab('vocab.json') + with open(PROGRESS_PATH, 'w', encoding='utf-8') as pf: + json.dump({'processed_lines': system.processed_lines}, pf) + + +@client.event +async def on_ready(): + print(f'Ruby is online as {client.user}!') + asyncio.create_task(train_books_idle()) + + +@client.event +async def on_message(message: discord.Message): + if message.author == client.user or not message.content: + return + + user_text = message.content.strip() + reply = system.generate(user_text) + await message.channel.send(reply) + + system.history.append({'user': user_text, 'bot': reply}) + asyncio.create_task(asyncio.to_thread(system.train, user_text, reply)) + + +# ─── Launch Dashboard & Bot ──────────────────────────────────────────────────── # noqa: E501 + + +def run_dashboard(): + dashboard.app.run( + host='0.0.0.0', port=5000, + debug=False, use_reloader=False + ) + + +threading.Thread(target=run_dashboard, daemon=True).start() +print('Dashboard available at http://127.0.0.1:5000') + +token = os.getenv('DISCORD_TOKEN') +if not token: + raise RuntimeError('Please set the DISCORD_TOKEN environment variable') +client.run(token) diff --git a/brain.py b/brain.py new file mode 100644 index 0000000..fe48e63 --- /dev/null +++ b/brain.py @@ -0,0 +1,36 @@ +import torch +import torch.nn as nn + + +class Brain(nn.Module): + """ + Minimal Transformer-based autoregressive model. + """ + + def __init__( + self, + vocab_size: int, + d_model: int = 256, + nhead: int = 4, + num_layers: int = 2, + dim_feedforward: int = 512, + max_seq_len: int = 128, + ): + super().__init__() + self.token_emb = nn.Embedding(vocab_size, d_model) + self.pos_emb = nn.Parameter(torch.zeros(1, max_seq_len, d_model)) + encoder_layer = nn.TransformerEncoderLayer( + d_model=d_model, + nhead=nhead, + dim_feedforward=dim_feedforward, + batch_first=True, + ) + self.transformer = nn.TransformerEncoder(encoder_layer, num_layers) + self.fc_out = nn.Linear(d_model, vocab_size) + self.max_seq_len = max_seq_len + + def forward(self, x: torch.Tensor) -> torch.Tensor: + seq_len = x.size(1) + x = self.token_emb(x) + self.pos_emb[:, :seq_len, :] + x = self.transformer(x) + return self.fc_out(x) diff --git a/brain_map.py b/brain_map.py new file mode 100644 index 0000000..2e2a2bf --- /dev/null +++ b/brain_map.py @@ -0,0 +1,92 @@ +import numpy as np +from flask import Blueprint, render_template, jsonify, request + +bp = Blueprint( + 'brain_map', + __name__, + template_folder='templates', + static_folder='static', +) + +# Will be injected from body.py +system = None + + +@bp.route('/graph') +def graph(): + return render_template('graph.html') + + +@bp.route('/data') +def data(): + if system is None: + return jsonify({"nodes": [], "edges": []}) + + # 1) embeddings → cosine sims + emb = system.brain.token_emb.weight.detach().cpu().numpy() + N = emb.shape[0] + norms = np.linalg.norm(emb, axis=1, keepdims=True) + emb_norm = emb / (norms + 1e-8) + sim = emb_norm.dot(emb_norm.T) + + # 2) filters + min_degree = int(request.args.get('min_degree', 1)) + max_nodes = int(request.args.get('max_nodes', 200)) + + # 3) valid tokens + items = [(tok, idx) for tok, idx in system.sensory.stoi.items() if 0 <= idx < N] + + # 4) build undirected unique pairs & degree counts + deg = {int(idx): 0 for _, idx in items} + unique_pairs = set() + for _, raw_i in items: + i = int(raw_i) + count = 0 + for raw_j in np.argsort(-sim[i]): + j = int(raw_j) + if j == i or j not in deg: + continue + pair = (min(i, j), max(i, j)) + if pair in unique_pairs: + continue + unique_pairs.add(pair) + deg[i] += 1 + deg[j] += 1 + count += 1 + if count >= 3: + break + + # 5) filter & cap nodes + filtered = [(tok, idx) for tok, idx in items if deg[int(idx)] >= min_degree] + filtered.sort(key=lambda x: (-deg[int(x[1])], int(x[1]))) + subset = filtered[:max_nodes] + subset_ids = {int(idx) for _, idx in subset} + + # 6) build nodes with HSL coloring + max_deg = max((deg[idx] for _, idx in subset), default=1) + nodes = [] + for tok, raw_idx in subset: + idx = int(raw_idx) + d = deg[idx] + hue = int((1 - d / max_deg) * 240) + nodes.append({ + 'id': idx, + 'label': tok, + 'color': { + 'background': f'hsl({hue},80%,40%)', + 'border': f'hsl({hue},60%,30%)', + 'highlight': { + 'background': f'hsl({hue},100%,50%)', + 'border': f'hsl({hue},80%,40%)' + } + } + }) + + # 7) build edges + edges = [ + {'from': a, 'to': b} + for (a, b) in unique_pairs + if a in subset_ids and b in subset_ids + ] + + return jsonify({'nodes': nodes, 'edges': edges}) diff --git a/dashboard.py b/dashboard.py new file mode 100644 index 0000000..ed2f39e --- /dev/null +++ b/dashboard.py @@ -0,0 +1,36 @@ +from flask import Flask, render_template, jsonify +import brain_map + +app = Flask( + __name__, + template_folder='templates', + static_folder='static', +) + +# Register the brain_map blueprint +app.register_blueprint(brain_map.bp) + +# Will be injected from body.py +system = None + + +@app.route('/') +def dashboard(): + return render_template('dashboard.html') + + +@app.route('/progress') +def progress(): + if system is None: + return jsonify({'processed': 0, 'total': 0}) + return jsonify({ + 'processed': getattr(system, 'processed_lines', 0), + 'total': getattr(system, 'total_lines', 0) + }) + + +@app.route('/interactions') +def interactions(): + if system is None or not hasattr(system, 'history'): + return jsonify([]) + return jsonify(list(system.history)) diff --git a/nervous_system.py b/nervous_system.py new file mode 100644 index 0000000..165dcd2 --- /dev/null +++ b/nervous_system.py @@ -0,0 +1,106 @@ +import torch +import torch.optim as optim +from torch.nn import CrossEntropyLoss +import torch.nn.functional as F + +from sensory import Sensory +from brain import Brain + + +class NervousSystem: + """Wraps the Brain, handles token growth, generation and on-the-fly training.""" # noqa: E501 + + def __init__(self, device: str = "cuda"): + self.device = torch.device(device if torch.cuda.is_available() else "cpu") # noqa: E501 + self.sensory = Sensory() + vocab_size = len(self.sensory.stoi) + self.brain = Brain(vocab_size).to(self.device) + + self.optimizer = optim.Adam(self.brain.parameters(), lr=1e-4) + self.criterion = CrossEntropyLoss(ignore_index=0) + self.meta_steps = 0 + + def _resize_embeddings(self) -> None: + new_size = len(self.sensory.stoi) + old_emb = self.brain.token_emb + + # rebuild token embeddings + self.brain.token_emb = torch.nn.Embedding( + new_size, old_emb.embedding_dim + ).to(self.device) + with torch.no_grad(): + self.brain.token_emb.weight[: old_emb.num_embeddings] = old_emb.weight # noqa: E501 + + # rebuild output head + old_out = self.brain.fc_out + self.brain.fc_out = torch.nn.Linear( + old_emb.embedding_dim, new_size + ).to(self.device) + with torch.no_grad(): + self.brain.fc_out.weight[: old_out.out_features] = old_out.weight + self.brain.fc_out.bias[: old_out.out_features] = old_out.bias + + def generate(self, prompt: str, max_len: int = 50, + temperature: float = 0.8, top_k: int = 50) -> str: + self.brain.eval() + raw_ids = self.sensory.encode(prompt, grow=False)[-self.brain.max_seq_len:] # noqa: E501 + out = torch.tensor(raw_ids, dtype=torch.long, device=self.device).unsqueeze(0) # noqa: E501 + + result = [] + for _ in range(max_len): + logits = self.brain(out)[:, -1, :] + # apply temperature + logits = logits / temperature + # top-k filtering + values, indices = torch.topk(logits, top_k) + probs = F.softmax(values, dim=-1) + next_tok = indices[0, torch.multinomial(probs, 1)].unsqueeze(0).unsqueeze(0) # noqa: E501 + tok_id = next_tok.item() + if tok_id == self.sensory.stoi[""]: + break + result.append(tok_id) + out = torch.cat([out, next_tok], dim=1) + + return self.sensory.decode(result) + + def train(self, user_text: str, bot_text: str) -> None: + # 1) grow vocab on _train_ only + for txt in (user_text, bot_text): + _ = self.sensory.encode(txt, grow=True) + self._resize_embeddings() + + # ensure + if "" not in self.sensory.stoi: + idx = len(self.sensory.stoi) + self.sensory.stoi[""] = idx + self.sensory.itos[idx] = "" + self._resize_embeddings() + + combined = f"{user_text} {bot_text}" + ids = torch.tensor( + self.sensory.encode(combined, grow=False), dtype=torch.long, device=self.device # noqa: E501 + ).unsqueeze(0) + + if ids.size(1) < 2: + return + + inputs = ids[:, :-1] + targets = ids[:, 1:] + + self.brain.train() + logits = self.brain(inputs) + loss = self.criterion( + logits.view(-1, logits.size(-1)), targets.view(-1) + ) + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + + # a tiny meta-learning bump + self.meta_steps += 1 + if self.meta_steps % 100 == 0: + for g in self.optimizer.param_groups: + old_lr = g["lr"] + g["lr"] = old_lr * 1.1 + torch.cuda.synchronize(self.device) + g["lr"] = old_lr diff --git a/sensory.py b/sensory.py new file mode 100644 index 0000000..5597211 --- /dev/null +++ b/sensory.py @@ -0,0 +1,48 @@ +import os +import json + + +class Sensory: + """Dynamic whitespace tokenizer that can grow (or not) its vocab.""" + + def __init__(self): + self.stoi = {"": 0, "": 1} + self.itos = {0: "", 1: ""} + + def encode(self, text: str, grow: bool = True) -> list[int]: + ids: list[int] = [] + for tok in text.strip().split(): + if tok not in self.stoi: + if grow: + idx = len(self.stoi) + self.stoi[tok] = idx + self.itos[idx] = tok + else: + idx = self.stoi[""] + else: + idx = self.stoi[tok] + ids.append(idx) + return ids + + def decode(self, ids: list[int]) -> str: + return " ".join(self.itos.get(i, "") for i in ids) + + def save_vocab(self, path: str = "vocab.json") -> None: + """Dump stoi+itos to disk.""" + data = { + "stoi": self.stoi, + # JSON keys must be strings + "itos": {str(k): v for k, v in self.itos.items()} + } + with open(path, "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, indent=2) + + def load_vocab(self, path: str = "vocab.json") -> None: + """Load stoi+itos if it exists.""" + if not os.path.isfile(path): + return + with open(path, encoding="utf-8") as f: + data = json.load(f) + self.stoi = data["stoi"] + # convert itos keys back to int + self.itos = {int(k): v for k, v in data["itos"].items()} diff --git a/templates/dashboard.html b/templates/dashboard.html new file mode 100644 index 0000000..035e625 --- /dev/null +++ b/templates/dashboard.html @@ -0,0 +1,72 @@ + + + + + Ruby Dashboard + + + +

Ruby Dashboard

+ +
+ Progress: 0/0 +
+ +
+

Recent Interactions

+
Loading…
+
+ +
+ +
+ + + + diff --git a/templates/graph.html b/templates/graph.html new file mode 100644 index 0000000..5dd2c94 --- /dev/null +++ b/templates/graph.html @@ -0,0 +1,99 @@ + + + + + Ruby Brain Map + + + + +
+ + + + +
+
+ + +