From 7823ce1d5e8ecf0fd8c1e1265fe64fb35406feb9 Mon Sep 17 00:00:00 2001 From: Dani Date: Tue, 22 Apr 2025 12:44:28 -0400 Subject: [PATCH] Doing some updates to the reader part of Ruby. --- .gitignore | 3 +- dashboard.py | 27 ++++++++++-- main.py | 15 +++---- reader.py | 35 ++++++++++++--- tokenizer.py | 11 ++--- trainer.py | 119 ++++++++++++++++++++++++++++++++++++++++++++++----- 6 files changed, 174 insertions(+), 36 deletions(-) diff --git a/.gitignore b/.gitignore index ef465be..c912757 100644 --- a/.gitignore +++ b/.gitignore @@ -172,4 +172,5 @@ cython_debug/ /logs/core_dreams.txt /logs/best_dream.txt /.vscode/launch.json -/books \ No newline at end of file +/books +/readstate.txt \ No newline at end of file diff --git a/dashboard.py b/dashboard.py index d2f2448..3262f51 100644 --- a/dashboard.py +++ b/dashboard.py @@ -1,8 +1,8 @@ from flask import Flask, render_template_string -from datetime import datetime import os app = Flask(__name__) +ruby_client = None # This will be set externally def tail(filepath, num_lines=10): @@ -31,6 +31,17 @@ def home(): errors = [line.strip() for line in tail("logs/error.log", 15)] best_dream = get_best_dream() + # Handle book progress if Ruby has a reader + book = { + "book": "Not reading", + "line": 0, + "total": 0, + "percent": 0.0, + "last_sentence": "" + } + if ruby_client and hasattr(ruby_client, "reader"): + book = ruby_client.reader.progress() + return render_template_string(""" @@ -47,6 +58,11 @@ def home():

🌸 Ruby's Dashboard

Vocabulary Size: {{ vocab_size }}

+ +

📖 Book Progress

+

{{ book.book }} – Line {{ book.line }} of {{ book.total }} ({{ book.percent }}%)

+

{{ book.last_sentence }}

+

🏆 Highest Scoring Dream

{{ best_dream }}

@@ -73,8 +89,11 @@ def home(): - """, best_dream=best_dream, dreams=dreams[::-1], messages=messages[::-1], errors=errors[::-1], vocab_size=vocab_size) + """, best_dream=best_dream, dreams=dreams[::-1], messages=messages[::-1], errors=errors[::-1], vocab_size=vocab_size, book=book) -def start_dashboard(): - app.run(debug=False, host="0.0.0.0", port=5000) +def start_dashboard_background(): + import threading + thread = threading.Thread(target=lambda: app.run(debug=False, host="0.0.0.0", port=5000)) + thread.daemon = True + thread.start() diff --git a/main.py b/main.py index 190d0ee..68895f5 100644 --- a/main.py +++ b/main.py @@ -2,10 +2,9 @@ import discord import asyncio import atexit import os -import threading from dotenv import load_dotenv from datetime import datetime, timedelta -from dashboard import start_dashboard +import dashboard from tokenizer import Tokenizer from trainer import RubyTrainer from reader import BookReader @@ -66,6 +65,8 @@ class Ruby(discord.Client): print(f"[READY] Logged in as {self.user} (ID: {self.user.id})") await self.set_activity("you...") self.trainer.reinforce_core_memory() + # self.trainer.clean_vocab() + # self.trainer.rebuild_model_if_needed() async def idle_dream_loop(self): await self.wait_until_ready() @@ -85,7 +86,7 @@ class Ruby(discord.Client): speak = random() < 0.5 thought = self.trainer.daydream(say_thought=speak) - if speak and thought and len(thought.split()) >=4: + if speak and thought and len(thought.split()) >= 4: for guild in self.guilds: for channel in guild.text_channels: if channel.permissions_for(guild.me).send_messages: @@ -125,15 +126,14 @@ class Ruby(discord.Client): def train_on_message(self, message: discord.Message): text = message.content.strip() self.trainer.train_on_tokens_from_text(text) - token_tensor = torch.tensor(tokens, dtype=torch.long) - loss = train_on_tokens(self.model, tokens, self.optimizer, self.criterion, device="cpu") - print(f"[TRAIN] Tokens: {tokens} | Loss: {loss:.4f}") # Run Ruby client = None try: client = Ruby() + dashboard.ruby_client = client + dashboard.start_dashboard_background() def on_exit(): if client: @@ -142,8 +142,7 @@ try: client.trainer.daydream(rounds=10) atexit.register(on_exit) - dashboard_thread = threading.Thread(target=start_dashboard, daemon=True) - dashboard_thread.start() + dashboard.start_dashboard_background() client.run(TOKEN) finally: if client is not None: diff --git a/reader.py b/reader.py index a5af528..bc036a8 100644 --- a/reader.py +++ b/reader.py @@ -2,14 +2,17 @@ import os import asyncio from datetime import datetime + class BookReader: - def __init__(self, trainer, book_path, state_path="readstate.txt", log_path="logs/read.log", interval=180): + def __init__(self, trainer, book_path, state_path="readstate.txt", log_path="logs/read.log", interval=15): self.trainer = trainer self.book_path = book_path self.state_path = state_path self.log_path = log_path - self.interval = interval # seconds between reading cycles + self.interval = interval self.current_line = 0 + self.last_sentence = "" + self.total_lines = 0 os.makedirs(os.path.dirname(self.log_path), exist_ok=True) if os.path.exists(self.state_path): @@ -19,35 +22,53 @@ class BookReader: except Exception: self.current_line = 0 + if os.path.exists(self.book_path): + with open(self.book_path, "r", encoding="utf-8", errors="ignore") as f: + self.total_lines = len(f.readlines()) + def _save_state(self): with open(self.state_path, "w", encoding="utf-8") as f: f.write(str(self.current_line)) def _log_read(self, text: str, score: float, tag: str = "Book"): with open(self.log_path, "a", encoding="utf-8") as f: - f.write(f"[{datetime.utcnow().isoformat()}] ({tag}) {score:.2f} | {text.strip()}\\n") + f.write(f"[{datetime.utcnow().isoformat()}] ({tag}) {score:.2f} | {text.strip()}\n") async def start_reading(self): if not os.path.exists(self.book_path): print(f"[BOOK] File not found: {self.book_path}") return - with open(self.book_path, "r", encoding="utf-8") as f: + with open(self.book_path, "r", encoding="utf-8", errors="ignore") as f: lines = f.readlines() + self.total_lines = len(lines) print(f"[BOOK] Starting to read {self.book_path} from line {self.current_line}...") - while self.current_line < len(lines): + while self.current_line < self.total_lines: passage = lines[self.current_line].strip() - if len(passage.split()) >= 5: + if len(passage.split()) >= 5 and self._is_valid(passage): score = self.trainer.score_sentence(passage) if self.trainer.is_reinforceable(passage) and score >= 2.5: self.trainer.train_on_tokens_from_text(passage) self._log_read(passage, score) + self.last_sentence = passage self.current_line += 1 self._save_state() await asyncio.sleep(self.interval) - print("[BOOK] Finished reading the book.") \ No newline at end of file + print("[BOOK] Finished reading the book.") + + def _is_valid(self, text: str) -> bool: + return all(c.isprintable() or c.isspace() for c in text) + + def progress(self) -> dict: + return { + "book": os.path.basename(self.book_path), + "line": self.current_line, + "total": self.total_lines, + "percent": round(100 * self.current_line / self.total_lines, 2) if self.total_lines else 0.0, + "last_sentence": self.last_sentence + } diff --git a/tokenizer.py b/tokenizer.py index 828c04c..4ba9782 100644 --- a/tokenizer.py +++ b/tokenizer.py @@ -1,4 +1,5 @@ import os +from trainer import normalize_for_vocab class Tokenizer: @@ -13,21 +14,21 @@ class Tokenizer: return with open(self.vocab_path, "r", encoding="utf-8") as f: for line in f: - token, idx = line.strip().split("\t") - self.vocab[token] = int(idx) - if token not in self.vocab: + token = line.strip() + if token and token not in self.vocab: + idx = len(self.vocab) self.vocab[token] = idx self.inv_vocab[idx] = token - self.inv_vocab = {v: k for k, v in self.vocab.items()} def save_vocab(self): with open(self.vocab_path, "w", encoding="utf-8") as f: for token, idx in self.vocab.items(): - f.write(f"{token}\t{idx}\n") + f.write(f"{token}\n") def tokenize(self, text): tokens = [] for word in text.strip().split(): + word = normalize_for_vocab(word) if word not in self.vocab: self.vocab[word] = len(self.vocab) self.inv_vocab[self.vocab[word]] = word diff --git a/trainer.py b/trainer.py index 0089168..cdc8dda 100644 --- a/trainer.py +++ b/trainer.py @@ -3,10 +3,34 @@ import torch.nn.functional as F from datetime import datetime from collections import Counter import os +import re +import string from model import MiniGPT # flake8: noqa E501 +def normalize_for_vocab(text: str) -> str: + # Replace em-dashes and smart quotes with standard forms + text = text.replace("—", " ").replace("“", '"').replace("”", '"').replace("‘", "'").replace("’", "'") + + # Remove parenthetical and bracket content + text = re.sub(r"\[(.*?)\]", "", text) + text = re.sub(r"\((.*?)\)", "", text) + + # Remove trailing punctuation (commas, periods, question marks, etc.) per word + text = re.sub(r"(\w)[.,!?;:]+(?=\s|$)", r"\1", text) + + # Remove quotes at start or end of lines + text = text.strip("\"'") + + # Normalize hyphenated words by collapsing to a single word + text = re.sub(r"(\w)-(\w)", r"\1\2", text) + + # Remove duplicate spaces and lowercase + text = re.sub(r"\s+", " ", text).strip().lower() + + return text + class RubyTrainer: def __init__(self, tokenizer, embed_dim=128, n_heads=4, n_layers=2, max_len=128): @@ -34,7 +58,8 @@ class RubyTrainer: self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001) def train_on_tokens_from_text(self, text: str): - tokens = self.tokenizer.tokenize(text) + normalized = normalize_for_vocab(text) + tokens = self.tokenizer.tokenize(normalized) if not tokens: return tokens = [self.tokenizer.vocab[""]] + tokens + [self.tokenizer.vocab[""]] @@ -54,44 +79,62 @@ class RubyTrainer: print(f"[TRAIN] Tokens: {tokens} | Loss: {loss.item():.4f}") - def generate_reply(self, prompt=None, max_length=20): + def generate_reply(self, prompt=None, max_length=20, temperature=1.3): self.model.eval() input_ids = torch.tensor([[self.tokenizer.vocab[""]]], device=self.device) with torch.no_grad(): for _ in range(max_length): + max_id = self.model.token_embed.num_embeddings + input_ids = torch.clamp(input_ids, 0, max_id - 1) output = self.model(input_ids) logits = output[:, -1, :] - # Apply repeat penalty BEFORE sampling + # Apply repeat penalty if input_ids.size(1) >= 2: last_token = input_ids[0, -1].item() - logits[0, last_token] *= 0.1 # Penalize repeating same token again + logits[0, last_token] *= 0.1 + + # 🔥 Temperature sampling + probs = F.softmax(logits / temperature, dim=-1) + next_token = torch.multinomial(probs, 1)[0].view(1) + + if next_token.item() >= self.model.token_embed.num_embeddings: + print("[ERROR] Token index out of bounds. Rebuilding model...") + self.rebuild_model_if_needed() + return "" - next_token = torch.argmax(logits, dim=-1) input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=1) if next_token.item() == self.tokenizer.vocab[""]: break output = self.tokenizer.detokenize(input_ids.squeeze().tolist()) - output = output.replace("", "").replace("", "").strip() - return output + return output.replace("", "").replace("", "").strip() - def self_rephrase(self, original: str, max_tokens=50): + + def self_rephrase(self, original: str, max_tokens=50, temperature=1.3): self.model.eval() tokens = [self.tokenizer.vocab[""]] + self.tokenizer.tokenize(original) input_ids = torch.tensor(tokens, dtype=torch.long, device=self.device).unsqueeze(0) for _ in range(max_tokens): with torch.no_grad(): + input_ids = torch.clamp(input_ids, 0, self.model.token_embed.num_embeddings - 1) out = self.model(input_ids) logits = out[:, -1, :] / 1.1 if input_ids.size(1) < 8: logits[0, self.tokenizer.vocab[""]] = float("-inf") - probs = F.softmax(logits, dim=-1) + probs = F.softmax(logits / temperature, dim=-1) next_token = torch.multinomial(probs, 1)[0].view(1, 1) + + # ✅ Ensure next_token is valid + if next_token.item() >= self.model.token_embed.num_embeddings: + print("[ERROR] Token index out of bounds in self_rephrase. Rebuilding model...") + self.rebuild_model_if_needed() + return "" + input_ids = torch.cat([input_ids, next_token], dim=1) if next_token.item() == self.tokenizer.vocab[""]: @@ -220,9 +263,20 @@ class RubyTrainer: if text.lower().count("i am") > len(text.split()) * 0.25: return False - # Reject if the first word is repeated 3+ times + # Reject if first word repeats 3 times ("you you you") if words[:3].count(words[0]) == 3: - return False # "you you you" type + return False + + # 🧠 NEW: Reject if starts with common book phrases + banned_starts = ("once upon", "chapter", "the end", "in which", "it was", "quick cried", "they are course") + lowered = text.lower() + if any(lowered.startswith(phrase) for phrase in banned_starts): + return False + + # 🧠 NEW: Reject if too many capitalized words in a row (e.g., names, places from a book) + cap_sequence = sum(1 for word in words if word.istitle()) + if cap_sequence > 5 and cap_sequence / len(words) > 0.4: + return False return True @@ -249,3 +303,46 @@ class RubyTrainer: base_score -= 2 return max(0.0, base_score) + + def clean_vocab(self, min_occurrences: int = 1): + print("[CLEAN] Analyzing and cleaning vocabulary...") + + # Count normalized forms + counts = Counter() + norm_to_original = {} + + for word in self.tokenizer.vocab: + if word in ("", ""): + continue + normalized = normalize_for_vocab(word) + if normalized not in norm_to_original: + norm_to_original[normalized] = word + counts[normalized] += 1 + + # Rebuild new vocab + new_vocab = {"": 0, "": 1} + reverse = dict() + + idx = 2 + for norm, original in norm_to_original.items(): + if counts[norm] >= min_occurrences: + new_vocab[original] = idx + reverse[norm] = original + idx += 1 + + old_size = len(self.tokenizer.vocab) + new_size = len(new_vocab) + print(f"[CLEAN] Vocabulary reduced: {old_size} → {new_size}") + + # Replace tokenizer vocab + self.tokenizer.vocab = new_vocab + self.tokenizer.inv_vocab = {v: k for k, v in new_vocab.items()} + + # Reinitialize the model to reflect the new vocab + self.rebuild_model_if_needed() + + # Optionally: Save cleaned vocab + with open("tokenizer_vocab.txt", "w", encoding="utf-8") as f: + for token in new_vocab: + f.write(f"{token}\n") + print("[CLEAN] Vocab written to tokenizer_vocab.txt")