Reverted some changes due to the unicode cleaner being moved to the tokenizer.

2025-04-27 15:38:58 -04:00 · 2025-04-27 15:38:58 -04:00 · 3a77b5db32
commit 3a77b5db32
parent 60ca746420
5 changed files with 16 additions and 16 deletions
--- a/model/rehearsal.py
+++ b/model/rehearsal.py
@ -2,25 +2,29 @@ import torch
 from model.brain import model, tokenizer, DEVICE
 from model.trainer import train_on_message
 from model.dynamic_expand import expand_model_if_needed
 from utils.unicleaner import clean_unicode
 def simulate_conversation():
    expand_model_if_needed()
    model.eval()
-    seed = torch.randint(0, tokenizer.next_id, (1, 5), device=DEVICE)
+
-    seed = seed[:, -128:]
+    max_token_id = model.head.out_features - 1
    if max_token_id < 1:
        return  # Safeguard if model is still too small
    seed = torch.randint(0, max_token_id + 1, (1, 5), device=DEVICE)
    seed = seed[:, -128:]  # Clamp sequence length
    output = model(seed)
    preds = torch.argmax(output, dim=-1).squeeze().tolist()
    if isinstance(preds, int):
        preds = [preds]
    # 🛡 Clamp predictions too
    preds = [min(max(p, 0), max_token_id) for p in preds]
    text = tokenizer.detokenize(preds)
    # 🧹 Clean the generated text too
    text = clean_unicode(text)
    if text and len(text.split()) >= 3:
        train_on_message(text)
--- a/model/tokenizer.py
+++ b/model/tokenizer.py
@ -1,6 +1,7 @@
 import re
 import os
 import json
 from utils.unicleaner import clean_unicode
 VOCAB_PATH = "data/memory/vocab.json"
@ -24,6 +25,7 @@ class Tokenizer:
        self.next_id = 4
    def tokenize(self, text):
        text = clean_unicode(text)  # 🚨 Always clean incoming text
        words = re.findall(r"\b\w+\b", text.lower())
        tokens = []
        for word in words:
--- a/model/trainer.py
+++ b/model/trainer.py
@ -4,7 +4,6 @@ from model.dynamic_expand import expand_model_if_needed, _last_expansion_time, g
 from model.brain_state import model, tokenizer, DEVICE, loss_fn
 from model.brainmap import update_brainmap
 from context.context import add_to_context, get_recent_context
 from utils.unicleaner import clean_unicode
 LOSS_FILE = "data/logs/loss.log"
 VOCAB_GROWTH_FILE = "data/logs/vocab_growth.log"
@ -36,9 +35,6 @@ def train_on_message(text: str, source: str = "user"):
    try:
        model.train()
        # 🧹 Clean up the incoming text
        text = clean_unicode(text)
        context_texts = get_recent_context(10)
        augmented_text = "<start> " + " ".join(context_texts + [text]) + " <end>"
--- a/reader/reader.py
+++ b/reader/reader.py
@ -3,7 +3,6 @@ import asyncio
 from model.trainer import train_on_message
 from model.scheduler import set_next_action
 from reader.filter import is_valid_line
 from utils.unicleaner import clean_unicode
 import json
 BOOK_DIR = "data/books"
@ -49,8 +48,7 @@ async def read_books_forever():
                if not line:
                    if len(paragraph) > PARAGRAPH_MIN_LENGTH:
-                        cleaned_paragraph = clean_unicode(paragraph.strip())
+                        train_on_message(paragraph.strip(), source="book")
                        train_on_message(cleaned_paragraph, source="book")
                        paragraph = ""
                        await asyncio.sleep(READ_DELAY)
                        set_next_action(READ_DELAY, "Reading")
@ -62,7 +60,6 @@ async def read_books_forever():
            # train last paragraph if any
            if paragraph and len(paragraph) > PARAGRAPH_MIN_LENGTH:
-                cleaned_paragraph = clean_unicode(paragraph.strip())
+                train_on_message(paragraph.strip(), source="book")
                train_on_message(cleaned_paragraph, source="book")
                await asyncio.sleep(READ_DELAY)
                set_next_action(READ_DELAY, "Reading")
--- a/utils/unicleaner.py
+++ b/utils/unicleaner.py
@ -17,6 +17,7 @@ RE_DASHES = {
    '\u2014': '-',  # Em dash
 }
 def clean_unicode(text: str) -> str:
    # 1. Replace fancy quotes
    for bad, good in RE_QUOTES.items():