Added a unicode cleaner to everything we get material from

2025-04-27 15:34:06 -04:00 · 2025-04-27 15:34:06 -04:00 · 60ca746420
commit 60ca746420
parent 97b43f832b
6 changed files with 81 additions and 7 deletions
--- a/model/brainmap.py
+++ b/model/brainmap.py
@ -1,6 +1,7 @@
 import os
 import json
 from collections import defaultdict
 from utils.unicleaner import clean_unicode
 BRAINMAP_FILE = "data/memory/brainmap.json"
@ -37,3 +38,22 @@ def update_brainmap(words):
 def get_brainmap():
    return brain_map
 def fix_brainmap(brainmap: dict) -> dict:
    cleaned_brainmap = {}
    for word, value in brainmap.items():
        cleaned_word = clean_unicode(word.strip())
        # Skip bad entries
        if not cleaned_word or cleaned_word in {"...", "-", "--", "''", '""'}:
            continue
        # Merge duplicates (case-insensitive optional)
        if cleaned_word in cleaned_brainmap:
            cleaned_brainmap[cleaned_word] += value
        else:
            cleaned_brainmap[cleaned_word] = value
    return cleaned_brainmap
--- a/model/dynamic_expand.py
+++ b/model/dynamic_expand.py
@ -23,7 +23,7 @@ def expand_model_if_needed():
        old_vocab_size = model.head.out_features
        if current_vocab_size <= old_vocab_size:
-            return
+            return False  # No expansion needed
        # print(f"[Expand] Expanding model from {old_vocab_size} -> {current_vocab_size}")
@ -42,6 +42,8 @@ def expand_model_if_needed():
        model = new_model
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
        _last_expansion_time = time.time()
        # print("[Expand] Expansion complete.")
        return True  # <<< tell trainer we expanded
--- a/model/rehearsal.py
+++ b/model/rehearsal.py
@ -2,6 +2,7 @@ import torch
 from model.brain import model, tokenizer, DEVICE
 from model.trainer import train_on_message
 from model.dynamic_expand import expand_model_if_needed
 from utils.unicleaner import clean_unicode
 def simulate_conversation():
@ -9,7 +10,7 @@ def simulate_conversation():
    model.eval()
    seed = torch.randint(0, tokenizer.next_id, (1, 5), device=DEVICE)
-    seed = seed[:, -128:]  # Safety clamp
+    seed = seed[:, -128:]
    output = model(seed)
    preds = torch.argmax(output, dim=-1).squeeze().tolist()
@ -17,5 +18,9 @@ def simulate_conversation():
        preds = [preds]
    text = tokenizer.detokenize(preds)
    # 🧹 Clean the generated text too
    text = clean_unicode(text)
    if text and len(text.split()) >= 3:
        train_on_message(text)
--- a/model/trainer.py
+++ b/model/trainer.py
@ -4,6 +4,7 @@ from model.dynamic_expand import expand_model_if_needed, _last_expansion_time, g
 from model.brain_state import model, tokenizer, DEVICE, loss_fn
 from model.brainmap import update_brainmap
 from context.context import add_to_context, get_recent_context
 from utils.unicleaner import clean_unicode
 LOSS_FILE = "data/logs/loss.log"
 VOCAB_GROWTH_FILE = "data/logs/vocab_growth.log"
@ -34,9 +35,11 @@ def train_on_message(text: str, source: str = "user"):
    try:
        model.train()
        context_texts = get_recent_context(10)
-        # Here's the important change:
+        # 🧹 Clean up the incoming text
        text = clean_unicode(text)
        context_texts = get_recent_context(10)
        augmented_text = "<start> " + " ".join(context_texts + [text]) + " <end>"
        tokens = tokenizer.tokenize(augmented_text)
@ -65,10 +68,12 @@ def train_on_message(text: str, source: str = "user"):
        opt.zero_grad()
        loss.backward()
        opt.step()
        scheduler.step()
        log_loss(loss.item())
        log_vocab_growth()
        add_to_context(text, source=source)
        update_brainmap(augmented_text.split())
    finally:
-        expand_lock.release()
+        expand_lock.release()
--- a/reader/reader.py
+++ b/reader/reader.py
@ -3,6 +3,7 @@ import asyncio
 from model.trainer import train_on_message
 from model.scheduler import set_next_action
 from reader.filter import is_valid_line
 from utils.unicleaner import clean_unicode
 import json
 BOOK_DIR = "data/books"
@ -48,7 +49,8 @@ async def read_books_forever():
                if not line:
                    if len(paragraph) > PARAGRAPH_MIN_LENGTH:
-                        train_on_message(paragraph.strip(), source="book")
+                        cleaned_paragraph = clean_unicode(paragraph.strip())
                        train_on_message(cleaned_paragraph, source="book")
                        paragraph = ""
                        await asyncio.sleep(READ_DELAY)
                        set_next_action(READ_DELAY, "Reading")
@ -60,6 +62,7 @@ async def read_books_forever():
            # train last paragraph if any
            if paragraph and len(paragraph) > PARAGRAPH_MIN_LENGTH:
-                train_on_message(paragraph.strip(), source="book")
+                cleaned_paragraph = clean_unicode(paragraph.strip())
                train_on_message(cleaned_paragraph, source="book")
                await asyncio.sleep(READ_DELAY)
                set_next_action(READ_DELAY, "Reading")
--- a/utils/unicleaner.py
+++ b/utils/unicleaner.py
@ -0,0 +1,39 @@
 import unicodedata
 import re
 # Precompiled regexes (fast)
 RE_SPACES = re.compile(r"\s+")
 RE_CONTROL_CHARS = re.compile(r"[\u0000-\u001F\u007F-\u009F]")
 RE_QUOTES = {
    '\u2018': "'",  # Left single quotation mark
    '\u2019': "'",  # Right single quotation mark
    '\u201C': '"',  # Left double quotation mark
    '\u201D': '"',  # Right double quotation mark
    '\u201E': '"',  # Double low-9 quotation mark
    '\u201F': '"',  # Double high-reversed-9 quotation mark
 }
 RE_DASHES = {
    '\u2013': '-',  # En dash
    '\u2014': '-',  # Em dash
 }
 def clean_unicode(text: str) -> str:
    # 1. Replace fancy quotes
    for bad, good in RE_QUOTES.items():
        text = text.replace(bad, good)
    # 2. Replace fancy dashes
    for bad, good in RE_DASHES.items():
        text = text.replace(bad, good)
    # 3. Remove BOMs and stray control characters
    text = RE_CONTROL_CHARS.sub('', text)
    # 4. Normalize Unicode (NFKC collapses fullwidth, fractions, weird numerics)
    text = unicodedata.normalize('NFKC', text)
    # 5. Collapse all whitespace to a single space
    text = RE_SPACES.sub(' ', text)
    # 6. Strip leading/trailing whitespace
    return text.strip()