From 60ca74642037ef27398873139df3fc38dccd9a69 Mon Sep 17 00:00:00 2001
From: Dani <dsapelli@yahoo.com>
Date: Sun, 27 Apr 2025 15:34:06 -0400
Subject: [PATCH] Added a unicode cleaner to everything we get material from

---
 model/brainmap.py       | 20 ++++++++++++++++++++
 model/dynamic_expand.py |  4 +++-
 model/rehearsal.py      |  7 ++++++-
 model/trainer.py        | 11 ++++++++---
 reader/reader.py        |  7 +++++--
 utils/unicleaner.py     | 39 +++++++++++++++++++++++++++++++++++++++
 6 files changed, 81 insertions(+), 7 deletions(-)
 create mode 100644 utils/unicleaner.py

diff --git a/model/brainmap.py b/model/brainmap.py
index 739820a..99d34f3 100644
--- a/model/brainmap.py
+++ b/model/brainmap.py
@@ -1,6 +1,7 @@
 import os
 import json
 from collections import defaultdict
+from utils.unicleaner import clean_unicode
 
 BRAINMAP_FILE = "data/memory/brainmap.json"
 
@@ -37,3 +38,22 @@ def update_brainmap(words):
 
 def get_brainmap():
     return brain_map
+
+
+def fix_brainmap(brainmap: dict) -> dict:
+    cleaned_brainmap = {}
+
+    for word, value in brainmap.items():
+        cleaned_word = clean_unicode(word.strip())
+
+        # Skip bad entries
+        if not cleaned_word or cleaned_word in {"...", "-", "--", "''", '""'}:
+            continue
+
+        # Merge duplicates (case-insensitive optional)
+        if cleaned_word in cleaned_brainmap:
+            cleaned_brainmap[cleaned_word] += value
+        else:
+            cleaned_brainmap[cleaned_word] = value
+
+    return cleaned_brainmap
diff --git a/model/dynamic_expand.py b/model/dynamic_expand.py
index c90c3b9..0b2c58a 100644
--- a/model/dynamic_expand.py
+++ b/model/dynamic_expand.py
@@ -23,7 +23,7 @@ def expand_model_if_needed():
         old_vocab_size = model.head.out_features
 
         if current_vocab_size <= old_vocab_size:
-            return
+            return False  # No expansion needed
 
         # print(f"[Expand] Expanding model from {old_vocab_size} -> {current_vocab_size}")
 
@@ -42,6 +42,8 @@ def expand_model_if_needed():
 
         model = new_model
         optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
+
         _last_expansion_time = time.time()
 
         # print("[Expand] Expansion complete.")
+        return True  # <<< tell trainer we expanded
diff --git a/model/rehearsal.py b/model/rehearsal.py
index 0515ef5..17eddc3 100644
--- a/model/rehearsal.py
+++ b/model/rehearsal.py
@@ -2,6 +2,7 @@ import torch
 from model.brain import model, tokenizer, DEVICE
 from model.trainer import train_on_message
 from model.dynamic_expand import expand_model_if_needed
+from utils.unicleaner import clean_unicode
 
 
 def simulate_conversation():
@@ -9,7 +10,7 @@ def simulate_conversation():
 
     model.eval()
     seed = torch.randint(0, tokenizer.next_id, (1, 5), device=DEVICE)
-    seed = seed[:, -128:]  # Safety clamp
+    seed = seed[:, -128:]
     output = model(seed)
 
     preds = torch.argmax(output, dim=-1).squeeze().tolist()
@@ -17,5 +18,9 @@ def simulate_conversation():
         preds = [preds]
 
     text = tokenizer.detokenize(preds)
+
+    # 🧹 Clean the generated text too
+    text = clean_unicode(text)
+
     if text and len(text.split()) >= 3:
         train_on_message(text)
diff --git a/model/trainer.py b/model/trainer.py
index 850677b..71ca696 100644
--- a/model/trainer.py
+++ b/model/trainer.py
@@ -4,6 +4,7 @@ from model.dynamic_expand import expand_model_if_needed, _last_expansion_time, g
 from model.brain_state import model, tokenizer, DEVICE, loss_fn
 from model.brainmap import update_brainmap
 from context.context import add_to_context, get_recent_context
+from utils.unicleaner import clean_unicode
 
 LOSS_FILE = "data/logs/loss.log"
 VOCAB_GROWTH_FILE = "data/logs/vocab_growth.log"
@@ -34,9 +35,11 @@ def train_on_message(text: str, source: str = "user"):
 
     try:
         model.train()
-        context_texts = get_recent_context(10)
 
-        # Here's the important change:
+        # 🧹 Clean up the incoming text
+        text = clean_unicode(text)
+
+        context_texts = get_recent_context(10)
         augmented_text = "<start> " + " ".join(context_texts + [text]) + " <end>"
 
         tokens = tokenizer.tokenize(augmented_text)
@@ -65,10 +68,12 @@ def train_on_message(text: str, source: str = "user"):
         opt.zero_grad()
         loss.backward()
         opt.step()
+        scheduler.step()
 
         log_loss(loss.item())
         log_vocab_growth()
         add_to_context(text, source=source)
         update_brainmap(augmented_text.split())
+
     finally:
-        expand_lock.release()
\ No newline at end of file
+        expand_lock.release()
diff --git a/reader/reader.py b/reader/reader.py
index 9783ea0..89ebe44 100644
--- a/reader/reader.py
+++ b/reader/reader.py
@@ -3,6 +3,7 @@ import asyncio
 from model.trainer import train_on_message
 from model.scheduler import set_next_action
 from reader.filter import is_valid_line
+from utils.unicleaner import clean_unicode
 import json
 
 BOOK_DIR = "data/books"
@@ -48,7 +49,8 @@ async def read_books_forever():
 
                 if not line:
                     if len(paragraph) > PARAGRAPH_MIN_LENGTH:
-                        train_on_message(paragraph.strip(), source="book")
+                        cleaned_paragraph = clean_unicode(paragraph.strip())
+                        train_on_message(cleaned_paragraph, source="book")
                         paragraph = ""
                         await asyncio.sleep(READ_DELAY)
                         set_next_action(READ_DELAY, "Reading")
@@ -60,6 +62,7 @@ async def read_books_forever():
 
             # train last paragraph if any
             if paragraph and len(paragraph) > PARAGRAPH_MIN_LENGTH:
-                train_on_message(paragraph.strip(), source="book")
+                cleaned_paragraph = clean_unicode(paragraph.strip())
+                train_on_message(cleaned_paragraph, source="book")
                 await asyncio.sleep(READ_DELAY)
                 set_next_action(READ_DELAY, "Reading")
diff --git a/utils/unicleaner.py b/utils/unicleaner.py
new file mode 100644
index 0000000..1fab524
--- /dev/null
+++ b/utils/unicleaner.py
@@ -0,0 +1,39 @@
+import unicodedata
+import re
+
+# Precompiled regexes (fast)
+RE_SPACES = re.compile(r"\s+")
+RE_CONTROL_CHARS = re.compile(r"[\u0000-\u001F\u007F-\u009F]")
+RE_QUOTES = {
+    '\u2018': "'",  # Left single quotation mark
+    '\u2019': "'",  # Right single quotation mark
+    '\u201C': '"',  # Left double quotation mark
+    '\u201D': '"',  # Right double quotation mark
+    '\u201E': '"',  # Double low-9 quotation mark
+    '\u201F': '"',  # Double high-reversed-9 quotation mark
+}
+RE_DASHES = {
+    '\u2013': '-',  # En dash
+    '\u2014': '-',  # Em dash
+}
+
+def clean_unicode(text: str) -> str:
+    # 1. Replace fancy quotes
+    for bad, good in RE_QUOTES.items():
+        text = text.replace(bad, good)
+
+    # 2. Replace fancy dashes
+    for bad, good in RE_DASHES.items():
+        text = text.replace(bad, good)
+
+    # 3. Remove BOMs and stray control characters
+    text = RE_CONTROL_CHARS.sub('', text)
+
+    # 4. Normalize Unicode (NFKC collapses fullwidth, fractions, weird numerics)
+    text = unicodedata.normalize('NFKC', text)
+
+    # 5. Collapse all whitespace to a single space
+    text = RE_SPACES.sub(' ', text)
+
+    # 6. Strip leading/trailing whitespace
+    return text.strip()