Added a unicode cleaner to everything we get material from

2025-04-27 15:34:06 -04:00 · 2025-04-27 15:34:06 -04:00 · 60ca746420
commit 60ca746420
parent 97b43f832b
6 changed files with 81 additions and 7 deletions
--- a/model/brainmap.py
+++ b/model/brainmap.py
@ -1,6 +1,7 @@
 import os
 import json
 from collections import defaultdict
+from utils.unicleaner import clean_unicode

 BRAINMAP_FILE = "data/memory/brainmap.json"

@ -37,3 +38,22 @@ def update_brainmap(words):

 def get_brainmap():
    return brain_map
+
+
+def fix_brainmap(brainmap: dict) -> dict:
+    cleaned_brainmap = {}
+
+    for word, value in brainmap.items():
+        cleaned_word = clean_unicode(word.strip())
+
+        # Skip bad entries
+        if not cleaned_word or cleaned_word in {"...", "-", "--", "''", '""'}:
+            continue
+
+        # Merge duplicates (case-insensitive optional)
+        if cleaned_word in cleaned_brainmap:
+            cleaned_brainmap[cleaned_word] += value
+        else:
+            cleaned_brainmap[cleaned_word] = value
+
+    return cleaned_brainmap
--- a/model/dynamic_expand.py
+++ b/model/dynamic_expand.py
@ -23,7 +23,7 @@ def expand_model_if_needed():
        old_vocab_size = model.head.out_features

        if current_vocab_size <= old_vocab_size:
-            return
+            return False  # No expansion needed

        # print(f"[Expand] Expanding model from {old_vocab_size} -> {current_vocab_size}")

@ -42,6 +42,8 @@ def expand_model_if_needed():

        model = new_model
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
+
        _last_expansion_time = time.time()

        # print("[Expand] Expansion complete.")
+        return True  # <<< tell trainer we expanded
--- a/model/rehearsal.py
+++ b/model/rehearsal.py
@ -2,6 +2,7 @@ import torch
 from model.brain import model, tokenizer, DEVICE
 from model.trainer import train_on_message
 from model.dynamic_expand import expand_model_if_needed
+from utils.unicleaner import clean_unicode


 def simulate_conversation():
@ -9,7 +10,7 @@ def simulate_conversation():

    model.eval()
    seed = torch.randint(0, tokenizer.next_id, (1, 5), device=DEVICE)
-    seed = seed[:, -128:]  # Safety clamp
+    seed = seed[:, -128:]
    output = model(seed)

    preds = torch.argmax(output, dim=-1).squeeze().tolist()
@ -17,5 +18,9 @@ def simulate_conversation():
        preds = [preds]

    text = tokenizer.detokenize(preds)
+
+    # 🧹 Clean the generated text too
+    text = clean_unicode(text)
+
    if text and len(text.split()) >= 3:
        train_on_message(text)
--- a/model/trainer.py
+++ b/model/trainer.py
@ -4,6 +4,7 @@ from model.dynamic_expand import expand_model_if_needed, _last_expansion_time, g
 from model.brain_state import model, tokenizer, DEVICE, loss_fn
 from model.brainmap import update_brainmap
 from context.context import add_to_context, get_recent_context
+from utils.unicleaner import clean_unicode

 LOSS_FILE = "data/logs/loss.log"
 VOCAB_GROWTH_FILE = "data/logs/vocab_growth.log"
@ -34,9 +35,11 @@ def train_on_message(text: str, source: str = "user"):

    try:
        model.train()
-        context_texts = get_recent_context(10)

-        # Here's the important change:
+        # 🧹 Clean up the incoming text
+        text = clean_unicode(text)
+
+        context_texts = get_recent_context(10)
        augmented_text = "<start> " + " ".join(context_texts + [text]) + " <end>"

        tokens = tokenizer.tokenize(augmented_text)
@ -65,10 +68,12 @@ def train_on_message(text: str, source: str = "user"):
        opt.zero_grad()
        loss.backward()
        opt.step()
+        scheduler.step()

        log_loss(loss.item())
        log_vocab_growth()
        add_to_context(text, source=source)
        update_brainmap(augmented_text.split())
+
    finally:
        expand_lock.release()
--- a/reader/reader.py
+++ b/reader/reader.py
@ -3,6 +3,7 @@ import asyncio
 from model.trainer import train_on_message
 from model.scheduler import set_next_action
 from reader.filter import is_valid_line
+from utils.unicleaner import clean_unicode
 import json

 BOOK_DIR = "data/books"
@ -48,7 +49,8 @@ async def read_books_forever():

                if not line:
                    if len(paragraph) > PARAGRAPH_MIN_LENGTH:
-                        train_on_message(paragraph.strip(), source="book")
+                        cleaned_paragraph = clean_unicode(paragraph.strip())
+                        train_on_message(cleaned_paragraph, source="book")
                        paragraph = ""
                        await asyncio.sleep(READ_DELAY)
                        set_next_action(READ_DELAY, "Reading")
@ -60,6 +62,7 @@ async def read_books_forever():

            # train last paragraph if any
            if paragraph and len(paragraph) > PARAGRAPH_MIN_LENGTH:
-                train_on_message(paragraph.strip(), source="book")
+                cleaned_paragraph = clean_unicode(paragraph.strip())
+                train_on_message(cleaned_paragraph, source="book")
                await asyncio.sleep(READ_DELAY)
                set_next_action(READ_DELAY, "Reading")
--- a/utils/unicleaner.py
+++ b/utils/unicleaner.py
@ -0,0 +1,39 @@
+import unicodedata
+import re
+
+# Precompiled regexes (fast)
+RE_SPACES = re.compile(r"\s+")
+RE_CONTROL_CHARS = re.compile(r"[\u0000-\u001F\u007F-\u009F]")
+RE_QUOTES = {
+    '\u2018': "'",  # Left single quotation mark
+    '\u2019': "'",  # Right single quotation mark
+    '\u201C': '"',  # Left double quotation mark
+    '\u201D': '"',  # Right double quotation mark
+    '\u201E': '"',  # Double low-9 quotation mark
+    '\u201F': '"',  # Double high-reversed-9 quotation mark
+}
+RE_DASHES = {
+    '\u2013': '-',  # En dash
+    '\u2014': '-',  # Em dash
+}
+
+def clean_unicode(text: str) -> str:
+    # 1. Replace fancy quotes
+    for bad, good in RE_QUOTES.items():
+        text = text.replace(bad, good)
+
+    # 2. Replace fancy dashes
+    for bad, good in RE_DASHES.items():
+        text = text.replace(bad, good)
+
+    # 3. Remove BOMs and stray control characters
+    text = RE_CONTROL_CHARS.sub('', text)
+
+    # 4. Normalize Unicode (NFKC collapses fullwidth, fractions, weird numerics)
+    text = unicodedata.normalize('NFKC', text)
+
+    # 5. Collapse all whitespace to a single space
+    text = RE_SPACES.sub(' ', text)
+
+    # 6. Strip leading/trailing whitespace
+    return text.strip()