From 60ca74642037ef27398873139df3fc38dccd9a69 Mon Sep 17 00:00:00 2001 From: Dani Date: Sun, 27 Apr 2025 15:34:06 -0400 Subject: [PATCH] Added a unicode cleaner to everything we get material from --- model/brainmap.py | 20 ++++++++++++++++++++ model/dynamic_expand.py | 4 +++- model/rehearsal.py | 7 ++++++- model/trainer.py | 11 ++++++++--- reader/reader.py | 7 +++++-- utils/unicleaner.py | 39 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 81 insertions(+), 7 deletions(-) create mode 100644 utils/unicleaner.py diff --git a/model/brainmap.py b/model/brainmap.py index 739820a..99d34f3 100644 --- a/model/brainmap.py +++ b/model/brainmap.py @@ -1,6 +1,7 @@ import os import json from collections import defaultdict +from utils.unicleaner import clean_unicode BRAINMAP_FILE = "data/memory/brainmap.json" @@ -37,3 +38,22 @@ def update_brainmap(words): def get_brainmap(): return brain_map + + +def fix_brainmap(brainmap: dict) -> dict: + cleaned_brainmap = {} + + for word, value in brainmap.items(): + cleaned_word = clean_unicode(word.strip()) + + # Skip bad entries + if not cleaned_word or cleaned_word in {"...", "-", "--", "''", '""'}: + continue + + # Merge duplicates (case-insensitive optional) + if cleaned_word in cleaned_brainmap: + cleaned_brainmap[cleaned_word] += value + else: + cleaned_brainmap[cleaned_word] = value + + return cleaned_brainmap diff --git a/model/dynamic_expand.py b/model/dynamic_expand.py index c90c3b9..0b2c58a 100644 --- a/model/dynamic_expand.py +++ b/model/dynamic_expand.py @@ -23,7 +23,7 @@ def expand_model_if_needed(): old_vocab_size = model.head.out_features if current_vocab_size <= old_vocab_size: - return + return False # No expansion needed # print(f"[Expand] Expanding model from {old_vocab_size} -> {current_vocab_size}") @@ -42,6 +42,8 @@ def expand_model_if_needed(): model = new_model optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) + _last_expansion_time = time.time() # print("[Expand] Expansion complete.") + return True # <<< tell trainer we expanded diff --git a/model/rehearsal.py b/model/rehearsal.py index 0515ef5..17eddc3 100644 --- a/model/rehearsal.py +++ b/model/rehearsal.py @@ -2,6 +2,7 @@ import torch from model.brain import model, tokenizer, DEVICE from model.trainer import train_on_message from model.dynamic_expand import expand_model_if_needed +from utils.unicleaner import clean_unicode def simulate_conversation(): @@ -9,7 +10,7 @@ def simulate_conversation(): model.eval() seed = torch.randint(0, tokenizer.next_id, (1, 5), device=DEVICE) - seed = seed[:, -128:] # Safety clamp + seed = seed[:, -128:] output = model(seed) preds = torch.argmax(output, dim=-1).squeeze().tolist() @@ -17,5 +18,9 @@ def simulate_conversation(): preds = [preds] text = tokenizer.detokenize(preds) + + # 🧹 Clean the generated text too + text = clean_unicode(text) + if text and len(text.split()) >= 3: train_on_message(text) diff --git a/model/trainer.py b/model/trainer.py index 850677b..71ca696 100644 --- a/model/trainer.py +++ b/model/trainer.py @@ -4,6 +4,7 @@ from model.dynamic_expand import expand_model_if_needed, _last_expansion_time, g from model.brain_state import model, tokenizer, DEVICE, loss_fn from model.brainmap import update_brainmap from context.context import add_to_context, get_recent_context +from utils.unicleaner import clean_unicode LOSS_FILE = "data/logs/loss.log" VOCAB_GROWTH_FILE = "data/logs/vocab_growth.log" @@ -34,9 +35,11 @@ def train_on_message(text: str, source: str = "user"): try: model.train() - context_texts = get_recent_context(10) - # Here's the important change: + # 🧹 Clean up the incoming text + text = clean_unicode(text) + + context_texts = get_recent_context(10) augmented_text = " " + " ".join(context_texts + [text]) + " " tokens = tokenizer.tokenize(augmented_text) @@ -65,10 +68,12 @@ def train_on_message(text: str, source: str = "user"): opt.zero_grad() loss.backward() opt.step() + scheduler.step() log_loss(loss.item()) log_vocab_growth() add_to_context(text, source=source) update_brainmap(augmented_text.split()) + finally: - expand_lock.release() \ No newline at end of file + expand_lock.release() diff --git a/reader/reader.py b/reader/reader.py index 9783ea0..89ebe44 100644 --- a/reader/reader.py +++ b/reader/reader.py @@ -3,6 +3,7 @@ import asyncio from model.trainer import train_on_message from model.scheduler import set_next_action from reader.filter import is_valid_line +from utils.unicleaner import clean_unicode import json BOOK_DIR = "data/books" @@ -48,7 +49,8 @@ async def read_books_forever(): if not line: if len(paragraph) > PARAGRAPH_MIN_LENGTH: - train_on_message(paragraph.strip(), source="book") + cleaned_paragraph = clean_unicode(paragraph.strip()) + train_on_message(cleaned_paragraph, source="book") paragraph = "" await asyncio.sleep(READ_DELAY) set_next_action(READ_DELAY, "Reading") @@ -60,6 +62,7 @@ async def read_books_forever(): # train last paragraph if any if paragraph and len(paragraph) > PARAGRAPH_MIN_LENGTH: - train_on_message(paragraph.strip(), source="book") + cleaned_paragraph = clean_unicode(paragraph.strip()) + train_on_message(cleaned_paragraph, source="book") await asyncio.sleep(READ_DELAY) set_next_action(READ_DELAY, "Reading") diff --git a/utils/unicleaner.py b/utils/unicleaner.py new file mode 100644 index 0000000..1fab524 --- /dev/null +++ b/utils/unicleaner.py @@ -0,0 +1,39 @@ +import unicodedata +import re + +# Precompiled regexes (fast) +RE_SPACES = re.compile(r"\s+") +RE_CONTROL_CHARS = re.compile(r"[\u0000-\u001F\u007F-\u009F]") +RE_QUOTES = { + '\u2018': "'", # Left single quotation mark + '\u2019': "'", # Right single quotation mark + '\u201C': '"', # Left double quotation mark + '\u201D': '"', # Right double quotation mark + '\u201E': '"', # Double low-9 quotation mark + '\u201F': '"', # Double high-reversed-9 quotation mark +} +RE_DASHES = { + '\u2013': '-', # En dash + '\u2014': '-', # Em dash +} + +def clean_unicode(text: str) -> str: + # 1. Replace fancy quotes + for bad, good in RE_QUOTES.items(): + text = text.replace(bad, good) + + # 2. Replace fancy dashes + for bad, good in RE_DASHES.items(): + text = text.replace(bad, good) + + # 3. Remove BOMs and stray control characters + text = RE_CONTROL_CHARS.sub('', text) + + # 4. Normalize Unicode (NFKC collapses fullwidth, fractions, weird numerics) + text = unicodedata.normalize('NFKC', text) + + # 5. Collapse all whitespace to a single space + text = RE_SPACES.sub(' ', text) + + # 6. Strip leading/trailing whitespace + return text.strip()