Added a unicode cleaner to everything we get material from

This commit is contained in:
Dani 2025-04-27 15:34:06 -04:00
parent 97b43f832b
commit 60ca746420
6 changed files with 81 additions and 7 deletions

View File

@ -1,6 +1,7 @@
import os
import json
from collections import defaultdict
from utils.unicleaner import clean_unicode
BRAINMAP_FILE = "data/memory/brainmap.json"
@ -37,3 +38,22 @@ def update_brainmap(words):
def get_brainmap():
return brain_map
def fix_brainmap(brainmap: dict) -> dict:
cleaned_brainmap = {}
for word, value in brainmap.items():
cleaned_word = clean_unicode(word.strip())
# Skip bad entries
if not cleaned_word or cleaned_word in {"...", "-", "--", "''", '""'}:
continue
# Merge duplicates (case-insensitive optional)
if cleaned_word in cleaned_brainmap:
cleaned_brainmap[cleaned_word] += value
else:
cleaned_brainmap[cleaned_word] = value
return cleaned_brainmap

View File

@ -23,7 +23,7 @@ def expand_model_if_needed():
old_vocab_size = model.head.out_features
if current_vocab_size <= old_vocab_size:
return
return False # No expansion needed
# print(f"[Expand] Expanding model from {old_vocab_size} -> {current_vocab_size}")
@ -42,6 +42,8 @@ def expand_model_if_needed():
model = new_model
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
_last_expansion_time = time.time()
# print("[Expand] Expansion complete.")
return True # <<< tell trainer we expanded

View File

@ -2,6 +2,7 @@ import torch
from model.brain import model, tokenizer, DEVICE
from model.trainer import train_on_message
from model.dynamic_expand import expand_model_if_needed
from utils.unicleaner import clean_unicode
def simulate_conversation():
@ -9,7 +10,7 @@ def simulate_conversation():
model.eval()
seed = torch.randint(0, tokenizer.next_id, (1, 5), device=DEVICE)
seed = seed[:, -128:] # Safety clamp
seed = seed[:, -128:]
output = model(seed)
preds = torch.argmax(output, dim=-1).squeeze().tolist()
@ -17,5 +18,9 @@ def simulate_conversation():
preds = [preds]
text = tokenizer.detokenize(preds)
# 🧹 Clean the generated text too
text = clean_unicode(text)
if text and len(text.split()) >= 3:
train_on_message(text)

View File

@ -4,6 +4,7 @@ from model.dynamic_expand import expand_model_if_needed, _last_expansion_time, g
from model.brain_state import model, tokenizer, DEVICE, loss_fn
from model.brainmap import update_brainmap
from context.context import add_to_context, get_recent_context
from utils.unicleaner import clean_unicode
LOSS_FILE = "data/logs/loss.log"
VOCAB_GROWTH_FILE = "data/logs/vocab_growth.log"
@ -34,9 +35,11 @@ def train_on_message(text: str, source: str = "user"):
try:
model.train()
context_texts = get_recent_context(10)
# Here's the important change:
# 🧹 Clean up the incoming text
text = clean_unicode(text)
context_texts = get_recent_context(10)
augmented_text = "<start> " + " ".join(context_texts + [text]) + " <end>"
tokens = tokenizer.tokenize(augmented_text)
@ -65,10 +68,12 @@ def train_on_message(text: str, source: str = "user"):
opt.zero_grad()
loss.backward()
opt.step()
scheduler.step()
log_loss(loss.item())
log_vocab_growth()
add_to_context(text, source=source)
update_brainmap(augmented_text.split())
finally:
expand_lock.release()

View File

@ -3,6 +3,7 @@ import asyncio
from model.trainer import train_on_message
from model.scheduler import set_next_action
from reader.filter import is_valid_line
from utils.unicleaner import clean_unicode
import json
BOOK_DIR = "data/books"
@ -48,7 +49,8 @@ async def read_books_forever():
if not line:
if len(paragraph) > PARAGRAPH_MIN_LENGTH:
train_on_message(paragraph.strip(), source="book")
cleaned_paragraph = clean_unicode(paragraph.strip())
train_on_message(cleaned_paragraph, source="book")
paragraph = ""
await asyncio.sleep(READ_DELAY)
set_next_action(READ_DELAY, "Reading")
@ -60,6 +62,7 @@ async def read_books_forever():
# train last paragraph if any
if paragraph and len(paragraph) > PARAGRAPH_MIN_LENGTH:
train_on_message(paragraph.strip(), source="book")
cleaned_paragraph = clean_unicode(paragraph.strip())
train_on_message(cleaned_paragraph, source="book")
await asyncio.sleep(READ_DELAY)
set_next_action(READ_DELAY, "Reading")

39
utils/unicleaner.py Normal file
View File

@ -0,0 +1,39 @@
import unicodedata
import re
# Precompiled regexes (fast)
RE_SPACES = re.compile(r"\s+")
RE_CONTROL_CHARS = re.compile(r"[\u0000-\u001F\u007F-\u009F]")
RE_QUOTES = {
'\u2018': "'", # Left single quotation mark
'\u2019': "'", # Right single quotation mark
'\u201C': '"', # Left double quotation mark
'\u201D': '"', # Right double quotation mark
'\u201E': '"', # Double low-9 quotation mark
'\u201F': '"', # Double high-reversed-9 quotation mark
}
RE_DASHES = {
'\u2013': '-', # En dash
'\u2014': '-', # Em dash
}
def clean_unicode(text: str) -> str:
# 1. Replace fancy quotes
for bad, good in RE_QUOTES.items():
text = text.replace(bad, good)
# 2. Replace fancy dashes
for bad, good in RE_DASHES.items():
text = text.replace(bad, good)
# 3. Remove BOMs and stray control characters
text = RE_CONTROL_CHARS.sub('', text)
# 4. Normalize Unicode (NFKC collapses fullwidth, fractions, weird numerics)
text = unicodedata.normalize('NFKC', text)
# 5. Collapse all whitespace to a single space
text = RE_SPACES.sub(' ', text)
# 6. Strip leading/trailing whitespace
return text.strip()