Added a unicode cleaner to everything we get material from
This commit is contained in:
parent
97b43f832b
commit
60ca746420
@ -1,6 +1,7 @@
|
||||
import os
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from utils.unicleaner import clean_unicode
|
||||
|
||||
BRAINMAP_FILE = "data/memory/brainmap.json"
|
||||
|
||||
@ -37,3 +38,22 @@ def update_brainmap(words):
|
||||
|
||||
def get_brainmap():
|
||||
return brain_map
|
||||
|
||||
|
||||
def fix_brainmap(brainmap: dict) -> dict:
|
||||
cleaned_brainmap = {}
|
||||
|
||||
for word, value in brainmap.items():
|
||||
cleaned_word = clean_unicode(word.strip())
|
||||
|
||||
# Skip bad entries
|
||||
if not cleaned_word or cleaned_word in {"...", "-", "--", "''", '""'}:
|
||||
continue
|
||||
|
||||
# Merge duplicates (case-insensitive optional)
|
||||
if cleaned_word in cleaned_brainmap:
|
||||
cleaned_brainmap[cleaned_word] += value
|
||||
else:
|
||||
cleaned_brainmap[cleaned_word] = value
|
||||
|
||||
return cleaned_brainmap
|
||||
|
@ -23,7 +23,7 @@ def expand_model_if_needed():
|
||||
old_vocab_size = model.head.out_features
|
||||
|
||||
if current_vocab_size <= old_vocab_size:
|
||||
return
|
||||
return False # No expansion needed
|
||||
|
||||
# print(f"[Expand] Expanding model from {old_vocab_size} -> {current_vocab_size}")
|
||||
|
||||
@ -42,6 +42,8 @@ def expand_model_if_needed():
|
||||
|
||||
model = new_model
|
||||
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
|
||||
|
||||
_last_expansion_time = time.time()
|
||||
|
||||
# print("[Expand] Expansion complete.")
|
||||
return True # <<< tell trainer we expanded
|
||||
|
@ -2,6 +2,7 @@ import torch
|
||||
from model.brain import model, tokenizer, DEVICE
|
||||
from model.trainer import train_on_message
|
||||
from model.dynamic_expand import expand_model_if_needed
|
||||
from utils.unicleaner import clean_unicode
|
||||
|
||||
|
||||
def simulate_conversation():
|
||||
@ -9,7 +10,7 @@ def simulate_conversation():
|
||||
|
||||
model.eval()
|
||||
seed = torch.randint(0, tokenizer.next_id, (1, 5), device=DEVICE)
|
||||
seed = seed[:, -128:] # Safety clamp
|
||||
seed = seed[:, -128:]
|
||||
output = model(seed)
|
||||
|
||||
preds = torch.argmax(output, dim=-1).squeeze().tolist()
|
||||
@ -17,5 +18,9 @@ def simulate_conversation():
|
||||
preds = [preds]
|
||||
|
||||
text = tokenizer.detokenize(preds)
|
||||
|
||||
# 🧹 Clean the generated text too
|
||||
text = clean_unicode(text)
|
||||
|
||||
if text and len(text.split()) >= 3:
|
||||
train_on_message(text)
|
||||
|
@ -4,6 +4,7 @@ from model.dynamic_expand import expand_model_if_needed, _last_expansion_time, g
|
||||
from model.brain_state import model, tokenizer, DEVICE, loss_fn
|
||||
from model.brainmap import update_brainmap
|
||||
from context.context import add_to_context, get_recent_context
|
||||
from utils.unicleaner import clean_unicode
|
||||
|
||||
LOSS_FILE = "data/logs/loss.log"
|
||||
VOCAB_GROWTH_FILE = "data/logs/vocab_growth.log"
|
||||
@ -34,9 +35,11 @@ def train_on_message(text: str, source: str = "user"):
|
||||
|
||||
try:
|
||||
model.train()
|
||||
context_texts = get_recent_context(10)
|
||||
|
||||
# Here's the important change:
|
||||
# 🧹 Clean up the incoming text
|
||||
text = clean_unicode(text)
|
||||
|
||||
context_texts = get_recent_context(10)
|
||||
augmented_text = "<start> " + " ".join(context_texts + [text]) + " <end>"
|
||||
|
||||
tokens = tokenizer.tokenize(augmented_text)
|
||||
@ -65,10 +68,12 @@ def train_on_message(text: str, source: str = "user"):
|
||||
opt.zero_grad()
|
||||
loss.backward()
|
||||
opt.step()
|
||||
scheduler.step()
|
||||
|
||||
log_loss(loss.item())
|
||||
log_vocab_growth()
|
||||
add_to_context(text, source=source)
|
||||
update_brainmap(augmented_text.split())
|
||||
|
||||
finally:
|
||||
expand_lock.release()
|
@ -3,6 +3,7 @@ import asyncio
|
||||
from model.trainer import train_on_message
|
||||
from model.scheduler import set_next_action
|
||||
from reader.filter import is_valid_line
|
||||
from utils.unicleaner import clean_unicode
|
||||
import json
|
||||
|
||||
BOOK_DIR = "data/books"
|
||||
@ -48,7 +49,8 @@ async def read_books_forever():
|
||||
|
||||
if not line:
|
||||
if len(paragraph) > PARAGRAPH_MIN_LENGTH:
|
||||
train_on_message(paragraph.strip(), source="book")
|
||||
cleaned_paragraph = clean_unicode(paragraph.strip())
|
||||
train_on_message(cleaned_paragraph, source="book")
|
||||
paragraph = ""
|
||||
await asyncio.sleep(READ_DELAY)
|
||||
set_next_action(READ_DELAY, "Reading")
|
||||
@ -60,6 +62,7 @@ async def read_books_forever():
|
||||
|
||||
# train last paragraph if any
|
||||
if paragraph and len(paragraph) > PARAGRAPH_MIN_LENGTH:
|
||||
train_on_message(paragraph.strip(), source="book")
|
||||
cleaned_paragraph = clean_unicode(paragraph.strip())
|
||||
train_on_message(cleaned_paragraph, source="book")
|
||||
await asyncio.sleep(READ_DELAY)
|
||||
set_next_action(READ_DELAY, "Reading")
|
||||
|
39
utils/unicleaner.py
Normal file
39
utils/unicleaner.py
Normal file
@ -0,0 +1,39 @@
|
||||
import unicodedata
|
||||
import re
|
||||
|
||||
# Precompiled regexes (fast)
|
||||
RE_SPACES = re.compile(r"\s+")
|
||||
RE_CONTROL_CHARS = re.compile(r"[\u0000-\u001F\u007F-\u009F]")
|
||||
RE_QUOTES = {
|
||||
'\u2018': "'", # Left single quotation mark
|
||||
'\u2019': "'", # Right single quotation mark
|
||||
'\u201C': '"', # Left double quotation mark
|
||||
'\u201D': '"', # Right double quotation mark
|
||||
'\u201E': '"', # Double low-9 quotation mark
|
||||
'\u201F': '"', # Double high-reversed-9 quotation mark
|
||||
}
|
||||
RE_DASHES = {
|
||||
'\u2013': '-', # En dash
|
||||
'\u2014': '-', # Em dash
|
||||
}
|
||||
|
||||
def clean_unicode(text: str) -> str:
|
||||
# 1. Replace fancy quotes
|
||||
for bad, good in RE_QUOTES.items():
|
||||
text = text.replace(bad, good)
|
||||
|
||||
# 2. Replace fancy dashes
|
||||
for bad, good in RE_DASHES.items():
|
||||
text = text.replace(bad, good)
|
||||
|
||||
# 3. Remove BOMs and stray control characters
|
||||
text = RE_CONTROL_CHARS.sub('', text)
|
||||
|
||||
# 4. Normalize Unicode (NFKC collapses fullwidth, fractions, weird numerics)
|
||||
text = unicodedata.normalize('NFKC', text)
|
||||
|
||||
# 5. Collapse all whitespace to a single space
|
||||
text = RE_SPACES.sub(' ', text)
|
||||
|
||||
# 6. Strip leading/trailing whitespace
|
||||
return text.strip()
|
Loading…
x
Reference in New Issue
Block a user