Added a unicode cleaner to everything we get material from
This commit is contained in:
parent
97b43f832b
commit
60ca746420
@ -1,6 +1,7 @@
|
|||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
from utils.unicleaner import clean_unicode
|
||||||
|
|
||||||
BRAINMAP_FILE = "data/memory/brainmap.json"
|
BRAINMAP_FILE = "data/memory/brainmap.json"
|
||||||
|
|
||||||
@ -37,3 +38,22 @@ def update_brainmap(words):
|
|||||||
|
|
||||||
def get_brainmap():
|
def get_brainmap():
|
||||||
return brain_map
|
return brain_map
|
||||||
|
|
||||||
|
|
||||||
|
def fix_brainmap(brainmap: dict) -> dict:
|
||||||
|
cleaned_brainmap = {}
|
||||||
|
|
||||||
|
for word, value in brainmap.items():
|
||||||
|
cleaned_word = clean_unicode(word.strip())
|
||||||
|
|
||||||
|
# Skip bad entries
|
||||||
|
if not cleaned_word or cleaned_word in {"...", "-", "--", "''", '""'}:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Merge duplicates (case-insensitive optional)
|
||||||
|
if cleaned_word in cleaned_brainmap:
|
||||||
|
cleaned_brainmap[cleaned_word] += value
|
||||||
|
else:
|
||||||
|
cleaned_brainmap[cleaned_word] = value
|
||||||
|
|
||||||
|
return cleaned_brainmap
|
||||||
|
@ -23,7 +23,7 @@ def expand_model_if_needed():
|
|||||||
old_vocab_size = model.head.out_features
|
old_vocab_size = model.head.out_features
|
||||||
|
|
||||||
if current_vocab_size <= old_vocab_size:
|
if current_vocab_size <= old_vocab_size:
|
||||||
return
|
return False # No expansion needed
|
||||||
|
|
||||||
# print(f"[Expand] Expanding model from {old_vocab_size} -> {current_vocab_size}")
|
# print(f"[Expand] Expanding model from {old_vocab_size} -> {current_vocab_size}")
|
||||||
|
|
||||||
@ -42,6 +42,8 @@ def expand_model_if_needed():
|
|||||||
|
|
||||||
model = new_model
|
model = new_model
|
||||||
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
|
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
|
||||||
|
|
||||||
_last_expansion_time = time.time()
|
_last_expansion_time = time.time()
|
||||||
|
|
||||||
# print("[Expand] Expansion complete.")
|
# print("[Expand] Expansion complete.")
|
||||||
|
return True # <<< tell trainer we expanded
|
||||||
|
@ -2,6 +2,7 @@ import torch
|
|||||||
from model.brain import model, tokenizer, DEVICE
|
from model.brain import model, tokenizer, DEVICE
|
||||||
from model.trainer import train_on_message
|
from model.trainer import train_on_message
|
||||||
from model.dynamic_expand import expand_model_if_needed
|
from model.dynamic_expand import expand_model_if_needed
|
||||||
|
from utils.unicleaner import clean_unicode
|
||||||
|
|
||||||
|
|
||||||
def simulate_conversation():
|
def simulate_conversation():
|
||||||
@ -9,7 +10,7 @@ def simulate_conversation():
|
|||||||
|
|
||||||
model.eval()
|
model.eval()
|
||||||
seed = torch.randint(0, tokenizer.next_id, (1, 5), device=DEVICE)
|
seed = torch.randint(0, tokenizer.next_id, (1, 5), device=DEVICE)
|
||||||
seed = seed[:, -128:] # Safety clamp
|
seed = seed[:, -128:]
|
||||||
output = model(seed)
|
output = model(seed)
|
||||||
|
|
||||||
preds = torch.argmax(output, dim=-1).squeeze().tolist()
|
preds = torch.argmax(output, dim=-1).squeeze().tolist()
|
||||||
@ -17,5 +18,9 @@ def simulate_conversation():
|
|||||||
preds = [preds]
|
preds = [preds]
|
||||||
|
|
||||||
text = tokenizer.detokenize(preds)
|
text = tokenizer.detokenize(preds)
|
||||||
|
|
||||||
|
# 🧹 Clean the generated text too
|
||||||
|
text = clean_unicode(text)
|
||||||
|
|
||||||
if text and len(text.split()) >= 3:
|
if text and len(text.split()) >= 3:
|
||||||
train_on_message(text)
|
train_on_message(text)
|
||||||
|
@ -4,6 +4,7 @@ from model.dynamic_expand import expand_model_if_needed, _last_expansion_time, g
|
|||||||
from model.brain_state import model, tokenizer, DEVICE, loss_fn
|
from model.brain_state import model, tokenizer, DEVICE, loss_fn
|
||||||
from model.brainmap import update_brainmap
|
from model.brainmap import update_brainmap
|
||||||
from context.context import add_to_context, get_recent_context
|
from context.context import add_to_context, get_recent_context
|
||||||
|
from utils.unicleaner import clean_unicode
|
||||||
|
|
||||||
LOSS_FILE = "data/logs/loss.log"
|
LOSS_FILE = "data/logs/loss.log"
|
||||||
VOCAB_GROWTH_FILE = "data/logs/vocab_growth.log"
|
VOCAB_GROWTH_FILE = "data/logs/vocab_growth.log"
|
||||||
@ -34,9 +35,11 @@ def train_on_message(text: str, source: str = "user"):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
model.train()
|
model.train()
|
||||||
context_texts = get_recent_context(10)
|
|
||||||
|
|
||||||
# Here's the important change:
|
# 🧹 Clean up the incoming text
|
||||||
|
text = clean_unicode(text)
|
||||||
|
|
||||||
|
context_texts = get_recent_context(10)
|
||||||
augmented_text = "<start> " + " ".join(context_texts + [text]) + " <end>"
|
augmented_text = "<start> " + " ".join(context_texts + [text]) + " <end>"
|
||||||
|
|
||||||
tokens = tokenizer.tokenize(augmented_text)
|
tokens = tokenizer.tokenize(augmented_text)
|
||||||
@ -65,10 +68,12 @@ def train_on_message(text: str, source: str = "user"):
|
|||||||
opt.zero_grad()
|
opt.zero_grad()
|
||||||
loss.backward()
|
loss.backward()
|
||||||
opt.step()
|
opt.step()
|
||||||
|
scheduler.step()
|
||||||
|
|
||||||
log_loss(loss.item())
|
log_loss(loss.item())
|
||||||
log_vocab_growth()
|
log_vocab_growth()
|
||||||
add_to_context(text, source=source)
|
add_to_context(text, source=source)
|
||||||
update_brainmap(augmented_text.split())
|
update_brainmap(augmented_text.split())
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
expand_lock.release()
|
expand_lock.release()
|
||||||
|
@ -3,6 +3,7 @@ import asyncio
|
|||||||
from model.trainer import train_on_message
|
from model.trainer import train_on_message
|
||||||
from model.scheduler import set_next_action
|
from model.scheduler import set_next_action
|
||||||
from reader.filter import is_valid_line
|
from reader.filter import is_valid_line
|
||||||
|
from utils.unicleaner import clean_unicode
|
||||||
import json
|
import json
|
||||||
|
|
||||||
BOOK_DIR = "data/books"
|
BOOK_DIR = "data/books"
|
||||||
@ -48,7 +49,8 @@ async def read_books_forever():
|
|||||||
|
|
||||||
if not line:
|
if not line:
|
||||||
if len(paragraph) > PARAGRAPH_MIN_LENGTH:
|
if len(paragraph) > PARAGRAPH_MIN_LENGTH:
|
||||||
train_on_message(paragraph.strip(), source="book")
|
cleaned_paragraph = clean_unicode(paragraph.strip())
|
||||||
|
train_on_message(cleaned_paragraph, source="book")
|
||||||
paragraph = ""
|
paragraph = ""
|
||||||
await asyncio.sleep(READ_DELAY)
|
await asyncio.sleep(READ_DELAY)
|
||||||
set_next_action(READ_DELAY, "Reading")
|
set_next_action(READ_DELAY, "Reading")
|
||||||
@ -60,6 +62,7 @@ async def read_books_forever():
|
|||||||
|
|
||||||
# train last paragraph if any
|
# train last paragraph if any
|
||||||
if paragraph and len(paragraph) > PARAGRAPH_MIN_LENGTH:
|
if paragraph and len(paragraph) > PARAGRAPH_MIN_LENGTH:
|
||||||
train_on_message(paragraph.strip(), source="book")
|
cleaned_paragraph = clean_unicode(paragraph.strip())
|
||||||
|
train_on_message(cleaned_paragraph, source="book")
|
||||||
await asyncio.sleep(READ_DELAY)
|
await asyncio.sleep(READ_DELAY)
|
||||||
set_next_action(READ_DELAY, "Reading")
|
set_next_action(READ_DELAY, "Reading")
|
||||||
|
39
utils/unicleaner.py
Normal file
39
utils/unicleaner.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
import unicodedata
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Precompiled regexes (fast)
|
||||||
|
RE_SPACES = re.compile(r"\s+")
|
||||||
|
RE_CONTROL_CHARS = re.compile(r"[\u0000-\u001F\u007F-\u009F]")
|
||||||
|
RE_QUOTES = {
|
||||||
|
'\u2018': "'", # Left single quotation mark
|
||||||
|
'\u2019': "'", # Right single quotation mark
|
||||||
|
'\u201C': '"', # Left double quotation mark
|
||||||
|
'\u201D': '"', # Right double quotation mark
|
||||||
|
'\u201E': '"', # Double low-9 quotation mark
|
||||||
|
'\u201F': '"', # Double high-reversed-9 quotation mark
|
||||||
|
}
|
||||||
|
RE_DASHES = {
|
||||||
|
'\u2013': '-', # En dash
|
||||||
|
'\u2014': '-', # Em dash
|
||||||
|
}
|
||||||
|
|
||||||
|
def clean_unicode(text: str) -> str:
|
||||||
|
# 1. Replace fancy quotes
|
||||||
|
for bad, good in RE_QUOTES.items():
|
||||||
|
text = text.replace(bad, good)
|
||||||
|
|
||||||
|
# 2. Replace fancy dashes
|
||||||
|
for bad, good in RE_DASHES.items():
|
||||||
|
text = text.replace(bad, good)
|
||||||
|
|
||||||
|
# 3. Remove BOMs and stray control characters
|
||||||
|
text = RE_CONTROL_CHARS.sub('', text)
|
||||||
|
|
||||||
|
# 4. Normalize Unicode (NFKC collapses fullwidth, fractions, weird numerics)
|
||||||
|
text = unicodedata.normalize('NFKC', text)
|
||||||
|
|
||||||
|
# 5. Collapse all whitespace to a single space
|
||||||
|
text = RE_SPACES.sub(' ', text)
|
||||||
|
|
||||||
|
# 6. Strip leading/trailing whitespace
|
||||||
|
return text.strip()
|
Loading…
x
Reference in New Issue
Block a user