Reverted some changes due to the unicode cleaner being moved to the tokenizer.

This commit is contained in:
Dani 2025-04-27 15:38:58 -04:00
parent 60ca746420
commit 3a77b5db32
5 changed files with 16 additions and 16 deletions

View File

@ -2,25 +2,29 @@ import torch
from model.brain import model, tokenizer, DEVICE from model.brain import model, tokenizer, DEVICE
from model.trainer import train_on_message from model.trainer import train_on_message
from model.dynamic_expand import expand_model_if_needed from model.dynamic_expand import expand_model_if_needed
from utils.unicleaner import clean_unicode
def simulate_conversation(): def simulate_conversation():
expand_model_if_needed() expand_model_if_needed()
model.eval() model.eval()
seed = torch.randint(0, tokenizer.next_id, (1, 5), device=DEVICE)
seed = seed[:, -128:] max_token_id = model.head.out_features - 1
if max_token_id < 1:
return # Safeguard if model is still too small
seed = torch.randint(0, max_token_id + 1, (1, 5), device=DEVICE)
seed = seed[:, -128:] # Clamp sequence length
output = model(seed) output = model(seed)
preds = torch.argmax(output, dim=-1).squeeze().tolist() preds = torch.argmax(output, dim=-1).squeeze().tolist()
if isinstance(preds, int): if isinstance(preds, int):
preds = [preds] preds = [preds]
# 🛡 Clamp predictions too
preds = [min(max(p, 0), max_token_id) for p in preds]
text = tokenizer.detokenize(preds) text = tokenizer.detokenize(preds)
# 🧹 Clean the generated text too
text = clean_unicode(text)
if text and len(text.split()) >= 3: if text and len(text.split()) >= 3:
train_on_message(text) train_on_message(text)

View File

@ -1,6 +1,7 @@
import re import re
import os import os
import json import json
from utils.unicleaner import clean_unicode
VOCAB_PATH = "data/memory/vocab.json" VOCAB_PATH = "data/memory/vocab.json"
@ -24,6 +25,7 @@ class Tokenizer:
self.next_id = 4 self.next_id = 4
def tokenize(self, text): def tokenize(self, text):
text = clean_unicode(text) # 🚨 Always clean incoming text
words = re.findall(r"\b\w+\b", text.lower()) words = re.findall(r"\b\w+\b", text.lower())
tokens = [] tokens = []
for word in words: for word in words:

View File

@ -4,7 +4,6 @@ from model.dynamic_expand import expand_model_if_needed, _last_expansion_time, g
from model.brain_state import model, tokenizer, DEVICE, loss_fn from model.brain_state import model, tokenizer, DEVICE, loss_fn
from model.brainmap import update_brainmap from model.brainmap import update_brainmap
from context.context import add_to_context, get_recent_context from context.context import add_to_context, get_recent_context
from utils.unicleaner import clean_unicode
LOSS_FILE = "data/logs/loss.log" LOSS_FILE = "data/logs/loss.log"
VOCAB_GROWTH_FILE = "data/logs/vocab_growth.log" VOCAB_GROWTH_FILE = "data/logs/vocab_growth.log"
@ -36,9 +35,6 @@ def train_on_message(text: str, source: str = "user"):
try: try:
model.train() model.train()
# 🧹 Clean up the incoming text
text = clean_unicode(text)
context_texts = get_recent_context(10) context_texts = get_recent_context(10)
augmented_text = "<start> " + " ".join(context_texts + [text]) + " <end>" augmented_text = "<start> " + " ".join(context_texts + [text]) + " <end>"

View File

@ -3,7 +3,6 @@ import asyncio
from model.trainer import train_on_message from model.trainer import train_on_message
from model.scheduler import set_next_action from model.scheduler import set_next_action
from reader.filter import is_valid_line from reader.filter import is_valid_line
from utils.unicleaner import clean_unicode
import json import json
BOOK_DIR = "data/books" BOOK_DIR = "data/books"
@ -49,8 +48,7 @@ async def read_books_forever():
if not line: if not line:
if len(paragraph) > PARAGRAPH_MIN_LENGTH: if len(paragraph) > PARAGRAPH_MIN_LENGTH:
cleaned_paragraph = clean_unicode(paragraph.strip()) train_on_message(paragraph.strip(), source="book")
train_on_message(cleaned_paragraph, source="book")
paragraph = "" paragraph = ""
await asyncio.sleep(READ_DELAY) await asyncio.sleep(READ_DELAY)
set_next_action(READ_DELAY, "Reading") set_next_action(READ_DELAY, "Reading")
@ -62,7 +60,6 @@ async def read_books_forever():
# train last paragraph if any # train last paragraph if any
if paragraph and len(paragraph) > PARAGRAPH_MIN_LENGTH: if paragraph and len(paragraph) > PARAGRAPH_MIN_LENGTH:
cleaned_paragraph = clean_unicode(paragraph.strip()) train_on_message(paragraph.strip(), source="book")
train_on_message(cleaned_paragraph, source="book")
await asyncio.sleep(READ_DELAY) await asyncio.sleep(READ_DELAY)
set_next_action(READ_DELAY, "Reading") set_next_action(READ_DELAY, "Reading")

View File

@ -17,6 +17,7 @@ RE_DASHES = {
'\u2014': '-', # Em dash '\u2014': '-', # Em dash
} }
def clean_unicode(text: str) -> str: def clean_unicode(text: str) -> str:
# 1. Replace fancy quotes # 1. Replace fancy quotes
for bad, good in RE_QUOTES.items(): for bad, good in RE_QUOTES.items():