Reverted some changes due to the unicode cleaner being moved to the tokenizer.
This commit is contained in:
parent
60ca746420
commit
3a77b5db32
@ -2,25 +2,29 @@ import torch
|
||||
from model.brain import model, tokenizer, DEVICE
|
||||
from model.trainer import train_on_message
|
||||
from model.dynamic_expand import expand_model_if_needed
|
||||
from utils.unicleaner import clean_unicode
|
||||
|
||||
|
||||
def simulate_conversation():
|
||||
expand_model_if_needed()
|
||||
|
||||
model.eval()
|
||||
seed = torch.randint(0, tokenizer.next_id, (1, 5), device=DEVICE)
|
||||
seed = seed[:, -128:]
|
||||
|
||||
max_token_id = model.head.out_features - 1
|
||||
if max_token_id < 1:
|
||||
return # Safeguard if model is still too small
|
||||
|
||||
seed = torch.randint(0, max_token_id + 1, (1, 5), device=DEVICE)
|
||||
seed = seed[:, -128:] # Clamp sequence length
|
||||
|
||||
output = model(seed)
|
||||
|
||||
preds = torch.argmax(output, dim=-1).squeeze().tolist()
|
||||
if isinstance(preds, int):
|
||||
preds = [preds]
|
||||
|
||||
# 🛡 Clamp predictions too
|
||||
preds = [min(max(p, 0), max_token_id) for p in preds]
|
||||
|
||||
text = tokenizer.detokenize(preds)
|
||||
|
||||
# 🧹 Clean the generated text too
|
||||
text = clean_unicode(text)
|
||||
|
||||
if text and len(text.split()) >= 3:
|
||||
train_on_message(text)
|
||||
|
@ -1,6 +1,7 @@
|
||||
import re
|
||||
import os
|
||||
import json
|
||||
from utils.unicleaner import clean_unicode
|
||||
|
||||
VOCAB_PATH = "data/memory/vocab.json"
|
||||
|
||||
@ -24,6 +25,7 @@ class Tokenizer:
|
||||
self.next_id = 4
|
||||
|
||||
def tokenize(self, text):
|
||||
text = clean_unicode(text) # 🚨 Always clean incoming text
|
||||
words = re.findall(r"\b\w+\b", text.lower())
|
||||
tokens = []
|
||||
for word in words:
|
||||
|
@ -4,7 +4,6 @@ from model.dynamic_expand import expand_model_if_needed, _last_expansion_time, g
|
||||
from model.brain_state import model, tokenizer, DEVICE, loss_fn
|
||||
from model.brainmap import update_brainmap
|
||||
from context.context import add_to_context, get_recent_context
|
||||
from utils.unicleaner import clean_unicode
|
||||
|
||||
LOSS_FILE = "data/logs/loss.log"
|
||||
VOCAB_GROWTH_FILE = "data/logs/vocab_growth.log"
|
||||
@ -36,9 +35,6 @@ def train_on_message(text: str, source: str = "user"):
|
||||
try:
|
||||
model.train()
|
||||
|
||||
# 🧹 Clean up the incoming text
|
||||
text = clean_unicode(text)
|
||||
|
||||
context_texts = get_recent_context(10)
|
||||
augmented_text = "<start> " + " ".join(context_texts + [text]) + " <end>"
|
||||
|
||||
|
@ -3,7 +3,6 @@ import asyncio
|
||||
from model.trainer import train_on_message
|
||||
from model.scheduler import set_next_action
|
||||
from reader.filter import is_valid_line
|
||||
from utils.unicleaner import clean_unicode
|
||||
import json
|
||||
|
||||
BOOK_DIR = "data/books"
|
||||
@ -49,8 +48,7 @@ async def read_books_forever():
|
||||
|
||||
if not line:
|
||||
if len(paragraph) > PARAGRAPH_MIN_LENGTH:
|
||||
cleaned_paragraph = clean_unicode(paragraph.strip())
|
||||
train_on_message(cleaned_paragraph, source="book")
|
||||
train_on_message(paragraph.strip(), source="book")
|
||||
paragraph = ""
|
||||
await asyncio.sleep(READ_DELAY)
|
||||
set_next_action(READ_DELAY, "Reading")
|
||||
@ -62,7 +60,6 @@ async def read_books_forever():
|
||||
|
||||
# train last paragraph if any
|
||||
if paragraph and len(paragraph) > PARAGRAPH_MIN_LENGTH:
|
||||
cleaned_paragraph = clean_unicode(paragraph.strip())
|
||||
train_on_message(cleaned_paragraph, source="book")
|
||||
train_on_message(paragraph.strip(), source="book")
|
||||
await asyncio.sleep(READ_DELAY)
|
||||
set_next_action(READ_DELAY, "Reading")
|
||||
|
@ -17,6 +17,7 @@ RE_DASHES = {
|
||||
'\u2014': '-', # Em dash
|
||||
}
|
||||
|
||||
|
||||
def clean_unicode(text: str) -> str:
|
||||
# 1. Replace fancy quotes
|
||||
for bad, good in RE_QUOTES.items():
|
||||
|
Loading…
x
Reference in New Issue
Block a user