fixed an error in the tokenizer.

Updated brain.py to be a tad more aggressive, and added a cleanup function for the brainmap to cleanup.py
This commit is contained in:
Dani 2025-04-27 17:03:26 -04:00
parent 4d4b39b4c7
commit 0674d51471
3 changed files with 61 additions and 10 deletions

View File

@ -11,12 +11,20 @@ recent_dreams = []
@torch.inference_mode() @torch.inference_mode()
def generate_response(max_tokens: int = 50): def generate_response(max_tokens: int = 50, temperature: float = 1.0):
model.eval() model.eval()
input_ids = torch.tensor([tokenizer.token_to_id("<start>")], device=DEVICE).unsqueeze(0) input_ids = torch.tensor([tokenizer.token_to_id("<start>")], device=DEVICE).unsqueeze(0)
generated = [] generated = []
forbidden_tokens = {
tokenizer.token_to_id("<unk>"),
tokenizer.token_to_id("<start>"),
tokenizer.token_to_id("<pad>"),
tokenizer.token_to_id("<end>"),
tokenizer.token_to_id("<sep>"),
}
for _ in range(max_tokens): for _ in range(max_tokens):
output = model(input_ids) output = model(input_ids)
if torch.isnan(output).any(): if torch.isnan(output).any():
@ -24,29 +32,38 @@ def generate_response(max_tokens: int = 50):
return "..." return "..."
next_token_logits = output[:, -1, :] next_token_logits = output[:, -1, :]
next_token = torch.argmax(next_token_logits, dim=-1) probs = torch.softmax(next_token_logits / temperature, dim=-1)
next_token = torch.multinomial(probs, num_samples=1)
# Resample if forbidden token
while next_token.item() in forbidden_tokens:
next_token = torch.multinomial(probs, num_samples=1)
token_id = next_token.item() token_id = next_token.item()
# If she outputs <end> token, stop generation
if tokenizer.reverse_vocab.get(token_id, "") == "<end>": if tokenizer.reverse_vocab.get(token_id, "") == "<end>":
break break
generated.append(token_id) generated.append(token_id)
next_token = next_token.unsqueeze(0) input_ids = torch.cat([input_ids, next_token], dim=1)
input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=1)
return tokenizer.detokenize(generated) return tokenizer.detokenize(generated)
def score_sentence(sentence: str) -> float: def score_sentence(sentence: str) -> float:
words = sentence.strip().split() words = sentence.strip().split()
unique = set(words)
length = len(words) length = len(words)
diversity = len(set(words)) / (length + 1) unique_ratio = len(unique) / (length + 1)
if length < 4:
if length < 5:
return 0.0 return 0.0
return diversity * min(length, 20) if unique_ratio < 0.5:
return 0.0
return unique_ratio * min(length / 20.0, 1.0)
def daydream(): def daydream():
@ -65,7 +82,7 @@ def daydream():
sentence = tokenizer.detokenize(dream) sentence = tokenizer.detokenize(dream)
score = score_sentence(sentence) score = score_sentence(sentence)
if score > 0.45: if score > 0.5:
save_dream(sentence, score) save_dream(sentence, score)
record_to_journal(sentence) record_to_journal(sentence)
train_on_message(sentence) train_on_message(sentence)

View File

@ -5,6 +5,7 @@ import time
from model.tokenizer import VOCAB_PATH from model.tokenizer import VOCAB_PATH
from model.memory import DREAM_LOG_PATH from model.memory import DREAM_LOG_PATH
from context.context import CONTEXT_FILE from context.context import CONTEXT_FILE
from model.brainmap import load_brainmap, save_brainmap
CLEANUP_LOG = "data/logs/cleanup.log" CLEANUP_LOG = "data/logs/cleanup.log"
@ -73,7 +74,34 @@ def cleanup_context():
log(f"Trimmed context memory from {len(context)}{len(filtered)}") log(f"Trimmed context memory from {len(context)}{len(filtered)}")
def cleanup_brainmap(min_neighbors=2, min_strength=2):
load_brainmap()
from model.brainmap import brainmap # after load
to_delete = []
for word, neighbors in brainmap.items():
# Remove weak neighbors
weak_neighbors = [n for n, count in neighbors.items() if count < min_strength]
for n in weak_neighbors:
del neighbors[n]
# Mark lonely words
if len(neighbors) < min_neighbors:
to_delete.append(word)
for word in to_delete:
del brainmap[word]
save_brainmap()
if to_delete:
log(f"Pruned {len(to_delete)} weak brainmap words")
def full_cleanup(): def full_cleanup():
cleanup_vocab() cleanup_vocab()
cleanup_dreams() cleanup_dreams()
cleanup_context() cleanup_context()
cleanup_brainmap()

View File

@ -25,7 +25,7 @@ class Tokenizer:
self.next_id = 5 self.next_id = 5
def tokenize(self, text): def tokenize(self, text):
text = clean_unicode(text) # 🚨 Always clean incoming text text = clean_unicode(text)
words = re.findall(r"\b\w+\b", text.lower()) words = re.findall(r"\b\w+\b", text.lower())
tokens = [] tokens = []
for word in words: for word in words:
@ -41,3 +41,9 @@ class Tokenizer:
if isinstance(tokens, int): if isinstance(tokens, int):
tokens = [tokens] tokens = [tokens]
return " ".join(self.reverse_vocab.get(t, "<unk>") for t in tokens) return " ".join(self.reverse_vocab.get(t, "<unk>") for t in tokens)
def token_to_id(self, token: str) -> int:
return self.vocab.get(token, self.vocab["<unk>"])
def id_to_token(self, idx: int) -> str:
return self.reverse_vocab.get(idx, "<unk>")