From 0674d5147134919b4dd0e98277779b41063f7fb8 Mon Sep 17 00:00:00 2001 From: Dani Date: Sun, 27 Apr 2025 17:03:26 -0400 Subject: [PATCH] fixed an error in the tokenizer. Updated brain.py to be a tad more aggressive, and added a cleanup function for the brainmap to cleanup.py --- model/brain.py | 35 ++++++++++++++++++++++++++--------- model/cleanup.py | 28 ++++++++++++++++++++++++++++ model/tokenizer.py | 8 +++++++- 3 files changed, 61 insertions(+), 10 deletions(-) diff --git a/model/brain.py b/model/brain.py index b28c8de..de66d79 100644 --- a/model/brain.py +++ b/model/brain.py @@ -11,12 +11,20 @@ recent_dreams = [] @torch.inference_mode() -def generate_response(max_tokens: int = 50): +def generate_response(max_tokens: int = 50, temperature: float = 1.0): model.eval() input_ids = torch.tensor([tokenizer.token_to_id("")], device=DEVICE).unsqueeze(0) generated = [] + forbidden_tokens = { + tokenizer.token_to_id(""), + tokenizer.token_to_id(""), + tokenizer.token_to_id(""), + tokenizer.token_to_id(""), + tokenizer.token_to_id(""), + } + for _ in range(max_tokens): output = model(input_ids) if torch.isnan(output).any(): @@ -24,29 +32,38 @@ def generate_response(max_tokens: int = 50): return "..." next_token_logits = output[:, -1, :] - next_token = torch.argmax(next_token_logits, dim=-1) + probs = torch.softmax(next_token_logits / temperature, dim=-1) + + next_token = torch.multinomial(probs, num_samples=1) + + # Resample if forbidden token + while next_token.item() in forbidden_tokens: + next_token = torch.multinomial(probs, num_samples=1) token_id = next_token.item() - # If she outputs token, stop generation if tokenizer.reverse_vocab.get(token_id, "") == "": break generated.append(token_id) - next_token = next_token.unsqueeze(0) - input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=1) + input_ids = torch.cat([input_ids, next_token], dim=1) return tokenizer.detokenize(generated) def score_sentence(sentence: str) -> float: words = sentence.strip().split() + unique = set(words) length = len(words) - diversity = len(set(words)) / (length + 1) - if length < 4: + unique_ratio = len(unique) / (length + 1) + + if length < 5: return 0.0 - return diversity * min(length, 20) + if unique_ratio < 0.5: + return 0.0 + + return unique_ratio * min(length / 20.0, 1.0) def daydream(): @@ -65,7 +82,7 @@ def daydream(): sentence = tokenizer.detokenize(dream) score = score_sentence(sentence) - if score > 0.45: + if score > 0.5: save_dream(sentence, score) record_to_journal(sentence) train_on_message(sentence) diff --git a/model/cleanup.py b/model/cleanup.py index 9feb62e..6effe6c 100644 --- a/model/cleanup.py +++ b/model/cleanup.py @@ -5,6 +5,7 @@ import time from model.tokenizer import VOCAB_PATH from model.memory import DREAM_LOG_PATH from context.context import CONTEXT_FILE +from model.brainmap import load_brainmap, save_brainmap CLEANUP_LOG = "data/logs/cleanup.log" @@ -73,7 +74,34 @@ def cleanup_context(): log(f"Trimmed context memory from {len(context)} → {len(filtered)}") +def cleanup_brainmap(min_neighbors=2, min_strength=2): + load_brainmap() + + from model.brainmap import brainmap # after load + + to_delete = [] + + for word, neighbors in brainmap.items(): + # Remove weak neighbors + weak_neighbors = [n for n, count in neighbors.items() if count < min_strength] + for n in weak_neighbors: + del neighbors[n] + + # Mark lonely words + if len(neighbors) < min_neighbors: + to_delete.append(word) + + for word in to_delete: + del brainmap[word] + + save_brainmap() + + if to_delete: + log(f"Pruned {len(to_delete)} weak brainmap words") + + def full_cleanup(): cleanup_vocab() cleanup_dreams() cleanup_context() + cleanup_brainmap() diff --git a/model/tokenizer.py b/model/tokenizer.py index 7d60093..a35b0c5 100644 --- a/model/tokenizer.py +++ b/model/tokenizer.py @@ -25,7 +25,7 @@ class Tokenizer: self.next_id = 5 def tokenize(self, text): - text = clean_unicode(text) # 🚨 Always clean incoming text + text = clean_unicode(text) words = re.findall(r"\b\w+\b", text.lower()) tokens = [] for word in words: @@ -41,3 +41,9 @@ class Tokenizer: if isinstance(tokens, int): tokens = [tokens] return " ".join(self.reverse_vocab.get(t, "") for t in tokens) + + def token_to_id(self, token: str) -> int: + return self.vocab.get(token, self.vocab[""]) + + def id_to_token(self, idx: int) -> str: + return self.reverse_vocab.get(idx, "")