fixed an error in the tokenizer.

Updated brain.py to be a tad more aggressive, and added a cleanup function for the brainmap to cleanup.py
2025-04-27 17:03:26 -04:00 · 2025-04-27 17:03:26 -04:00 · 0674d51471
commit 0674d51471
parent 4d4b39b4c7
3 changed files with 61 additions and 10 deletions
--- a/model/brain.py
+++ b/model/brain.py
@ -11,12 +11,20 @@ recent_dreams = []
@torch.inference_mode()
-def generate_response(max_tokens: int = 50):
+def generate_response(max_tokens: int = 50, temperature: float = 1.0):
    model.eval()
    input_ids = torch.tensor([tokenizer.token_to_id("<start>")], device=DEVICE).unsqueeze(0)
    generated = []
    forbidden_tokens = {
        tokenizer.token_to_id("<unk>"),
        tokenizer.token_to_id("<start>"),
        tokenizer.token_to_id("<pad>"),
        tokenizer.token_to_id("<end>"),
        tokenizer.token_to_id("<sep>"),
    }
    for _ in range(max_tokens):
        output = model(input_ids)
        if torch.isnan(output).any():
@ -24,29 +32,38 @@ def generate_response(max_tokens: int = 50):
            return "..."
        next_token_logits = output[:, -1, :]
-        next_token = torch.argmax(next_token_logits, dim=-1)
+        probs = torch.softmax(next_token_logits / temperature, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        # Resample if forbidden token
        while next_token.item() in forbidden_tokens:
            next_token = torch.multinomial(probs, num_samples=1)
        token_id = next_token.item()
        # If she outputs <end> token, stop generation
        if tokenizer.reverse_vocab.get(token_id, "") == "<end>":
            break
        generated.append(token_id)
-        next_token = next_token.unsqueeze(0)
+        input_ids = torch.cat([input_ids, next_token], dim=1)
        input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=1)
    return tokenizer.detokenize(generated)
 def score_sentence(sentence: str) -> float:
    words = sentence.strip().split()
    unique = set(words)
    length = len(words)
-    diversity = len(set(words)) / (length + 1)
+    unique_ratio = len(unique) / (length + 1)
-    if length < 4:
+
    if length < 5:
        return 0.0
-    return diversity * min(length, 20)
+    if unique_ratio < 0.5:
        return 0.0
    return unique_ratio * min(length / 20.0, 1.0)
 def daydream():
@ -65,7 +82,7 @@ def daydream():
    sentence = tokenizer.detokenize(dream)
    score = score_sentence(sentence)
-    if score > 0.45:
+    if score > 0.5:
        save_dream(sentence, score)
        record_to_journal(sentence)
        train_on_message(sentence)
--- a/model/cleanup.py
+++ b/model/cleanup.py
@ -5,6 +5,7 @@ import time
 from model.tokenizer import VOCAB_PATH
 from model.memory import DREAM_LOG_PATH
 from context.context import CONTEXT_FILE
 from model.brainmap import load_brainmap, save_brainmap
 CLEANUP_LOG = "data/logs/cleanup.log"
@ -73,7 +74,34 @@ def cleanup_context():
        log(f"Trimmed context memory from {len(context)} → {len(filtered)}")
 def cleanup_brainmap(min_neighbors=2, min_strength=2):
    load_brainmap()
    from model.brainmap import brainmap  # after load
    to_delete = []
    for word, neighbors in brainmap.items():
        # Remove weak neighbors
        weak_neighbors = [n for n, count in neighbors.items() if count < min_strength]
        for n in weak_neighbors:
            del neighbors[n]
        # Mark lonely words
        if len(neighbors) < min_neighbors:
            to_delete.append(word)
    for word in to_delete:
        del brainmap[word]
    save_brainmap()
    if to_delete:
        log(f"Pruned {len(to_delete)} weak brainmap words")
 def full_cleanup():
    cleanup_vocab()
    cleanup_dreams()
    cleanup_context()
    cleanup_brainmap()
--- a/model/tokenizer.py
+++ b/model/tokenizer.py
@ -25,7 +25,7 @@ class Tokenizer:
        self.next_id = 5
    def tokenize(self, text):
-        text = clean_unicode(text)  # 🚨 Always clean incoming text
+        text = clean_unicode(text)
        words = re.findall(r"\b\w+\b", text.lower())
        tokens = []
        for word in words:
@ -41,3 +41,9 @@ class Tokenizer:
        if isinstance(tokens, int):
            tokens = [tokens]
        return " ".join(self.reverse_vocab.get(t, "<unk>") for t in tokens)
    def token_to_id(self, token: str) -> int:
        return self.vocab.get(token, self.vocab["<unk>"])
    def id_to_token(self, idx: int) -> str:
        return self.reverse_vocab.get(idx, "<unk>")