fixed an error in the tokenizer.
Updated brain.py to be a tad more aggressive, and added a cleanup function for the brainmap to cleanup.py
This commit is contained in:
parent
4d4b39b4c7
commit
0674d51471
@ -11,12 +11,20 @@ recent_dreams = []
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def generate_response(max_tokens: int = 50):
|
||||
def generate_response(max_tokens: int = 50, temperature: float = 1.0):
|
||||
model.eval()
|
||||
input_ids = torch.tensor([tokenizer.token_to_id("<start>")], device=DEVICE).unsqueeze(0)
|
||||
|
||||
generated = []
|
||||
|
||||
forbidden_tokens = {
|
||||
tokenizer.token_to_id("<unk>"),
|
||||
tokenizer.token_to_id("<start>"),
|
||||
tokenizer.token_to_id("<pad>"),
|
||||
tokenizer.token_to_id("<end>"),
|
||||
tokenizer.token_to_id("<sep>"),
|
||||
}
|
||||
|
||||
for _ in range(max_tokens):
|
||||
output = model(input_ids)
|
||||
if torch.isnan(output).any():
|
||||
@ -24,29 +32,38 @@ def generate_response(max_tokens: int = 50):
|
||||
return "..."
|
||||
|
||||
next_token_logits = output[:, -1, :]
|
||||
next_token = torch.argmax(next_token_logits, dim=-1)
|
||||
probs = torch.softmax(next_token_logits / temperature, dim=-1)
|
||||
|
||||
next_token = torch.multinomial(probs, num_samples=1)
|
||||
|
||||
# Resample if forbidden token
|
||||
while next_token.item() in forbidden_tokens:
|
||||
next_token = torch.multinomial(probs, num_samples=1)
|
||||
|
||||
token_id = next_token.item()
|
||||
|
||||
# If she outputs <end> token, stop generation
|
||||
if tokenizer.reverse_vocab.get(token_id, "") == "<end>":
|
||||
break
|
||||
|
||||
generated.append(token_id)
|
||||
|
||||
next_token = next_token.unsqueeze(0)
|
||||
input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=1)
|
||||
input_ids = torch.cat([input_ids, next_token], dim=1)
|
||||
|
||||
return tokenizer.detokenize(generated)
|
||||
|
||||
|
||||
def score_sentence(sentence: str) -> float:
|
||||
words = sentence.strip().split()
|
||||
unique = set(words)
|
||||
length = len(words)
|
||||
diversity = len(set(words)) / (length + 1)
|
||||
if length < 4:
|
||||
unique_ratio = len(unique) / (length + 1)
|
||||
|
||||
if length < 5:
|
||||
return 0.0
|
||||
return diversity * min(length, 20)
|
||||
if unique_ratio < 0.5:
|
||||
return 0.0
|
||||
|
||||
return unique_ratio * min(length / 20.0, 1.0)
|
||||
|
||||
|
||||
def daydream():
|
||||
@ -65,7 +82,7 @@ def daydream():
|
||||
sentence = tokenizer.detokenize(dream)
|
||||
score = score_sentence(sentence)
|
||||
|
||||
if score > 0.45:
|
||||
if score > 0.5:
|
||||
save_dream(sentence, score)
|
||||
record_to_journal(sentence)
|
||||
train_on_message(sentence)
|
||||
|
@ -5,6 +5,7 @@ import time
|
||||
from model.tokenizer import VOCAB_PATH
|
||||
from model.memory import DREAM_LOG_PATH
|
||||
from context.context import CONTEXT_FILE
|
||||
from model.brainmap import load_brainmap, save_brainmap
|
||||
|
||||
CLEANUP_LOG = "data/logs/cleanup.log"
|
||||
|
||||
@ -73,7 +74,34 @@ def cleanup_context():
|
||||
log(f"Trimmed context memory from {len(context)} → {len(filtered)}")
|
||||
|
||||
|
||||
def cleanup_brainmap(min_neighbors=2, min_strength=2):
|
||||
load_brainmap()
|
||||
|
||||
from model.brainmap import brainmap # after load
|
||||
|
||||
to_delete = []
|
||||
|
||||
for word, neighbors in brainmap.items():
|
||||
# Remove weak neighbors
|
||||
weak_neighbors = [n for n, count in neighbors.items() if count < min_strength]
|
||||
for n in weak_neighbors:
|
||||
del neighbors[n]
|
||||
|
||||
# Mark lonely words
|
||||
if len(neighbors) < min_neighbors:
|
||||
to_delete.append(word)
|
||||
|
||||
for word in to_delete:
|
||||
del brainmap[word]
|
||||
|
||||
save_brainmap()
|
||||
|
||||
if to_delete:
|
||||
log(f"Pruned {len(to_delete)} weak brainmap words")
|
||||
|
||||
|
||||
def full_cleanup():
|
||||
cleanup_vocab()
|
||||
cleanup_dreams()
|
||||
cleanup_context()
|
||||
cleanup_brainmap()
|
||||
|
@ -25,7 +25,7 @@ class Tokenizer:
|
||||
self.next_id = 5
|
||||
|
||||
def tokenize(self, text):
|
||||
text = clean_unicode(text) # 🚨 Always clean incoming text
|
||||
text = clean_unicode(text)
|
||||
words = re.findall(r"\b\w+\b", text.lower())
|
||||
tokens = []
|
||||
for word in words:
|
||||
@ -41,3 +41,9 @@ class Tokenizer:
|
||||
if isinstance(tokens, int):
|
||||
tokens = [tokens]
|
||||
return " ".join(self.reverse_vocab.get(t, "<unk>") for t in tokens)
|
||||
|
||||
def token_to_id(self, token: str) -> int:
|
||||
return self.vocab.get(token, self.vocab["<unk>"])
|
||||
|
||||
def id_to_token(self, idx: int) -> str:
|
||||
return self.reverse_vocab.get(idx, "<unk>")
|
||||
|
Loading…
x
Reference in New Issue
Block a user