fixed an error in the tokenizer.
Updated brain.py to be a tad more aggressive, and added a cleanup function for the brainmap to cleanup.py
This commit is contained in:
parent
4d4b39b4c7
commit
0674d51471
@ -11,12 +11,20 @@ recent_dreams = []
|
|||||||
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def generate_response(max_tokens: int = 50):
|
def generate_response(max_tokens: int = 50, temperature: float = 1.0):
|
||||||
model.eval()
|
model.eval()
|
||||||
input_ids = torch.tensor([tokenizer.token_to_id("<start>")], device=DEVICE).unsqueeze(0)
|
input_ids = torch.tensor([tokenizer.token_to_id("<start>")], device=DEVICE).unsqueeze(0)
|
||||||
|
|
||||||
generated = []
|
generated = []
|
||||||
|
|
||||||
|
forbidden_tokens = {
|
||||||
|
tokenizer.token_to_id("<unk>"),
|
||||||
|
tokenizer.token_to_id("<start>"),
|
||||||
|
tokenizer.token_to_id("<pad>"),
|
||||||
|
tokenizer.token_to_id("<end>"),
|
||||||
|
tokenizer.token_to_id("<sep>"),
|
||||||
|
}
|
||||||
|
|
||||||
for _ in range(max_tokens):
|
for _ in range(max_tokens):
|
||||||
output = model(input_ids)
|
output = model(input_ids)
|
||||||
if torch.isnan(output).any():
|
if torch.isnan(output).any():
|
||||||
@ -24,29 +32,38 @@ def generate_response(max_tokens: int = 50):
|
|||||||
return "..."
|
return "..."
|
||||||
|
|
||||||
next_token_logits = output[:, -1, :]
|
next_token_logits = output[:, -1, :]
|
||||||
next_token = torch.argmax(next_token_logits, dim=-1)
|
probs = torch.softmax(next_token_logits / temperature, dim=-1)
|
||||||
|
|
||||||
|
next_token = torch.multinomial(probs, num_samples=1)
|
||||||
|
|
||||||
|
# Resample if forbidden token
|
||||||
|
while next_token.item() in forbidden_tokens:
|
||||||
|
next_token = torch.multinomial(probs, num_samples=1)
|
||||||
|
|
||||||
token_id = next_token.item()
|
token_id = next_token.item()
|
||||||
|
|
||||||
# If she outputs <end> token, stop generation
|
|
||||||
if tokenizer.reverse_vocab.get(token_id, "") == "<end>":
|
if tokenizer.reverse_vocab.get(token_id, "") == "<end>":
|
||||||
break
|
break
|
||||||
|
|
||||||
generated.append(token_id)
|
generated.append(token_id)
|
||||||
|
|
||||||
next_token = next_token.unsqueeze(0)
|
input_ids = torch.cat([input_ids, next_token], dim=1)
|
||||||
input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=1)
|
|
||||||
|
|
||||||
return tokenizer.detokenize(generated)
|
return tokenizer.detokenize(generated)
|
||||||
|
|
||||||
|
|
||||||
def score_sentence(sentence: str) -> float:
|
def score_sentence(sentence: str) -> float:
|
||||||
words = sentence.strip().split()
|
words = sentence.strip().split()
|
||||||
|
unique = set(words)
|
||||||
length = len(words)
|
length = len(words)
|
||||||
diversity = len(set(words)) / (length + 1)
|
unique_ratio = len(unique) / (length + 1)
|
||||||
if length < 4:
|
|
||||||
|
if length < 5:
|
||||||
return 0.0
|
return 0.0
|
||||||
return diversity * min(length, 20)
|
if unique_ratio < 0.5:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
return unique_ratio * min(length / 20.0, 1.0)
|
||||||
|
|
||||||
|
|
||||||
def daydream():
|
def daydream():
|
||||||
@ -65,7 +82,7 @@ def daydream():
|
|||||||
sentence = tokenizer.detokenize(dream)
|
sentence = tokenizer.detokenize(dream)
|
||||||
score = score_sentence(sentence)
|
score = score_sentence(sentence)
|
||||||
|
|
||||||
if score > 0.45:
|
if score > 0.5:
|
||||||
save_dream(sentence, score)
|
save_dream(sentence, score)
|
||||||
record_to_journal(sentence)
|
record_to_journal(sentence)
|
||||||
train_on_message(sentence)
|
train_on_message(sentence)
|
||||||
|
@ -5,6 +5,7 @@ import time
|
|||||||
from model.tokenizer import VOCAB_PATH
|
from model.tokenizer import VOCAB_PATH
|
||||||
from model.memory import DREAM_LOG_PATH
|
from model.memory import DREAM_LOG_PATH
|
||||||
from context.context import CONTEXT_FILE
|
from context.context import CONTEXT_FILE
|
||||||
|
from model.brainmap import load_brainmap, save_brainmap
|
||||||
|
|
||||||
CLEANUP_LOG = "data/logs/cleanup.log"
|
CLEANUP_LOG = "data/logs/cleanup.log"
|
||||||
|
|
||||||
@ -73,7 +74,34 @@ def cleanup_context():
|
|||||||
log(f"Trimmed context memory from {len(context)} → {len(filtered)}")
|
log(f"Trimmed context memory from {len(context)} → {len(filtered)}")
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_brainmap(min_neighbors=2, min_strength=2):
|
||||||
|
load_brainmap()
|
||||||
|
|
||||||
|
from model.brainmap import brainmap # after load
|
||||||
|
|
||||||
|
to_delete = []
|
||||||
|
|
||||||
|
for word, neighbors in brainmap.items():
|
||||||
|
# Remove weak neighbors
|
||||||
|
weak_neighbors = [n for n, count in neighbors.items() if count < min_strength]
|
||||||
|
for n in weak_neighbors:
|
||||||
|
del neighbors[n]
|
||||||
|
|
||||||
|
# Mark lonely words
|
||||||
|
if len(neighbors) < min_neighbors:
|
||||||
|
to_delete.append(word)
|
||||||
|
|
||||||
|
for word in to_delete:
|
||||||
|
del brainmap[word]
|
||||||
|
|
||||||
|
save_brainmap()
|
||||||
|
|
||||||
|
if to_delete:
|
||||||
|
log(f"Pruned {len(to_delete)} weak brainmap words")
|
||||||
|
|
||||||
|
|
||||||
def full_cleanup():
|
def full_cleanup():
|
||||||
cleanup_vocab()
|
cleanup_vocab()
|
||||||
cleanup_dreams()
|
cleanup_dreams()
|
||||||
cleanup_context()
|
cleanup_context()
|
||||||
|
cleanup_brainmap()
|
||||||
|
@ -25,7 +25,7 @@ class Tokenizer:
|
|||||||
self.next_id = 5
|
self.next_id = 5
|
||||||
|
|
||||||
def tokenize(self, text):
|
def tokenize(self, text):
|
||||||
text = clean_unicode(text) # 🚨 Always clean incoming text
|
text = clean_unicode(text)
|
||||||
words = re.findall(r"\b\w+\b", text.lower())
|
words = re.findall(r"\b\w+\b", text.lower())
|
||||||
tokens = []
|
tokens = []
|
||||||
for word in words:
|
for word in words:
|
||||||
@ -41,3 +41,9 @@ class Tokenizer:
|
|||||||
if isinstance(tokens, int):
|
if isinstance(tokens, int):
|
||||||
tokens = [tokens]
|
tokens = [tokens]
|
||||||
return " ".join(self.reverse_vocab.get(t, "<unk>") for t in tokens)
|
return " ".join(self.reverse_vocab.get(t, "<unk>") for t in tokens)
|
||||||
|
|
||||||
|
def token_to_id(self, token: str) -> int:
|
||||||
|
return self.vocab.get(token, self.vocab["<unk>"])
|
||||||
|
|
||||||
|
def id_to_token(self, idx: int) -> str:
|
||||||
|
return self.reverse_vocab.get(idx, "<unk>")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user