Updated brain.py to be a tad more aggressive, and added a cleanup function for the brainmap to cleanup.py
50 lines
1.4 KiB
Python
50 lines
1.4 KiB
Python
import re
|
|
import os
|
|
import json
|
|
from utils.unicleaner import clean_unicode
|
|
|
|
VOCAB_PATH = "data/memory/vocab.json"
|
|
|
|
|
|
def load_vocab():
|
|
if os.path.exists(VOCAB_PATH):
|
|
with open(VOCAB_PATH, "r", encoding="utf-8") as f:
|
|
return json.load(f)
|
|
return {}
|
|
|
|
|
|
def save_vocab(vocab):
|
|
with open(VOCAB_PATH, "w", encoding="utf-8") as f:
|
|
json.dump(vocab, f, indent=2)
|
|
|
|
|
|
class Tokenizer:
|
|
def __init__(self):
|
|
self.vocab = {"<pad>": 0, "<unk>": 1, "<start>": 2, "<end>": 3, "<sep>": 4}
|
|
self.reverse_vocab = {0: "<pad>", 1: "<unk>", 2: "<start>", 3: "<end>", 4: "<sep>"}
|
|
self.next_id = 5
|
|
|
|
def tokenize(self, text):
|
|
text = clean_unicode(text)
|
|
words = re.findall(r"\b\w+\b", text.lower())
|
|
tokens = []
|
|
for word in words:
|
|
if word not in self.vocab:
|
|
self.vocab[word] = self.next_id
|
|
self.reverse_vocab[self.next_id] = word
|
|
self.next_id += 1
|
|
tokens.append(self.vocab[word])
|
|
save_vocab(self.vocab)
|
|
return tokens
|
|
|
|
def detokenize(self, tokens):
|
|
if isinstance(tokens, int):
|
|
tokens = [tokens]
|
|
return " ".join(self.reverse_vocab.get(t, "<unk>") for t in tokens)
|
|
|
|
def token_to_id(self, token: str) -> int:
|
|
return self.vocab.get(token, self.vocab["<unk>"])
|
|
|
|
def id_to_token(self, idx: int) -> str:
|
|
return self.reverse_vocab.get(idx, "<unk>")
|