import re import os import json VOCAB_PATH = "data/memory/vocab.json" def load_vocab(): if os.path.exists(VOCAB_PATH): with open(VOCAB_PATH, "r", encoding="utf-8") as f: return json.load(f) return {} def save_vocab(vocab): with open(VOCAB_PATH, "w", encoding="utf-8") as f: json.dump(vocab, f, indent=2) class Tokenizer: def __init__(self): self.vocab = load_vocab() self.reverse_vocab = {v: k for k, v in self.vocab.items()} self.next_id = max(self.vocab.values(), default=0) + 1 def tokenize(self, text): words = re.findall(r"\b\w+\b", text.lower()) tokens = [] for word in words: if word not in self.vocab: self.vocab[word] = self.next_id self.reverse_vocab[self.next_id] = word self.next_id += 1 tokens.append(self.vocab[word]) save_vocab(self.vocab) return tokens def detokenize(self, tokens): return " ".join(self.reverse_vocab.get(t, "") for t in tokens)