import re import os import json from utils.unicleaner import clean_unicode VOCAB_PATH = "memory/vocab.json" def load_vocab(): if os.path.exists(VOCAB_PATH): with open(VOCAB_PATH, "r", encoding="utf-8") as f: return json.load(f) return {} def save_vocab(vocab): with open(VOCAB_PATH, "w", encoding="utf-8") as f: json.dump(vocab, f, indent=2) class Tokenizer: def __init__(self): self.vocab = {"": 0, "": 1, "": 2, "": 3, "": 4} self.reverse_vocab = {0: "", 1: "", 2: "", 3: "", 4: ""} self.next_id = 5 def tokenize(self, text): text = clean_unicode(text) words = re.findall(r"\b\w+\b", text.lower()) tokens = [] for word in words: # Filter out: # - Unicode-containing words # - Single-letter tokens (except meaningful ones like 'a' and 'i') # - Hyphenated gibberish like '--shire' or '1531--a' if not word.isascii(): continue if len(word) == 1 and word not in {"a", "i"}: continue if re.fullmatch(r"[-_]+", word) or re.search(r"--+", word): continue if word not in self.vocab: self.vocab[word] = self.next_id self.reverse_vocab[self.next_id] = word self.next_id += 1 tokens.append(self.vocab[word]) save_vocab(self.vocab) return tokens def detokenize(self, tokens): if isinstance(tokens, int): tokens = [tokens] return " ".join(self.reverse_vocab.get(t, "") for t in tokens) def token_to_id(self, token: str) -> int: return self.vocab.get(token, self.vocab[""]) def id_to_token(self, idx: int) -> str: return self.reverse_vocab.get(idx, "")