import json import os class Sensory: """Dynamic whitespace tokenizer that can grow (or not) its vocab.""" def __init__(self): # ensure , , AND are present from the start self.stoi = {"": 0, "": 1, "": 2} self.itos = {0: "", 1: "", 2: ""} def encode(self, text: str, grow: bool = True) -> list[int]: ids: list[int] = [] for tok in text.strip().split(): if tok not in self.stoi: if grow: idx = len(self.stoi) self.stoi[tok] = idx self.itos[idx] = tok else: idx = self.stoi[""] else: idx = self.stoi[tok] ids.append(idx) return ids def decode(self, ids: list[int]) -> str: out = [] for i in ids: if i == self.stoi[""]: break out.append(self.itos.get(i, "")) return " ".join(out) def save_vocab(self, path: str = "vocab.json") -> None: data = {"stoi": self.stoi, "itos": {str(k): v for k, v in self.itos.items()}} with open(path, "w", encoding="utf-8") as f: json.dump(data, f, indent=2) def load_vocab(self, path: str = "vocab.json") -> None: if not os.path.isfile(path): return with open(path, encoding="utf-8") as f: data = json.load(f) self.stoi = data["stoi"] self.itos = {int(k): v for k, v in data["itos"].items()} # if somehow got lost, re-add it if "" not in self.stoi: idx = len(self.stoi) self.stoi[""] = idx self.itos[idx] = ""