import os import json class Sensory: """Dynamic whitespace tokenizer that can grow (or not) its vocab.""" def __init__(self): self.stoi = {"": 0, "": 1} self.itos = {0: "", 1: ""} def encode(self, text: str, grow: bool = True) -> list[int]: ids: list[int] = [] for tok in text.strip().split(): if tok not in self.stoi: if grow: idx = len(self.stoi) self.stoi[tok] = idx self.itos[idx] = tok else: idx = self.stoi[""] else: idx = self.stoi[tok] ids.append(idx) return ids def decode(self, ids: list[int]) -> str: return " ".join(self.itos.get(i, "") for i in ids) def save_vocab(self, path: str = "vocab.json") -> None: """Dump stoi+itos to disk.""" data = { "stoi": self.stoi, # JSON keys must be strings "itos": {str(k): v for k, v in self.itos.items()} } with open(path, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2) def load_vocab(self, path: str = "vocab.json") -> None: """Load stoi+itos if it exists.""" if not os.path.isfile(path): return with open(path, encoding="utf-8") as f: data = json.load(f) self.stoi = data["stoi"] # convert itos keys back to int self.itos = {int(k): v for k, v in data["itos"].items()}