Ruby/sensory.py
Dani 48585acb6f Dashboard updated
Added life log, persona, and plugins manager.
changed it so that any new .json files aren't uploaded
2025-05-05 13:23:38 -04:00

53 lines
1.7 KiB
Python

import json
import os
class Sensory:
"""Dynamic whitespace tokenizer that can grow (or not) its vocab."""
def __init__(self):
# ensure <pad>, <unk>, AND <eos> are present from the start
self.stoi = {"<pad>": 0, "<unk>": 1, "<eos>": 2}
self.itos = {0: "<pad>", 1: "<unk>", 2: "<eos>"}
def encode(self, text: str, grow: bool = True) -> list[int]:
ids: list[int] = []
for tok in text.strip().split():
if tok not in self.stoi:
if grow:
idx = len(self.stoi)
self.stoi[tok] = idx
self.itos[idx] = tok
else:
idx = self.stoi["<unk>"]
else:
idx = self.stoi[tok]
ids.append(idx)
return ids
def decode(self, ids: list[int]) -> str:
out = []
for i in ids:
if i == self.stoi["<eos>"]:
break
out.append(self.itos.get(i, "<unk>"))
return " ".join(out)
def save_vocab(self, path: str = "vocab.json") -> None:
data = {"stoi": self.stoi, "itos": {str(k): v for k, v in self.itos.items()}}
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2)
def load_vocab(self, path: str = "vocab.json") -> None:
if not os.path.isfile(path):
return
with open(path, encoding="utf-8") as f:
data = json.load(f)
self.stoi = data["stoi"]
self.itos = {int(k): v for k, v in data["itos"].items()}
# if somehow <eos> got lost, re-add it
if "<eos>" not in self.stoi:
idx = len(self.stoi)
self.stoi["<eos>"] = idx
self.itos[idx] = "<eos>"