Added life log, persona, and plugins manager. changed it so that any new .json files aren't uploaded
53 lines
1.7 KiB
Python
53 lines
1.7 KiB
Python
import json
|
|
import os
|
|
|
|
|
|
class Sensory:
|
|
"""Dynamic whitespace tokenizer that can grow (or not) its vocab."""
|
|
|
|
def __init__(self):
|
|
# ensure <pad>, <unk>, AND <eos> are present from the start
|
|
self.stoi = {"<pad>": 0, "<unk>": 1, "<eos>": 2}
|
|
self.itos = {0: "<pad>", 1: "<unk>", 2: "<eos>"}
|
|
|
|
def encode(self, text: str, grow: bool = True) -> list[int]:
|
|
ids: list[int] = []
|
|
for tok in text.strip().split():
|
|
if tok not in self.stoi:
|
|
if grow:
|
|
idx = len(self.stoi)
|
|
self.stoi[tok] = idx
|
|
self.itos[idx] = tok
|
|
else:
|
|
idx = self.stoi["<unk>"]
|
|
else:
|
|
idx = self.stoi[tok]
|
|
ids.append(idx)
|
|
return ids
|
|
|
|
def decode(self, ids: list[int]) -> str:
|
|
out = []
|
|
for i in ids:
|
|
if i == self.stoi["<eos>"]:
|
|
break
|
|
out.append(self.itos.get(i, "<unk>"))
|
|
return " ".join(out)
|
|
|
|
def save_vocab(self, path: str = "vocab.json") -> None:
|
|
data = {"stoi": self.stoi, "itos": {str(k): v for k, v in self.itos.items()}}
|
|
with open(path, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, indent=2)
|
|
|
|
def load_vocab(self, path: str = "vocab.json") -> None:
|
|
if not os.path.isfile(path):
|
|
return
|
|
with open(path, encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
self.stoi = data["stoi"]
|
|
self.itos = {int(k): v for k, v in data["itos"].items()}
|
|
# if somehow <eos> got lost, re-add it
|
|
if "<eos>" not in self.stoi:
|
|
idx = len(self.stoi)
|
|
self.stoi["<eos>"] = idx
|
|
self.itos[idx] = "<eos>"
|