Ruby/model/tokenizer.py

44 lines
1.2 KiB
Python

import re
import os
import json
from utils.unicleaner import clean_unicode
VOCAB_PATH = "data/memory/vocab.json"
def load_vocab():
if os.path.exists(VOCAB_PATH):
with open(VOCAB_PATH, "r", encoding="utf-8") as f:
return json.load(f)
return {}
def save_vocab(vocab):
with open(VOCAB_PATH, "w", encoding="utf-8") as f:
json.dump(vocab, f, indent=2)
class Tokenizer:
def __init__(self):
self.vocab = {"<pad>": 0, "<unk>": 1, "<start>": 2, "<end>": 3}
self.reverse_vocab = {0: "<pad>", 1: "<unk>", 2: "<start>", 3: "<end>"}
self.next_id = 4
def tokenize(self, text):
text = clean_unicode(text) # 🚨 Always clean incoming text
words = re.findall(r"\b\w+\b", text.lower())
tokens = []
for word in words:
if word not in self.vocab:
self.vocab[word] = self.next_id
self.reverse_vocab[self.next_id] = word
self.next_id += 1
tokens.append(self.vocab[word])
save_vocab(self.vocab)
return tokens
def detokenize(self, tokens):
if isinstance(tokens, int):
tokens = [tokens]
return " ".join(self.reverse_vocab.get(t, "<unk>") for t in tokens)