Ruby/model/tokenizer.py

import re
import os
import json

VOCAB_PATH = "data/memory/vocab.json"


def load_vocab():
    if os.path.exists(VOCAB_PATH):
        with open(VOCAB_PATH, "r", encoding="utf-8") as f:
            return json.load(f)
    return {}


def save_vocab(vocab):
    with open(VOCAB_PATH, "w", encoding="utf-8") as f:
        json.dump(vocab, f, indent=2)


class Tokenizer:
    def __init__(self):
        self.vocab = load_vocab()
        self.reverse_vocab = {v: k for k, v in self.vocab.items()}
        self.next_id = max(self.vocab.values(), default=0) + 1

    def tokenize(self, text):
        words = re.findall(r"\b\w+\b", text.lower())
        tokens = []
        for word in words:
            if word not in self.vocab:
                self.vocab[word] = self.next_id
                self.reverse_vocab[self.next_id] = word
                self.next_id += 1
            tokens.append(self.vocab[word])
        save_vocab(self.vocab)
        return tokens

    def detokenize(self, tokens):
        return " ".join(self.reverse_vocab.get(t, "<unk>") for t in tokens)