Ruby/ego/tokenizer.py

import re
import os
import json
from utils.unicleaner import clean_unicode

VOCAB_PATH = "memory/vocab.json"


def load_vocab():
    if os.path.exists(VOCAB_PATH):
        with open(VOCAB_PATH, "r", encoding="utf-8") as f:
            return json.load(f)
    return {}


def save_vocab(vocab):
    with open(VOCAB_PATH, "w", encoding="utf-8") as f:
        json.dump(vocab, f, indent=2)


class Tokenizer:
    def __init__(self):
        self.vocab = {"<pad>": 0, "<unk>": 1, "<start>": 2, "<end>": 3, "<sep>": 4}
        self.reverse_vocab = {0: "<pad>", 1: "<unk>", 2: "<start>", 3: "<end>", 4: "<sep>"}
        self.next_id = 5

    def tokenize(self, text):
        text = clean_unicode(text)
        words = re.findall(r"\b\w+\b", text.lower())

        tokens = []
        for word in words:
            # Filter out:
            # - Unicode-containing words
            # - Single-letter tokens (except meaningful ones like 'a' and 'i')
            # - Hyphenated gibberish like '--shire' or '1531--a'
            if not word.isascii():
                continue
            if len(word) == 1 and word not in {"a", "i"}:
                continue
            if re.fullmatch(r"[-_]+", word) or re.search(r"--+", word):
                continue

            if word not in self.vocab:
                self.vocab[word] = self.next_id
                self.reverse_vocab[self.next_id] = word
                self.next_id += 1
            tokens.append(self.vocab[word])

        save_vocab(self.vocab)
        return tokens

    def detokenize(self, tokens):
        if isinstance(tokens, int):
            tokens = [tokens]
        return " ".join(self.reverse_vocab.get(t, "<unk>") for t in tokens)

    def token_to_id(self, token: str) -> int:
        return self.vocab.get(token, self.vocab["<unk>"])

    def id_to_token(self, idx: int) -> str:
        return self.reverse_vocab.get(idx, "<unk>")