NOVA/nova_tokenizer/tokenizer.py

"""
NOVA Tokenizer - SentencePiece-based tokenization
"""

import sentencepiece as spm
from typing import List, Union, Optional
import os


class NovaTokenizer:
    """
    SentencePiece tokenizer for NOVA

    Supports both BPE and Unigram models with special tokens
    """

    def __init__(
        self,
        model_path: str,
        add_bos: bool = True,
        add_eos: bool = True,
    ):
        """
        Args:
            model_path: Path to SentencePiece model file (.model)
            add_bos: Whether to add BOS token by default
            add_eos: Whether to add EOS token by default
        """
        if not os.path.exists(model_path):
            raise FileNotFoundError(f"Tokenizer model not found: {model_path}")

        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(model_path)

        self.add_bos = add_bos
        self.add_eos = add_eos

        # Special token IDs
        self.bos_id = self.sp.bos_id()
        self.eos_id = self.sp.eos_id()
        self.pad_id = self.sp.pad_id()
        self.unk_id = self.sp.unk_id()

        # Vocabulary info
        self.vocab_size = self.sp.vocab_size()

    def encode(
        self,
        text: Union[str, List[str]],
        add_bos: Optional[bool] = None,
        add_eos: Optional[bool] = None,
    ) -> Union[List[int], List[List[int]]]:
        """
        Encode text to token IDs

        Args:
            text: Single string or list of strings
            add_bos: Override default BOS behavior
            add_eos: Override default EOS behavior

        Returns:
            Token IDs (single list or list of lists)
        """
        add_bos = self.add_bos if add_bos is None else add_bos
        add_eos = self.add_eos if add_eos is None else add_eos

        if isinstance(text, str):
            ids = self.sp.Encode(text)
            if add_bos:
                ids = [self.bos_id] + ids
            if add_eos:
                ids = ids + [self.eos_id]
            return ids
        else:
            return [self.encode(t, add_bos, add_eos) for t in text]

    def decode(
        self,
        ids: Union[List[int], List[List[int]]],
        skip_special_tokens: bool = True,
    ) -> Union[str, List[str]]:
        """
        Decode token IDs to text

        Args:
            ids: Single list of IDs or list of lists
            skip_special_tokens: Whether to remove special tokens

        Returns:
            Decoded text (single string or list of strings)
        """
        if isinstance(ids[0], list):
            return [self.decode(i, skip_special_tokens) for i in ids]

        if skip_special_tokens:
            # Remove BOS, EOS, PAD tokens
            ids = [i for i in ids if i not in [self.bos_id, self.eos_id, self.pad_id]]

        return self.sp.Decode(ids)

    def encode_batch(
        self,
        texts: List[str],
        add_bos: Optional[bool] = None,
        add_eos: Optional[bool] = None,
    ) -> List[List[int]]:
        """Encode batch of texts"""
        return self.encode(texts, add_bos, add_eos)

    def decode_batch(
        self,
        ids_list: List[List[int]],
        skip_special_tokens: bool = True,
    ) -> List[str]:
        """Decode batch of token ID lists"""
        return self.decode(ids_list, skip_special_tokens)

    def __len__(self) -> int:
        """Return vocabulary size"""
        return self.vocab_size

    def __call__(
        self,
        text: Union[str, List[str]],
        add_bos: Optional[bool] = None,
        add_eos: Optional[bool] = None,
    ) -> Union[List[int], List[List[int]]]:
        """Shorthand for encode"""
        return self.encode(text, add_bos, add_eos)

    def get_piece(self, token_id: int) -> str:
        """Get string piece for token ID"""
        return self.sp.IdToPiece(token_id)

    def get_id(self, piece: str) -> int:
        """Get token ID for string piece"""
        return self.sp.PieceToId(piece)

    @property
    def bos_token(self) -> str:
        """BOS token string"""
        return self.sp.IdToPiece(self.bos_id) if self.bos_id >= 0 else ""

    @property
    def eos_token(self) -> str:
        """EOS token string"""
        return self.sp.IdToPiece(self.eos_id) if self.eos_id >= 0 else ""

    @property
    def pad_token(self) -> str:
        """PAD token string"""
        return self.sp.IdToPiece(self.pad_id) if self.pad_id >= 0 else ""

    @property
    def unk_token(self) -> str:
        """UNK token string"""
        return self.sp.IdToPiece(self.unk_id) if self.unk_id >= 0 else ""