""" NOVA Tokenizer - SentencePiece-based tokenization """ import sentencepiece as spm from typing import List, Union, Optional import os class NovaTokenizer: """ SentencePiece tokenizer for NOVA Supports both BPE and Unigram models with special tokens """ def __init__( self, model_path: str, add_bos: bool = True, add_eos: bool = True, ): """ Args: model_path: Path to SentencePiece model file (.model) add_bos: Whether to add BOS token by default add_eos: Whether to add EOS token by default """ if not os.path.exists(model_path): raise FileNotFoundError(f"Tokenizer model not found: {model_path}") self.sp = spm.SentencePieceProcessor() self.sp.Load(model_path) self.add_bos = add_bos self.add_eos = add_eos # Special token IDs self.bos_id = self.sp.bos_id() self.eos_id = self.sp.eos_id() self.pad_id = self.sp.pad_id() self.unk_id = self.sp.unk_id() # Vocabulary info self.vocab_size = self.sp.vocab_size() def encode( self, text: Union[str, List[str]], add_bos: Optional[bool] = None, add_eos: Optional[bool] = None, ) -> Union[List[int], List[List[int]]]: """ Encode text to token IDs Args: text: Single string or list of strings add_bos: Override default BOS behavior add_eos: Override default EOS behavior Returns: Token IDs (single list or list of lists) """ add_bos = self.add_bos if add_bos is None else add_bos add_eos = self.add_eos if add_eos is None else add_eos if isinstance(text, str): ids = self.sp.Encode(text) if add_bos: ids = [self.bos_id] + ids if add_eos: ids = ids + [self.eos_id] return ids else: return [self.encode(t, add_bos, add_eos) for t in text] def decode( self, ids: Union[List[int], List[List[int]]], skip_special_tokens: bool = True, ) -> Union[str, List[str]]: """ Decode token IDs to text Args: ids: Single list of IDs or list of lists skip_special_tokens: Whether to remove special tokens Returns: Decoded text (single string or list of strings) """ if isinstance(ids[0], list): return [self.decode(i, skip_special_tokens) for i in ids] if skip_special_tokens: # Remove BOS, EOS, PAD tokens ids = [i for i in ids if i not in [self.bos_id, self.eos_id, self.pad_id]] return self.sp.Decode(ids) def encode_batch( self, texts: List[str], add_bos: Optional[bool] = None, add_eos: Optional[bool] = None, ) -> List[List[int]]: """Encode batch of texts""" return self.encode(texts, add_bos, add_eos) def decode_batch( self, ids_list: List[List[int]], skip_special_tokens: bool = True, ) -> List[str]: """Decode batch of token ID lists""" return self.decode(ids_list, skip_special_tokens) def __len__(self) -> int: """Return vocabulary size""" return self.vocab_size def __call__( self, text: Union[str, List[str]], add_bos: Optional[bool] = None, add_eos: Optional[bool] = None, ) -> Union[List[int], List[List[int]]]: """Shorthand for encode""" return self.encode(text, add_bos, add_eos) def get_piece(self, token_id: int) -> str: """Get string piece for token ID""" return self.sp.IdToPiece(token_id) def get_id(self, piece: str) -> int: """Get token ID for string piece""" return self.sp.PieceToId(piece) @property def bos_token(self) -> str: """BOS token string""" return self.sp.IdToPiece(self.bos_id) if self.bos_id >= 0 else "" @property def eos_token(self) -> str: """EOS token string""" return self.sp.IdToPiece(self.eos_id) if self.eos_id >= 0 else "" @property def pad_token(self) -> str: """PAD token string""" return self.sp.IdToPiece(self.pad_id) if self.pad_id >= 0 else "" @property def unk_token(self) -> str: """UNK token string""" return self.sp.IdToPiece(self.unk_id) if self.unk_id >= 0 else ""