Complete transformer LLM built from scratch with: Core Features: - Full transformer architecture (RoPE, RMSNorm, SwiGLU, KV-cache) - SentencePiece tokenizer (BPE/Unigram) - Training pipeline (AMP, gradient checkpointing, DDP) - Persona system with personality matrix (NO AI disclosure by default) - Genetic evolution (NOVA-EVO) for hyperparameter optimization - Legal-only data pipeline with license tracking - Chat interface (CLI + REST API) - Conversation memory (SQLite) Model Sizes: - 125M, 350M, 1.3B, 3B parameters - Local-first, runs on CPU or GPU - Python 3.10.6+, PyTorch 2.0+ Personas: - girlfriend_gentle (high warmth, high empathy) - girlfriend_playful (high humor, high playfulness) - girlfriend_supportive (balanced, default) Documentation: - Complete README with quickstart - Model card with ethical considerations - Privacy documentation (local-first, zero telemetry) - Data licenses and attribution - Contributing guide Infrastructure: - GitHub Actions CI/CD - Comprehensive test suite - Quickstart script - CLI tool License: Apache 2.0 🤖 Generated with Claude Code https://claude.com/claude-code Co-Authored-By: Claude <noreply@anthropic.com>
158 lines
4.4 KiB
Python
158 lines
4.4 KiB
Python
"""
|
|
NOVA Tokenizer - SentencePiece-based tokenization
|
|
"""
|
|
|
|
import sentencepiece as spm
|
|
from typing import List, Union, Optional
|
|
import os
|
|
|
|
|
|
class NovaTokenizer:
|
|
"""
|
|
SentencePiece tokenizer for NOVA
|
|
|
|
Supports both BPE and Unigram models with special tokens
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
model_path: str,
|
|
add_bos: bool = True,
|
|
add_eos: bool = True,
|
|
):
|
|
"""
|
|
Args:
|
|
model_path: Path to SentencePiece model file (.model)
|
|
add_bos: Whether to add BOS token by default
|
|
add_eos: Whether to add EOS token by default
|
|
"""
|
|
if not os.path.exists(model_path):
|
|
raise FileNotFoundError(f"Tokenizer model not found: {model_path}")
|
|
|
|
self.sp = spm.SentencePieceProcessor()
|
|
self.sp.Load(model_path)
|
|
|
|
self.add_bos = add_bos
|
|
self.add_eos = add_eos
|
|
|
|
# Special token IDs
|
|
self.bos_id = self.sp.bos_id()
|
|
self.eos_id = self.sp.eos_id()
|
|
self.pad_id = self.sp.pad_id()
|
|
self.unk_id = self.sp.unk_id()
|
|
|
|
# Vocabulary info
|
|
self.vocab_size = self.sp.vocab_size()
|
|
|
|
def encode(
|
|
self,
|
|
text: Union[str, List[str]],
|
|
add_bos: Optional[bool] = None,
|
|
add_eos: Optional[bool] = None,
|
|
) -> Union[List[int], List[List[int]]]:
|
|
"""
|
|
Encode text to token IDs
|
|
|
|
Args:
|
|
text: Single string or list of strings
|
|
add_bos: Override default BOS behavior
|
|
add_eos: Override default EOS behavior
|
|
|
|
Returns:
|
|
Token IDs (single list or list of lists)
|
|
"""
|
|
add_bos = self.add_bos if add_bos is None else add_bos
|
|
add_eos = self.add_eos if add_eos is None else add_eos
|
|
|
|
if isinstance(text, str):
|
|
ids = self.sp.Encode(text)
|
|
if add_bos:
|
|
ids = [self.bos_id] + ids
|
|
if add_eos:
|
|
ids = ids + [self.eos_id]
|
|
return ids
|
|
else:
|
|
return [self.encode(t, add_bos, add_eos) for t in text]
|
|
|
|
def decode(
|
|
self,
|
|
ids: Union[List[int], List[List[int]]],
|
|
skip_special_tokens: bool = True,
|
|
) -> Union[str, List[str]]:
|
|
"""
|
|
Decode token IDs to text
|
|
|
|
Args:
|
|
ids: Single list of IDs or list of lists
|
|
skip_special_tokens: Whether to remove special tokens
|
|
|
|
Returns:
|
|
Decoded text (single string or list of strings)
|
|
"""
|
|
if isinstance(ids[0], list):
|
|
return [self.decode(i, skip_special_tokens) for i in ids]
|
|
|
|
if skip_special_tokens:
|
|
# Remove BOS, EOS, PAD tokens
|
|
ids = [i for i in ids if i not in [self.bos_id, self.eos_id, self.pad_id]]
|
|
|
|
return self.sp.Decode(ids)
|
|
|
|
def encode_batch(
|
|
self,
|
|
texts: List[str],
|
|
add_bos: Optional[bool] = None,
|
|
add_eos: Optional[bool] = None,
|
|
) -> List[List[int]]:
|
|
"""Encode batch of texts"""
|
|
return self.encode(texts, add_bos, add_eos)
|
|
|
|
def decode_batch(
|
|
self,
|
|
ids_list: List[List[int]],
|
|
skip_special_tokens: bool = True,
|
|
) -> List[str]:
|
|
"""Decode batch of token ID lists"""
|
|
return self.decode(ids_list, skip_special_tokens)
|
|
|
|
def __len__(self) -> int:
|
|
"""Return vocabulary size"""
|
|
return self.vocab_size
|
|
|
|
def __call__(
|
|
self,
|
|
text: Union[str, List[str]],
|
|
add_bos: Optional[bool] = None,
|
|
add_eos: Optional[bool] = None,
|
|
) -> Union[List[int], List[List[int]]]:
|
|
"""Shorthand for encode"""
|
|
return self.encode(text, add_bos, add_eos)
|
|
|
|
def get_piece(self, token_id: int) -> str:
|
|
"""Get string piece for token ID"""
|
|
return self.sp.IdToPiece(token_id)
|
|
|
|
def get_id(self, piece: str) -> int:
|
|
"""Get token ID for string piece"""
|
|
return self.sp.PieceToId(piece)
|
|
|
|
@property
|
|
def bos_token(self) -> str:
|
|
"""BOS token string"""
|
|
return self.sp.IdToPiece(self.bos_id) if self.bos_id >= 0 else ""
|
|
|
|
@property
|
|
def eos_token(self) -> str:
|
|
"""EOS token string"""
|
|
return self.sp.IdToPiece(self.eos_id) if self.eos_id >= 0 else ""
|
|
|
|
@property
|
|
def pad_token(self) -> str:
|
|
"""PAD token string"""
|
|
return self.sp.IdToPiece(self.pad_id) if self.pad_id >= 0 else ""
|
|
|
|
@property
|
|
def unk_token(self) -> str:
|
|
"""UNK token string"""
|
|
return self.sp.IdToPiece(self.unk_id) if self.unk_id >= 0 else ""
|