Initial commit: NOVA - Neuro-Optimizing Versatile Agent
Complete transformer LLM built from scratch with: Core Features: - Full transformer architecture (RoPE, RMSNorm, SwiGLU, KV-cache) - SentencePiece tokenizer (BPE/Unigram) - Training pipeline (AMP, gradient checkpointing, DDP) - Persona system with personality matrix (NO AI disclosure by default) - Genetic evolution (NOVA-EVO) for hyperparameter optimization - Legal-only data pipeline with license tracking - Chat interface (CLI + REST API) - Conversation memory (SQLite) Model Sizes: - 125M, 350M, 1.3B, 3B parameters - Local-first, runs on CPU or GPU - Python 3.10.6+, PyTorch 2.0+ Personas: - girlfriend_gentle (high warmth, high empathy) - girlfriend_playful (high humor, high playfulness) - girlfriend_supportive (balanced, default) Documentation: - Complete README with quickstart - Model card with ethical considerations - Privacy documentation (local-first, zero telemetry) - Data licenses and attribution - Contributing guide Infrastructure: - GitHub Actions CI/CD - Comprehensive test suite - Quickstart script - CLI tool License: Apache 2.0 🤖 Generated with Claude Code https://claude.com/claude-code Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
11
nova_tokenizer/__init__.py
Normal file
11
nova_tokenizer/__init__.py
Normal file
@@ -0,0 +1,11 @@
|
||||
"""
|
||||
NOVA Tokenizer - SentencePiece-based tokenization
|
||||
"""
|
||||
|
||||
from .tokenizer import NovaTokenizer
|
||||
from .trainer import train_tokenizer
|
||||
|
||||
__all__ = [
|
||||
'NovaTokenizer',
|
||||
'train_tokenizer',
|
||||
]
|
157
nova_tokenizer/tokenizer.py
Normal file
157
nova_tokenizer/tokenizer.py
Normal file
@@ -0,0 +1,157 @@
|
||||
"""
|
||||
NOVA Tokenizer - SentencePiece-based tokenization
|
||||
"""
|
||||
|
||||
import sentencepiece as spm
|
||||
from typing import List, Union, Optional
|
||||
import os
|
||||
|
||||
|
||||
class NovaTokenizer:
|
||||
"""
|
||||
SentencePiece tokenizer for NOVA
|
||||
|
||||
Supports both BPE and Unigram models with special tokens
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_path: str,
|
||||
add_bos: bool = True,
|
||||
add_eos: bool = True,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
model_path: Path to SentencePiece model file (.model)
|
||||
add_bos: Whether to add BOS token by default
|
||||
add_eos: Whether to add EOS token by default
|
||||
"""
|
||||
if not os.path.exists(model_path):
|
||||
raise FileNotFoundError(f"Tokenizer model not found: {model_path}")
|
||||
|
||||
self.sp = spm.SentencePieceProcessor()
|
||||
self.sp.Load(model_path)
|
||||
|
||||
self.add_bos = add_bos
|
||||
self.add_eos = add_eos
|
||||
|
||||
# Special token IDs
|
||||
self.bos_id = self.sp.bos_id()
|
||||
self.eos_id = self.sp.eos_id()
|
||||
self.pad_id = self.sp.pad_id()
|
||||
self.unk_id = self.sp.unk_id()
|
||||
|
||||
# Vocabulary info
|
||||
self.vocab_size = self.sp.vocab_size()
|
||||
|
||||
def encode(
|
||||
self,
|
||||
text: Union[str, List[str]],
|
||||
add_bos: Optional[bool] = None,
|
||||
add_eos: Optional[bool] = None,
|
||||
) -> Union[List[int], List[List[int]]]:
|
||||
"""
|
||||
Encode text to token IDs
|
||||
|
||||
Args:
|
||||
text: Single string or list of strings
|
||||
add_bos: Override default BOS behavior
|
||||
add_eos: Override default EOS behavior
|
||||
|
||||
Returns:
|
||||
Token IDs (single list or list of lists)
|
||||
"""
|
||||
add_bos = self.add_bos if add_bos is None else add_bos
|
||||
add_eos = self.add_eos if add_eos is None else add_eos
|
||||
|
||||
if isinstance(text, str):
|
||||
ids = self.sp.Encode(text)
|
||||
if add_bos:
|
||||
ids = [self.bos_id] + ids
|
||||
if add_eos:
|
||||
ids = ids + [self.eos_id]
|
||||
return ids
|
||||
else:
|
||||
return [self.encode(t, add_bos, add_eos) for t in text]
|
||||
|
||||
def decode(
|
||||
self,
|
||||
ids: Union[List[int], List[List[int]]],
|
||||
skip_special_tokens: bool = True,
|
||||
) -> Union[str, List[str]]:
|
||||
"""
|
||||
Decode token IDs to text
|
||||
|
||||
Args:
|
||||
ids: Single list of IDs or list of lists
|
||||
skip_special_tokens: Whether to remove special tokens
|
||||
|
||||
Returns:
|
||||
Decoded text (single string or list of strings)
|
||||
"""
|
||||
if isinstance(ids[0], list):
|
||||
return [self.decode(i, skip_special_tokens) for i in ids]
|
||||
|
||||
if skip_special_tokens:
|
||||
# Remove BOS, EOS, PAD tokens
|
||||
ids = [i for i in ids if i not in [self.bos_id, self.eos_id, self.pad_id]]
|
||||
|
||||
return self.sp.Decode(ids)
|
||||
|
||||
def encode_batch(
|
||||
self,
|
||||
texts: List[str],
|
||||
add_bos: Optional[bool] = None,
|
||||
add_eos: Optional[bool] = None,
|
||||
) -> List[List[int]]:
|
||||
"""Encode batch of texts"""
|
||||
return self.encode(texts, add_bos, add_eos)
|
||||
|
||||
def decode_batch(
|
||||
self,
|
||||
ids_list: List[List[int]],
|
||||
skip_special_tokens: bool = True,
|
||||
) -> List[str]:
|
||||
"""Decode batch of token ID lists"""
|
||||
return self.decode(ids_list, skip_special_tokens)
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""Return vocabulary size"""
|
||||
return self.vocab_size
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text: Union[str, List[str]],
|
||||
add_bos: Optional[bool] = None,
|
||||
add_eos: Optional[bool] = None,
|
||||
) -> Union[List[int], List[List[int]]]:
|
||||
"""Shorthand for encode"""
|
||||
return self.encode(text, add_bos, add_eos)
|
||||
|
||||
def get_piece(self, token_id: int) -> str:
|
||||
"""Get string piece for token ID"""
|
||||
return self.sp.IdToPiece(token_id)
|
||||
|
||||
def get_id(self, piece: str) -> int:
|
||||
"""Get token ID for string piece"""
|
||||
return self.sp.PieceToId(piece)
|
||||
|
||||
@property
|
||||
def bos_token(self) -> str:
|
||||
"""BOS token string"""
|
||||
return self.sp.IdToPiece(self.bos_id) if self.bos_id >= 0 else ""
|
||||
|
||||
@property
|
||||
def eos_token(self) -> str:
|
||||
"""EOS token string"""
|
||||
return self.sp.IdToPiece(self.eos_id) if self.eos_id >= 0 else ""
|
||||
|
||||
@property
|
||||
def pad_token(self) -> str:
|
||||
"""PAD token string"""
|
||||
return self.sp.IdToPiece(self.pad_id) if self.pad_id >= 0 else ""
|
||||
|
||||
@property
|
||||
def unk_token(self) -> str:
|
||||
"""UNK token string"""
|
||||
return self.sp.IdToPiece(self.unk_id) if self.unk_id >= 0 else ""
|
152
nova_tokenizer/trainer.py
Normal file
152
nova_tokenizer/trainer.py
Normal file
@@ -0,0 +1,152 @@
|
||||
"""
|
||||
SentencePiece tokenizer trainer
|
||||
"""
|
||||
|
||||
import sentencepiece as spm
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
import tempfile
|
||||
|
||||
|
||||
def train_tokenizer(
|
||||
input_files: List[str],
|
||||
model_prefix: str,
|
||||
vocab_size: int = 32000,
|
||||
model_type: str = "bpe", # or "unigram"
|
||||
character_coverage: float = 0.9995,
|
||||
num_threads: int = 4,
|
||||
user_defined_symbols: Optional[List[str]] = None,
|
||||
max_sentence_length: int = 16384,
|
||||
shuffle_input_sentence: bool = True,
|
||||
seed_sentencepiece_size: int = 1000000,
|
||||
**kwargs
|
||||
) -> str:
|
||||
"""
|
||||
Train a SentencePiece tokenizer
|
||||
|
||||
Args:
|
||||
input_files: List of text file paths for training
|
||||
model_prefix: Output model path prefix (will create .model and .vocab files)
|
||||
vocab_size: Target vocabulary size
|
||||
model_type: 'bpe' or 'unigram'
|
||||
character_coverage: Character coverage (0.9995 for multilingual, 1.0 for single language)
|
||||
num_threads: Number of threads for training
|
||||
user_defined_symbols: Optional list of user-defined symbols to add
|
||||
max_sentence_length: Maximum sentence length
|
||||
shuffle_input_sentence: Whether to shuffle input sentences
|
||||
seed_sentencepiece_size: Number of sentences to use for initial seed
|
||||
**kwargs: Additional arguments to pass to SentencePiece trainer
|
||||
|
||||
Returns:
|
||||
Path to trained model file
|
||||
"""
|
||||
# Validate input files
|
||||
for f in input_files:
|
||||
if not Path(f).exists():
|
||||
raise FileNotFoundError(f"Input file not found: {f}")
|
||||
|
||||
# Prepare training arguments
|
||||
train_args = {
|
||||
'input': ','.join(input_files),
|
||||
'model_prefix': model_prefix,
|
||||
'vocab_size': vocab_size,
|
||||
'model_type': model_type,
|
||||
'character_coverage': character_coverage,
|
||||
'num_threads': num_threads,
|
||||
'max_sentence_length': max_sentence_length,
|
||||
'shuffle_input_sentence': shuffle_input_sentence,
|
||||
'seed_sentencepiece_size': seed_sentencepiece_size,
|
||||
|
||||
# Special tokens
|
||||
'pad_id': 0,
|
||||
'unk_id': 1,
|
||||
'bos_id': 2,
|
||||
'eos_id': 3,
|
||||
'pad_piece': '<pad>',
|
||||
'unk_piece': '<unk>',
|
||||
'bos_piece': '<s>',
|
||||
'eos_piece': '</s>',
|
||||
|
||||
# User-defined symbols (e.g., for special control tokens)
|
||||
'user_defined_symbols': user_defined_symbols or [],
|
||||
|
||||
# Normalization
|
||||
'normalization_rule_name': 'nmt_nfkc_cf', # Standard normalization
|
||||
'remove_extra_whitespaces': True,
|
||||
'split_by_unicode_script': True,
|
||||
'split_by_whitespace': True,
|
||||
'split_by_number': True,
|
||||
'split_digits': True,
|
||||
'byte_fallback': True, # Handle unknown bytes
|
||||
}
|
||||
|
||||
# Add any additional kwargs
|
||||
train_args.update(kwargs)
|
||||
|
||||
# Train the model
|
||||
print(f"Training {model_type.upper()} tokenizer with vocab size {vocab_size}...")
|
||||
print(f"Input files: {len(input_files)} file(s)")
|
||||
print(f"Output: {model_prefix}.model")
|
||||
|
||||
spm.SentencePieceTrainer.Train(**{k: str(v) if isinstance(v, list) else v
|
||||
for k, v in train_args.items()})
|
||||
|
||||
model_path = f"{model_prefix}.model"
|
||||
|
||||
# Verify the model was created
|
||||
if not Path(model_path).exists():
|
||||
raise RuntimeError(f"Model training failed - {model_path} not created")
|
||||
|
||||
# Print vocab info
|
||||
sp = spm.SentencePieceProcessor()
|
||||
sp.Load(model_path)
|
||||
print(f"✓ Tokenizer trained successfully!")
|
||||
print(f" Vocabulary size: {sp.vocab_size()}")
|
||||
print(f" BOS token: {sp.IdToPiece(sp.bos_id())} (ID: {sp.bos_id()})")
|
||||
print(f" EOS token: {sp.IdToPiece(sp.eos_id())} (ID: {sp.eos_id()})")
|
||||
print(f" PAD token: {sp.IdToPiece(sp.pad_id())} (ID: {sp.pad_id()})")
|
||||
print(f" UNK token: {sp.IdToPiece(sp.unk_id())} (ID: {sp.unk_id()})")
|
||||
|
||||
return model_path
|
||||
|
||||
|
||||
def train_from_text(
|
||||
texts: List[str],
|
||||
model_prefix: str,
|
||||
vocab_size: int = 32000,
|
||||
model_type: str = "bpe",
|
||||
**kwargs
|
||||
) -> str:
|
||||
"""
|
||||
Train tokenizer directly from list of texts (without needing files)
|
||||
|
||||
Args:
|
||||
texts: List of text strings
|
||||
model_prefix: Output model path prefix
|
||||
vocab_size: Target vocabulary size
|
||||
model_type: 'bpe' or 'unigram'
|
||||
**kwargs: Additional arguments
|
||||
|
||||
Returns:
|
||||
Path to trained model file
|
||||
"""
|
||||
# Write texts to temporary file
|
||||
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt', encoding='utf-8') as f:
|
||||
for text in texts:
|
||||
f.write(text.strip() + '\n')
|
||||
temp_file = f.name
|
||||
|
||||
try:
|
||||
# Train using the temporary file
|
||||
model_path = train_tokenizer(
|
||||
input_files=[temp_file],
|
||||
model_prefix=model_prefix,
|
||||
vocab_size=vocab_size,
|
||||
model_type=model_type,
|
||||
**kwargs
|
||||
)
|
||||
finally:
|
||||
# Clean up temp file
|
||||
Path(temp_file).unlink(missing_ok=True)
|
||||
|
||||
return model_path
|
Reference in New Issue
Block a user