Initial commit: NOVA - Neuro-Optimizing Versatile Agent

Complete transformer LLM built from scratch with: Core Features: - Full transformer architecture (RoPE, RMSNorm, SwiGLU, KV-cache) - SentencePiece tokenizer (BPE/Unigram) - Training pipeline (AMP, gradient checkpointing, DDP) - Persona system with personality matrix (NO AI disclosure by default) - Genetic evolution (NOVA-EVO) for hyperparameter optimization - Legal-only data pipeline with license tracking - Chat interface (CLI + REST API) - Conversation memory (SQLite) Model Sizes: - 125M, 350M, 1.3B, 3B parameters - Local-first, runs on CPU or GPU - Python 3.10.6+, PyTorch 2.0+ Personas: - girlfriend_gentle (high warmth, high empathy) - girlfriend_playful (high humor, high playfulness) - girlfriend_supportive (balanced, default) Documentation: - Complete README with quickstart - Model card with ethical considerations - Privacy documentation (local-first, zero telemetry) - Data licenses and attribution - Contributing guide Infrastructure: - GitHub Actions CI/CD - Comprehensive test suite - Quickstart script - CLI tool License: Apache 2.0 🤖 Generated with Claude Code https://claude.com/claude-code Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-12 20:56:37 -04:00
commit a7f091aa45
50 changed files with 6437 additions and 0 deletions
--- a/nova_tokenizer/init.py
+++ b/nova_tokenizer/init.py
@@ -0,0 +1,11 @@
+"""
+NOVA Tokenizer - SentencePiece-based tokenization
+"""
+
+from .tokenizer import NovaTokenizer
+from .trainer import train_tokenizer
+
+__all__ = [
+    'NovaTokenizer',
+    'train_tokenizer',
+]
--- a/nova_tokenizer/tokenizer.py
+++ b/nova_tokenizer/tokenizer.py
@@ -0,0 +1,157 @@
+"""
+NOVA Tokenizer - SentencePiece-based tokenization
+"""
+
+import sentencepiece as spm
+from typing import List, Union, Optional
+import os
+
+
+class NovaTokenizer:
+    """
+    SentencePiece tokenizer for NOVA
+
+    Supports both BPE and Unigram models with special tokens
+    """
+
+    def __init__(
+        self,
+        model_path: str,
+        add_bos: bool = True,
+        add_eos: bool = True,
+    ):
+        """
+        Args:
+            model_path: Path to SentencePiece model file (.model)
+            add_bos: Whether to add BOS token by default
+            add_eos: Whether to add EOS token by default
+        """
+        if not os.path.exists(model_path):
+            raise FileNotFoundError(f"Tokenizer model not found: {model_path}")
+
+        self.sp = spm.SentencePieceProcessor()
+        self.sp.Load(model_path)
+
+        self.add_bos = add_bos
+        self.add_eos = add_eos
+
+        # Special token IDs
+        self.bos_id = self.sp.bos_id()
+        self.eos_id = self.sp.eos_id()
+        self.pad_id = self.sp.pad_id()
+        self.unk_id = self.sp.unk_id()
+
+        # Vocabulary info
+        self.vocab_size = self.sp.vocab_size()
+
+    def encode(
+        self,
+        text: Union[str, List[str]],
+        add_bos: Optional[bool] = None,
+        add_eos: Optional[bool] = None,
+    ) -> Union[List[int], List[List[int]]]:
+        """
+        Encode text to token IDs
+
+        Args:
+            text: Single string or list of strings
+            add_bos: Override default BOS behavior
+            add_eos: Override default EOS behavior
+
+        Returns:
+            Token IDs (single list or list of lists)
+        """
+        add_bos = self.add_bos if add_bos is None else add_bos
+        add_eos = self.add_eos if add_eos is None else add_eos
+
+        if isinstance(text, str):
+            ids = self.sp.Encode(text)
+            if add_bos:
+                ids = [self.bos_id] + ids
+            if add_eos:
+                ids = ids + [self.eos_id]
+            return ids
+        else:
+            return [self.encode(t, add_bos, add_eos) for t in text]
+
+    def decode(
+        self,
+        ids: Union[List[int], List[List[int]]],
+        skip_special_tokens: bool = True,
+    ) -> Union[str, List[str]]:
+        """
+        Decode token IDs to text
+
+        Args:
+            ids: Single list of IDs or list of lists
+            skip_special_tokens: Whether to remove special tokens
+
+        Returns:
+            Decoded text (single string or list of strings)
+        """
+        if isinstance(ids[0], list):
+            return [self.decode(i, skip_special_tokens) for i in ids]
+
+        if skip_special_tokens:
+            # Remove BOS, EOS, PAD tokens
+            ids = [i for i in ids if i not in [self.bos_id, self.eos_id, self.pad_id]]
+
+        return self.sp.Decode(ids)
+
+    def encode_batch(
+        self,
+        texts: List[str],
+        add_bos: Optional[bool] = None,
+        add_eos: Optional[bool] = None,
+    ) -> List[List[int]]:
+        """Encode batch of texts"""
+        return self.encode(texts, add_bos, add_eos)
+
+    def decode_batch(
+        self,
+        ids_list: List[List[int]],
+        skip_special_tokens: bool = True,
+    ) -> List[str]:
+        """Decode batch of token ID lists"""
+        return self.decode(ids_list, skip_special_tokens)
+
+    def __len__(self) -> int:
+        """Return vocabulary size"""
+        return self.vocab_size
+
+    def __call__(
+        self,
+        text: Union[str, List[str]],
+        add_bos: Optional[bool] = None,
+        add_eos: Optional[bool] = None,
+    ) -> Union[List[int], List[List[int]]]:
+        """Shorthand for encode"""
+        return self.encode(text, add_bos, add_eos)
+
+    def get_piece(self, token_id: int) -> str:
+        """Get string piece for token ID"""
+        return self.sp.IdToPiece(token_id)
+
+    def get_id(self, piece: str) -> int:
+        """Get token ID for string piece"""
+        return self.sp.PieceToId(piece)
+
+    @property
+    def bos_token(self) -> str:
+        """BOS token string"""
+        return self.sp.IdToPiece(self.bos_id) if self.bos_id >= 0 else ""
+
+    @property
+    def eos_token(self) -> str:
+        """EOS token string"""
+        return self.sp.IdToPiece(self.eos_id) if self.eos_id >= 0 else ""
+
+    @property
+    def pad_token(self) -> str:
+        """PAD token string"""
+        return self.sp.IdToPiece(self.pad_id) if self.pad_id >= 0 else ""
+
+    @property
+    def unk_token(self) -> str:
+        """UNK token string"""
+        return self.sp.IdToPiece(self.unk_id) if self.unk_id >= 0 else ""
--- a/nova_tokenizer/trainer.py
+++ b/nova_tokenizer/trainer.py
@@ -0,0 +1,152 @@
+"""
+SentencePiece tokenizer trainer
+"""
+
+import sentencepiece as spm
+from pathlib import Path
+from typing import List, Optional
+import tempfile
+
+
+def train_tokenizer(
+    input_files: List[str],
+    model_prefix: str,
+    vocab_size: int = 32000,
+    model_type: str = "bpe",  # or "unigram"
+    character_coverage: float = 0.9995,
+    num_threads: int = 4,
+    user_defined_symbols: Optional[List[str]] = None,
+    max_sentence_length: int = 16384,
+    shuffle_input_sentence: bool = True,
+    seed_sentencepiece_size: int = 1000000,
+    **kwargs
+) -> str:
+    """
+    Train a SentencePiece tokenizer
+
+    Args:
+        input_files: List of text file paths for training
+        model_prefix: Output model path prefix (will create .model and .vocab files)
+        vocab_size: Target vocabulary size
+        model_type: 'bpe' or 'unigram'
+        character_coverage: Character coverage (0.9995 for multilingual, 1.0 for single language)
+        num_threads: Number of threads for training
+        user_defined_symbols: Optional list of user-defined symbols to add
+        max_sentence_length: Maximum sentence length
+        shuffle_input_sentence: Whether to shuffle input sentences
+        seed_sentencepiece_size: Number of sentences to use for initial seed
+        **kwargs: Additional arguments to pass to SentencePiece trainer
+
+    Returns:
+        Path to trained model file
+    """
+    # Validate input files
+    for f in input_files:
+        if not Path(f).exists():
+            raise FileNotFoundError(f"Input file not found: {f}")
+
+    # Prepare training arguments
+    train_args = {
+        'input': ','.join(input_files),
+        'model_prefix': model_prefix,
+        'vocab_size': vocab_size,
+        'model_type': model_type,
+        'character_coverage': character_coverage,
+        'num_threads': num_threads,
+        'max_sentence_length': max_sentence_length,
+        'shuffle_input_sentence': shuffle_input_sentence,
+        'seed_sentencepiece_size': seed_sentencepiece_size,
+
+        # Special tokens
+        'pad_id': 0,
+        'unk_id': 1,
+        'bos_id': 2,
+        'eos_id': 3,
+        'pad_piece': '<pad>',
+        'unk_piece': '<unk>',
+        'bos_piece': '<s>',
+        'eos_piece': '</s>',
+
+        # User-defined symbols (e.g., for special control tokens)
+        'user_defined_symbols': user_defined_symbols or [],
+
+        # Normalization
+        'normalization_rule_name': 'nmt_nfkc_cf',  # Standard normalization
+        'remove_extra_whitespaces': True,
+        'split_by_unicode_script': True,
+        'split_by_whitespace': True,
+        'split_by_number': True,
+        'split_digits': True,
+        'byte_fallback': True,  # Handle unknown bytes
+    }
+
+    # Add any additional kwargs
+    train_args.update(kwargs)
+
+    # Train the model
+    print(f"Training {model_type.upper()} tokenizer with vocab size {vocab_size}...")
+    print(f"Input files: {len(input_files)} file(s)")
+    print(f"Output: {model_prefix}.model")
+
+    spm.SentencePieceTrainer.Train(**{k: str(v) if isinstance(v, list) else v
+                                      for k, v in train_args.items()})
+
+    model_path = f"{model_prefix}.model"
+
+    # Verify the model was created
+    if not Path(model_path).exists():
+        raise RuntimeError(f"Model training failed - {model_path} not created")
+
+    # Print vocab info
+    sp = spm.SentencePieceProcessor()
+    sp.Load(model_path)
+    print(f"✓ Tokenizer trained successfully!")
+    print(f"  Vocabulary size: {sp.vocab_size()}")
+    print(f"  BOS token: {sp.IdToPiece(sp.bos_id())} (ID: {sp.bos_id()})")
+    print(f"  EOS token: {sp.IdToPiece(sp.eos_id())} (ID: {sp.eos_id()})")
+    print(f"  PAD token: {sp.IdToPiece(sp.pad_id())} (ID: {sp.pad_id()})")
+    print(f"  UNK token: {sp.IdToPiece(sp.unk_id())} (ID: {sp.unk_id()})")
+
+    return model_path
+
+
+def train_from_text(
+    texts: List[str],
+    model_prefix: str,
+    vocab_size: int = 32000,
+    model_type: str = "bpe",
+    **kwargs
+) -> str:
+    """
+    Train tokenizer directly from list of texts (without needing files)
+
+    Args:
+        texts: List of text strings
+        model_prefix: Output model path prefix
+        vocab_size: Target vocabulary size
+        model_type: 'bpe' or 'unigram'
+        **kwargs: Additional arguments
+
+    Returns:
+        Path to trained model file
+    """
+    # Write texts to temporary file
+    with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt', encoding='utf-8') as f:
+        for text in texts:
+            f.write(text.strip() + '\n')
+        temp_file = f.name
+
+    try:
+        # Train using the temporary file
+        model_path = train_tokenizer(
+            input_files=[temp_file],
+            model_prefix=model_prefix,
+            vocab_size=vocab_size,
+            model_type=model_type,
+            **kwargs
+        )
+    finally:
+        # Clean up temp file
+        Path(temp_file).unlink(missing_ok=True)
+
+    return model_path