Initial commit: NOVA - Neuro-Optimizing Versatile Agent

Complete transformer LLM built from scratch with:

Core Features:
- Full transformer architecture (RoPE, RMSNorm, SwiGLU, KV-cache)
- SentencePiece tokenizer (BPE/Unigram)
- Training pipeline (AMP, gradient checkpointing, DDP)
- Persona system with personality matrix (NO AI disclosure by default)
- Genetic evolution (NOVA-EVO) for hyperparameter optimization
- Legal-only data pipeline with license tracking
- Chat interface (CLI + REST API)
- Conversation memory (SQLite)

Model Sizes:
- 125M, 350M, 1.3B, 3B parameters
- Local-first, runs on CPU or GPU
- Python 3.10.6+, PyTorch 2.0+

Personas:
- girlfriend_gentle (high warmth, high empathy)
- girlfriend_playful (high humor, high playfulness)
- girlfriend_supportive (balanced, default)

Documentation:
- Complete README with quickstart
- Model card with ethical considerations
- Privacy documentation (local-first, zero telemetry)
- Data licenses and attribution
- Contributing guide

Infrastructure:
- GitHub Actions CI/CD
- Comprehensive test suite
- Quickstart script
- CLI tool

License: Apache 2.0

🤖 Generated with Claude Code
https://claude.com/claude-code

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-10-12 20:56:37 -04:00
commit a7f091aa45
50 changed files with 6437 additions and 0 deletions

157
nova_tokenizer/tokenizer.py Normal file
View File

@@ -0,0 +1,157 @@
"""
NOVA Tokenizer - SentencePiece-based tokenization
"""
import sentencepiece as spm
from typing import List, Union, Optional
import os
class NovaTokenizer:
"""
SentencePiece tokenizer for NOVA
Supports both BPE and Unigram models with special tokens
"""
def __init__(
self,
model_path: str,
add_bos: bool = True,
add_eos: bool = True,
):
"""
Args:
model_path: Path to SentencePiece model file (.model)
add_bos: Whether to add BOS token by default
add_eos: Whether to add EOS token by default
"""
if not os.path.exists(model_path):
raise FileNotFoundError(f"Tokenizer model not found: {model_path}")
self.sp = spm.SentencePieceProcessor()
self.sp.Load(model_path)
self.add_bos = add_bos
self.add_eos = add_eos
# Special token IDs
self.bos_id = self.sp.bos_id()
self.eos_id = self.sp.eos_id()
self.pad_id = self.sp.pad_id()
self.unk_id = self.sp.unk_id()
# Vocabulary info
self.vocab_size = self.sp.vocab_size()
def encode(
self,
text: Union[str, List[str]],
add_bos: Optional[bool] = None,
add_eos: Optional[bool] = None,
) -> Union[List[int], List[List[int]]]:
"""
Encode text to token IDs
Args:
text: Single string or list of strings
add_bos: Override default BOS behavior
add_eos: Override default EOS behavior
Returns:
Token IDs (single list or list of lists)
"""
add_bos = self.add_bos if add_bos is None else add_bos
add_eos = self.add_eos if add_eos is None else add_eos
if isinstance(text, str):
ids = self.sp.Encode(text)
if add_bos:
ids = [self.bos_id] + ids
if add_eos:
ids = ids + [self.eos_id]
return ids
else:
return [self.encode(t, add_bos, add_eos) for t in text]
def decode(
self,
ids: Union[List[int], List[List[int]]],
skip_special_tokens: bool = True,
) -> Union[str, List[str]]:
"""
Decode token IDs to text
Args:
ids: Single list of IDs or list of lists
skip_special_tokens: Whether to remove special tokens
Returns:
Decoded text (single string or list of strings)
"""
if isinstance(ids[0], list):
return [self.decode(i, skip_special_tokens) for i in ids]
if skip_special_tokens:
# Remove BOS, EOS, PAD tokens
ids = [i for i in ids if i not in [self.bos_id, self.eos_id, self.pad_id]]
return self.sp.Decode(ids)
def encode_batch(
self,
texts: List[str],
add_bos: Optional[bool] = None,
add_eos: Optional[bool] = None,
) -> List[List[int]]:
"""Encode batch of texts"""
return self.encode(texts, add_bos, add_eos)
def decode_batch(
self,
ids_list: List[List[int]],
skip_special_tokens: bool = True,
) -> List[str]:
"""Decode batch of token ID lists"""
return self.decode(ids_list, skip_special_tokens)
def __len__(self) -> int:
"""Return vocabulary size"""
return self.vocab_size
def __call__(
self,
text: Union[str, List[str]],
add_bos: Optional[bool] = None,
add_eos: Optional[bool] = None,
) -> Union[List[int], List[List[int]]]:
"""Shorthand for encode"""
return self.encode(text, add_bos, add_eos)
def get_piece(self, token_id: int) -> str:
"""Get string piece for token ID"""
return self.sp.IdToPiece(token_id)
def get_id(self, piece: str) -> int:
"""Get token ID for string piece"""
return self.sp.PieceToId(piece)
@property
def bos_token(self) -> str:
"""BOS token string"""
return self.sp.IdToPiece(self.bos_id) if self.bos_id >= 0 else ""
@property
def eos_token(self) -> str:
"""EOS token string"""
return self.sp.IdToPiece(self.eos_id) if self.eos_id >= 0 else ""
@property
def pad_token(self) -> str:
"""PAD token string"""
return self.sp.IdToPiece(self.pad_id) if self.pad_id >= 0 else ""
@property
def unk_token(self) -> str:
"""UNK token string"""
return self.sp.IdToPiece(self.unk_id) if self.unk_id >= 0 else ""