Initial commit: NOVA - Neuro-Optimizing Versatile Agent

Complete transformer LLM built from scratch with:

Core Features:
- Full transformer architecture (RoPE, RMSNorm, SwiGLU, KV-cache)
- SentencePiece tokenizer (BPE/Unigram)
- Training pipeline (AMP, gradient checkpointing, DDP)
- Persona system with personality matrix (NO AI disclosure by default)
- Genetic evolution (NOVA-EVO) for hyperparameter optimization
- Legal-only data pipeline with license tracking
- Chat interface (CLI + REST API)
- Conversation memory (SQLite)

Model Sizes:
- 125M, 350M, 1.3B, 3B parameters
- Local-first, runs on CPU or GPU
- Python 3.10.6+, PyTorch 2.0+

Personas:
- girlfriend_gentle (high warmth, high empathy)
- girlfriend_playful (high humor, high playfulness)
- girlfriend_supportive (balanced, default)

Documentation:
- Complete README with quickstart
- Model card with ethical considerations
- Privacy documentation (local-first, zero telemetry)
- Data licenses and attribution
- Contributing guide

Infrastructure:
- GitHub Actions CI/CD
- Comprehensive test suite
- Quickstart script
- CLI tool

License: Apache 2.0

🤖 Generated with Claude Code
https://claude.com/claude-code

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-10-12 20:56:37 -04:00
commit a7f091aa45
50 changed files with 6437 additions and 0 deletions

152
nova_tokenizer/trainer.py Normal file
View File

@@ -0,0 +1,152 @@
"""
SentencePiece tokenizer trainer
"""
import sentencepiece as spm
from pathlib import Path
from typing import List, Optional
import tempfile
def train_tokenizer(
input_files: List[str],
model_prefix: str,
vocab_size: int = 32000,
model_type: str = "bpe", # or "unigram"
character_coverage: float = 0.9995,
num_threads: int = 4,
user_defined_symbols: Optional[List[str]] = None,
max_sentence_length: int = 16384,
shuffle_input_sentence: bool = True,
seed_sentencepiece_size: int = 1000000,
**kwargs
) -> str:
"""
Train a SentencePiece tokenizer
Args:
input_files: List of text file paths for training
model_prefix: Output model path prefix (will create .model and .vocab files)
vocab_size: Target vocabulary size
model_type: 'bpe' or 'unigram'
character_coverage: Character coverage (0.9995 for multilingual, 1.0 for single language)
num_threads: Number of threads for training
user_defined_symbols: Optional list of user-defined symbols to add
max_sentence_length: Maximum sentence length
shuffle_input_sentence: Whether to shuffle input sentences
seed_sentencepiece_size: Number of sentences to use for initial seed
**kwargs: Additional arguments to pass to SentencePiece trainer
Returns:
Path to trained model file
"""
# Validate input files
for f in input_files:
if not Path(f).exists():
raise FileNotFoundError(f"Input file not found: {f}")
# Prepare training arguments
train_args = {
'input': ','.join(input_files),
'model_prefix': model_prefix,
'vocab_size': vocab_size,
'model_type': model_type,
'character_coverage': character_coverage,
'num_threads': num_threads,
'max_sentence_length': max_sentence_length,
'shuffle_input_sentence': shuffle_input_sentence,
'seed_sentencepiece_size': seed_sentencepiece_size,
# Special tokens
'pad_id': 0,
'unk_id': 1,
'bos_id': 2,
'eos_id': 3,
'pad_piece': '<pad>',
'unk_piece': '<unk>',
'bos_piece': '<s>',
'eos_piece': '</s>',
# User-defined symbols (e.g., for special control tokens)
'user_defined_symbols': user_defined_symbols or [],
# Normalization
'normalization_rule_name': 'nmt_nfkc_cf', # Standard normalization
'remove_extra_whitespaces': True,
'split_by_unicode_script': True,
'split_by_whitespace': True,
'split_by_number': True,
'split_digits': True,
'byte_fallback': True, # Handle unknown bytes
}
# Add any additional kwargs
train_args.update(kwargs)
# Train the model
print(f"Training {model_type.upper()} tokenizer with vocab size {vocab_size}...")
print(f"Input files: {len(input_files)} file(s)")
print(f"Output: {model_prefix}.model")
spm.SentencePieceTrainer.Train(**{k: str(v) if isinstance(v, list) else v
for k, v in train_args.items()})
model_path = f"{model_prefix}.model"
# Verify the model was created
if not Path(model_path).exists():
raise RuntimeError(f"Model training failed - {model_path} not created")
# Print vocab info
sp = spm.SentencePieceProcessor()
sp.Load(model_path)
print(f"✓ Tokenizer trained successfully!")
print(f" Vocabulary size: {sp.vocab_size()}")
print(f" BOS token: {sp.IdToPiece(sp.bos_id())} (ID: {sp.bos_id()})")
print(f" EOS token: {sp.IdToPiece(sp.eos_id())} (ID: {sp.eos_id()})")
print(f" PAD token: {sp.IdToPiece(sp.pad_id())} (ID: {sp.pad_id()})")
print(f" UNK token: {sp.IdToPiece(sp.unk_id())} (ID: {sp.unk_id()})")
return model_path
def train_from_text(
texts: List[str],
model_prefix: str,
vocab_size: int = 32000,
model_type: str = "bpe",
**kwargs
) -> str:
"""
Train tokenizer directly from list of texts (without needing files)
Args:
texts: List of text strings
model_prefix: Output model path prefix
vocab_size: Target vocabulary size
model_type: 'bpe' or 'unigram'
**kwargs: Additional arguments
Returns:
Path to trained model file
"""
# Write texts to temporary file
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt', encoding='utf-8') as f:
for text in texts:
f.write(text.strip() + '\n')
temp_file = f.name
try:
# Train using the temporary file
model_path = train_tokenizer(
input_files=[temp_file],
model_prefix=model_prefix,
vocab_size=vocab_size,
model_type=model_type,
**kwargs
)
finally:
# Clean up temp file
Path(temp_file).unlink(missing_ok=True)
return model_path