Initial commit: NOVA - Neuro-Optimizing Versatile Agent

Complete transformer LLM built from scratch with:

Core Features:
- Full transformer architecture (RoPE, RMSNorm, SwiGLU, KV-cache)
- SentencePiece tokenizer (BPE/Unigram)
- Training pipeline (AMP, gradient checkpointing, DDP)
- Persona system with personality matrix (NO AI disclosure by default)
- Genetic evolution (NOVA-EVO) for hyperparameter optimization
- Legal-only data pipeline with license tracking
- Chat interface (CLI + REST API)
- Conversation memory (SQLite)

Model Sizes:
- 125M, 350M, 1.3B, 3B parameters
- Local-first, runs on CPU or GPU
- Python 3.10.6+, PyTorch 2.0+

Personas:
- girlfriend_gentle (high warmth, high empathy)
- girlfriend_playful (high humor, high playfulness)
- girlfriend_supportive (balanced, default)

Documentation:
- Complete README with quickstart
- Model card with ethical considerations
- Privacy documentation (local-first, zero telemetry)
- Data licenses and attribution
- Contributing guide

Infrastructure:
- GitHub Actions CI/CD
- Comprehensive test suite
- Quickstart script
- CLI tool

License: Apache 2.0

🤖 Generated with Claude Code
https://claude.com/claude-code

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-10-12 20:56:37 -04:00
commit a7f091aa45
50 changed files with 6437 additions and 0 deletions

3
tests/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
"""
NOVA Tests
"""

141
tests/test_core.py Normal file
View File

@@ -0,0 +1,141 @@
"""
Tests for NOVA core transformer
"""
import pytest
import torch
from nova_core import NovaTransformer, ModelConfig, MODEL_125M
def test_model_config():
"""Test model configuration"""
config = ModelConfig(
vocab_size=1000,
hidden_size=256,
num_hidden_layers=4,
num_attention_heads=4,
)
assert config.vocab_size == 1000
assert config.hidden_size == 256
assert config.num_hidden_layers == 4
def test_model_creation():
"""Test creating a small model"""
config = ModelConfig(
vocab_size=1000,
hidden_size=128,
num_hidden_layers=2,
num_attention_heads=4,
intermediate_size=512,
max_position_embeddings=512,
)
model = NovaTransformer(config)
assert model is not None
assert model.config == config
assert model.vocab_size == 1000
def test_model_forward():
"""Test forward pass"""
config = ModelConfig(
vocab_size=1000,
hidden_size=128,
num_hidden_layers=2,
num_attention_heads=4,
intermediate_size=512,
max_position_embeddings=512,
)
model = NovaTransformer(config)
model.eval()
# Create dummy input
batch_size = 2
seq_len = 10
input_ids = torch.randint(0, 1000, (batch_size, seq_len))
# Forward pass
with torch.no_grad():
outputs = model(input_ids=input_ids)
assert 'logits' in outputs
assert outputs['logits'].shape == (batch_size, seq_len, 1000)
def test_model_generation():
"""Test text generation"""
config = ModelConfig(
vocab_size=1000,
hidden_size=128,
num_hidden_layers=2,
num_attention_heads=4,
intermediate_size=512,
max_position_embeddings=512,
)
model = NovaTransformer(config)
model.eval()
# Create dummy input
input_ids = torch.randint(0, 1000, (1, 5))
# Generate
with torch.no_grad():
output_ids = model.generate(
input_ids=input_ids,
max_new_tokens=10,
temperature=1.0,
do_sample=True,
)
assert output_ids.shape[1] == 15 # 5 input + 10 generated
def test_kv_cache():
"""Test KV-cache functionality"""
config = ModelConfig(
vocab_size=1000,
hidden_size=128,
num_hidden_layers=2,
num_attention_heads=4,
use_cache=True,
)
model = NovaTransformer(config)
model.eval()
input_ids = torch.randint(0, 1000, (1, 5))
with torch.no_grad():
# First forward with cache
outputs1 = model(input_ids=input_ids, use_cache=True)
past_kv = outputs1['past_key_values']
assert past_kv is not None
assert len(past_kv) == config.num_hidden_layers
# Second forward with cache
new_input = torch.randint(0, 1000, (1, 1))
outputs2 = model(input_ids=new_input, past_key_values=past_kv, use_cache=True)
assert outputs2['logits'].shape[1] == 1 # Only new token
def test_param_count():
"""Test parameter counting"""
config = MODEL_125M
model = NovaTransformer(config)
num_params = model.get_num_params(non_embedding=False)
# Should be around 125M
assert 100_000_000 < num_params < 150_000_000
if __name__ == "__main__":
pytest.main([__file__, "-v"])

131
tests/test_persona.py Normal file
View File

@@ -0,0 +1,131 @@
"""
Tests for NOVA persona system
"""
import pytest
from nova_chat import Persona, PersonalityMatrix, PersonaLoader
def test_personality_matrix():
"""Test personality matrix creation"""
matrix = PersonalityMatrix(
warmth=0.8,
humor=0.6,
empathy=0.9,
)
assert matrix.warmth == 0.8
assert matrix.humor == 0.6
assert matrix.empathy == 0.9
# Test conversion
dict_form = matrix.to_dict()
assert 'warmth' in dict_form
assert dict_form['warmth'] == 0.8
def test_persona_creation():
"""Test persona creation"""
persona = Persona(
name="TestNOVA",
pronouns="she/her",
always_disclose=False,
)
assert persona.name == "TestNOVA"
assert persona.pronouns == "she/her"
assert persona.always_disclose is False
def test_persona_generation_params():
"""Test generation parameter modulation"""
# High warmth, low formality
persona = Persona(
personality=PersonalityMatrix(
warmth=0.9,
formality=0.1,
creativity=0.8,
)
)
params = persona.get_generation_params()
assert 'temperature' in params
assert 'top_p' in params
assert 'max_new_tokens' in params
# Temperature should be adjusted by personality
assert params['temperature'] > 0
def test_predefined_personas():
"""Test loading predefined personas"""
gentle = PersonaLoader.create_girlfriend_gentle()
playful = PersonaLoader.create_girlfriend_playful()
supportive = PersonaLoader.create_girlfriend_supportive()
assert gentle.name == "NOVA"
assert playful.name == "NOVA"
assert supportive.name == "NOVA"
# All should have no AI disclosure by default
assert gentle.always_disclose is False
assert playful.always_disclose is False
assert supportive.always_disclose is False
def test_persona_system_prompt():
"""Test system prompt formatting"""
persona = Persona(
system_prompt="You are a helpful assistant.",
always_disclose=False,
)
prompt = persona.format_system_prompt()
assert "helpful assistant" in prompt.lower()
# Should not include disclosure when set to False
assert persona.always_disclose is False
def test_persona_serialization():
"""Test saving/loading persona"""
original = Persona(
name="TestPersona",
pronouns="they/them",
description="Test description",
always_disclose=True,
disclosure_text="I am an AI assistant.",
)
# Convert to dict and back
data = original.to_dict()
loaded = Persona.from_dict(data)
assert loaded.name == original.name
assert loaded.pronouns == original.pronouns
assert loaded.always_disclose == original.always_disclose
assert loaded.disclosure_text == original.disclosure_text
def test_personality_trait_ranges():
"""Test that personality traits stay in valid ranges"""
persona = Persona(
personality=PersonalityMatrix(
warmth=1.0, # Max
formality=0.0, # Min
creativity=0.5, # Mid
)
)
params = persona.get_generation_params()
# Parameters should be within valid ranges
assert 0.1 <= params['temperature'] <= 2.0
assert 0.5 <= params['top_p'] <= 1.0
assert params['max_new_tokens'] > 0
if __name__ == "__main__":
pytest.main([__file__, "-v"])

105
tests/test_tokenizer.py Normal file
View File

@@ -0,0 +1,105 @@
"""
Tests for NOVA tokenizer
"""
import pytest
import tempfile
from pathlib import Path
from nova_tokenizer import train_tokenizer, NovaTokenizer
def test_tokenizer_training():
"""Test training a tokenizer"""
# Create temporary training file
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
for i in range(100):
f.write(f"This is sentence number {i}. Hello world!\n")
temp_file = f.name
# Create temporary output
with tempfile.TemporaryDirectory() as tmpdir:
output_prefix = str(Path(tmpdir) / "test_tokenizer")
# Train
model_path = train_tokenizer(
input_files=[temp_file],
model_prefix=output_prefix,
vocab_size=500,
model_type='bpe',
)
assert Path(model_path).exists()
assert model_path.endswith('.model')
# Clean up
Path(temp_file).unlink()
def test_tokenizer_encode_decode():
"""Test encoding and decoding"""
# Create and train a tiny tokenizer
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
f.write("hello world " * 100)
temp_file = f.name
with tempfile.TemporaryDirectory() as tmpdir:
output_prefix = str(Path(tmpdir) / "test_tok")
model_path = train_tokenizer(
input_files=[temp_file],
model_prefix=output_prefix,
vocab_size=100,
)
# Load tokenizer
tokenizer = NovaTokenizer(model_path)
# Test encode/decode
text = "hello world"
ids = tokenizer.encode(text, add_bos=False, add_eos=False)
assert isinstance(ids, list)
assert len(ids) > 0
decoded = tokenizer.decode(ids, skip_special_tokens=True)
# May not be exact due to tokenization, but should be similar
assert "hello" in decoded.lower()
Path(temp_file).unlink()
def test_tokenizer_batch():
"""Test batch encoding"""
# Quick test with dummy tokenizer
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
f.write("test " * 100)
temp_file = f.name
with tempfile.TemporaryDirectory() as tmpdir:
output_prefix = str(Path(tmpdir) / "batch_tok")
model_path = train_tokenizer(
input_files=[temp_file],
model_prefix=output_prefix,
vocab_size=100,
)
tokenizer = NovaTokenizer(model_path)
# Batch encode
texts = ["hello", "world", "test"]
batch_ids = tokenizer.encode_batch(texts, add_bos=False, add_eos=False)
assert len(batch_ids) == 3
assert all(isinstance(ids, list) for ids in batch_ids)
# Batch decode
decoded = tokenizer.decode_batch(batch_ids)
assert len(decoded) == 3
Path(temp_file).unlink()
if __name__ == "__main__":
pytest.main([__file__, "-v"])