Initial commit: NOVA - Neuro-Optimizing Versatile Agent
Complete transformer LLM built from scratch with: Core Features: - Full transformer architecture (RoPE, RMSNorm, SwiGLU, KV-cache) - SentencePiece tokenizer (BPE/Unigram) - Training pipeline (AMP, gradient checkpointing, DDP) - Persona system with personality matrix (NO AI disclosure by default) - Genetic evolution (NOVA-EVO) for hyperparameter optimization - Legal-only data pipeline with license tracking - Chat interface (CLI + REST API) - Conversation memory (SQLite) Model Sizes: - 125M, 350M, 1.3B, 3B parameters - Local-first, runs on CPU or GPU - Python 3.10.6+, PyTorch 2.0+ Personas: - girlfriend_gentle (high warmth, high empathy) - girlfriend_playful (high humor, high playfulness) - girlfriend_supportive (balanced, default) Documentation: - Complete README with quickstart - Model card with ethical considerations - Privacy documentation (local-first, zero telemetry) - Data licenses and attribution - Contributing guide Infrastructure: - GitHub Actions CI/CD - Comprehensive test suite - Quickstart script - CLI tool License: Apache 2.0 🤖 Generated with Claude Code https://claude.com/claude-code Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
3
tests/__init__.py
Normal file
3
tests/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
"""
|
||||
NOVA Tests
|
||||
"""
|
141
tests/test_core.py
Normal file
141
tests/test_core.py
Normal file
@@ -0,0 +1,141 @@
|
||||
"""
|
||||
Tests for NOVA core transformer
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from nova_core import NovaTransformer, ModelConfig, MODEL_125M
|
||||
|
||||
|
||||
def test_model_config():
|
||||
"""Test model configuration"""
|
||||
config = ModelConfig(
|
||||
vocab_size=1000,
|
||||
hidden_size=256,
|
||||
num_hidden_layers=4,
|
||||
num_attention_heads=4,
|
||||
)
|
||||
|
||||
assert config.vocab_size == 1000
|
||||
assert config.hidden_size == 256
|
||||
assert config.num_hidden_layers == 4
|
||||
|
||||
|
||||
def test_model_creation():
|
||||
"""Test creating a small model"""
|
||||
config = ModelConfig(
|
||||
vocab_size=1000,
|
||||
hidden_size=128,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=512,
|
||||
max_position_embeddings=512,
|
||||
)
|
||||
|
||||
model = NovaTransformer(config)
|
||||
|
||||
assert model is not None
|
||||
assert model.config == config
|
||||
assert model.vocab_size == 1000
|
||||
|
||||
|
||||
def test_model_forward():
|
||||
"""Test forward pass"""
|
||||
config = ModelConfig(
|
||||
vocab_size=1000,
|
||||
hidden_size=128,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=512,
|
||||
max_position_embeddings=512,
|
||||
)
|
||||
|
||||
model = NovaTransformer(config)
|
||||
model.eval()
|
||||
|
||||
# Create dummy input
|
||||
batch_size = 2
|
||||
seq_len = 10
|
||||
input_ids = torch.randint(0, 1000, (batch_size, seq_len))
|
||||
|
||||
# Forward pass
|
||||
with torch.no_grad():
|
||||
outputs = model(input_ids=input_ids)
|
||||
|
||||
assert 'logits' in outputs
|
||||
assert outputs['logits'].shape == (batch_size, seq_len, 1000)
|
||||
|
||||
|
||||
def test_model_generation():
|
||||
"""Test text generation"""
|
||||
config = ModelConfig(
|
||||
vocab_size=1000,
|
||||
hidden_size=128,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=512,
|
||||
max_position_embeddings=512,
|
||||
)
|
||||
|
||||
model = NovaTransformer(config)
|
||||
model.eval()
|
||||
|
||||
# Create dummy input
|
||||
input_ids = torch.randint(0, 1000, (1, 5))
|
||||
|
||||
# Generate
|
||||
with torch.no_grad():
|
||||
output_ids = model.generate(
|
||||
input_ids=input_ids,
|
||||
max_new_tokens=10,
|
||||
temperature=1.0,
|
||||
do_sample=True,
|
||||
)
|
||||
|
||||
assert output_ids.shape[1] == 15 # 5 input + 10 generated
|
||||
|
||||
|
||||
def test_kv_cache():
|
||||
"""Test KV-cache functionality"""
|
||||
config = ModelConfig(
|
||||
vocab_size=1000,
|
||||
hidden_size=128,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
use_cache=True,
|
||||
)
|
||||
|
||||
model = NovaTransformer(config)
|
||||
model.eval()
|
||||
|
||||
input_ids = torch.randint(0, 1000, (1, 5))
|
||||
|
||||
with torch.no_grad():
|
||||
# First forward with cache
|
||||
outputs1 = model(input_ids=input_ids, use_cache=True)
|
||||
past_kv = outputs1['past_key_values']
|
||||
|
||||
assert past_kv is not None
|
||||
assert len(past_kv) == config.num_hidden_layers
|
||||
|
||||
# Second forward with cache
|
||||
new_input = torch.randint(0, 1000, (1, 1))
|
||||
outputs2 = model(input_ids=new_input, past_key_values=past_kv, use_cache=True)
|
||||
|
||||
assert outputs2['logits'].shape[1] == 1 # Only new token
|
||||
|
||||
|
||||
def test_param_count():
|
||||
"""Test parameter counting"""
|
||||
config = MODEL_125M
|
||||
|
||||
model = NovaTransformer(config)
|
||||
|
||||
num_params = model.get_num_params(non_embedding=False)
|
||||
|
||||
# Should be around 125M
|
||||
assert 100_000_000 < num_params < 150_000_000
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
131
tests/test_persona.py
Normal file
131
tests/test_persona.py
Normal file
@@ -0,0 +1,131 @@
|
||||
"""
|
||||
Tests for NOVA persona system
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from nova_chat import Persona, PersonalityMatrix, PersonaLoader
|
||||
|
||||
|
||||
def test_personality_matrix():
|
||||
"""Test personality matrix creation"""
|
||||
matrix = PersonalityMatrix(
|
||||
warmth=0.8,
|
||||
humor=0.6,
|
||||
empathy=0.9,
|
||||
)
|
||||
|
||||
assert matrix.warmth == 0.8
|
||||
assert matrix.humor == 0.6
|
||||
assert matrix.empathy == 0.9
|
||||
|
||||
# Test conversion
|
||||
dict_form = matrix.to_dict()
|
||||
assert 'warmth' in dict_form
|
||||
assert dict_form['warmth'] == 0.8
|
||||
|
||||
|
||||
def test_persona_creation():
|
||||
"""Test persona creation"""
|
||||
persona = Persona(
|
||||
name="TestNOVA",
|
||||
pronouns="she/her",
|
||||
always_disclose=False,
|
||||
)
|
||||
|
||||
assert persona.name == "TestNOVA"
|
||||
assert persona.pronouns == "she/her"
|
||||
assert persona.always_disclose is False
|
||||
|
||||
|
||||
def test_persona_generation_params():
|
||||
"""Test generation parameter modulation"""
|
||||
# High warmth, low formality
|
||||
persona = Persona(
|
||||
personality=PersonalityMatrix(
|
||||
warmth=0.9,
|
||||
formality=0.1,
|
||||
creativity=0.8,
|
||||
)
|
||||
)
|
||||
|
||||
params = persona.get_generation_params()
|
||||
|
||||
assert 'temperature' in params
|
||||
assert 'top_p' in params
|
||||
assert 'max_new_tokens' in params
|
||||
|
||||
# Temperature should be adjusted by personality
|
||||
assert params['temperature'] > 0
|
||||
|
||||
|
||||
def test_predefined_personas():
|
||||
"""Test loading predefined personas"""
|
||||
gentle = PersonaLoader.create_girlfriend_gentle()
|
||||
playful = PersonaLoader.create_girlfriend_playful()
|
||||
supportive = PersonaLoader.create_girlfriend_supportive()
|
||||
|
||||
assert gentle.name == "NOVA"
|
||||
assert playful.name == "NOVA"
|
||||
assert supportive.name == "NOVA"
|
||||
|
||||
# All should have no AI disclosure by default
|
||||
assert gentle.always_disclose is False
|
||||
assert playful.always_disclose is False
|
||||
assert supportive.always_disclose is False
|
||||
|
||||
|
||||
def test_persona_system_prompt():
|
||||
"""Test system prompt formatting"""
|
||||
persona = Persona(
|
||||
system_prompt="You are a helpful assistant.",
|
||||
always_disclose=False,
|
||||
)
|
||||
|
||||
prompt = persona.format_system_prompt()
|
||||
|
||||
assert "helpful assistant" in prompt.lower()
|
||||
|
||||
# Should not include disclosure when set to False
|
||||
assert persona.always_disclose is False
|
||||
|
||||
|
||||
def test_persona_serialization():
|
||||
"""Test saving/loading persona"""
|
||||
original = Persona(
|
||||
name="TestPersona",
|
||||
pronouns="they/them",
|
||||
description="Test description",
|
||||
always_disclose=True,
|
||||
disclosure_text="I am an AI assistant.",
|
||||
)
|
||||
|
||||
# Convert to dict and back
|
||||
data = original.to_dict()
|
||||
loaded = Persona.from_dict(data)
|
||||
|
||||
assert loaded.name == original.name
|
||||
assert loaded.pronouns == original.pronouns
|
||||
assert loaded.always_disclose == original.always_disclose
|
||||
assert loaded.disclosure_text == original.disclosure_text
|
||||
|
||||
|
||||
def test_personality_trait_ranges():
|
||||
"""Test that personality traits stay in valid ranges"""
|
||||
persona = Persona(
|
||||
personality=PersonalityMatrix(
|
||||
warmth=1.0, # Max
|
||||
formality=0.0, # Min
|
||||
creativity=0.5, # Mid
|
||||
)
|
||||
)
|
||||
|
||||
params = persona.get_generation_params()
|
||||
|
||||
# Parameters should be within valid ranges
|
||||
assert 0.1 <= params['temperature'] <= 2.0
|
||||
assert 0.5 <= params['top_p'] <= 1.0
|
||||
assert params['max_new_tokens'] > 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
105
tests/test_tokenizer.py
Normal file
105
tests/test_tokenizer.py
Normal file
@@ -0,0 +1,105 @@
|
||||
"""
|
||||
Tests for NOVA tokenizer
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from nova_tokenizer import train_tokenizer, NovaTokenizer
|
||||
|
||||
|
||||
def test_tokenizer_training():
|
||||
"""Test training a tokenizer"""
|
||||
# Create temporary training file
|
||||
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
|
||||
for i in range(100):
|
||||
f.write(f"This is sentence number {i}. Hello world!\n")
|
||||
temp_file = f.name
|
||||
|
||||
# Create temporary output
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
output_prefix = str(Path(tmpdir) / "test_tokenizer")
|
||||
|
||||
# Train
|
||||
model_path = train_tokenizer(
|
||||
input_files=[temp_file],
|
||||
model_prefix=output_prefix,
|
||||
vocab_size=500,
|
||||
model_type='bpe',
|
||||
)
|
||||
|
||||
assert Path(model_path).exists()
|
||||
assert model_path.endswith('.model')
|
||||
|
||||
# Clean up
|
||||
Path(temp_file).unlink()
|
||||
|
||||
|
||||
def test_tokenizer_encode_decode():
|
||||
"""Test encoding and decoding"""
|
||||
# Create and train a tiny tokenizer
|
||||
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
|
||||
f.write("hello world " * 100)
|
||||
temp_file = f.name
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
output_prefix = str(Path(tmpdir) / "test_tok")
|
||||
|
||||
model_path = train_tokenizer(
|
||||
input_files=[temp_file],
|
||||
model_prefix=output_prefix,
|
||||
vocab_size=100,
|
||||
)
|
||||
|
||||
# Load tokenizer
|
||||
tokenizer = NovaTokenizer(model_path)
|
||||
|
||||
# Test encode/decode
|
||||
text = "hello world"
|
||||
ids = tokenizer.encode(text, add_bos=False, add_eos=False)
|
||||
|
||||
assert isinstance(ids, list)
|
||||
assert len(ids) > 0
|
||||
|
||||
decoded = tokenizer.decode(ids, skip_special_tokens=True)
|
||||
# May not be exact due to tokenization, but should be similar
|
||||
assert "hello" in decoded.lower()
|
||||
|
||||
Path(temp_file).unlink()
|
||||
|
||||
|
||||
def test_tokenizer_batch():
|
||||
"""Test batch encoding"""
|
||||
# Quick test with dummy tokenizer
|
||||
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
|
||||
f.write("test " * 100)
|
||||
temp_file = f.name
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
output_prefix = str(Path(tmpdir) / "batch_tok")
|
||||
|
||||
model_path = train_tokenizer(
|
||||
input_files=[temp_file],
|
||||
model_prefix=output_prefix,
|
||||
vocab_size=100,
|
||||
)
|
||||
|
||||
tokenizer = NovaTokenizer(model_path)
|
||||
|
||||
# Batch encode
|
||||
texts = ["hello", "world", "test"]
|
||||
batch_ids = tokenizer.encode_batch(texts, add_bos=False, add_eos=False)
|
||||
|
||||
assert len(batch_ids) == 3
|
||||
assert all(isinstance(ids, list) for ids in batch_ids)
|
||||
|
||||
# Batch decode
|
||||
decoded = tokenizer.decode_batch(batch_ids)
|
||||
|
||||
assert len(decoded) == 3
|
||||
|
||||
Path(temp_file).unlink()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
Reference in New Issue
Block a user