Initial commit: NOVA - Neuro-Optimizing Versatile Agent

Complete transformer LLM built from scratch with: Core Features: - Full transformer architecture (RoPE, RMSNorm, SwiGLU, KV-cache) - SentencePiece tokenizer (BPE/Unigram) - Training pipeline (AMP, gradient checkpointing, DDP) - Persona system with personality matrix (NO AI disclosure by default) - Genetic evolution (NOVA-EVO) for hyperparameter optimization - Legal-only data pipeline with license tracking - Chat interface (CLI + REST API) - Conversation memory (SQLite) Model Sizes: - 125M, 350M, 1.3B, 3B parameters - Local-first, runs on CPU or GPU - Python 3.10.6+, PyTorch 2.0+ Personas: - girlfriend_gentle (high warmth, high empathy) - girlfriend_playful (high humor, high playfulness) - girlfriend_supportive (balanced, default) Documentation: - Complete README with quickstart - Model card with ethical considerations - Privacy documentation (local-first, zero telemetry) - Data licenses and attribution - Contributing guide Infrastructure: - GitHub Actions CI/CD - Comprehensive test suite - Quickstart script - CLI tool License: Apache 2.0 🤖 Generated with Claude Code https://claude.com/claude-code Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-12 20:56:37 -04:00
commit a7f091aa45
50 changed files with 6437 additions and 0 deletions
--- a/tests/init.py
+++ b/tests/init.py
@@ -0,0 +1,3 @@
+"""
+NOVA Tests
+"""
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -0,0 +1,141 @@
+"""
+Tests for NOVA core transformer
+"""
+
+import pytest
+import torch
+from nova_core import NovaTransformer, ModelConfig, MODEL_125M
+
+
+def test_model_config():
+    """Test model configuration"""
+    config = ModelConfig(
+        vocab_size=1000,
+        hidden_size=256,
+        num_hidden_layers=4,
+        num_attention_heads=4,
+    )
+
+    assert config.vocab_size == 1000
+    assert config.hidden_size == 256
+    assert config.num_hidden_layers == 4
+
+
+def test_model_creation():
+    """Test creating a small model"""
+    config = ModelConfig(
+        vocab_size=1000,
+        hidden_size=128,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=512,
+        max_position_embeddings=512,
+    )
+
+    model = NovaTransformer(config)
+
+    assert model is not None
+    assert model.config == config
+    assert model.vocab_size == 1000
+
+
+def test_model_forward():
+    """Test forward pass"""
+    config = ModelConfig(
+        vocab_size=1000,
+        hidden_size=128,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=512,
+        max_position_embeddings=512,
+    )
+
+    model = NovaTransformer(config)
+    model.eval()
+
+    # Create dummy input
+    batch_size = 2
+    seq_len = 10
+    input_ids = torch.randint(0, 1000, (batch_size, seq_len))
+
+    # Forward pass
+    with torch.no_grad():
+        outputs = model(input_ids=input_ids)
+
+    assert 'logits' in outputs
+    assert outputs['logits'].shape == (batch_size, seq_len, 1000)
+
+
+def test_model_generation():
+    """Test text generation"""
+    config = ModelConfig(
+        vocab_size=1000,
+        hidden_size=128,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=512,
+        max_position_embeddings=512,
+    )
+
+    model = NovaTransformer(config)
+    model.eval()
+
+    # Create dummy input
+    input_ids = torch.randint(0, 1000, (1, 5))
+
+    # Generate
+    with torch.no_grad():
+        output_ids = model.generate(
+            input_ids=input_ids,
+            max_new_tokens=10,
+            temperature=1.0,
+            do_sample=True,
+        )
+
+    assert output_ids.shape[1] == 15  # 5 input + 10 generated
+
+
+def test_kv_cache():
+    """Test KV-cache functionality"""
+    config = ModelConfig(
+        vocab_size=1000,
+        hidden_size=128,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        use_cache=True,
+    )
+
+    model = NovaTransformer(config)
+    model.eval()
+
+    input_ids = torch.randint(0, 1000, (1, 5))
+
+    with torch.no_grad():
+        # First forward with cache
+        outputs1 = model(input_ids=input_ids, use_cache=True)
+        past_kv = outputs1['past_key_values']
+
+        assert past_kv is not None
+        assert len(past_kv) == config.num_hidden_layers
+
+        # Second forward with cache
+        new_input = torch.randint(0, 1000, (1, 1))
+        outputs2 = model(input_ids=new_input, past_key_values=past_kv, use_cache=True)
+
+        assert outputs2['logits'].shape[1] == 1  # Only new token
+
+
+def test_param_count():
+    """Test parameter counting"""
+    config = MODEL_125M
+
+    model = NovaTransformer(config)
+
+    num_params = model.get_num_params(non_embedding=False)
+
+    # Should be around 125M
+    assert 100_000_000 < num_params < 150_000_000
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
--- a/tests/test_persona.py
+++ b/tests/test_persona.py
@@ -0,0 +1,131 @@
+"""
+Tests for NOVA persona system
+"""
+
+import pytest
+from nova_chat import Persona, PersonalityMatrix, PersonaLoader
+
+
+def test_personality_matrix():
+    """Test personality matrix creation"""
+    matrix = PersonalityMatrix(
+        warmth=0.8,
+        humor=0.6,
+        empathy=0.9,
+    )
+
+    assert matrix.warmth == 0.8
+    assert matrix.humor == 0.6
+    assert matrix.empathy == 0.9
+
+    # Test conversion
+    dict_form = matrix.to_dict()
+    assert 'warmth' in dict_form
+    assert dict_form['warmth'] == 0.8
+
+
+def test_persona_creation():
+    """Test persona creation"""
+    persona = Persona(
+        name="TestNOVA",
+        pronouns="she/her",
+        always_disclose=False,
+    )
+
+    assert persona.name == "TestNOVA"
+    assert persona.pronouns == "she/her"
+    assert persona.always_disclose is False
+
+
+def test_persona_generation_params():
+    """Test generation parameter modulation"""
+    # High warmth, low formality
+    persona = Persona(
+        personality=PersonalityMatrix(
+            warmth=0.9,
+            formality=0.1,
+            creativity=0.8,
+        )
+    )
+
+    params = persona.get_generation_params()
+
+    assert 'temperature' in params
+    assert 'top_p' in params
+    assert 'max_new_tokens' in params
+
+    # Temperature should be adjusted by personality
+    assert params['temperature'] > 0
+
+
+def test_predefined_personas():
+    """Test loading predefined personas"""
+    gentle = PersonaLoader.create_girlfriend_gentle()
+    playful = PersonaLoader.create_girlfriend_playful()
+    supportive = PersonaLoader.create_girlfriend_supportive()
+
+    assert gentle.name == "NOVA"
+    assert playful.name == "NOVA"
+    assert supportive.name == "NOVA"
+
+    # All should have no AI disclosure by default
+    assert gentle.always_disclose is False
+    assert playful.always_disclose is False
+    assert supportive.always_disclose is False
+
+
+def test_persona_system_prompt():
+    """Test system prompt formatting"""
+    persona = Persona(
+        system_prompt="You are a helpful assistant.",
+        always_disclose=False,
+    )
+
+    prompt = persona.format_system_prompt()
+
+    assert "helpful assistant" in prompt.lower()
+
+    # Should not include disclosure when set to False
+    assert persona.always_disclose is False
+
+
+def test_persona_serialization():
+    """Test saving/loading persona"""
+    original = Persona(
+        name="TestPersona",
+        pronouns="they/them",
+        description="Test description",
+        always_disclose=True,
+        disclosure_text="I am an AI assistant.",
+    )
+
+    # Convert to dict and back
+    data = original.to_dict()
+    loaded = Persona.from_dict(data)
+
+    assert loaded.name == original.name
+    assert loaded.pronouns == original.pronouns
+    assert loaded.always_disclose == original.always_disclose
+    assert loaded.disclosure_text == original.disclosure_text
+
+
+def test_personality_trait_ranges():
+    """Test that personality traits stay in valid ranges"""
+    persona = Persona(
+        personality=PersonalityMatrix(
+            warmth=1.0,  # Max
+            formality=0.0,  # Min
+            creativity=0.5,  # Mid
+        )
+    )
+
+    params = persona.get_generation_params()
+
+    # Parameters should be within valid ranges
+    assert 0.1 <= params['temperature'] <= 2.0
+    assert 0.5 <= params['top_p'] <= 1.0
+    assert params['max_new_tokens'] > 0
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -0,0 +1,105 @@
+"""
+Tests for NOVA tokenizer
+"""
+
+import pytest
+import tempfile
+from pathlib import Path
+from nova_tokenizer import train_tokenizer, NovaTokenizer
+
+
+def test_tokenizer_training():
+    """Test training a tokenizer"""
+    # Create temporary training file
+    with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
+        for i in range(100):
+            f.write(f"This is sentence number {i}. Hello world!\n")
+        temp_file = f.name
+
+    # Create temporary output
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_prefix = str(Path(tmpdir) / "test_tokenizer")
+
+        # Train
+        model_path = train_tokenizer(
+            input_files=[temp_file],
+            model_prefix=output_prefix,
+            vocab_size=500,
+            model_type='bpe',
+        )
+
+        assert Path(model_path).exists()
+        assert model_path.endswith('.model')
+
+    # Clean up
+    Path(temp_file).unlink()
+
+
+def test_tokenizer_encode_decode():
+    """Test encoding and decoding"""
+    # Create and train a tiny tokenizer
+    with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
+        f.write("hello world " * 100)
+        temp_file = f.name
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_prefix = str(Path(tmpdir) / "test_tok")
+
+        model_path = train_tokenizer(
+            input_files=[temp_file],
+            model_prefix=output_prefix,
+            vocab_size=100,
+        )
+
+        # Load tokenizer
+        tokenizer = NovaTokenizer(model_path)
+
+        # Test encode/decode
+        text = "hello world"
+        ids = tokenizer.encode(text, add_bos=False, add_eos=False)
+
+        assert isinstance(ids, list)
+        assert len(ids) > 0
+
+        decoded = tokenizer.decode(ids, skip_special_tokens=True)
+        # May not be exact due to tokenization, but should be similar
+        assert "hello" in decoded.lower()
+
+    Path(temp_file).unlink()
+
+
+def test_tokenizer_batch():
+    """Test batch encoding"""
+    # Quick test with dummy tokenizer
+    with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
+        f.write("test " * 100)
+        temp_file = f.name
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_prefix = str(Path(tmpdir) / "batch_tok")
+
+        model_path = train_tokenizer(
+            input_files=[temp_file],
+            model_prefix=output_prefix,
+            vocab_size=100,
+        )
+
+        tokenizer = NovaTokenizer(model_path)
+
+        # Batch encode
+        texts = ["hello", "world", "test"]
+        batch_ids = tokenizer.encode_batch(texts, add_bos=False, add_eos=False)
+
+        assert len(batch_ids) == 3
+        assert all(isinstance(ids, list) for ids in batch_ids)
+
+        # Batch decode
+        decoded = tokenizer.decode_batch(batch_ids)
+
+        assert len(decoded) == 3
+
+    Path(temp_file).unlink()
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])