"""
Tests for NOVA core transformer
"""

import pytest
import torch
from nova_core import NovaTransformer, ModelConfig, MODEL_125M


def test_model_config():
    """Test model configuration"""
    config = ModelConfig(
        vocab_size=1000,
        hidden_size=256,
        num_hidden_layers=4,
        num_attention_heads=4,
    )

    assert config.vocab_size == 1000
    assert config.hidden_size == 256
    assert config.num_hidden_layers == 4


def test_model_creation():
    """Test creating a small model"""
    config = ModelConfig(
        vocab_size=1000,
        hidden_size=128,
        num_hidden_layers=2,
        num_attention_heads=4,
        intermediate_size=512,
        max_position_embeddings=512,
    )

    model = NovaTransformer(config)

    assert model is not None
    assert model.config == config
    assert model.vocab_size == 1000


def test_model_forward():
    """Test forward pass"""
    config = ModelConfig(
        vocab_size=1000,
        hidden_size=128,
        num_hidden_layers=2,
        num_attention_heads=4,
        intermediate_size=512,
        max_position_embeddings=512,
    )

    model = NovaTransformer(config)
    model.eval()

    # Create dummy input
    batch_size = 2
    seq_len = 10
    input_ids = torch.randint(0, 1000, (batch_size, seq_len))

    # Forward pass
    with torch.no_grad():
        outputs = model(input_ids=input_ids)

    assert 'logits' in outputs
    assert outputs['logits'].shape == (batch_size, seq_len, 1000)


def test_model_generation():
    """Test text generation"""
    config = ModelConfig(
        vocab_size=1000,
        hidden_size=128,
        num_hidden_layers=2,
        num_attention_heads=4,
        intermediate_size=512,
        max_position_embeddings=512,
    )

    model = NovaTransformer(config)
    model.eval()

    # Create dummy input
    input_ids = torch.randint(0, 1000, (1, 5))

    # Generate
    with torch.no_grad():
        output_ids = model.generate(
            input_ids=input_ids,
            max_new_tokens=10,
            temperature=1.0,
            do_sample=True,
        )

    assert output_ids.shape[1] == 15  # 5 input + 10 generated


def test_kv_cache():
    """Test KV-cache functionality"""
    config = ModelConfig(
        vocab_size=1000,
        hidden_size=128,
        num_hidden_layers=2,
        num_attention_heads=4,
        use_cache=True,
    )

    model = NovaTransformer(config)
    model.eval()

    input_ids = torch.randint(0, 1000, (1, 5))

    with torch.no_grad():
        # First forward with cache
        outputs1 = model(input_ids=input_ids, use_cache=True)
        past_kv = outputs1['past_key_values']

        assert past_kv is not None
        assert len(past_kv) == config.num_hidden_layers

        # Second forward with cache
        new_input = torch.randint(0, 1000, (1, 1))
        outputs2 = model(input_ids=new_input, past_key_values=past_kv, use_cache=True)

        assert outputs2['logits'].shape[1] == 1  # Only new token


def test_param_count():
    """Test parameter counting"""
    config = MODEL_125M

    model = NovaTransformer(config)

    num_params = model.get_num_params(non_embedding=False)

    # Should be around 125M
    assert 100_000_000 < num_params < 150_000_000


if __name__ == "__main__":
    pytest.main([__file__, "-v"])