NOVA/tests/test_tokenizer.py

"""
Tests for NOVA tokenizer
"""

import pytest
import tempfile
from pathlib import Path
from nova_tokenizer import train_tokenizer, NovaTokenizer


def test_tokenizer_training():
    """Test training a tokenizer"""
    # Create temporary training file
    with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
        for i in range(100):
            f.write(f"This is sentence number {i}. Hello world!\n")
        temp_file = f.name

    # Create temporary output
    with tempfile.TemporaryDirectory() as tmpdir:
        output_prefix = str(Path(tmpdir) / "test_tokenizer")

        # Train
        model_path = train_tokenizer(
            input_files=[temp_file],
            model_prefix=output_prefix,
            vocab_size=500,
            model_type='bpe',
        )

        assert Path(model_path).exists()
        assert model_path.endswith('.model')

    # Clean up
    Path(temp_file).unlink()


def test_tokenizer_encode_decode():
    """Test encoding and decoding"""
    # Create and train a tiny tokenizer
    with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
        f.write("hello world " * 100)
        temp_file = f.name

    with tempfile.TemporaryDirectory() as tmpdir:
        output_prefix = str(Path(tmpdir) / "test_tok")

        model_path = train_tokenizer(
            input_files=[temp_file],
            model_prefix=output_prefix,
            vocab_size=100,
        )

        # Load tokenizer
        tokenizer = NovaTokenizer(model_path)

        # Test encode/decode
        text = "hello world"
        ids = tokenizer.encode(text, add_bos=False, add_eos=False)

        assert isinstance(ids, list)
        assert len(ids) > 0

        decoded = tokenizer.decode(ids, skip_special_tokens=True)
        # May not be exact due to tokenization, but should be similar
        assert "hello" in decoded.lower()

    Path(temp_file).unlink()


def test_tokenizer_batch():
    """Test batch encoding"""
    # Quick test with dummy tokenizer
    with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
        f.write("test " * 100)
        temp_file = f.name

    with tempfile.TemporaryDirectory() as tmpdir:
        output_prefix = str(Path(tmpdir) / "batch_tok")

        model_path = train_tokenizer(
            input_files=[temp_file],
            model_prefix=output_prefix,
            vocab_size=100,
        )

        tokenizer = NovaTokenizer(model_path)

        # Batch encode
        texts = ["hello", "world", "test"]
        batch_ids = tokenizer.encode_batch(texts, add_bos=False, add_eos=False)

        assert len(batch_ids) == 3
        assert all(isinstance(ids, list) for ids in batch_ids)

        # Batch decode
        decoded = tokenizer.decode_batch(batch_ids)

        assert len(decoded) == 3

    Path(temp_file).unlink()


if __name__ == "__main__":
    pytest.main([__file__, "-v"])