""" Tests for NOVA tokenizer """ import pytest import tempfile from pathlib import Path from nova_tokenizer import train_tokenizer, NovaTokenizer def test_tokenizer_training(): """Test training a tokenizer""" # Create temporary training file with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f: for i in range(100): f.write(f"This is sentence number {i}. Hello world!\n") temp_file = f.name # Create temporary output with tempfile.TemporaryDirectory() as tmpdir: output_prefix = str(Path(tmpdir) / "test_tokenizer") # Train model_path = train_tokenizer( input_files=[temp_file], model_prefix=output_prefix, vocab_size=500, model_type='bpe', ) assert Path(model_path).exists() assert model_path.endswith('.model') # Clean up Path(temp_file).unlink() def test_tokenizer_encode_decode(): """Test encoding and decoding""" # Create and train a tiny tokenizer with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f: f.write("hello world " * 100) temp_file = f.name with tempfile.TemporaryDirectory() as tmpdir: output_prefix = str(Path(tmpdir) / "test_tok") model_path = train_tokenizer( input_files=[temp_file], model_prefix=output_prefix, vocab_size=100, ) # Load tokenizer tokenizer = NovaTokenizer(model_path) # Test encode/decode text = "hello world" ids = tokenizer.encode(text, add_bos=False, add_eos=False) assert isinstance(ids, list) assert len(ids) > 0 decoded = tokenizer.decode(ids, skip_special_tokens=True) # May not be exact due to tokenization, but should be similar assert "hello" in decoded.lower() Path(temp_file).unlink() def test_tokenizer_batch(): """Test batch encoding""" # Quick test with dummy tokenizer with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f: f.write("test " * 100) temp_file = f.name with tempfile.TemporaryDirectory() as tmpdir: output_prefix = str(Path(tmpdir) / "batch_tok") model_path = train_tokenizer( input_files=[temp_file], model_prefix=output_prefix, vocab_size=100, ) tokenizer = NovaTokenizer(model_path) # Batch encode texts = ["hello", "world", "test"] batch_ids = tokenizer.encode_batch(texts, add_bos=False, add_eos=False) assert len(batch_ids) == 3 assert all(isinstance(ids, list) for ids in batch_ids) # Batch decode decoded = tokenizer.decode_batch(batch_ids) assert len(decoded) == 3 Path(temp_file).unlink() if __name__ == "__main__": pytest.main([__file__, "-v"])