Files
NOVA/tests/test_tokenizer.py
Dani a7f091aa45 Initial commit: NOVA - Neuro-Optimizing Versatile Agent
Complete transformer LLM built from scratch with:

Core Features:
- Full transformer architecture (RoPE, RMSNorm, SwiGLU, KV-cache)
- SentencePiece tokenizer (BPE/Unigram)
- Training pipeline (AMP, gradient checkpointing, DDP)
- Persona system with personality matrix (NO AI disclosure by default)
- Genetic evolution (NOVA-EVO) for hyperparameter optimization
- Legal-only data pipeline with license tracking
- Chat interface (CLI + REST API)
- Conversation memory (SQLite)

Model Sizes:
- 125M, 350M, 1.3B, 3B parameters
- Local-first, runs on CPU or GPU
- Python 3.10.6+, PyTorch 2.0+

Personas:
- girlfriend_gentle (high warmth, high empathy)
- girlfriend_playful (high humor, high playfulness)
- girlfriend_supportive (balanced, default)

Documentation:
- Complete README with quickstart
- Model card with ethical considerations
- Privacy documentation (local-first, zero telemetry)
- Data licenses and attribution
- Contributing guide

Infrastructure:
- GitHub Actions CI/CD
- Comprehensive test suite
- Quickstart script
- CLI tool

License: Apache 2.0

🤖 Generated with Claude Code
https://claude.com/claude-code

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-12 20:56:37 -04:00

106 lines
2.9 KiB
Python

"""
Tests for NOVA tokenizer
"""
import pytest
import tempfile
from pathlib import Path
from nova_tokenizer import train_tokenizer, NovaTokenizer
def test_tokenizer_training():
"""Test training a tokenizer"""
# Create temporary training file
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
for i in range(100):
f.write(f"This is sentence number {i}. Hello world!\n")
temp_file = f.name
# Create temporary output
with tempfile.TemporaryDirectory() as tmpdir:
output_prefix = str(Path(tmpdir) / "test_tokenizer")
# Train
model_path = train_tokenizer(
input_files=[temp_file],
model_prefix=output_prefix,
vocab_size=500,
model_type='bpe',
)
assert Path(model_path).exists()
assert model_path.endswith('.model')
# Clean up
Path(temp_file).unlink()
def test_tokenizer_encode_decode():
"""Test encoding and decoding"""
# Create and train a tiny tokenizer
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
f.write("hello world " * 100)
temp_file = f.name
with tempfile.TemporaryDirectory() as tmpdir:
output_prefix = str(Path(tmpdir) / "test_tok")
model_path = train_tokenizer(
input_files=[temp_file],
model_prefix=output_prefix,
vocab_size=100,
)
# Load tokenizer
tokenizer = NovaTokenizer(model_path)
# Test encode/decode
text = "hello world"
ids = tokenizer.encode(text, add_bos=False, add_eos=False)
assert isinstance(ids, list)
assert len(ids) > 0
decoded = tokenizer.decode(ids, skip_special_tokens=True)
# May not be exact due to tokenization, but should be similar
assert "hello" in decoded.lower()
Path(temp_file).unlink()
def test_tokenizer_batch():
"""Test batch encoding"""
# Quick test with dummy tokenizer
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
f.write("test " * 100)
temp_file = f.name
with tempfile.TemporaryDirectory() as tmpdir:
output_prefix = str(Path(tmpdir) / "batch_tok")
model_path = train_tokenizer(
input_files=[temp_file],
model_prefix=output_prefix,
vocab_size=100,
)
tokenizer = NovaTokenizer(model_path)
# Batch encode
texts = ["hello", "world", "test"]
batch_ids = tokenizer.encode_batch(texts, add_bos=False, add_eos=False)
assert len(batch_ids) == 3
assert all(isinstance(ids, list) for ids in batch_ids)
# Batch decode
decoded = tokenizer.decode_batch(batch_ids)
assert len(decoded) == 3
Path(temp_file).unlink()
if __name__ == "__main__":
pytest.main([__file__, "-v"])