Complete transformer LLM built from scratch with: Core Features: - Full transformer architecture (RoPE, RMSNorm, SwiGLU, KV-cache) - SentencePiece tokenizer (BPE/Unigram) - Training pipeline (AMP, gradient checkpointing, DDP) - Persona system with personality matrix (NO AI disclosure by default) - Genetic evolution (NOVA-EVO) for hyperparameter optimization - Legal-only data pipeline with license tracking - Chat interface (CLI + REST API) - Conversation memory (SQLite) Model Sizes: - 125M, 350M, 1.3B, 3B parameters - Local-first, runs on CPU or GPU - Python 3.10.6+, PyTorch 2.0+ Personas: - girlfriend_gentle (high warmth, high empathy) - girlfriend_playful (high humor, high playfulness) - girlfriend_supportive (balanced, default) Documentation: - Complete README with quickstart - Model card with ethical considerations - Privacy documentation (local-first, zero telemetry) - Data licenses and attribution - Contributing guide Infrastructure: - GitHub Actions CI/CD - Comprehensive test suite - Quickstart script - CLI tool License: Apache 2.0 🤖 Generated with Claude Code https://claude.com/claude-code Co-Authored-By: Claude <noreply@anthropic.com>
106 lines
2.9 KiB
Python
106 lines
2.9 KiB
Python
"""
|
|
Tests for NOVA tokenizer
|
|
"""
|
|
|
|
import pytest
|
|
import tempfile
|
|
from pathlib import Path
|
|
from nova_tokenizer import train_tokenizer, NovaTokenizer
|
|
|
|
|
|
def test_tokenizer_training():
|
|
"""Test training a tokenizer"""
|
|
# Create temporary training file
|
|
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
|
|
for i in range(100):
|
|
f.write(f"This is sentence number {i}. Hello world!\n")
|
|
temp_file = f.name
|
|
|
|
# Create temporary output
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
output_prefix = str(Path(tmpdir) / "test_tokenizer")
|
|
|
|
# Train
|
|
model_path = train_tokenizer(
|
|
input_files=[temp_file],
|
|
model_prefix=output_prefix,
|
|
vocab_size=500,
|
|
model_type='bpe',
|
|
)
|
|
|
|
assert Path(model_path).exists()
|
|
assert model_path.endswith('.model')
|
|
|
|
# Clean up
|
|
Path(temp_file).unlink()
|
|
|
|
|
|
def test_tokenizer_encode_decode():
|
|
"""Test encoding and decoding"""
|
|
# Create and train a tiny tokenizer
|
|
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
|
|
f.write("hello world " * 100)
|
|
temp_file = f.name
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
output_prefix = str(Path(tmpdir) / "test_tok")
|
|
|
|
model_path = train_tokenizer(
|
|
input_files=[temp_file],
|
|
model_prefix=output_prefix,
|
|
vocab_size=100,
|
|
)
|
|
|
|
# Load tokenizer
|
|
tokenizer = NovaTokenizer(model_path)
|
|
|
|
# Test encode/decode
|
|
text = "hello world"
|
|
ids = tokenizer.encode(text, add_bos=False, add_eos=False)
|
|
|
|
assert isinstance(ids, list)
|
|
assert len(ids) > 0
|
|
|
|
decoded = tokenizer.decode(ids, skip_special_tokens=True)
|
|
# May not be exact due to tokenization, but should be similar
|
|
assert "hello" in decoded.lower()
|
|
|
|
Path(temp_file).unlink()
|
|
|
|
|
|
def test_tokenizer_batch():
|
|
"""Test batch encoding"""
|
|
# Quick test with dummy tokenizer
|
|
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
|
|
f.write("test " * 100)
|
|
temp_file = f.name
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
output_prefix = str(Path(tmpdir) / "batch_tok")
|
|
|
|
model_path = train_tokenizer(
|
|
input_files=[temp_file],
|
|
model_prefix=output_prefix,
|
|
vocab_size=100,
|
|
)
|
|
|
|
tokenizer = NovaTokenizer(model_path)
|
|
|
|
# Batch encode
|
|
texts = ["hello", "world", "test"]
|
|
batch_ids = tokenizer.encode_batch(texts, add_bos=False, add_eos=False)
|
|
|
|
assert len(batch_ids) == 3
|
|
assert all(isinstance(ids, list) for ids in batch_ids)
|
|
|
|
# Batch decode
|
|
decoded = tokenizer.decode_batch(batch_ids)
|
|
|
|
assert len(decoded) == 3
|
|
|
|
Path(temp_file).unlink()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v"])
|