Complete transformer LLM built from scratch with: Core Features: - Full transformer architecture (RoPE, RMSNorm, SwiGLU, KV-cache) - SentencePiece tokenizer (BPE/Unigram) - Training pipeline (AMP, gradient checkpointing, DDP) - Persona system with personality matrix (NO AI disclosure by default) - Genetic evolution (NOVA-EVO) for hyperparameter optimization - Legal-only data pipeline with license tracking - Chat interface (CLI + REST API) - Conversation memory (SQLite) Model Sizes: - 125M, 350M, 1.3B, 3B parameters - Local-first, runs on CPU or GPU - Python 3.10.6+, PyTorch 2.0+ Personas: - girlfriend_gentle (high warmth, high empathy) - girlfriend_playful (high humor, high playfulness) - girlfriend_supportive (balanced, default) Documentation: - Complete README with quickstart - Model card with ethical considerations - Privacy documentation (local-first, zero telemetry) - Data licenses and attribution - Contributing guide Infrastructure: - GitHub Actions CI/CD - Comprehensive test suite - Quickstart script - CLI tool License: Apache 2.0 🤖 Generated with Claude Code https://claude.com/claude-code Co-Authored-By: Claude <noreply@anthropic.com>
244 lines
7.2 KiB
Python
244 lines
7.2 KiB
Python
"""
|
|
Fitness evaluator for NOVA-EVO
|
|
"""
|
|
|
|
import torch
|
|
import time
|
|
from typing import Dict
|
|
from pathlib import Path
|
|
|
|
from .config import Individual, EvolutionConfig
|
|
from nova_core import NovaTransformer, ModelConfig
|
|
from nova_train import NovaTrainer, TrainingConfig
|
|
|
|
|
|
class FitnessEvaluator:
|
|
"""
|
|
Evaluates fitness of individuals by training and measuring metrics
|
|
|
|
Metrics:
|
|
- Loss/perplexity (quality of learning)
|
|
- Latency (inference speed)
|
|
- Memory usage (peak RAM/VRAM)
|
|
- Chat quality (optional, if eval set available)
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
base_model_config: ModelConfig,
|
|
evo_config: EvolutionConfig,
|
|
train_dataset,
|
|
eval_dataset=None,
|
|
device: str = "auto",
|
|
):
|
|
"""
|
|
Args:
|
|
base_model_config: Base model configuration
|
|
evo_config: Evolution configuration
|
|
train_dataset: Training dataset for fitness eval
|
|
eval_dataset: Optional evaluation dataset
|
|
device: Device for training
|
|
"""
|
|
self.base_model_config = base_model_config
|
|
self.evo_config = evo_config
|
|
self.train_dataset = train_dataset
|
|
self.eval_dataset = eval_dataset
|
|
self.device = device
|
|
|
|
def evaluate(self, individual: Individual) -> Dict[str, float]:
|
|
"""
|
|
Evaluate fitness of an individual
|
|
|
|
Args:
|
|
individual: Individual to evaluate
|
|
|
|
Returns:
|
|
Dictionary of metrics
|
|
"""
|
|
# Create model with individual's architecture choices
|
|
model_config = self._create_model_config(individual)
|
|
model = NovaTransformer(model_config)
|
|
|
|
# Create training config with individual's hyperparameters
|
|
train_config = self._create_training_config(individual)
|
|
|
|
# Train for eval_steps
|
|
train_loader = self._create_dataloader(
|
|
self.train_dataset,
|
|
batch_size=individual.batch_size
|
|
)
|
|
|
|
# Quick training
|
|
loss = self._quick_train(model, train_config, train_loader)
|
|
|
|
# Measure latency
|
|
latency_ms = self._measure_latency(model)
|
|
|
|
# Measure memory
|
|
memory_mb = self._measure_memory(model)
|
|
|
|
# Calculate perplexity
|
|
perplexity = torch.exp(torch.tensor(loss)).item() if loss < 100 else float('inf')
|
|
|
|
return {
|
|
'loss': loss,
|
|
'perplexity': perplexity,
|
|
'latency_ms': latency_ms,
|
|
'memory_mb': memory_mb,
|
|
'quality_score': 0.0, # TODO: Implement chat quality eval
|
|
}
|
|
|
|
def _create_model_config(self, individual: Individual) -> ModelConfig:
|
|
"""Create model config from individual's genes"""
|
|
config = ModelConfig(
|
|
vocab_size=self.base_model_config.vocab_size,
|
|
hidden_size=self.base_model_config.hidden_size,
|
|
num_hidden_layers=self.base_model_config.num_hidden_layers,
|
|
num_attention_heads=self.base_model_config.num_attention_heads,
|
|
intermediate_size=self.base_model_config.intermediate_size,
|
|
max_position_embeddings=self.base_model_config.max_position_embeddings,
|
|
# Individual's choices
|
|
rope_theta=individual.rope_theta,
|
|
hidden_act=individual.hidden_act,
|
|
norm_type=individual.norm_type,
|
|
)
|
|
return config
|
|
|
|
def _create_training_config(self, individual: Individual) -> TrainingConfig:
|
|
"""Create training config from individual's hyperparameters"""
|
|
config = TrainingConfig(
|
|
learning_rate=individual.learning_rate,
|
|
batch_size=individual.batch_size,
|
|
warmup_steps=individual.warmup_steps,
|
|
weight_decay=individual.weight_decay,
|
|
num_epochs=1, # Just one pass for eval
|
|
save_steps=999999, # Don't save during eval
|
|
device=self.device,
|
|
)
|
|
return config
|
|
|
|
def _create_dataloader(self, dataset, batch_size: int):
|
|
"""Create dataloader for training"""
|
|
from torch.utils.data import DataLoader
|
|
|
|
return DataLoader(
|
|
dataset,
|
|
batch_size=batch_size,
|
|
shuffle=True,
|
|
num_workers=0,
|
|
)
|
|
|
|
def _quick_train(
|
|
self,
|
|
model: NovaTransformer,
|
|
train_config: TrainingConfig,
|
|
train_loader
|
|
) -> float:
|
|
"""
|
|
Quick training for evaluation
|
|
|
|
Returns:
|
|
Final loss
|
|
"""
|
|
# Limit to eval_steps
|
|
limited_loader = []
|
|
for i, batch in enumerate(train_loader):
|
|
if i >= self.evo_config.eval_steps:
|
|
break
|
|
limited_loader.append(batch)
|
|
|
|
if not limited_loader:
|
|
return float('inf')
|
|
|
|
# Simple training loop
|
|
device = torch.device(self.device if self.device != "auto" else "cuda" if torch.cuda.is_available() else "cpu")
|
|
model.to(device)
|
|
model.train()
|
|
|
|
optimizer = torch.optim.AdamW(
|
|
model.parameters(),
|
|
lr=train_config.learning_rate,
|
|
weight_decay=train_config.weight_decay,
|
|
)
|
|
|
|
total_loss = 0.0
|
|
num_batches = 0
|
|
|
|
for batch in limited_loader:
|
|
input_ids = batch['input_ids'].to(device)
|
|
labels = batch.get('labels', input_ids).to(device)
|
|
|
|
outputs = model(input_ids=input_ids)
|
|
logits = outputs['logits']
|
|
|
|
# Calculate loss
|
|
shift_logits = logits[..., :-1, :].contiguous()
|
|
shift_labels = labels[..., 1:].contiguous()
|
|
|
|
loss = torch.nn.functional.cross_entropy(
|
|
shift_logits.view(-1, shift_logits.size(-1)),
|
|
shift_labels.view(-1),
|
|
ignore_index=-100
|
|
)
|
|
|
|
optimizer.zero_grad()
|
|
loss.backward()
|
|
optimizer.step()
|
|
|
|
total_loss += loss.item()
|
|
num_batches += 1
|
|
|
|
return total_loss / num_batches if num_batches > 0 else float('inf')
|
|
|
|
@torch.no_grad()
|
|
def _measure_latency(self, model: NovaTransformer) -> float:
|
|
"""
|
|
Measure average inference latency in milliseconds
|
|
|
|
Args:
|
|
model: Model to measure
|
|
|
|
Returns:
|
|
Average latency in ms
|
|
"""
|
|
device = next(model.parameters()).device
|
|
model.eval()
|
|
|
|
# Dummy input
|
|
input_ids = torch.randint(0, model.config.vocab_size, (1, 128), device=device)
|
|
|
|
# Warmup
|
|
for _ in range(3):
|
|
_ = model(input_ids=input_ids)
|
|
|
|
# Measure
|
|
num_runs = 10
|
|
start = time.time()
|
|
|
|
for _ in range(num_runs):
|
|
_ = model(input_ids=input_ids)
|
|
|
|
if device.type == 'cuda':
|
|
torch.cuda.synchronize()
|
|
|
|
elapsed = (time.time() - start) / num_runs
|
|
return elapsed * 1000 # Convert to ms
|
|
|
|
def _measure_memory(self, model: NovaTransformer) -> float:
|
|
"""
|
|
Measure peak memory usage in MB
|
|
|
|
Args:
|
|
model: Model to measure
|
|
|
|
Returns:
|
|
Peak memory in MB
|
|
"""
|
|
# Count parameters
|
|
num_params = sum(p.numel() for p in model.parameters())
|
|
|
|
# Approximate memory (4 bytes per float32 parameter)
|
|
memory_mb = (num_params * 4) / (1024 ** 2)
|
|
|
|
return memory_mb
|