Initial commit: NOVA - Neuro-Optimizing Versatile Agent
Complete transformer LLM built from scratch with: Core Features: - Full transformer architecture (RoPE, RMSNorm, SwiGLU, KV-cache) - SentencePiece tokenizer (BPE/Unigram) - Training pipeline (AMP, gradient checkpointing, DDP) - Persona system with personality matrix (NO AI disclosure by default) - Genetic evolution (NOVA-EVO) for hyperparameter optimization - Legal-only data pipeline with license tracking - Chat interface (CLI + REST API) - Conversation memory (SQLite) Model Sizes: - 125M, 350M, 1.3B, 3B parameters - Local-first, runs on CPU or GPU - Python 3.10.6+, PyTorch 2.0+ Personas: - girlfriend_gentle (high warmth, high empathy) - girlfriend_playful (high humor, high playfulness) - girlfriend_supportive (balanced, default) Documentation: - Complete README with quickstart - Model card with ethical considerations - Privacy documentation (local-first, zero telemetry) - Data licenses and attribution - Contributing guide Infrastructure: - GitHub Actions CI/CD - Comprehensive test suite - Quickstart script - CLI tool License: Apache 2.0 🤖 Generated with Claude Code https://claude.com/claude-code Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
13
nova_evo/__init__.py
Normal file
13
nova_evo/__init__.py
Normal file
@@ -0,0 +1,13 @@
|
||||
"""
|
||||
NOVA-EVO - Genetic algorithm for architecture and hyperparameter optimization
|
||||
"""
|
||||
|
||||
from .evolution import EvolutionEngine
|
||||
from .fitness import FitnessEvaluator
|
||||
from .config import EvolutionConfig
|
||||
|
||||
__all__ = [
|
||||
'EvolutionEngine',
|
||||
'FitnessEvaluator',
|
||||
'EvolutionConfig',
|
||||
]
|
117
nova_evo/config.py
Normal file
117
nova_evo/config.py
Normal file
@@ -0,0 +1,117 @@
|
||||
"""
|
||||
Evolution configuration for NOVA-EVO
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvolutionConfig:
|
||||
"""Configuration for genetic algorithm evolution"""
|
||||
|
||||
# Population settings
|
||||
population_size: int = 20
|
||||
num_generations: int = 10
|
||||
elite_ratio: float = 0.2 # Top performers to keep
|
||||
mutation_rate: float = 0.3
|
||||
|
||||
# Search space - hyperparameters
|
||||
search_learning_rate: bool = True
|
||||
lr_min: float = 1e-5
|
||||
lr_max: float = 1e-3
|
||||
|
||||
search_batch_size: bool = True
|
||||
batch_size_options: List[int] = field(default_factory=lambda: [4, 8, 16, 32])
|
||||
|
||||
search_warmup_steps: bool = True
|
||||
warmup_min: int = 100
|
||||
warmup_max: int = 2000
|
||||
|
||||
search_weight_decay: bool = True
|
||||
wd_min: float = 0.0
|
||||
wd_max: float = 0.3
|
||||
|
||||
# Search space - architecture toggles
|
||||
search_rope_theta: bool = True
|
||||
rope_theta_options: List[float] = field(default_factory=lambda: [1000.0, 10000.0, 100000.0])
|
||||
|
||||
search_activation: bool = True
|
||||
activation_options: List[str] = field(default_factory=lambda: ['swiglu', 'geglu', 'gelu'])
|
||||
|
||||
search_norm: bool = True
|
||||
norm_options: List[str] = field(default_factory=lambda: ['rmsnorm', 'layernorm'])
|
||||
|
||||
# Fitness evaluation
|
||||
eval_steps: int = 100 # How many steps to train for evaluation
|
||||
eval_dataset_size: int = 1000 # Number of samples for evaluation
|
||||
|
||||
# Multi-objective weights
|
||||
loss_weight: float = 0.5
|
||||
latency_weight: float = 0.2
|
||||
memory_weight: float = 0.2
|
||||
quality_weight: float = 0.1 # Chat quality (if eval set available)
|
||||
|
||||
# Compute budgets
|
||||
max_eval_time_seconds: float = 300.0 # Max time per individual eval
|
||||
max_total_time_hours: float = 24.0 # Max total evolution time
|
||||
|
||||
# Checkpointing
|
||||
save_dir: str = "nova_evo/hall_of_fame"
|
||||
checkpoint_every_n_generations: int = 5
|
||||
|
||||
# Reproducibility
|
||||
seed: int = 42
|
||||
|
||||
|
||||
@dataclass
|
||||
class Individual:
|
||||
"""Single individual in evolution population"""
|
||||
|
||||
# Hyperparameters
|
||||
learning_rate: float = 3e-4
|
||||
batch_size: int = 8
|
||||
warmup_steps: int = 1000
|
||||
weight_decay: float = 0.1
|
||||
|
||||
# Architecture choices
|
||||
rope_theta: float = 10000.0
|
||||
hidden_act: str = "swiglu"
|
||||
norm_type: str = "rmsnorm"
|
||||
|
||||
# Fitness scores
|
||||
loss: Optional[float] = None
|
||||
perplexity: Optional[float] = None
|
||||
latency_ms: Optional[float] = None
|
||||
memory_mb: Optional[float] = None
|
||||
quality_score: Optional[float] = None
|
||||
fitness: Optional[float] = None
|
||||
|
||||
# Metadata
|
||||
generation: int = 0
|
||||
parent_ids: List[int] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary"""
|
||||
return {
|
||||
'learning_rate': self.learning_rate,
|
||||
'batch_size': self.batch_size,
|
||||
'warmup_steps': self.warmup_steps,
|
||||
'weight_decay': self.weight_decay,
|
||||
'rope_theta': self.rope_theta,
|
||||
'hidden_act': self.hidden_act,
|
||||
'norm_type': self.norm_type,
|
||||
'loss': self.loss,
|
||||
'perplexity': self.perplexity,
|
||||
'latency_ms': self.latency_ms,
|
||||
'memory_mb': self.memory_mb,
|
||||
'quality_score': self.quality_score,
|
||||
'fitness': self.fitness,
|
||||
'generation': self.generation,
|
||||
'parent_ids': self.parent_ids,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> 'Individual':
|
||||
"""Create from dictionary"""
|
||||
return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
|
318
nova_evo/evolution.py
Normal file
318
nova_evo/evolution.py
Normal file
@@ -0,0 +1,318 @@
|
||||
"""
|
||||
NOVA-EVO: Genetic algorithm for hyperparameter and architecture search
|
||||
"""
|
||||
|
||||
import random
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple, Optional
|
||||
import time
|
||||
from tqdm import tqdm
|
||||
import copy
|
||||
|
||||
from .config import EvolutionConfig, Individual
|
||||
from .fitness import FitnessEvaluator
|
||||
|
||||
|
||||
class EvolutionEngine:
|
||||
"""
|
||||
Genetic algorithm engine for evolving NOVA configurations
|
||||
|
||||
Features:
|
||||
- Multi-objective fitness (loss, latency, memory, quality)
|
||||
- Elitism with Pareto selection
|
||||
- Mutation and crossover
|
||||
- Hall of Fame for best individuals
|
||||
- Rollback on regression
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: EvolutionConfig,
|
||||
fitness_evaluator: FitnessEvaluator,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
config: Evolution configuration
|
||||
fitness_evaluator: Fitness evaluation engine
|
||||
"""
|
||||
self.config = config
|
||||
self.evaluator = fitness_evaluator
|
||||
|
||||
# Population
|
||||
self.population: List[Individual] = []
|
||||
self.generation = 0
|
||||
|
||||
# Hall of Fame - best individuals
|
||||
self.hall_of_fame: List[Individual] = []
|
||||
self.max_hof_size = 10
|
||||
|
||||
# Tracking
|
||||
self.evolution_history = []
|
||||
self.start_time = None
|
||||
|
||||
# Setup
|
||||
Path(config.save_dir).mkdir(parents=True, exist_ok=True)
|
||||
random.seed(config.seed)
|
||||
|
||||
def initialize_population(self) -> List[Individual]:
|
||||
"""Create initial random population"""
|
||||
print(f"Initializing population of {self.config.population_size}...")
|
||||
|
||||
population = []
|
||||
|
||||
for i in range(self.config.population_size):
|
||||
individual = Individual(
|
||||
learning_rate=random.uniform(self.config.lr_min, self.config.lr_max) if self.config.search_learning_rate else 3e-4,
|
||||
batch_size=random.choice(self.config.batch_size_options) if self.config.search_batch_size else 8,
|
||||
warmup_steps=random.randint(self.config.warmup_min, self.config.warmup_max) if self.config.search_warmup_steps else 1000,
|
||||
weight_decay=random.uniform(self.config.wd_min, self.config.wd_max) if self.config.search_weight_decay else 0.1,
|
||||
rope_theta=random.choice(self.config.rope_theta_options) if self.config.search_rope_theta else 10000.0,
|
||||
hidden_act=random.choice(self.config.activation_options) if self.config.search_activation else "swiglu",
|
||||
norm_type=random.choice(self.config.norm_options) if self.config.search_norm else "rmsnorm",
|
||||
generation=0,
|
||||
)
|
||||
population.append(individual)
|
||||
|
||||
return population
|
||||
|
||||
def evaluate_population(self, population: List[Individual]) -> List[Individual]:
|
||||
"""Evaluate fitness for all individuals in population"""
|
||||
print(f"\nEvaluating {len(population)} individuals...")
|
||||
|
||||
for idx, individual in enumerate(tqdm(population, desc="Evaluating")):
|
||||
# Skip if already evaluated
|
||||
if individual.fitness is not None:
|
||||
continue
|
||||
|
||||
# Evaluate
|
||||
metrics = self.evaluator.evaluate(individual)
|
||||
|
||||
# Store metrics
|
||||
individual.loss = metrics['loss']
|
||||
individual.perplexity = metrics.get('perplexity')
|
||||
individual.latency_ms = metrics.get('latency_ms')
|
||||
individual.memory_mb = metrics.get('memory_mb')
|
||||
individual.quality_score = metrics.get('quality_score', 0.0)
|
||||
|
||||
# Calculate multi-objective fitness
|
||||
individual.fitness = self._calculate_fitness(individual)
|
||||
|
||||
return population
|
||||
|
||||
def _calculate_fitness(self, individual: Individual) -> float:
|
||||
"""
|
||||
Calculate multi-objective fitness score
|
||||
|
||||
Lower is better (we're minimizing)
|
||||
"""
|
||||
fitness = 0.0
|
||||
|
||||
# Loss component (lower is better)
|
||||
if individual.loss is not None:
|
||||
fitness += individual.loss * self.config.loss_weight
|
||||
|
||||
# Latency component (lower is better, normalized)
|
||||
if individual.latency_ms is not None:
|
||||
normalized_latency = individual.latency_ms / 1000.0 # Normalize to seconds
|
||||
fitness += normalized_latency * self.config.latency_weight
|
||||
|
||||
# Memory component (lower is better, normalized)
|
||||
if individual.memory_mb is not None:
|
||||
normalized_memory = individual.memory_mb / 1000.0 # Normalize to GB
|
||||
fitness += normalized_memory * self.config.memory_weight
|
||||
|
||||
# Quality component (higher is better, so negate)
|
||||
if individual.quality_score is not None:
|
||||
fitness -= individual.quality_score * self.config.quality_weight
|
||||
|
||||
return fitness
|
||||
|
||||
def select_parents(self, population: List[Individual]) -> List[Individual]:
|
||||
"""
|
||||
Select parents for next generation using elitism
|
||||
|
||||
Args:
|
||||
population: Current population (should be evaluated)
|
||||
|
||||
Returns:
|
||||
Elite individuals to keep
|
||||
"""
|
||||
# Sort by fitness (lower is better)
|
||||
sorted_pop = sorted(population, key=lambda x: x.fitness if x.fitness is not None else float('inf'))
|
||||
|
||||
# Select top performers
|
||||
num_elite = max(1, int(len(population) * self.config.elite_ratio))
|
||||
elite = sorted_pop[:num_elite]
|
||||
|
||||
return elite
|
||||
|
||||
def crossover(self, parent1: Individual, parent2: Individual) -> Individual:
|
||||
"""
|
||||
Create offspring by combining two parents
|
||||
|
||||
Uses uniform crossover - randomly picks from each parent
|
||||
"""
|
||||
child = Individual(
|
||||
learning_rate=random.choice([parent1.learning_rate, parent2.learning_rate]),
|
||||
batch_size=random.choice([parent1.batch_size, parent2.batch_size]),
|
||||
warmup_steps=random.choice([parent1.warmup_steps, parent2.warmup_steps]),
|
||||
weight_decay=random.choice([parent1.weight_decay, parent2.weight_decay]),
|
||||
rope_theta=random.choice([parent1.rope_theta, parent2.rope_theta]),
|
||||
hidden_act=random.choice([parent1.hidden_act, parent2.hidden_act]),
|
||||
norm_type=random.choice([parent1.norm_type, parent2.norm_type]),
|
||||
generation=self.generation + 1,
|
||||
parent_ids=[id(parent1), id(parent2)],
|
||||
)
|
||||
|
||||
return child
|
||||
|
||||
def mutate(self, individual: Individual) -> Individual:
|
||||
"""
|
||||
Mutate an individual with random changes
|
||||
|
||||
Args:
|
||||
individual: Individual to mutate
|
||||
|
||||
Returns:
|
||||
Mutated copy
|
||||
"""
|
||||
mutated = copy.deepcopy(individual)
|
||||
mutated.generation = self.generation + 1
|
||||
|
||||
# Mutate each gene with some probability
|
||||
if random.random() < self.config.mutation_rate:
|
||||
mutated.learning_rate = random.uniform(self.config.lr_min, self.config.lr_max)
|
||||
|
||||
if random.random() < self.config.mutation_rate:
|
||||
mutated.batch_size = random.choice(self.config.batch_size_options)
|
||||
|
||||
if random.random() < self.config.mutation_rate:
|
||||
mutated.warmup_steps = random.randint(self.config.warmup_min, self.config.warmup_max)
|
||||
|
||||
if random.random() < self.config.mutation_rate:
|
||||
mutated.weight_decay = random.uniform(self.config.wd_min, self.config.wd_max)
|
||||
|
||||
if random.random() < self.config.mutation_rate:
|
||||
mutated.rope_theta = random.choice(self.config.rope_theta_options)
|
||||
|
||||
if random.random() < self.config.mutation_rate:
|
||||
mutated.hidden_act = random.choice(self.config.activation_options)
|
||||
|
||||
if random.random() < self.config.mutation_rate:
|
||||
mutated.norm_type = random.choice(self.config.norm_options)
|
||||
|
||||
# Reset fitness (needs re-evaluation)
|
||||
mutated.fitness = None
|
||||
mutated.loss = None
|
||||
|
||||
return mutated
|
||||
|
||||
def create_next_generation(self, parents: List[Individual]) -> List[Individual]:
|
||||
"""Create next generation from parents"""
|
||||
next_gen = []
|
||||
|
||||
# Keep elite unchanged
|
||||
next_gen.extend(copy.deepcopy(parents))
|
||||
|
||||
# Fill rest with offspring
|
||||
while len(next_gen) < self.config.population_size:
|
||||
# Select two random parents
|
||||
parent1, parent2 = random.sample(parents, 2)
|
||||
|
||||
# Crossover
|
||||
child = self.crossover(parent1, parent2)
|
||||
|
||||
# Mutate
|
||||
child = self.mutate(child)
|
||||
|
||||
next_gen.append(child)
|
||||
|
||||
return next_gen
|
||||
|
||||
def update_hall_of_fame(self, population: List[Individual]):
|
||||
"""Update hall of fame with best individuals"""
|
||||
# Add current best to hall of fame
|
||||
for ind in population:
|
||||
if ind.fitness is not None:
|
||||
self.hall_of_fame.append(copy.deepcopy(ind))
|
||||
|
||||
# Sort by fitness
|
||||
self.hall_of_fame.sort(key=lambda x: x.fitness if x.fitness is not None else float('inf'))
|
||||
|
||||
# Keep only top N
|
||||
self.hall_of_fame = self.hall_of_fame[:self.max_hof_size]
|
||||
|
||||
def save_checkpoint(self):
|
||||
"""Save evolution state"""
|
||||
checkpoint_path = Path(self.config.save_dir) / f"generation_{self.generation}.json"
|
||||
|
||||
checkpoint = {
|
||||
'generation': self.generation,
|
||||
'population': [ind.to_dict() for ind in self.population],
|
||||
'hall_of_fame': [ind.to_dict() for ind in self.hall_of_fame],
|
||||
'config': self.config.__dict__,
|
||||
}
|
||||
|
||||
with open(checkpoint_path, 'w') as f:
|
||||
json.dump(checkpoint, f, indent=2)
|
||||
|
||||
print(f" Checkpoint saved: {checkpoint_path}")
|
||||
|
||||
def run(self):
|
||||
"""Run the evolution process"""
|
||||
print("=" * 60)
|
||||
print("NOVA-EVO: Genetic Algorithm Evolution")
|
||||
print("=" * 60)
|
||||
|
||||
self.start_time = time.time()
|
||||
|
||||
# Initialize population
|
||||
self.population = self.initialize_population()
|
||||
|
||||
# Evolution loop
|
||||
for gen in range(self.config.num_generations):
|
||||
self.generation = gen
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Generation {gen + 1}/{self.config.num_generations}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Evaluate
|
||||
self.population = self.evaluate_population(self.population)
|
||||
|
||||
# Select parents
|
||||
parents = self.select_parents(self.population)
|
||||
|
||||
# Update hall of fame
|
||||
self.update_hall_of_fame(self.population)
|
||||
|
||||
# Report best individual
|
||||
best = self.hall_of_fame[0] if self.hall_of_fame else None
|
||||
if best:
|
||||
print(f"\n🏆 Best individual so far:")
|
||||
print(f" Fitness: {best.fitness:.4f}")
|
||||
print(f" Loss: {best.loss:.4f}")
|
||||
print(f" LR: {best.learning_rate:.2e}, BS: {best.batch_size}")
|
||||
print(f" Activation: {best.hidden_act}, Norm: {best.norm_type}")
|
||||
|
||||
# Checkpoint
|
||||
if (gen + 1) % self.config.checkpoint_every_n_generations == 0:
|
||||
self.save_checkpoint()
|
||||
|
||||
# Create next generation
|
||||
if gen < self.config.num_generations - 1:
|
||||
self.population = self.create_next_generation(parents)
|
||||
|
||||
# Final checkpoint
|
||||
self.save_checkpoint()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Evolution Complete!")
|
||||
print("=" * 60)
|
||||
print(f"Total time: {(time.time() - self.start_time) / 3600:.2f} hours")
|
||||
print(f"\nTop 3 individuals:")
|
||||
for i, ind in enumerate(self.hall_of_fame[:3]):
|
||||
print(f"\n{i+1}. Fitness: {ind.fitness:.4f}")
|
||||
print(f" Loss: {ind.loss:.4f}, LR: {ind.learning_rate:.2e}")
|
||||
print(f" Batch size: {ind.batch_size}, Warmup: {ind.warmup_steps}")
|
||||
print(f" Activation: {ind.hidden_act}, Norm: {ind.norm_type}")
|
243
nova_evo/fitness.py
Normal file
243
nova_evo/fitness.py
Normal file
@@ -0,0 +1,243 @@
|
||||
"""
|
||||
Fitness evaluator for NOVA-EVO
|
||||
"""
|
||||
|
||||
import torch
|
||||
import time
|
||||
from typing import Dict
|
||||
from pathlib import Path
|
||||
|
||||
from .config import Individual, EvolutionConfig
|
||||
from nova_core import NovaTransformer, ModelConfig
|
||||
from nova_train import NovaTrainer, TrainingConfig
|
||||
|
||||
|
||||
class FitnessEvaluator:
|
||||
"""
|
||||
Evaluates fitness of individuals by training and measuring metrics
|
||||
|
||||
Metrics:
|
||||
- Loss/perplexity (quality of learning)
|
||||
- Latency (inference speed)
|
||||
- Memory usage (peak RAM/VRAM)
|
||||
- Chat quality (optional, if eval set available)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
base_model_config: ModelConfig,
|
||||
evo_config: EvolutionConfig,
|
||||
train_dataset,
|
||||
eval_dataset=None,
|
||||
device: str = "auto",
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
base_model_config: Base model configuration
|
||||
evo_config: Evolution configuration
|
||||
train_dataset: Training dataset for fitness eval
|
||||
eval_dataset: Optional evaluation dataset
|
||||
device: Device for training
|
||||
"""
|
||||
self.base_model_config = base_model_config
|
||||
self.evo_config = evo_config
|
||||
self.train_dataset = train_dataset
|
||||
self.eval_dataset = eval_dataset
|
||||
self.device = device
|
||||
|
||||
def evaluate(self, individual: Individual) -> Dict[str, float]:
|
||||
"""
|
||||
Evaluate fitness of an individual
|
||||
|
||||
Args:
|
||||
individual: Individual to evaluate
|
||||
|
||||
Returns:
|
||||
Dictionary of metrics
|
||||
"""
|
||||
# Create model with individual's architecture choices
|
||||
model_config = self._create_model_config(individual)
|
||||
model = NovaTransformer(model_config)
|
||||
|
||||
# Create training config with individual's hyperparameters
|
||||
train_config = self._create_training_config(individual)
|
||||
|
||||
# Train for eval_steps
|
||||
train_loader = self._create_dataloader(
|
||||
self.train_dataset,
|
||||
batch_size=individual.batch_size
|
||||
)
|
||||
|
||||
# Quick training
|
||||
loss = self._quick_train(model, train_config, train_loader)
|
||||
|
||||
# Measure latency
|
||||
latency_ms = self._measure_latency(model)
|
||||
|
||||
# Measure memory
|
||||
memory_mb = self._measure_memory(model)
|
||||
|
||||
# Calculate perplexity
|
||||
perplexity = torch.exp(torch.tensor(loss)).item() if loss < 100 else float('inf')
|
||||
|
||||
return {
|
||||
'loss': loss,
|
||||
'perplexity': perplexity,
|
||||
'latency_ms': latency_ms,
|
||||
'memory_mb': memory_mb,
|
||||
'quality_score': 0.0, # TODO: Implement chat quality eval
|
||||
}
|
||||
|
||||
def _create_model_config(self, individual: Individual) -> ModelConfig:
|
||||
"""Create model config from individual's genes"""
|
||||
config = ModelConfig(
|
||||
vocab_size=self.base_model_config.vocab_size,
|
||||
hidden_size=self.base_model_config.hidden_size,
|
||||
num_hidden_layers=self.base_model_config.num_hidden_layers,
|
||||
num_attention_heads=self.base_model_config.num_attention_heads,
|
||||
intermediate_size=self.base_model_config.intermediate_size,
|
||||
max_position_embeddings=self.base_model_config.max_position_embeddings,
|
||||
# Individual's choices
|
||||
rope_theta=individual.rope_theta,
|
||||
hidden_act=individual.hidden_act,
|
||||
norm_type=individual.norm_type,
|
||||
)
|
||||
return config
|
||||
|
||||
def _create_training_config(self, individual: Individual) -> TrainingConfig:
|
||||
"""Create training config from individual's hyperparameters"""
|
||||
config = TrainingConfig(
|
||||
learning_rate=individual.learning_rate,
|
||||
batch_size=individual.batch_size,
|
||||
warmup_steps=individual.warmup_steps,
|
||||
weight_decay=individual.weight_decay,
|
||||
num_epochs=1, # Just one pass for eval
|
||||
save_steps=999999, # Don't save during eval
|
||||
device=self.device,
|
||||
)
|
||||
return config
|
||||
|
||||
def _create_dataloader(self, dataset, batch_size: int):
|
||||
"""Create dataloader for training"""
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
return DataLoader(
|
||||
dataset,
|
||||
batch_size=batch_size,
|
||||
shuffle=True,
|
||||
num_workers=0,
|
||||
)
|
||||
|
||||
def _quick_train(
|
||||
self,
|
||||
model: NovaTransformer,
|
||||
train_config: TrainingConfig,
|
||||
train_loader
|
||||
) -> float:
|
||||
"""
|
||||
Quick training for evaluation
|
||||
|
||||
Returns:
|
||||
Final loss
|
||||
"""
|
||||
# Limit to eval_steps
|
||||
limited_loader = []
|
||||
for i, batch in enumerate(train_loader):
|
||||
if i >= self.evo_config.eval_steps:
|
||||
break
|
||||
limited_loader.append(batch)
|
||||
|
||||
if not limited_loader:
|
||||
return float('inf')
|
||||
|
||||
# Simple training loop
|
||||
device = torch.device(self.device if self.device != "auto" else "cuda" if torch.cuda.is_available() else "cpu")
|
||||
model.to(device)
|
||||
model.train()
|
||||
|
||||
optimizer = torch.optim.AdamW(
|
||||
model.parameters(),
|
||||
lr=train_config.learning_rate,
|
||||
weight_decay=train_config.weight_decay,
|
||||
)
|
||||
|
||||
total_loss = 0.0
|
||||
num_batches = 0
|
||||
|
||||
for batch in limited_loader:
|
||||
input_ids = batch['input_ids'].to(device)
|
||||
labels = batch.get('labels', input_ids).to(device)
|
||||
|
||||
outputs = model(input_ids=input_ids)
|
||||
logits = outputs['logits']
|
||||
|
||||
# Calculate loss
|
||||
shift_logits = logits[..., :-1, :].contiguous()
|
||||
shift_labels = labels[..., 1:].contiguous()
|
||||
|
||||
loss = torch.nn.functional.cross_entropy(
|
||||
shift_logits.view(-1, shift_logits.size(-1)),
|
||||
shift_labels.view(-1),
|
||||
ignore_index=-100
|
||||
)
|
||||
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
total_loss += loss.item()
|
||||
num_batches += 1
|
||||
|
||||
return total_loss / num_batches if num_batches > 0 else float('inf')
|
||||
|
||||
@torch.no_grad()
|
||||
def _measure_latency(self, model: NovaTransformer) -> float:
|
||||
"""
|
||||
Measure average inference latency in milliseconds
|
||||
|
||||
Args:
|
||||
model: Model to measure
|
||||
|
||||
Returns:
|
||||
Average latency in ms
|
||||
"""
|
||||
device = next(model.parameters()).device
|
||||
model.eval()
|
||||
|
||||
# Dummy input
|
||||
input_ids = torch.randint(0, model.config.vocab_size, (1, 128), device=device)
|
||||
|
||||
# Warmup
|
||||
for _ in range(3):
|
||||
_ = model(input_ids=input_ids)
|
||||
|
||||
# Measure
|
||||
num_runs = 10
|
||||
start = time.time()
|
||||
|
||||
for _ in range(num_runs):
|
||||
_ = model(input_ids=input_ids)
|
||||
|
||||
if device.type == 'cuda':
|
||||
torch.cuda.synchronize()
|
||||
|
||||
elapsed = (time.time() - start) / num_runs
|
||||
return elapsed * 1000 # Convert to ms
|
||||
|
||||
def _measure_memory(self, model: NovaTransformer) -> float:
|
||||
"""
|
||||
Measure peak memory usage in MB
|
||||
|
||||
Args:
|
||||
model: Model to measure
|
||||
|
||||
Returns:
|
||||
Peak memory in MB
|
||||
"""
|
||||
# Count parameters
|
||||
num_params = sum(p.numel() for p in model.parameters())
|
||||
|
||||
# Approximate memory (4 bytes per float32 parameter)
|
||||
memory_mb = (num_params * 4) / (1024 ** 2)
|
||||
|
||||
return memory_mb
|
Reference in New Issue
Block a user