Complete transformer LLM built from scratch with: Core Features: - Full transformer architecture (RoPE, RMSNorm, SwiGLU, KV-cache) - SentencePiece tokenizer (BPE/Unigram) - Training pipeline (AMP, gradient checkpointing, DDP) - Persona system with personality matrix (NO AI disclosure by default) - Genetic evolution (NOVA-EVO) for hyperparameter optimization - Legal-only data pipeline with license tracking - Chat interface (CLI + REST API) - Conversation memory (SQLite) Model Sizes: - 125M, 350M, 1.3B, 3B parameters - Local-first, runs on CPU or GPU - Python 3.10.6+, PyTorch 2.0+ Personas: - girlfriend_gentle (high warmth, high empathy) - girlfriend_playful (high humor, high playfulness) - girlfriend_supportive (balanced, default) Documentation: - Complete README with quickstart - Model card with ethical considerations - Privacy documentation (local-first, zero telemetry) - Data licenses and attribution - Contributing guide Infrastructure: - GitHub Actions CI/CD - Comprehensive test suite - Quickstart script - CLI tool License: Apache 2.0 🤖 Generated with Claude Code https://claude.com/claude-code Co-Authored-By: Claude <noreply@anthropic.com>
75 lines
2.0 KiB
Python
75 lines
2.0 KiB
Python
"""
|
|
Training configuration
|
|
"""
|
|
|
|
from dataclasses import dataclass
|
|
from typing import Optional
|
|
|
|
|
|
@dataclass
|
|
class TrainingConfig:
|
|
"""Configuration for training NOVA models"""
|
|
|
|
# Model
|
|
model_name: str = "nova-125m"
|
|
model_config_path: Optional[str] = None
|
|
|
|
# Data
|
|
train_data_path: str = "data/train"
|
|
val_data_path: str = "data/val"
|
|
max_seq_length: int = 2048
|
|
|
|
# Training hyperparameters
|
|
num_epochs: int = 10
|
|
batch_size: int = 8
|
|
gradient_accumulation_steps: int = 4
|
|
learning_rate: float = 3e-4
|
|
weight_decay: float = 0.1
|
|
max_grad_norm: float = 1.0
|
|
warmup_steps: int = 1000
|
|
lr_scheduler: str = "cosine" # or "linear", "constant"
|
|
|
|
# Optimization
|
|
optimizer: str = "adamw" # or "lion", "adafactor"
|
|
adam_beta1: float = 0.9
|
|
adam_beta2: float = 0.95
|
|
adam_epsilon: float = 1e-8
|
|
|
|
# Mixed precision and efficiency
|
|
use_amp: bool = True # Automatic Mixed Precision
|
|
gradient_checkpointing: bool = False
|
|
use_ddp: bool = False # Distributed Data Parallel
|
|
|
|
# Checkpointing
|
|
save_dir: str = "checkpoints"
|
|
save_steps: int = 1000
|
|
save_total_limit: int = 5
|
|
resume_from_checkpoint: Optional[str] = None
|
|
|
|
# Evaluation
|
|
eval_steps: int = 500
|
|
eval_strategy: str = "steps" # or "epoch"
|
|
logging_steps: int = 100
|
|
|
|
# Early stopping
|
|
early_stopping: bool = False
|
|
early_stopping_patience: int = 3
|
|
early_stopping_threshold: float = 0.001
|
|
|
|
# Reproducibility
|
|
seed: int = 42
|
|
|
|
# Device
|
|
device: str = "auto" # "auto", "cpu", "cuda", "cuda:0", etc.
|
|
|
|
# Logging
|
|
log_to_wandb: bool = False
|
|
wandb_project: Optional[str] = None
|
|
wandb_run_name: Optional[str] = None
|
|
|
|
def __post_init__(self):
|
|
"""Validate configuration"""
|
|
assert self.batch_size > 0, "batch_size must be positive"
|
|
assert self.learning_rate > 0, "learning_rate must be positive"
|
|
assert self.gradient_accumulation_steps > 0, "gradient_accumulation_steps must be positive"
|