Files
NOVA/nova_train/config.py
Dani a7f091aa45 Initial commit: NOVA - Neuro-Optimizing Versatile Agent
Complete transformer LLM built from scratch with:

Core Features:
- Full transformer architecture (RoPE, RMSNorm, SwiGLU, KV-cache)
- SentencePiece tokenizer (BPE/Unigram)
- Training pipeline (AMP, gradient checkpointing, DDP)
- Persona system with personality matrix (NO AI disclosure by default)
- Genetic evolution (NOVA-EVO) for hyperparameter optimization
- Legal-only data pipeline with license tracking
- Chat interface (CLI + REST API)
- Conversation memory (SQLite)

Model Sizes:
- 125M, 350M, 1.3B, 3B parameters
- Local-first, runs on CPU or GPU
- Python 3.10.6+, PyTorch 2.0+

Personas:
- girlfriend_gentle (high warmth, high empathy)
- girlfriend_playful (high humor, high playfulness)
- girlfriend_supportive (balanced, default)

Documentation:
- Complete README with quickstart
- Model card with ethical considerations
- Privacy documentation (local-first, zero telemetry)
- Data licenses and attribution
- Contributing guide

Infrastructure:
- GitHub Actions CI/CD
- Comprehensive test suite
- Quickstart script
- CLI tool

License: Apache 2.0

🤖 Generated with Claude Code
https://claude.com/claude-code

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-12 20:56:37 -04:00

75 lines
2.0 KiB
Python

"""
Training configuration
"""
from dataclasses import dataclass
from typing import Optional
@dataclass
class TrainingConfig:
"""Configuration for training NOVA models"""
# Model
model_name: str = "nova-125m"
model_config_path: Optional[str] = None
# Data
train_data_path: str = "data/train"
val_data_path: str = "data/val"
max_seq_length: int = 2048
# Training hyperparameters
num_epochs: int = 10
batch_size: int = 8
gradient_accumulation_steps: int = 4
learning_rate: float = 3e-4
weight_decay: float = 0.1
max_grad_norm: float = 1.0
warmup_steps: int = 1000
lr_scheduler: str = "cosine" # or "linear", "constant"
# Optimization
optimizer: str = "adamw" # or "lion", "adafactor"
adam_beta1: float = 0.9
adam_beta2: float = 0.95
adam_epsilon: float = 1e-8
# Mixed precision and efficiency
use_amp: bool = True # Automatic Mixed Precision
gradient_checkpointing: bool = False
use_ddp: bool = False # Distributed Data Parallel
# Checkpointing
save_dir: str = "checkpoints"
save_steps: int = 1000
save_total_limit: int = 5
resume_from_checkpoint: Optional[str] = None
# Evaluation
eval_steps: int = 500
eval_strategy: str = "steps" # or "epoch"
logging_steps: int = 100
# Early stopping
early_stopping: bool = False
early_stopping_patience: int = 3
early_stopping_threshold: float = 0.001
# Reproducibility
seed: int = 42
# Device
device: str = "auto" # "auto", "cpu", "cuda", "cuda:0", etc.
# Logging
log_to_wandb: bool = False
wandb_project: Optional[str] = None
wandb_run_name: Optional[str] = None
def __post_init__(self):
"""Validate configuration"""
assert self.batch_size > 0, "batch_size must be positive"
assert self.learning_rate > 0, "learning_rate must be positive"
assert self.gradient_accumulation_steps > 0, "gradient_accumulation_steps must be positive"