Complete transformer LLM built from scratch with: Core Features: - Full transformer architecture (RoPE, RMSNorm, SwiGLU, KV-cache) - SentencePiece tokenizer (BPE/Unigram) - Training pipeline (AMP, gradient checkpointing, DDP) - Persona system with personality matrix (NO AI disclosure by default) - Genetic evolution (NOVA-EVO) for hyperparameter optimization - Legal-only data pipeline with license tracking - Chat interface (CLI + REST API) - Conversation memory (SQLite) Model Sizes: - 125M, 350M, 1.3B, 3B parameters - Local-first, runs on CPU or GPU - Python 3.10.6+, PyTorch 2.0+ Personas: - girlfriend_gentle (high warmth, high empathy) - girlfriend_playful (high humor, high playfulness) - girlfriend_supportive (balanced, default) Documentation: - Complete README with quickstart - Model card with ethical considerations - Privacy documentation (local-first, zero telemetry) - Data licenses and attribution - Contributing guide Infrastructure: - GitHub Actions CI/CD - Comprehensive test suite - Quickstart script - CLI tool License: Apache 2.0 🤖 Generated with Claude Code https://claude.com/claude-code Co-Authored-By: Claude <noreply@anthropic.com>
95 lines
2.5 KiB
Python
95 lines
2.5 KiB
Python
"""
|
|
Model configuration for NOVA transformer
|
|
"""
|
|
|
|
from dataclasses import dataclass
|
|
from typing import Optional
|
|
|
|
|
|
@dataclass
|
|
class ModelConfig:
|
|
"""Configuration for NOVA transformer model"""
|
|
|
|
# Model architecture
|
|
vocab_size: int = 32000
|
|
hidden_size: int = 768
|
|
num_hidden_layers: int = 12
|
|
num_attention_heads: int = 12
|
|
intermediate_size: int = 3072
|
|
max_position_embeddings: int = 2048
|
|
|
|
# Activation and normalization
|
|
hidden_act: str = "swiglu" # or "gelu"
|
|
norm_type: str = "rmsnorm" # or "layernorm"
|
|
rms_norm_eps: float = 1e-6
|
|
|
|
# Positional encoding
|
|
rope_theta: float = 10000.0
|
|
use_rope: bool = True
|
|
use_alibi: bool = False # Alternative to RoPE
|
|
|
|
# Attention
|
|
attention_dropout: float = 0.0
|
|
hidden_dropout: float = 0.1
|
|
num_key_value_heads: Optional[int] = None # For grouped-query attention (GQA)
|
|
use_flash_attention: bool = False # Auto-detected at runtime
|
|
|
|
# Training
|
|
initializer_range: float = 0.02
|
|
use_cache: bool = True # KV-cache for inference
|
|
|
|
# Efficiency
|
|
gradient_checkpointing: bool = False
|
|
tie_word_embeddings: bool = False
|
|
|
|
def __post_init__(self):
|
|
"""Validate and set derived values"""
|
|
if self.num_key_value_heads is None:
|
|
self.num_key_value_heads = self.num_attention_heads
|
|
|
|
assert self.hidden_size % self.num_attention_heads == 0, \
|
|
f"hidden_size ({self.hidden_size}) must be divisible by num_attention_heads ({self.num_attention_heads})"
|
|
|
|
assert self.num_attention_heads % self.num_key_value_heads == 0, \
|
|
f"num_attention_heads ({self.num_attention_heads}) must be divisible by num_key_value_heads ({self.num_key_value_heads})"
|
|
|
|
|
|
# Predefined model sizes
|
|
MODEL_125M = ModelConfig(
|
|
vocab_size=32000,
|
|
hidden_size=768,
|
|
num_hidden_layers=12,
|
|
num_attention_heads=12,
|
|
intermediate_size=3072,
|
|
max_position_embeddings=2048,
|
|
)
|
|
|
|
MODEL_350M = ModelConfig(
|
|
vocab_size=32000,
|
|
hidden_size=1024,
|
|
num_hidden_layers=24,
|
|
num_attention_heads=16,
|
|
intermediate_size=4096,
|
|
max_position_embeddings=2048,
|
|
)
|
|
|
|
MODEL_1_3B = ModelConfig(
|
|
vocab_size=32000,
|
|
hidden_size=2048,
|
|
num_hidden_layers=24,
|
|
num_attention_heads=32,
|
|
intermediate_size=8192,
|
|
max_position_embeddings=2048,
|
|
num_key_value_heads=8, # GQA for efficiency
|
|
)
|
|
|
|
MODEL_3B = ModelConfig(
|
|
vocab_size=32000,
|
|
hidden_size=2560,
|
|
num_hidden_layers=32,
|
|
num_attention_heads=32,
|
|
intermediate_size=10240,
|
|
max_position_embeddings=4096,
|
|
num_key_value_heads=8, # GQA for efficiency
|
|
)
|