Initial commit: NOVA - Neuro-Optimizing Versatile Agent
Complete transformer LLM built from scratch with: Core Features: - Full transformer architecture (RoPE, RMSNorm, SwiGLU, KV-cache) - SentencePiece tokenizer (BPE/Unigram) - Training pipeline (AMP, gradient checkpointing, DDP) - Persona system with personality matrix (NO AI disclosure by default) - Genetic evolution (NOVA-EVO) for hyperparameter optimization - Legal-only data pipeline with license tracking - Chat interface (CLI + REST API) - Conversation memory (SQLite) Model Sizes: - 125M, 350M, 1.3B, 3B parameters - Local-first, runs on CPU or GPU - Python 3.10.6+, PyTorch 2.0+ Personas: - girlfriend_gentle (high warmth, high empathy) - girlfriend_playful (high humor, high playfulness) - girlfriend_supportive (balanced, default) Documentation: - Complete README with quickstart - Model card with ethical considerations - Privacy documentation (local-first, zero telemetry) - Data licenses and attribution - Contributing guide Infrastructure: - GitHub Actions CI/CD - Comprehensive test suite - Quickstart script - CLI tool License: Apache 2.0 🤖 Generated with Claude Code https://claude.com/claude-code Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
114
nova_core/activations.py
Normal file
114
nova_core/activations.py
Normal file
@@ -0,0 +1,114 @@
|
||||
"""
|
||||
Activation functions for NOVA
|
||||
"""
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
class SwiGLU(nn.Module):
|
||||
"""
|
||||
SwiGLU activation function from Shazeer (2020)
|
||||
Used in PaLM and other modern LLMs
|
||||
|
||||
SwiGLU(x, W, V, b, c) = Swish(xW + b) ⊗ (xV + c)
|
||||
where Swish(x) = x * sigmoid(x)
|
||||
"""
|
||||
|
||||
def __init__(self, hidden_size: int, intermediate_size: int, bias: bool = False):
|
||||
"""
|
||||
Args:
|
||||
hidden_size: Input dimension
|
||||
intermediate_size: Hidden dimension (usually 4 * hidden_size)
|
||||
bias: Whether to use bias in linear layers
|
||||
"""
|
||||
super().__init__()
|
||||
# Gate projection
|
||||
self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=bias)
|
||||
# Up projection
|
||||
self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=bias)
|
||||
# Down projection
|
||||
self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=bias)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply SwiGLU activation
|
||||
|
||||
Args:
|
||||
x: Input tensor [..., hidden_size]
|
||||
|
||||
Returns:
|
||||
Output tensor [..., hidden_size]
|
||||
"""
|
||||
# Swish activation: x * sigmoid(x)
|
||||
gate = F.silu(self.gate_proj(x))
|
||||
# Element-wise multiplication with up projection
|
||||
up = self.up_proj(x)
|
||||
# Down projection
|
||||
return self.down_proj(gate * up)
|
||||
|
||||
|
||||
class GeGLU(nn.Module):
|
||||
"""
|
||||
GeGLU activation function - variant of SwiGLU using GELU
|
||||
GeGLU(x, W, V) = GELU(xW) ⊗ (xV)
|
||||
"""
|
||||
|
||||
def __init__(self, hidden_size: int, intermediate_size: int, bias: bool = False):
|
||||
"""
|
||||
Args:
|
||||
hidden_size: Input dimension
|
||||
intermediate_size: Hidden dimension
|
||||
bias: Whether to use bias in linear layers
|
||||
"""
|
||||
super().__init__()
|
||||
self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=bias)
|
||||
self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=bias)
|
||||
self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=bias)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
"""Apply GeGLU activation"""
|
||||
gate = F.gelu(self.gate_proj(x), approximate="tanh")
|
||||
up = self.up_proj(x)
|
||||
return self.down_proj(gate * up)
|
||||
|
||||
|
||||
class MLP(nn.Module):
|
||||
"""
|
||||
Standard MLP with configurable activation
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size: int,
|
||||
intermediate_size: int,
|
||||
hidden_act: str = "swiglu",
|
||||
bias: bool = False
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
hidden_size: Input/output dimension
|
||||
intermediate_size: Hidden dimension
|
||||
hidden_act: Activation function ('swiglu', 'geglu', or 'gelu')
|
||||
bias: Whether to use bias
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
if hidden_act.lower() == "swiglu":
|
||||
self.mlp = SwiGLU(hidden_size, intermediate_size, bias)
|
||||
elif hidden_act.lower() == "geglu":
|
||||
self.mlp = GeGLU(hidden_size, intermediate_size, bias)
|
||||
elif hidden_act.lower() == "gelu":
|
||||
# Standard GELU MLP
|
||||
self.mlp = nn.Sequential(
|
||||
nn.Linear(hidden_size, intermediate_size, bias=bias),
|
||||
nn.GELU(approximate="tanh"),
|
||||
nn.Linear(intermediate_size, hidden_size, bias=bias)
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown activation: {hidden_act}")
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
"""Forward pass through MLP"""
|
||||
return self.mlp(x)
|
Reference in New Issue
Block a user