Initial commit: NOVA - Neuro-Optimizing Versatile Agent
Complete transformer LLM built from scratch with: Core Features: - Full transformer architecture (RoPE, RMSNorm, SwiGLU, KV-cache) - SentencePiece tokenizer (BPE/Unigram) - Training pipeline (AMP, gradient checkpointing, DDP) - Persona system with personality matrix (NO AI disclosure by default) - Genetic evolution (NOVA-EVO) for hyperparameter optimization - Legal-only data pipeline with license tracking - Chat interface (CLI + REST API) - Conversation memory (SQLite) Model Sizes: - 125M, 350M, 1.3B, 3B parameters - Local-first, runs on CPU or GPU - Python 3.10.6+, PyTorch 2.0+ Personas: - girlfriend_gentle (high warmth, high empathy) - girlfriend_playful (high humor, high playfulness) - girlfriend_supportive (balanced, default) Documentation: - Complete README with quickstart - Model card with ethical considerations - Privacy documentation (local-first, zero telemetry) - Data licenses and attribution - Contributing guide Infrastructure: - GitHub Actions CI/CD - Comprehensive test suite - Quickstart script - CLI tool License: Apache 2.0 🤖 Generated with Claude Code https://claude.com/claude-code Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
98
nova_core/layers.py
Normal file
98
nova_core/layers.py
Normal file
@@ -0,0 +1,98 @@
|
||||
"""
|
||||
Transformer block layers
|
||||
"""
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from .attention import MultiHeadAttention
|
||||
from .activations import MLP
|
||||
from .normalization import get_norm_layer
|
||||
|
||||
|
||||
class TransformerBlock(nn.Module):
|
||||
"""
|
||||
Single transformer decoder block with:
|
||||
- Multi-head attention with RoPE
|
||||
- Feed-forward network (MLP)
|
||||
- Pre-normalization (norm before attention/FFN)
|
||||
- Residual connections
|
||||
"""
|
||||
|
||||
def __init__(self, config, layer_idx: int):
|
||||
"""
|
||||
Args:
|
||||
config: ModelConfig instance
|
||||
layer_idx: Layer index for identification
|
||||
"""
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.layer_idx = layer_idx
|
||||
|
||||
# Attention
|
||||
self.self_attn = MultiHeadAttention(config)
|
||||
self.attn_norm = get_norm_layer(
|
||||
config.norm_type,
|
||||
config.hidden_size,
|
||||
config.rms_norm_eps
|
||||
)
|
||||
|
||||
# Feed-forward
|
||||
self.mlp = MLP(
|
||||
hidden_size=config.hidden_size,
|
||||
intermediate_size=config.intermediate_size,
|
||||
hidden_act=config.hidden_act
|
||||
)
|
||||
self.mlp_norm = get_norm_layer(
|
||||
config.norm_type,
|
||||
config.hidden_size,
|
||||
config.rms_norm_eps
|
||||
)
|
||||
|
||||
# Dropout
|
||||
self.dropout = nn.Dropout(config.hidden_dropout)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
|
||||
past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
|
||||
use_cache: bool = False,
|
||||
) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
|
||||
"""
|
||||
Args:
|
||||
hidden_states: [batch, seq_len, hidden_size]
|
||||
attention_mask: Optional attention mask
|
||||
position_embeddings: Optional (cos, sin) for RoPE
|
||||
past_key_value: Optional cached key/value
|
||||
use_cache: Whether to return key/value cache
|
||||
|
||||
Returns:
|
||||
(hidden_states, past_key_value if use_cache else None)
|
||||
"""
|
||||
residual = hidden_states
|
||||
|
||||
# Pre-norm for attention
|
||||
hidden_states = self.attn_norm(hidden_states)
|
||||
|
||||
# Self-attention with KV-cache
|
||||
attn_output, past_key_value = self.self_attn(
|
||||
hidden_states=hidden_states,
|
||||
attention_mask=attention_mask,
|
||||
position_embeddings=position_embeddings,
|
||||
past_key_value=past_key_value,
|
||||
use_cache=use_cache,
|
||||
)
|
||||
|
||||
# Residual connection
|
||||
hidden_states = residual + self.dropout(attn_output)
|
||||
|
||||
# Feed-forward with pre-norm
|
||||
residual = hidden_states
|
||||
hidden_states = self.mlp_norm(hidden_states)
|
||||
mlp_output = self.mlp(hidden_states)
|
||||
hidden_states = residual + self.dropout(mlp_output)
|
||||
|
||||
return hidden_states, past_key_value
|
Reference in New Issue
Block a user