Initial commit: NOVA - Neuro-Optimizing Versatile Agent
Complete transformer LLM built from scratch with: Core Features: - Full transformer architecture (RoPE, RMSNorm, SwiGLU, KV-cache) - SentencePiece tokenizer (BPE/Unigram) - Training pipeline (AMP, gradient checkpointing, DDP) - Persona system with personality matrix (NO AI disclosure by default) - Genetic evolution (NOVA-EVO) for hyperparameter optimization - Legal-only data pipeline with license tracking - Chat interface (CLI + REST API) - Conversation memory (SQLite) Model Sizes: - 125M, 350M, 1.3B, 3B parameters - Local-first, runs on CPU or GPU - Python 3.10.6+, PyTorch 2.0+ Personas: - girlfriend_gentle (high warmth, high empathy) - girlfriend_playful (high humor, high playfulness) - girlfriend_supportive (balanced, default) Documentation: - Complete README with quickstart - Model card with ethical considerations - Privacy documentation (local-first, zero telemetry) - Data licenses and attribution - Contributing guide Infrastructure: - GitHub Actions CI/CD - Comprehensive test suite - Quickstart script - CLI tool License: Apache 2.0 🤖 Generated with Claude Code https://claude.com/claude-code Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
155
nova_core/rope.py
Normal file
155
nova_core/rope.py
Normal file
@@ -0,0 +1,155 @@
|
||||
"""
|
||||
Rotary Position Embedding (RoPE) implementation
|
||||
"""
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from typing import Tuple
|
||||
|
||||
|
||||
class RotaryPositionalEmbedding(nn.Module):
|
||||
"""
|
||||
Rotary Position Embedding (RoPE) from Su et al. (2021)
|
||||
https://arxiv.org/abs/2104.09864
|
||||
"""
|
||||
|
||||
def __init__(self, dim: int, max_seq_len: int = 2048, theta: float = 10000.0):
|
||||
"""
|
||||
Args:
|
||||
dim: Dimension of the embeddings (should be head_dim)
|
||||
max_seq_len: Maximum sequence length
|
||||
theta: Base for the geometric progression (default 10000.0)
|
||||
"""
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
self.max_seq_len = max_seq_len
|
||||
self.theta = theta
|
||||
|
||||
# Precompute frequencies
|
||||
inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
|
||||
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
||||
|
||||
# Precompute cos/sin cache
|
||||
self._update_cos_sin_cache(max_seq_len)
|
||||
|
||||
def _update_cos_sin_cache(self, seq_len: int):
|
||||
"""Precompute cos and sin for positions up to seq_len"""
|
||||
position = torch.arange(seq_len).unsqueeze(1)
|
||||
freqs = position * self.inv_freq.unsqueeze(0)
|
||||
|
||||
# Create rotation matrix [seq_len, dim/2]
|
||||
emb = torch.cat([freqs, freqs], dim=-1)
|
||||
|
||||
self.register_buffer("cos_cached", emb.cos(), persistent=False)
|
||||
self.register_buffer("sin_cached", emb.sin(), persistent=False)
|
||||
self.cached_seq_len = seq_len
|
||||
|
||||
def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
|
||||
"""Rotates half the hidden dims of the input"""
|
||||
x1, x2 = x.chunk(2, dim=-1)
|
||||
return torch.cat([-x2, x1], dim=-1)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
q: torch.Tensor,
|
||||
k: torch.Tensor,
|
||||
position_ids: torch.Tensor = None
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Apply rotary position embeddings to query and key tensors
|
||||
|
||||
Args:
|
||||
q: Query tensor [batch, num_heads, seq_len, head_dim]
|
||||
k: Key tensor [batch, num_heads, seq_len, head_dim]
|
||||
position_ids: Optional position IDs [batch, seq_len]
|
||||
|
||||
Returns:
|
||||
Tuple of rotated query and key tensors
|
||||
"""
|
||||
seq_len = q.shape[2]
|
||||
|
||||
# Update cache if needed
|
||||
if seq_len > self.cached_seq_len:
|
||||
self._update_cos_sin_cache(seq_len)
|
||||
|
||||
# Get cos/sin for current positions
|
||||
if position_ids is not None:
|
||||
# For generation with KV-cache
|
||||
cos = self.cos_cached[position_ids].unsqueeze(1)
|
||||
sin = self.sin_cached[position_ids].unsqueeze(1)
|
||||
else:
|
||||
# For training or initial forward pass
|
||||
cos = self.cos_cached[:seq_len].unsqueeze(0).unsqueeze(0)
|
||||
sin = self.sin_cached[:seq_len].unsqueeze(0).unsqueeze(0)
|
||||
|
||||
# Apply rotation
|
||||
q_embed = (q * cos) + (self.rotate_half(q) * sin)
|
||||
k_embed = (k * cos) + (self.rotate_half(k) * sin)
|
||||
|
||||
return q_embed, k_embed
|
||||
|
||||
|
||||
class ALiBiPositionalBias(nn.Module):
|
||||
"""
|
||||
Attention with Linear Biases (ALiBi) from Press et al. (2021)
|
||||
https://arxiv.org/abs/2108.12409
|
||||
Alternative to RoPE
|
||||
"""
|
||||
|
||||
def __init__(self, num_heads: int, max_seq_len: int = 2048):
|
||||
"""
|
||||
Args:
|
||||
num_heads: Number of attention heads
|
||||
max_seq_len: Maximum sequence length
|
||||
"""
|
||||
super().__init__()
|
||||
self.num_heads = num_heads
|
||||
self.max_seq_len = max_seq_len
|
||||
|
||||
# Compute slopes for each head
|
||||
slopes = self._get_slopes(num_heads)
|
||||
self.register_buffer("slopes", slopes, persistent=False)
|
||||
|
||||
# Precompute bias matrix
|
||||
alibi = self._get_alibi_bias(max_seq_len, slopes)
|
||||
self.register_buffer("alibi_bias", alibi, persistent=False)
|
||||
|
||||
def _get_slopes(self, num_heads: int) -> torch.Tensor:
|
||||
"""Compute slopes for ALiBi"""
|
||||
def get_slopes_power_of_2(n):
|
||||
start = 2 ** (-(2 ** -(torch.log2(torch.tensor(n)) - 3)))
|
||||
ratio = start
|
||||
return torch.pow(2, torch.arange(n)) * ratio
|
||||
|
||||
# Handle non-power-of-2 number of heads
|
||||
if (num_heads & (num_heads - 1)) == 0:
|
||||
return get_slopes_power_of_2(num_heads)
|
||||
else:
|
||||
closest_power_of_2 = 2 ** torch.floor(torch.log2(torch.tensor(num_heads)))
|
||||
slopes_a = get_slopes_power_of_2(int(closest_power_of_2))
|
||||
slopes_b = self._get_slopes(int(2 * closest_power_of_2))[0::2][:num_heads - int(closest_power_of_2)]
|
||||
return torch.cat([slopes_a, slopes_b])
|
||||
|
||||
def _get_alibi_bias(self, seq_len: int, slopes: torch.Tensor) -> torch.Tensor:
|
||||
"""Precompute ALiBi bias matrix"""
|
||||
# Create relative position matrix
|
||||
pos = torch.arange(seq_len).unsqueeze(0)
|
||||
rel_pos = pos - pos.T # [seq_len, seq_len]
|
||||
|
||||
# Apply slopes [num_heads, seq_len, seq_len]
|
||||
alibi = rel_pos.unsqueeze(0) * slopes.unsqueeze(-1).unsqueeze(-1)
|
||||
|
||||
return alibi
|
||||
|
||||
def forward(self, attention_scores: torch.Tensor, seq_len: int) -> torch.Tensor:
|
||||
"""
|
||||
Add ALiBi bias to attention scores
|
||||
|
||||
Args:
|
||||
attention_scores: [batch, num_heads, seq_len, seq_len]
|
||||
seq_len: Current sequence length
|
||||
|
||||
Returns:
|
||||
Biased attention scores
|
||||
"""
|
||||
return attention_scores + self.alibi_bias[:, :seq_len, :seq_len]
|
Reference in New Issue
Block a user