# Lyra 125M Model Configuration # GPT-style decoder-only transformer model: name: "lyra-125M" architecture: "gpt" # Model dimensions vocab_size: 50257 # Will be updated after tokenizer training n_positions: 1024 # Context window n_embd: 768 # Embedding dimension n_layer: 12 # Number of transformer layers n_head: 12 # Number of attention heads n_inner: 3072 # FFN inner dimension (4 * n_embd) # Regularization embd_pdrop: 0.1 resid_pdrop: 0.1 attn_pdrop: 0.1 # Activation activation: "gelu" # Total parameters: ~125M