# Lyra 250M Model Configuration # GPT-style decoder-only transformer model: name: "lyra-250M" architecture: "gpt" # Model dimensions vocab_size: 50257 n_positions: 2048 # Larger context window n_embd: 1024 # Embedding dimension n_layer: 16 # Number of transformer layers n_head: 16 # Number of attention heads n_inner: 4096 # FFN inner dimension (4 * n_embd) # Regularization embd_pdrop: 0.1 resid_pdrop: 0.1 attn_pdrop: 0.1 # Activation activation: "gelu" # Total parameters: ~250M