53 lines
1.2 KiB
YAML
53 lines
1.2 KiB
YAML
# Training Configuration for Lyra
|
|
|
|
training:
|
|
# Model selection
|
|
model_config: "configs/model_125M.yaml" # Start with 125M
|
|
|
|
# Data
|
|
train_data_path: "data/processed/train.bin"
|
|
val_data_path: "data/processed/val.bin"
|
|
|
|
# Training hyperparameters
|
|
batch_size: 8 # Adjust based on VRAM
|
|
gradient_accumulation_steps: 4
|
|
effective_batch_size: 32 # batch_size * grad_accum_steps
|
|
|
|
max_steps: 100000
|
|
warmup_steps: 2000
|
|
eval_interval: 1000
|
|
save_interval: 5000
|
|
|
|
# Optimization
|
|
learning_rate: 6.0e-4
|
|
weight_decay: 0.1
|
|
beta1: 0.9
|
|
beta2: 0.95
|
|
grad_clip: 1.0
|
|
|
|
# Learning rate schedule
|
|
lr_scheduler: "cosine"
|
|
min_lr: 6.0e-5 # 10% of max lr
|
|
|
|
# Mixed precision
|
|
use_amp: true
|
|
amp_dtype: "bfloat16" # bfloat16 or float16
|
|
|
|
# Optimization techniques
|
|
gradient_checkpointing: true
|
|
compile_model: false # PyTorch 2.0 compilation (can cause issues)
|
|
|
|
# Logging
|
|
log_interval: 10
|
|
wandb_project: "lyra-training"
|
|
wandb_run_name: null # Auto-generated if null
|
|
|
|
# Checkpointing
|
|
checkpoint_dir: "models/checkpoints"
|
|
save_optimizer_state: true
|
|
keep_last_n_checkpoints: 3
|
|
|
|
# Hardware
|
|
device: "cuda"
|
|
num_workers: 4
|
|
pin_memory: true |