# Training Configuration for Lyra

training:
  # Model selection
  model_config: "configs/model_125M.yaml"  # Start with 125M

  # Data
  train_data_path: "data/processed/train.bin"
  val_data_path: "data/processed/val.bin"

  # Training hyperparameters
  batch_size: 8              # Adjust based on VRAM
  gradient_accumulation_steps: 4
  effective_batch_size: 32   # batch_size * grad_accum_steps

  max_steps: 100000
  warmup_steps: 2000
  eval_interval: 1000
  save_interval: 5000

  # Optimization
  learning_rate: 6.0e-4
  weight_decay: 0.1
  beta1: 0.9
  beta2: 0.95
  grad_clip: 1.0

  # Learning rate schedule
  lr_scheduler: "cosine"
  min_lr: 6.0e-5  # 10% of max lr

  # Mixed precision
  use_amp: true
  amp_dtype: "bfloat16"  # bfloat16 or float16

  # Optimization techniques
  gradient_checkpointing: true
  compile_model: false  # PyTorch 2.0 compilation (can cause issues)

  # Logging
  log_interval: 10
  wandb_project: "lyra-training"
  wandb_run_name: null  # Auto-generated if null

  # Checkpointing
  checkpoint_dir: "models/checkpoints"
  save_optimizer_state: true
  keep_last_n_checkpoints: 3

  # Hardware
  device: "cuda"
  num_workers: 4
  pin_memory: true