# config.py VOCAB_SIZE = 50000 CONTEXT_SIZE = 128 EMBED_DIM = 256 NUM_HEADS = 8 NUM_LAYERS = 6 BATCH_SIZE = 16 LEARNING_RATE = 3e-4 DEVICE = "cuda" # fallback handled in trainer MAX_TOKENS = 100_000 # Used to cap input corpus size