Added another learning source for Nora. Also added the requirements.

2025-06-09 14:25:11 -04:00
parent da23742671
commit 5d53ba7cb8
14 changed files with 1070 additions and 78 deletions
--- a/train.py
+++ b/train.py
@ -37,6 +37,13 @@ def train(

    device = config.device
    model.to(device)
+
+    # ─── ensure optimizer state is on the same device ───
+    # (this moves any loaded CPU buffers for Adam/AdamW into CUDA)
+    for state in optimizer.state.values():
+        for k, v in state.items():
+            if isinstance(v, torch.Tensor):
+                state[k] = v.to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.stoi["<pad>"])
    scaler = GradScaler()