Created NORA. She has been designed from zero. At this point, I have determined the best hyperparamers for her to train. Next step is to help her communicate on discord and see how she handles it.
This commit is contained in:
commit
16289fc942
198
.gitignore
vendored
Normal file
198
.gitignore
vendored
Normal file
@ -0,0 +1,198 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# UV
|
||||
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
#uv.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
#poetry.toml
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
||||
.pdm.toml
|
||||
.pdm-python
|
||||
.pdm-build/
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
# Abstra
|
||||
# Abstra is an AI-powered process automation framework.
|
||||
# Ignore directories containing user credentials, local state, and settings.
|
||||
# Learn more at https://abstra.io/docs
|
||||
.abstra/
|
||||
|
||||
# Visual Studio Code
|
||||
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
||||
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
||||
# you could uncomment the following to ignore the entire vscode folder
|
||||
# .vscode/
|
||||
|
||||
# Ruff stuff:
|
||||
.ruff_cache/
|
||||
|
||||
# PyPI configuration file
|
||||
.pypirc
|
||||
|
||||
# Cursor
|
||||
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
||||
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
||||
# refer to https://docs.cursor.com/context/ignore-files
|
||||
.cursorignore
|
||||
.cursorindexingignore
|
||||
checkpoints/nora_step_*.pt
|
||||
data/books
|
||||
*.json
|
21
LICENSE
Normal file
21
LICENSE
Normal file
@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2025 [fullname]
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
33
README.md
Normal file
33
README.md
Normal file
@ -0,0 +1,33 @@
|
||||
# Nora: Train a Transformer LM from Scratch
|
||||
|
||||
> A minimal, from-scratch language model. No pretrained weights—just public-domain books and your GPU (or CPU).
|
||||
|
||||
## Overview
|
||||
|
||||
Nora is a character-level Transformer language model written entirely in PyTorch. It learns from whatever plain‐text `.txt` files you place in `data/books/`. Over time, you can extend Nora’s codebase (e.g., add reinforcement-learning loops, self-improvement modules, etc.) toward more advanced AI, if you wish.
|
||||
|
||||
## Why “Nora”?
|
||||
|
||||
- A simple, human‐like female name.
|
||||
- Short, easy to pronounce.
|
||||
- As the project scales, “Nora” could theoretically be extended with modules to approach more general intelligence.
|
||||
|
||||
## Requirements
|
||||
|
||||
- **Python 3.10.6** (Windows 11 or any OS)
|
||||
- **CUDA-capable GPU** (if you want to train faster; otherwise CPU)
|
||||
- **PyTorch** (install with `pip install torch torchvision`)
|
||||
- **tqdm** (`pip install tqdm`)
|
||||
- **Other Python packages**: `numpy`, `typing`
|
||||
|
||||
## Folder Structure
|
||||
|
||||
- nora/
|
||||
- ├── config.py
|
||||
- ├── tokenizer.py
|
||||
- ├── data_loader.py
|
||||
- ├── model.py
|
||||
- ├── train.py
|
||||
- ├── utils.py
|
||||
- ├── main.py
|
||||
- └── README.md
|
146
config.py
Normal file
146
config.py
Normal file
@ -0,0 +1,146 @@
|
||||
"""
|
||||
config.py
|
||||
|
||||
Define hyperparameters, file paths, and other settings via argparse.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import torch
|
||||
|
||||
|
||||
def get_config():
|
||||
parser = argparse.ArgumentParser(description="Nora: Train a Transformer from scratch")
|
||||
|
||||
# Data & paths
|
||||
parser.add_argument(
|
||||
"--data_dir",
|
||||
type=str,
|
||||
default="data/books",
|
||||
help="Path to folder containing .txt files (public-domain books).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--vocab_path",
|
||||
type=str,
|
||||
default="data/vocab.json",
|
||||
help="Where to save/load the tokenizer vocabulary.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--checkpoint_dir",
|
||||
type=str,
|
||||
default="checkpoints",
|
||||
help="Directory to save model checkpoints.",
|
||||
)
|
||||
|
||||
# Model hyperparameters
|
||||
parser.add_argument(
|
||||
"--d_model",
|
||||
type=int,
|
||||
default=512,
|
||||
help="Transformer embedding size (d_model).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--nhead",
|
||||
type=int,
|
||||
default=8,
|
||||
help="Number of attention heads.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num_layers",
|
||||
type=int,
|
||||
default=6,
|
||||
help="Number of Transformer encoder layers.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dim_feedforward",
|
||||
type=int,
|
||||
default=2048,
|
||||
help="Inner feedforward dimension.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dropout",
|
||||
type=float,
|
||||
default=0.1,
|
||||
help="Dropout rate in Transformer.",
|
||||
)
|
||||
|
||||
# Training hyperparameters
|
||||
parser.add_argument(
|
||||
"--batch_size",
|
||||
type=int,
|
||||
default=32,
|
||||
help="Batch size per training step.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--seq_length",
|
||||
type=int,
|
||||
default=128,
|
||||
help="Sequence length (context window) in tokens.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--epochs",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Number of training epochs.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lr",
|
||||
type=float,
|
||||
default=1e-4,
|
||||
help="Learning rate.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--warmup_steps",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Linear learning rate warmup steps.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_grad_norm",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Gradient clipping norm.",
|
||||
)
|
||||
|
||||
# Logging & checkpointing
|
||||
parser.add_argument(
|
||||
"--log_interval",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Print training loss every N steps.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--save_interval",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Save a checkpoint every N steps.",
|
||||
)
|
||||
|
||||
# Device selection
|
||||
parser.add_argument(
|
||||
"--device",
|
||||
type=str,
|
||||
default="cuda" if torch.cuda.is_available() else "cpu",
|
||||
help="Device to train on ('cuda' or 'cpu').",
|
||||
)
|
||||
|
||||
# Scaling options (for Pi vs GPU)
|
||||
parser.add_argument(
|
||||
"--tiny",
|
||||
action="store_true",
|
||||
help="If set, override model sizes to be tiny (for Pi 3B or very low-compute).",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# If --tiny is set, override some hyperparameters to very small values:
|
||||
if args.tiny:
|
||||
args.d_model = 64
|
||||
args.nhead = 2
|
||||
args.num_layers = 2
|
||||
args.dim_feedforward = 256
|
||||
args.batch_size = 8
|
||||
args.seq_length = 32
|
||||
args.lr = 1e-3
|
||||
args.epochs = 5
|
||||
|
||||
return args
|
63
data_loader.py
Normal file
63
data_loader.py
Normal file
@ -0,0 +1,63 @@
|
||||
"""
|
||||
data_loader.py
|
||||
|
||||
Loads all .txt files from data_dir, concatenates them, tokenizes them,
|
||||
and creates a Dataset of (input_seq, target_seq) for language modeling.
|
||||
"""
|
||||
|
||||
import os
|
||||
import torch
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
|
||||
|
||||
class TextDataset(Dataset):
|
||||
def __init__(self, data_dir: str, tokenizer, seq_length: int):
|
||||
"""
|
||||
- data_dir: folder of .txt public-domain books.
|
||||
- tokenizer: instance of CharTokenizer (from tokenizer.py).
|
||||
- seq_length: context length in tokens.
|
||||
"""
|
||||
super().__init__()
|
||||
self.seq_length = seq_length
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
# Read and concatenate all text files into one long string
|
||||
texts = []
|
||||
for root, _, files in os.walk(data_dir):
|
||||
for fname in files:
|
||||
if not fname.lower().endswith(".txt"):
|
||||
continue
|
||||
path = os.path.join(root, fname)
|
||||
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
texts.append(f.read())
|
||||
full_text = "\n".join(texts)
|
||||
token_ids = self.tokenizer.encode(full_text)
|
||||
|
||||
# Prepare input-target pairs
|
||||
self.examples = []
|
||||
stride = 32
|
||||
for i in range(0, len(token_ids) - seq_length, stride):
|
||||
inp = token_ids[i : i + seq_length]
|
||||
targ = token_ids[i + 1 : i + seq_length + 1]
|
||||
self.examples.append((inp, targ))
|
||||
|
||||
def __len__(self):
|
||||
return len(self.examples)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
inp, targ = self.examples[idx]
|
||||
return torch.tensor(inp, dtype=torch.long), torch.tensor(targ, dtype=torch.long)
|
||||
|
||||
|
||||
def get_dataloader(
|
||||
data_dir: str, tokenizer, seq_length: int, batch_size: int, shuffle: bool = True
|
||||
) -> DataLoader:
|
||||
dataset = TextDataset(data_dir, tokenizer, seq_length)
|
||||
return DataLoader(
|
||||
dataset,
|
||||
batch_size=batch_size,
|
||||
shuffle=shuffle,
|
||||
drop_last=True,
|
||||
num_workers=8,
|
||||
pin_memory=True,
|
||||
)
|
89
main.py
Normal file
89
main.py
Normal file
@ -0,0 +1,89 @@
|
||||
"""
|
||||
main.py
|
||||
|
||||
Orchestrates the entire Nora project:
|
||||
- Parses arguments
|
||||
- Builds or loads tokenizer
|
||||
- Constructs dataset & dataloader
|
||||
- Instantiates the model
|
||||
- Sets up optimizer, scheduler
|
||||
- Calls train()
|
||||
"""
|
||||
|
||||
import os
|
||||
import torch
|
||||
import logging
|
||||
from config import get_config
|
||||
from tokenizer import CharTokenizer
|
||||
from data_loader import get_dataloader
|
||||
from model import NoraTransformerLM
|
||||
from train import train
|
||||
from utils import setup_logging, load_checkpoint, save_checkpoint
|
||||
|
||||
torch.backends.cudnn.benchmark = True
|
||||
torch.backends.cudnn.enabled = True
|
||||
|
||||
|
||||
def main():
|
||||
args = get_config()
|
||||
|
||||
# 1) Logging setup
|
||||
log_file = os.path.join(args.checkpoint_dir, "train.log")
|
||||
setup_logging(log_file)
|
||||
|
||||
logging.info(f"[main] Using device: {args.device}")
|
||||
logging.info(f"[main] Config: {args}")
|
||||
|
||||
# 2) Tokenizer: if vocab exists, load; else build from data_dir
|
||||
tokenizer = CharTokenizer(vocab_path=args.vocab_path, data_dir=args.data_dir)
|
||||
|
||||
# 3) DataLoader
|
||||
dataloader = get_dataloader(
|
||||
data_dir=args.data_dir,
|
||||
tokenizer=tokenizer,
|
||||
seq_length=args.seq_length,
|
||||
batch_size=args.batch_size,
|
||||
shuffle=True,
|
||||
)
|
||||
|
||||
# 4) Model instantiation
|
||||
model = NoraTransformerLM(
|
||||
vocab_size=tokenizer.vocab_size(),
|
||||
d_model=args.d_model,
|
||||
nhead=args.nhead,
|
||||
num_layers=args.num_layers,
|
||||
dim_feedforward=args.dim_feedforward,
|
||||
dropout=args.dropout,
|
||||
max_seq_len=args.seq_length,
|
||||
)
|
||||
|
||||
# 5) Optimizer & scheduler (linear warmup + decay)
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9)
|
||||
|
||||
def lr_lambda(current_step):
|
||||
# Linear warmup for first warmup_steps, then decay with 1/sqrt(step)
|
||||
if current_step < args.warmup_steps:
|
||||
return float(current_step) / float(max(1, args.warmup_steps))
|
||||
return (args.warmup_steps ** 0.5) * float(current_step ** -0.5)
|
||||
|
||||
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
|
||||
|
||||
# 6) Check for existing checkpoint to resume
|
||||
start_step = 0
|
||||
ckpts = sorted(os.listdir(args.checkpoint_dir)) if os.path.isdir(args.checkpoint_dir) else []
|
||||
ckpts = [f for f in ckpts if f.startswith("nora_step_") and f.endswith(".pt")]
|
||||
if ckpts:
|
||||
latest_ckpt = os.path.join(args.checkpoint_dir, ckpts[-1])
|
||||
logging.info(f"[main] Found existing checkpoint: {latest_ckpt}; resuming from it.")
|
||||
start_step = load_checkpoint(latest_ckpt, model, optimizer)
|
||||
|
||||
# 7) Begin training
|
||||
try:
|
||||
train(model, dataloader, optimizer, scheduler, tokenizer, args, start_step=start_step)
|
||||
except Exception as e:
|
||||
logging.exception("[main] Exception during training")
|
||||
raise e
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
100
model.py
Normal file
100
model.py
Normal file
@ -0,0 +1,100 @@
|
||||
"""
|
||||
model.py
|
||||
|
||||
Defines a Transformer‐based language model from scratch, using PyTorch’s nn.Transformer.
|
||||
No pretrained weights—everything is initialized randomly.
|
||||
"""
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import math
|
||||
|
||||
|
||||
class PositionalEncoding(nn.Module):
|
||||
def __init__(self, d_model: int, max_len: int = 10_000):
|
||||
super().__init__()
|
||||
pe = torch.zeros(max_len, d_model)
|
||||
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
|
||||
div_term = torch.exp(
|
||||
torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
|
||||
)
|
||||
pe[:, 0::2] = torch.sin(position * div_term)
|
||||
pe[:, 1::2] = torch.cos(position * div_term)
|
||||
pe = pe.unsqueeze(0) # shape: (1, max_len, d_model)
|
||||
self.register_buffer("pe", pe)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
x: (batch_size, seq_length, d_model)
|
||||
returns x + positional encodings for the first seq_length positions.
|
||||
"""
|
||||
x = x + self.pe[:, : x.size(1), :]
|
||||
return x
|
||||
|
||||
|
||||
class NoraTransformerLM(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size: int,
|
||||
d_model: int = 512,
|
||||
nhead: int = 8,
|
||||
num_layers: int = 6,
|
||||
dim_feedforward: int = 2048,
|
||||
dropout: float = 0.1,
|
||||
max_seq_len: int = 512,
|
||||
):
|
||||
super().__init__()
|
||||
self.model_type = "TransformerLM"
|
||||
self.d_model = d_model
|
||||
self.vocab_size = vocab_size
|
||||
|
||||
# Token embedding + positional encoding
|
||||
self.token_embed = nn.Embedding(vocab_size, d_model)
|
||||
self.pos_encoder = PositionalEncoding(d_model, max_len=max_seq_len)
|
||||
|
||||
# Transformer encoder layers
|
||||
encoder_layers = nn.TransformerEncoderLayer(
|
||||
d_model=d_model,
|
||||
nhead=nhead,
|
||||
dim_feedforward=dim_feedforward,
|
||||
dropout=dropout,
|
||||
activation="gelu",
|
||||
)
|
||||
self.transformer_encoder = nn.TransformerEncoder(
|
||||
encoder_layers, num_layers=num_layers
|
||||
)
|
||||
|
||||
# Final linear layer to project to vocabulary
|
||||
self.fc_out = nn.Linear(d_model, vocab_size)
|
||||
|
||||
# Initialization
|
||||
self._init_weights()
|
||||
|
||||
def _init_weights(self):
|
||||
nn.init.normal_(self.token_embed.weight, mean=0, std=self.d_model ** -0.5)
|
||||
nn.init.zeros_(self.fc_out.bias)
|
||||
nn.init.normal_(self.fc_out.weight, mean=0, std=self.d_model ** -0.5)
|
||||
|
||||
def forward(self, src: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
src: (batch_size, seq_length), token IDs
|
||||
returns: logits (batch_size, seq_length, vocab_size)
|
||||
"""
|
||||
|
||||
# Embed tokens and add positional encoding
|
||||
x = self.token_embed(src) * math.sqrt(self.d_model) # (B, S, D)
|
||||
x = self.pos_encoder(x) # (B, S, D)
|
||||
# PyTorch Transformer expects (S, B, D)
|
||||
x = x.permute(1, 0, 2) # (seq_length, batch_size, d_model)
|
||||
|
||||
# Create a causal mask so each position can only attend to previous positions
|
||||
seq_len = x.size(0)
|
||||
mask = torch.triu(torch.ones(seq_len, seq_len, device=x.device), diagonal=1).bool()
|
||||
|
||||
# Pass through Transformer encoder
|
||||
x = self.transformer_encoder(x, mask=mask) # (seq_length, batch_size, d_model)
|
||||
|
||||
# Back to (B, S, D)
|
||||
x = x.permute(1, 0, 2) # (batch_size, seq_length, d_model)
|
||||
logits = self.fc_out(x) # (batch_size, seq_length, vocab_size)
|
||||
return logits
|
86
tokenizer.py
Normal file
86
tokenizer.py
Normal file
@ -0,0 +1,86 @@
|
||||
"""
|
||||
tokenizer.py
|
||||
|
||||
A simple character‐level tokenizer that builds its own vocabulary from all text files.
|
||||
Saves/loads vocab to/from JSON. You can extend this to a word‐level tokenizer if you wish.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
from collections import Counter
|
||||
from typing import List, Dict, Union
|
||||
|
||||
|
||||
class CharTokenizer:
|
||||
def __init__(self, vocab_path: str, data_dir: str):
|
||||
"""
|
||||
If vocab_path exists, load it; otherwise, build from raw text in data_dir.
|
||||
"""
|
||||
self.vocab_path = vocab_path
|
||||
self.data_dir = data_dir
|
||||
self.stoi: Dict[str, int] = {}
|
||||
self.itos: Dict[int, str] = {}
|
||||
|
||||
if os.path.isfile(self.vocab_path):
|
||||
self._load_vocab()
|
||||
else:
|
||||
self._build_vocab()
|
||||
|
||||
def _build_vocab(self):
|
||||
"""
|
||||
Read all .txt files under data_dir, count character frequencies,
|
||||
build a sorted vocabulary, and save to vocab_path.
|
||||
"""
|
||||
counter = Counter()
|
||||
print(f"[tokenizer] Building vocabulary from data in '{self.data_dir}'...")
|
||||
for root, _, files in os.walk(self.data_dir):
|
||||
for fname in files:
|
||||
if not fname.lower().endswith(".txt"):
|
||||
continue
|
||||
path = os.path.join(root, fname)
|
||||
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
text = f.read()
|
||||
counter.update(text)
|
||||
|
||||
# Ensure a consistent ordering: sort by frequency descending, then Unicode codepoint
|
||||
sorted_chars = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
|
||||
unique_chars = [ch for ch, _ in sorted_chars]
|
||||
|
||||
# Add special tokens
|
||||
tokens = ["<pad>", "<unk>"] + unique_chars
|
||||
|
||||
self.stoi = {ch: i for i, ch in enumerate(tokens)}
|
||||
self.itos = {i: ch for i, ch in enumerate(tokens)}
|
||||
|
||||
# Save to JSON
|
||||
os.makedirs(os.path.dirname(self.vocab_path), exist_ok=True)
|
||||
with open(self.vocab_path, "w", encoding="utf-8") as f:
|
||||
json.dump(self.stoi, f, ensure_ascii=False, indent=2)
|
||||
print(f"[tokenizer] Built vocab size = {len(self.stoi)}; saved to '{self.vocab_path}'.")
|
||||
|
||||
def _load_vocab(self):
|
||||
"""
|
||||
Load existing vocabulary from vocab_path.
|
||||
"""
|
||||
print(f"[tokenizer] Loading vocabulary from '{self.vocab_path}'...")
|
||||
with open(self.vocab_path, "r", encoding="utf-8") as f:
|
||||
self.stoi = json.load(f)
|
||||
self.itos = {i: ch for ch, i in self.stoi.items()}
|
||||
print(f"[tokenizer] Loaded vocab size = {len(self.stoi)}.")
|
||||
|
||||
def encode(self, text: str) -> List[int]:
|
||||
"""
|
||||
Convert a string to a list of integer token IDs (character‐level).
|
||||
Unrecognized chars map to <unk>.
|
||||
"""
|
||||
unk_id = self.stoi.get("<unk>")
|
||||
return [self.stoi.get(ch, unk_id) for ch in text]
|
||||
|
||||
def decode(self, token_ids: List[int]) -> str:
|
||||
"""
|
||||
Convert a list of token IDs back into a string.
|
||||
"""
|
||||
return "".join(self.itos.get(i, "<unk>") for i in token_ids)
|
||||
|
||||
def vocab_size(self) -> int:
|
||||
return len(self.stoi)
|
135
train.py
Normal file
135
train.py
Normal file
@ -0,0 +1,135 @@
|
||||
"""
|
||||
train.py
|
||||
|
||||
Training loop for Nora, with automatic mixed precision (AMP) to speed up on CUDA GPUs.
|
||||
Uses tqdm for progress bars, logging for metrics, and gradient clipping + LR scheduler.
|
||||
"""
|
||||
|
||||
import time
|
||||
import logging
|
||||
import math
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
from tqdm import tqdm
|
||||
from torch.nn.utils import clip_grad_norm_
|
||||
from torch.amp import GradScaler, autocast # <-- updated import
|
||||
|
||||
|
||||
def train(
|
||||
model: torch.nn.Module,
|
||||
dataloader: torch.utils.data.DataLoader,
|
||||
optimizer: torch.optim.Optimizer,
|
||||
scheduler,
|
||||
tokenizer,
|
||||
config,
|
||||
start_step: int = 0,
|
||||
):
|
||||
"""
|
||||
model: NoraTransformerLM
|
||||
dataloader: DataLoader for TextDataset
|
||||
optimizer: AdamW (or Adam)
|
||||
scheduler: LR scheduler with warmup
|
||||
tokenizer: CharTokenizer
|
||||
config: namespace from config.py
|
||||
start_step: if resuming from checkpoint
|
||||
"""
|
||||
|
||||
device = config.device
|
||||
model.to(device)
|
||||
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.stoi["<pad>"])
|
||||
scaler = GradScaler()
|
||||
|
||||
global_step = start_step
|
||||
steps_per_epoch = len(dataloader)
|
||||
total_steps = config.epochs * steps_per_epoch
|
||||
|
||||
logging.info(
|
||||
f"[train] Starting training for {config.epochs} epochs, "
|
||||
f"{steps_per_epoch} steps/epoch, total approx {total_steps} steps."
|
||||
)
|
||||
|
||||
for epoch in range(config.epochs):
|
||||
model.train()
|
||||
epoch_loss = 0.0
|
||||
epoch_start = time.time()
|
||||
|
||||
# If you want to profile the first 100 steps, uncomment below:
|
||||
# if global_step == start_step:
|
||||
# t0 = time.time()
|
||||
|
||||
pbar = tqdm(
|
||||
enumerate(dataloader),
|
||||
total=steps_per_epoch,
|
||||
desc=f"Epoch {epoch + 1}",
|
||||
ncols=100,
|
||||
unit="step",
|
||||
)
|
||||
for step, (inputs, targets) in pbar:
|
||||
inputs = inputs.to(device)
|
||||
targets = targets.to(device)
|
||||
|
||||
optimizer.zero_grad()
|
||||
|
||||
# Mixed precision forward/backward (specify device_type="cuda")
|
||||
with autocast(device_type="cuda", dtype=torch.float16):
|
||||
logits = model(inputs) # (batch, seq_len, vocab_size)
|
||||
loss = criterion(
|
||||
logits.view(-1, tokenizer.vocab_size()),
|
||||
targets.view(-1),
|
||||
)
|
||||
|
||||
scaler.scale(loss).backward()
|
||||
scaler.unscale_(optimizer)
|
||||
clip_grad_norm_(model.parameters(), config.max_grad_norm)
|
||||
scaler.step(optimizer)
|
||||
scaler.update()
|
||||
scheduler.step()
|
||||
|
||||
epoch_loss += loss.item()
|
||||
global_step += 1
|
||||
|
||||
# Log every log_interval steps
|
||||
if global_step % config.log_interval == 0:
|
||||
avg_loss = epoch_loss / (step + 1)
|
||||
ppl = math.exp(avg_loss)
|
||||
logging.info(
|
||||
f"[step {global_step}/{total_steps}] "
|
||||
f"avg_loss = {avg_loss:.4f}, ppl = {ppl:.2f}, "
|
||||
f"lr = {scheduler.get_last_lr()[0]:.2e}"
|
||||
)
|
||||
|
||||
# Save checkpoint every save_interval steps
|
||||
if global_step % config.save_interval == 0:
|
||||
from utils import save_checkpoint
|
||||
|
||||
save_checkpoint(
|
||||
model,
|
||||
optimizer,
|
||||
global_step,
|
||||
config.checkpoint_dir,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
|
||||
pbar.set_postfix({"loss": f"{loss.item():.4f}"})
|
||||
|
||||
# (Optional) Profile first 100 steps
|
||||
# if global_step == start_step + 100:
|
||||
# elapsed = time.time() - t0
|
||||
# print(
|
||||
# f"[profile] avg time/step over first 100 batches: "
|
||||
# f"{elapsed/100:.4f} s"
|
||||
# )
|
||||
|
||||
epoch_time = time.time() - epoch_start
|
||||
avg_epoch_loss = epoch_loss / steps_per_epoch
|
||||
logging.info(
|
||||
f"[epoch {epoch + 1}/{config.epochs}] "
|
||||
f"avg_loss = {avg_epoch_loss:.4f}, time = {epoch_time:.1f}s"
|
||||
)
|
||||
|
||||
# Final checkpoint at end of all epochs
|
||||
from utils import save_checkpoint
|
||||
|
||||
save_checkpoint(model, optimizer, global_step, config.checkpoint_dir, tokenizer=tokenizer)
|
||||
logging.info("[train] Training complete.")
|
84
utils.py
Normal file
84
utils.py
Normal file
@ -0,0 +1,84 @@
|
||||
"""
|
||||
utils.py
|
||||
|
||||
Common utilities: logging setup, checkpoint saving & loading, device checks, etc.
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
import torch
|
||||
|
||||
|
||||
def setup_logging(log_file: str = None):
|
||||
"""
|
||||
Set up logging to stdout (and optionally to a file).
|
||||
"""
|
||||
root = logging.getLogger()
|
||||
root.setLevel(logging.INFO)
|
||||
formatter = logging.Formatter(
|
||||
"[%(asctime)s] [%(levelname)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
|
||||
)
|
||||
|
||||
# Console handler
|
||||
ch = logging.StreamHandler()
|
||||
ch.setLevel(logging.INFO)
|
||||
ch.setFormatter(formatter)
|
||||
root.addHandler(ch)
|
||||
|
||||
# File handler
|
||||
if log_file:
|
||||
os.makedirs(os.path.dirname(log_file), exist_ok=True)
|
||||
fh = logging.FileHandler(log_file)
|
||||
fh.setLevel(logging.INFO)
|
||||
fh.setFormatter(formatter)
|
||||
root.addHandler(fh)
|
||||
|
||||
|
||||
def save_checkpoint(
|
||||
model: torch.nn.Module,
|
||||
optimizer: torch.optim.Optimizer,
|
||||
step: int,
|
||||
checkpoint_dir: str,
|
||||
tokenizer=None,
|
||||
):
|
||||
"""
|
||||
Save model state, optimizer state, and tokenizer (optional) to a checkpoint file.
|
||||
"""
|
||||
os.makedirs(checkpoint_dir, exist_ok=True)
|
||||
ckpt_path = os.path.join(checkpoint_dir, f"nora_step_{step}.pt")
|
||||
state = {
|
||||
"step": step,
|
||||
"model_state_dict": model.state_dict(),
|
||||
"optimizer_state_dict": optimizer.state_dict(),
|
||||
}
|
||||
if tokenizer:
|
||||
# tokenizer.stoi is JSON‐serializable
|
||||
state["tokenizer_stoi"] = tokenizer.stoi
|
||||
|
||||
torch.save(state, ckpt_path)
|
||||
logging.info(f"[checkpoint] Saved checkpoint to {ckpt_path}")
|
||||
|
||||
|
||||
def load_checkpoint(
|
||||
ckpt_path: str, model: torch.nn.Module, optimizer: torch.optim.Optimizer = None
|
||||
):
|
||||
"""
|
||||
Load model & optimizer state from a checkpoint. Returns step.
|
||||
If optimizer is None, only loads model weights.
|
||||
"""
|
||||
if not os.path.isfile(ckpt_path):
|
||||
raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
|
||||
state = torch.load(ckpt_path, map_location="cpu")
|
||||
model.load_state_dict(state["model_state_dict"])
|
||||
step = state.get("step", 0)
|
||||
if optimizer and "optimizer_state_dict" in state:
|
||||
optimizer.load_state_dict(state["optimizer_state_dict"])
|
||||
logging.info(f"[checkpoint] Loaded checkpoint from {ckpt_path} (step {step})")
|
||||
return step
|
||||
|
||||
|
||||
def get_default_device():
|
||||
"""
|
||||
Return 'cuda' if available; otherwise 'cpu'.
|
||||
"""
|
||||
return "cuda" if torch.cuda.is_available() else "cpu"
|
Loading…
x
Reference in New Issue
Block a user