# 03_train_val_split.py """ Create a train/val split from data.txt or a small fallback, and save to disk. What it does: - Loads UTF-8 text, normalizes newlines to "\\n". - Splits by character index (default 90/10) deterministically. - Writes 'train.txt' and 'val.txt' How to run: python 03_train_val_split.py # optional: python -m doctest -v 03_train_val_split.py Notes: - Keep 'data.txt' next to this script. If it's missing, small fallback is used """ from __future__ import annotations from pathlib import Path from typing import Tuple FALLBACK = "abcdefg\nhijklmn\nopqrst\nuvwx\nyz\n" def load_and_normalize(path: Path) -> str: """Load text from path if it exists; else use FALLBACK. Normalize to LF. Args: path: Path to 'data.txt'. Returns: Text with only '\\n' line endings. >>> load_and_normalize.__doc__ is not None True """ text = path.read_text(encoding="utf-8") if path.exists() else FALLBACK return text.replace("\r\n", "\n").replace("\r", "\n") def split_indices(n: int, train_ratio: float = 0.9) -> Tuple[int, int]: """Return (train_end, val_start) indices for a 1D split of length n. Args: n: Total number of characters. train_ratio: Fraction for train portion (0.0 < train_ratio < 1.0) Returns: (train_end, val_start) where val_start == train_end. >>> split_indices(100, 0.8) (80, 80) >>> split_indices(5, 0.6) (3, 3) """ assert 0.0 < train_ratio < 1.0, "train_ratio must be between 0 and 1" train_end = int(n * train_ratio) return train_end, train_end def main() -> None: data_path = Path("data.txt") text = load_and_normalize(data_path) n = len(text) tr_end, va_start = split_indices(n, 0.9) train, val = text[:tr_end], text[va_start:] Path("train.txt").write_text(train, encoding="utf-8") Path("val.txt").write_text(val, encoding="utf-8") print(f"Total chars: {n}") print(f"Train chars: {len(train)}") print(f"Val chars: {len(val)}") print("Wrote train.txt and val.txt") if __name__ == "__main__": main()