import os import torch from torch.utils.data import Dataset class CharDataset(Dataset): """ Builds a char-level dataset from all .txt files under books_dir. Returns sequences of length block_size for next-char prediction. """ def __init__(self, books_dir: str, block_size: int): texts = [] for fn in os.listdir(books_dir): if fn.lower().endswith('.txt'): path = os.path.join(books_dir, fn) with open(path, 'r', encoding='utf8') as f: texts.append(f.read()) data = '\n'.join(texts) # build vocab chars = sorted(set(data)) self.stoi = {ch: i for i, ch in enumerate(chars)} self.itos = {i: ch for ch, i in self.stoi.items()} self.vocab_size = len(self.stoi) # encode all data as a single tensor self.data = torch.tensor( [self.stoi[ch] for ch in data], dtype=torch.long ) self.block_size = block_size def __len__(self): return len(self.data) - self.block_size def __getitem__(self, idx): x = self.data[idx: idx + self.block_size] y = self.data[idx + 1: idx + 1 + self.block_size] return x, y