Created NORA. She has been designed from zero. At this point, I have determined the best hyperparamers for her to train. Next step is to help her communicate on discord and see how she handles it.
This commit is contained in:
63
data_loader.py
Normal file
63
data_loader.py
Normal file
@ -0,0 +1,63 @@
|
||||
"""
|
||||
data_loader.py
|
||||
|
||||
Loads all .txt files from data_dir, concatenates them, tokenizes them,
|
||||
and creates a Dataset of (input_seq, target_seq) for language modeling.
|
||||
"""
|
||||
|
||||
import os
|
||||
import torch
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
|
||||
|
||||
class TextDataset(Dataset):
|
||||
def __init__(self, data_dir: str, tokenizer, seq_length: int):
|
||||
"""
|
||||
- data_dir: folder of .txt public-domain books.
|
||||
- tokenizer: instance of CharTokenizer (from tokenizer.py).
|
||||
- seq_length: context length in tokens.
|
||||
"""
|
||||
super().__init__()
|
||||
self.seq_length = seq_length
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
# Read and concatenate all text files into one long string
|
||||
texts = []
|
||||
for root, _, files in os.walk(data_dir):
|
||||
for fname in files:
|
||||
if not fname.lower().endswith(".txt"):
|
||||
continue
|
||||
path = os.path.join(root, fname)
|
||||
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
texts.append(f.read())
|
||||
full_text = "\n".join(texts)
|
||||
token_ids = self.tokenizer.encode(full_text)
|
||||
|
||||
# Prepare input-target pairs
|
||||
self.examples = []
|
||||
stride = 32
|
||||
for i in range(0, len(token_ids) - seq_length, stride):
|
||||
inp = token_ids[i : i + seq_length]
|
||||
targ = token_ids[i + 1 : i + seq_length + 1]
|
||||
self.examples.append((inp, targ))
|
||||
|
||||
def __len__(self):
|
||||
return len(self.examples)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
inp, targ = self.examples[idx]
|
||||
return torch.tensor(inp, dtype=torch.long), torch.tensor(targ, dtype=torch.long)
|
||||
|
||||
|
||||
def get_dataloader(
|
||||
data_dir: str, tokenizer, seq_length: int, batch_size: int, shuffle: bool = True
|
||||
) -> DataLoader:
|
||||
dataset = TextDataset(data_dir, tokenizer, seq_length)
|
||||
return DataLoader(
|
||||
dataset,
|
||||
batch_size=batch_size,
|
||||
shuffle=shuffle,
|
||||
drop_last=True,
|
||||
num_workers=8,
|
||||
pin_memory=True,
|
||||
)
|
Reference in New Issue
Block a user