diff --git a/.gitignore b/.gitignore index 2cd3f63..6ffc9c7 100644 --- a/.gitignore +++ b/.gitignore @@ -193,3 +193,8 @@ cython_debug/ # refer to https://docs.cursor.com/context/ignore-files .cursorignore .cursorindexingignore + +*.txt +/texts +*.json +*.pt \ No newline at end of file diff --git a/build_tokenizer.py b/build_tokenizer.py new file mode 100644 index 0000000..1cb5545 --- /dev/null +++ b/build_tokenizer.py @@ -0,0 +1,21 @@ +from collections import Counter +import json + +# Read corpus +with open('corpus.txt', 'r', encoding='utf-8') as f: + text = f.read().lower() # Normalize to lowercase + words = text.split() + +# Build vocabulary +vocab_size = 10000 +word_counts = Counter(words).most_common(vocab_size - 4) # Reserve 4 for special tokens +vocab = {word: idx for idx, (word, _) in enumerate(word_counts)} +vocab[''] = len(vocab) +vocab[''] = len(vocab) +vocab[''] = len(vocab) +vocab[''] = len(vocab) + +# Save vocab +with open('vocab.json', 'w') as f: + json.dump(vocab, f) +print(f"Vocabulary of size {len(vocab)} saved to vocab.json") \ No newline at end of file diff --git a/download_corpus.py b/download_corpus.py new file mode 100644 index 0000000..1e3dcd1 --- /dev/null +++ b/download_corpus.py @@ -0,0 +1,36 @@ +import gutenbergpy.textget +import re +import glob + + +# Download books by Gutenberg ID +def download_gutenberg_book(book_id, output_file): + try: + raw_text = gutenbergpy.textget.get_text_by_id(book_id) + # Remove headers/footers + text = re.sub(r'\*\*\*.*?\*\*\*', '', raw_text.decode('utf-8'), flags=re.DOTALL) + text = re.sub(r'\n+', '\n', text).strip() + with open(output_file, 'w', encoding='utf-8') as f: + f.write(text) + except Exception as e: + print(f"Error downloading book {book_id}: {e}") + + +# Download selected books +books = [ + (1342, 'pride_and_prejudice.txt'), + (45, 'anne_of_green_gables.txt'), + (74, 'tom_sawyer.txt') +] +for book_id, filename in books: + print(f"Downloading book ID {book_id}...") + download_gutenberg_book(book_id, filename) + +# Combine into corpus +corpus = '' +for file in glob.glob('*.txt'): + with open(file, 'r', encoding='utf-8') as f: + corpus += f.read() + '\n' +with open('corpus.txt', 'w', encoding='utf-8') as f: + f.write(corpus) +print("Corpus created at corpus.txt") \ No newline at end of file diff --git a/finetune_vivi.py b/finetune_vivi.py new file mode 100644 index 0000000..ef765e4 --- /dev/null +++ b/finetune_vivi.py @@ -0,0 +1,68 @@ +import torch +import torch.nn as nn +from torch.utils.data import Dataset, DataLoader +import torch.optim as optim +import json + +# Define model (same as before) +class VivianTransformer(nn.Module): + def __init__(self, vocab_size, d_model=128, n_layers=2, n_heads=4, d_ff=512): + super().__init__() + self.embedding = nn.Embedding(vocab_size, d_model) + self.pos_encoding = nn.Parameter(torch.randn(1, 512, d_model)) + encoder_layer = nn.TransformerEncoderLayer(d_model, n_heads, d_ff, dropout=0.1) + self.transformer = nn.TransformerEncoder(encoder_layer, n_layers) + self.fc_out = nn.Linear(d_model, vocab_size) + + def forward(self, x): + x = self.embedding(x) + self.pos_encoding[:, :x.size(1), :] + x = self.transformer(x) + return self.fc_out(x) + +# Conversation dataset +class ViviDataset(Dataset): + def __init__(self, json_file, vocab, max_len=32): + with open(json_file, 'r') as f: + self.data = json.load(f) + self.vocab = vocab + self.max_len = max_len + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + user = self.data[idx]['user'].lower().split() + vivi = self.data[idx]['vivi'].lower().split() + seq = [self.vocab['']] + [self.vocab.get(word, self.vocab['']) for word in user + vivi] + [self.vocab['']] + seq = seq[:self.max_len] + [self.vocab['']] * (self.max_len - len(seq)) + return torch.tensor(seq[:-1]), torch.tensor(seq[1:]) + +# Load vocab and data +with open('vocab.json', 'r') as f: + vocab = json.load(f) +dataset = ViviDataset('vivi_conversations.json', vocab) +dataloader = DataLoader(dataset, batch_size=8, shuffle=True) + +# Load model +model = VivianTransformer(len(vocab)).cuda() +model.load_state_dict(torch.load('vivi_base.pt')) +optimizer = optim.Adam(model.parameters(), lr=0.00005) # Lower LR for fine-tuning +criterion = nn.CrossEntropyLoss(ignore_index=vocab['']) + +# Fine-tune +for epoch in range(10): + model.train() + total_loss = 0 + for src, tgt in dataloader: + src, tgt = src.cuda(), tgt.cuda() + optimizer.zero_grad() + output = model(src) + loss = criterion(output.view(-1, len(vocab)), tgt.view(-1)) + loss.backward() + optimizer.step() + total_loss += loss.item() + print(f'Fine-tune Epoch {epoch+1}, Loss: {total_loss / len(dataloader):.4f}') + +# Save model +torch.save(model.state_dict(), 'vivi_finetuned.pt') +print("Fine-tuned model saved to vivi_finetuned.pt") \ No newline at end of file diff --git a/prepare_dataset.py b/prepare_dataset.py new file mode 100644 index 0000000..1ac975d --- /dev/null +++ b/prepare_dataset.py @@ -0,0 +1,31 @@ +import torch +from torch.utils.data import Dataset +import json + + +class TextDataset(Dataset): + def __init__(self, corpus_file, vocab, max_len=32): + self.vocab = vocab + self.max_len = max_len + with open(corpus_file, 'r', encoding='utf-8') as f: + text = f.read().lower().split() + self.tokens = [self.vocab.get(word, self.vocab['']) for word in text] + + def __len__(self): + return len(self.tokens) // self.max_len + + def __getitem__(self, idx): + start = idx * self.max_len + seq = self.tokens[start:start + self.max_len] + if len(seq) < self.max_len: + seq += [self.vocab['']] * (self.max_len - len(seq)) + return torch.tensor(seq[:-1]), torch.tensor(seq[1:]) + +# Load vocab +with open('vocab.json', 'r') as f: + vocab = json.load(f) + +# Create dataset +dataset = TextDataset('corpus.txt', vocab) +torch.save(dataset, 'dataset.pt') +print("Dataset saved to dataset.pt") \ No newline at end of file diff --git a/talk_to_vivi.py b/talk_to_vivi.py new file mode 100644 index 0000000..ba05108 --- /dev/null +++ b/talk_to_vivi.py @@ -0,0 +1,109 @@ +import torch +import torch.nn as nn +import json + +# Define model +class VivianTransformer(nn.Module): + def __init__(self, vocab_size, d_model=128, n_layers=2, n_heads=4, d_ff=512): + super().__init__() + self.embedding = nn.Embedding(vocab_size, d_model) + self.pos_encoding = nn.Parameter(torch.randn(1, 512, d_model)) + encoder_layer = nn.TransformerEncoderLayer(d_model, n_heads, d_ff, dropout=0.1) + self.transformer = nn.TransformerEncoder(encoder_layer, n_layers) + self.fc_out = nn.Linear(d_model, vocab_size) + + def forward(self, x): + x = self.embedding(x) + self.pos_encoding[:, :x.size(1), :] + x = self.transformer(x) + return self.fc_out(x) + +# Check device +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +print(f"Using device: {device}") +if device.type == 'cuda': + print(f"GPU: {torch.cuda.get_device_name(0)}") +else: + print("Warning: CUDA not available. Running on CPU will be slower.") + +# Load vocab +try: + with open('vocab.json', 'r') as f: + vocab = json.load(f) +except FileNotFoundError: + print("Error: vocab.json not found. Run build_tokenizer.py first.") + exit(1) + +# Load model +try: + model = VivianTransformer(len(vocab)).to(device) + model.load_state_dict(torch.load('vivi_finetuned.pt', map_location=device)) +except FileNotFoundError: + print("Error: vivi_finetuned.pt not found. Trying vivi_base.pt...") + try: + model.load_state_dict(torch.load('vivi_base.pt', map_location=device)) + except FileNotFoundError: + print("Error: vivi_base.pt not found. Run train_vivi.py first.") + exit(1) +except Exception as e: + print(f"Error loading model: {e}") + exit(1) +model.eval() + +# Reverse vocab for decoding +id2word = {idx: word for word, idx in vocab.items()} + +# Context memory +context_memory = [] +memory_size = 5 + +def generate_response(prompt, max_len=32, p=0.9): + global context_memory + context_memory.append(prompt) + if len(context_memory) > memory_size: + context_memory = context_memory[-memory_size:] + input_text = ' '.join(context_memory).lower() + input_ids = [vocab['']] + [vocab.get(word, vocab['']) for word in input_text.split()] + input_tensor = torch.tensor([input_ids], device=device) + + with torch.no_grad(): + for _ in range(max_len - len(input_ids)): + output = model(input_tensor) + logits = output[:, -1, :] + probs = torch.softmax(logits, dim=-1) + probs, indices = probs.sort(descending=True) + cum_probs = torch.cumsum(probs, dim=-1) + mask = cum_probs <= p + if not mask.any(): + mask[0] = True + probs = probs[mask] + indices = indices[mask] + next_word_id = torch.multinomial(probs, 1).item() # Get scalar index + next_word_tensor = torch.tensor([[indices[next_word_id]]], device=device) + input_tensor = torch.cat([input_tensor, next_word_tensor], dim=1) + if indices[next_word_id].item() == vocab['']: + break + + response_ids = input_tensor[0, len(input_ids):].tolist() + response = ' '.join(id2word.get(idx, '') for idx in response_ids if idx != vocab['']) + context_memory.append(response) + return response + +# Save conversations +conversations = [] +try: + with open('vivi_conversations.json', 'r') as f: + conversations = json.load(f) +except FileNotFoundError: + pass + +# Interactive loop +print("Chat with Vivi! Type 'exit' or 'quit' to stop.") +while True: + user_input = input("You: ") + if user_input.lower() in ['exit', 'quit']: + break + response = generate_response(user_input) + print(f"Vivi: {response}") + conversations.append({"user": user_input, "vivi": response}) + with open('vivi_conversations.json', 'w') as f: + json.dump(conversations, f, indent=2) \ No newline at end of file diff --git a/train_vivi.py b/train_vivi.py new file mode 100644 index 0000000..e6084b7 --- /dev/null +++ b/train_vivi.py @@ -0,0 +1,94 @@ +import torch +import torch.nn as nn +from torch.utils.data import Dataset, DataLoader +import torch.optim as optim +import json + +# Define TextDataset +class TextDataset(Dataset): + def __init__(self, corpus_file, vocab, max_len=32): + self.vocab = vocab + self.max_len = max_len + with open(corpus_file, 'r', encoding='utf-8') as f: + text = f.read().lower().split() + self.tokens = [self.vocab.get(word, self.vocab['']) for word in text] + + def __len__(self): + return len(self.tokens) // self.max_len + + def __getitem__(self, idx): + start = idx * self.max_len + seq = self.tokens[start:start + self.max_len] + if len(seq) < self.max_len: + seq += [self.vocab['']] * (self.max_len - len(seq)) + return torch.tensor(seq[:-1]), torch.tensor(seq[1:]) + +# Define model +class VivianTransformer(nn.Module): + def __init__(self, vocab_size, d_model=128, n_layers=2, n_heads=4, d_ff=512): + super().__init__() + self.embedding = nn.Embedding(vocab_size, d_model) + self.pos_encoding = nn.Parameter(torch.randn(1, 512, d_model)) + encoder_layer = nn.TransformerEncoderLayer(d_model, n_heads, d_ff, dropout=0.1) + self.transformer = nn.TransformerEncoder(encoder_layer, n_layers) + self.fc_out = nn.Linear(d_model, vocab_size) + + def forward(self, x): + x = self.embedding(x) + self.pos_encoding[:, :x.size(1), :] + x = self.transformer(x) + return self.fc_out(x) + +# Check device +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +print(f"Using device: {device}") +if device.type == 'cuda': + print(f"GPU: {torch.cuda.get_device_name(0)}") +else: + print("Warning: CUDA not available. Training on CPU will be slower.") + +# Load vocab +try: + with open('vocab.json', 'r') as f: + vocab = json.load(f) +except FileNotFoundError: + print("Error: vocab.json not found. Run build_tokenizer.py first.") + exit(1) + +# Load dataset +try: + dataset = torch.load('dataset.pt') +except FileNotFoundError: + print("Error: dataset.pt not found. Run prepare_dataset.py first.") + exit(1) +except Exception as e: + print(f"Error loading dataset.pt: {e}") + exit(1) + +# Create dataloader +dataloader = DataLoader(dataset, batch_size=16, shuffle=True) + +# Initialize model +model = VivianTransformer(len(vocab)).to(device) +optimizer = optim.Adam(model.parameters(), lr=0.0001) +criterion = nn.CrossEntropyLoss(ignore_index=vocab['']) + +# Train +print("Starting training...") +for epoch in range(5): + model.train() + total_loss = 0 + for batch_idx, (src, tgt) in enumerate(dataloader): + src, tgt = src.to(device), tgt.to(device) + optimizer.zero_grad() + output = model(src) + loss = criterion(output.view(-1, len(vocab)), tgt.view(-1)) + loss.backward() + optimizer.step() + total_loss += loss.item() + if batch_idx % 100 == 0: + print(f"Epoch {epoch+1}, Batch {batch_idx}, Loss: {loss.item():.4f}") + print(f'Epoch {epoch+1}, Average Loss: {total_loss / len(dataloader):.4f}') + +# Save model +torch.save(model.state_dict(), 'vivi_base.pt') +print("Model saved to vivi_base.pt") \ No newline at end of file