Added the basics of her code, updated to not include any extra files
This commit is contained in:
5
.gitignore
vendored
5
.gitignore
vendored
@ -193,3 +193,8 @@ cython_debug/
|
|||||||
# refer to https://docs.cursor.com/context/ignore-files
|
# refer to https://docs.cursor.com/context/ignore-files
|
||||||
.cursorignore
|
.cursorignore
|
||||||
.cursorindexingignore
|
.cursorindexingignore
|
||||||
|
|
||||||
|
*.txt
|
||||||
|
/texts
|
||||||
|
*.json
|
||||||
|
*.pt
|
21
build_tokenizer.py
Normal file
21
build_tokenizer.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
from collections import Counter
|
||||||
|
import json
|
||||||
|
|
||||||
|
# Read corpus
|
||||||
|
with open('corpus.txt', 'r', encoding='utf-8') as f:
|
||||||
|
text = f.read().lower() # Normalize to lowercase
|
||||||
|
words = text.split()
|
||||||
|
|
||||||
|
# Build vocabulary
|
||||||
|
vocab_size = 10000
|
||||||
|
word_counts = Counter(words).most_common(vocab_size - 4) # Reserve 4 for special tokens
|
||||||
|
vocab = {word: idx for idx, (word, _) in enumerate(word_counts)}
|
||||||
|
vocab['<unk>'] = len(vocab)
|
||||||
|
vocab['<pad>'] = len(vocab)
|
||||||
|
vocab['<s>'] = len(vocab)
|
||||||
|
vocab['</s>'] = len(vocab)
|
||||||
|
|
||||||
|
# Save vocab
|
||||||
|
with open('vocab.json', 'w') as f:
|
||||||
|
json.dump(vocab, f)
|
||||||
|
print(f"Vocabulary of size {len(vocab)} saved to vocab.json")
|
36
download_corpus.py
Normal file
36
download_corpus.py
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
import gutenbergpy.textget
|
||||||
|
import re
|
||||||
|
import glob
|
||||||
|
|
||||||
|
|
||||||
|
# Download books by Gutenberg ID
|
||||||
|
def download_gutenberg_book(book_id, output_file):
|
||||||
|
try:
|
||||||
|
raw_text = gutenbergpy.textget.get_text_by_id(book_id)
|
||||||
|
# Remove headers/footers
|
||||||
|
text = re.sub(r'\*\*\*.*?\*\*\*', '', raw_text.decode('utf-8'), flags=re.DOTALL)
|
||||||
|
text = re.sub(r'\n+', '\n', text).strip()
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(text)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error downloading book {book_id}: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
# Download selected books
|
||||||
|
books = [
|
||||||
|
(1342, 'pride_and_prejudice.txt'),
|
||||||
|
(45, 'anne_of_green_gables.txt'),
|
||||||
|
(74, 'tom_sawyer.txt')
|
||||||
|
]
|
||||||
|
for book_id, filename in books:
|
||||||
|
print(f"Downloading book ID {book_id}...")
|
||||||
|
download_gutenberg_book(book_id, filename)
|
||||||
|
|
||||||
|
# Combine into corpus
|
||||||
|
corpus = ''
|
||||||
|
for file in glob.glob('*.txt'):
|
||||||
|
with open(file, 'r', encoding='utf-8') as f:
|
||||||
|
corpus += f.read() + '\n'
|
||||||
|
with open('corpus.txt', 'w', encoding='utf-8') as f:
|
||||||
|
f.write(corpus)
|
||||||
|
print("Corpus created at corpus.txt")
|
68
finetune_vivi.py
Normal file
68
finetune_vivi.py
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from torch.utils.data import Dataset, DataLoader
|
||||||
|
import torch.optim as optim
|
||||||
|
import json
|
||||||
|
|
||||||
|
# Define model (same as before)
|
||||||
|
class VivianTransformer(nn.Module):
|
||||||
|
def __init__(self, vocab_size, d_model=128, n_layers=2, n_heads=4, d_ff=512):
|
||||||
|
super().__init__()
|
||||||
|
self.embedding = nn.Embedding(vocab_size, d_model)
|
||||||
|
self.pos_encoding = nn.Parameter(torch.randn(1, 512, d_model))
|
||||||
|
encoder_layer = nn.TransformerEncoderLayer(d_model, n_heads, d_ff, dropout=0.1)
|
||||||
|
self.transformer = nn.TransformerEncoder(encoder_layer, n_layers)
|
||||||
|
self.fc_out = nn.Linear(d_model, vocab_size)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.embedding(x) + self.pos_encoding[:, :x.size(1), :]
|
||||||
|
x = self.transformer(x)
|
||||||
|
return self.fc_out(x)
|
||||||
|
|
||||||
|
# Conversation dataset
|
||||||
|
class ViviDataset(Dataset):
|
||||||
|
def __init__(self, json_file, vocab, max_len=32):
|
||||||
|
with open(json_file, 'r') as f:
|
||||||
|
self.data = json.load(f)
|
||||||
|
self.vocab = vocab
|
||||||
|
self.max_len = max_len
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.data)
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
user = self.data[idx]['user'].lower().split()
|
||||||
|
vivi = self.data[idx]['vivi'].lower().split()
|
||||||
|
seq = [self.vocab['<s>']] + [self.vocab.get(word, self.vocab['<unk>']) for word in user + vivi] + [self.vocab['</s>']]
|
||||||
|
seq = seq[:self.max_len] + [self.vocab['<pad>']] * (self.max_len - len(seq))
|
||||||
|
return torch.tensor(seq[:-1]), torch.tensor(seq[1:])
|
||||||
|
|
||||||
|
# Load vocab and data
|
||||||
|
with open('vocab.json', 'r') as f:
|
||||||
|
vocab = json.load(f)
|
||||||
|
dataset = ViviDataset('vivi_conversations.json', vocab)
|
||||||
|
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
|
||||||
|
|
||||||
|
# Load model
|
||||||
|
model = VivianTransformer(len(vocab)).cuda()
|
||||||
|
model.load_state_dict(torch.load('vivi_base.pt'))
|
||||||
|
optimizer = optim.Adam(model.parameters(), lr=0.00005) # Lower LR for fine-tuning
|
||||||
|
criterion = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'])
|
||||||
|
|
||||||
|
# Fine-tune
|
||||||
|
for epoch in range(10):
|
||||||
|
model.train()
|
||||||
|
total_loss = 0
|
||||||
|
for src, tgt in dataloader:
|
||||||
|
src, tgt = src.cuda(), tgt.cuda()
|
||||||
|
optimizer.zero_grad()
|
||||||
|
output = model(src)
|
||||||
|
loss = criterion(output.view(-1, len(vocab)), tgt.view(-1))
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
total_loss += loss.item()
|
||||||
|
print(f'Fine-tune Epoch {epoch+1}, Loss: {total_loss / len(dataloader):.4f}')
|
||||||
|
|
||||||
|
# Save model
|
||||||
|
torch.save(model.state_dict(), 'vivi_finetuned.pt')
|
||||||
|
print("Fine-tuned model saved to vivi_finetuned.pt")
|
31
prepare_dataset.py
Normal file
31
prepare_dataset.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
import torch
|
||||||
|
from torch.utils.data import Dataset
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
class TextDataset(Dataset):
|
||||||
|
def __init__(self, corpus_file, vocab, max_len=32):
|
||||||
|
self.vocab = vocab
|
||||||
|
self.max_len = max_len
|
||||||
|
with open(corpus_file, 'r', encoding='utf-8') as f:
|
||||||
|
text = f.read().lower().split()
|
||||||
|
self.tokens = [self.vocab.get(word, self.vocab['<unk>']) for word in text]
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.tokens) // self.max_len
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
start = idx * self.max_len
|
||||||
|
seq = self.tokens[start:start + self.max_len]
|
||||||
|
if len(seq) < self.max_len:
|
||||||
|
seq += [self.vocab['<pad>']] * (self.max_len - len(seq))
|
||||||
|
return torch.tensor(seq[:-1]), torch.tensor(seq[1:])
|
||||||
|
|
||||||
|
# Load vocab
|
||||||
|
with open('vocab.json', 'r') as f:
|
||||||
|
vocab = json.load(f)
|
||||||
|
|
||||||
|
# Create dataset
|
||||||
|
dataset = TextDataset('corpus.txt', vocab)
|
||||||
|
torch.save(dataset, 'dataset.pt')
|
||||||
|
print("Dataset saved to dataset.pt")
|
109
talk_to_vivi.py
Normal file
109
talk_to_vivi.py
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import json
|
||||||
|
|
||||||
|
# Define model
|
||||||
|
class VivianTransformer(nn.Module):
|
||||||
|
def __init__(self, vocab_size, d_model=128, n_layers=2, n_heads=4, d_ff=512):
|
||||||
|
super().__init__()
|
||||||
|
self.embedding = nn.Embedding(vocab_size, d_model)
|
||||||
|
self.pos_encoding = nn.Parameter(torch.randn(1, 512, d_model))
|
||||||
|
encoder_layer = nn.TransformerEncoderLayer(d_model, n_heads, d_ff, dropout=0.1)
|
||||||
|
self.transformer = nn.TransformerEncoder(encoder_layer, n_layers)
|
||||||
|
self.fc_out = nn.Linear(d_model, vocab_size)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.embedding(x) + self.pos_encoding[:, :x.size(1), :]
|
||||||
|
x = self.transformer(x)
|
||||||
|
return self.fc_out(x)
|
||||||
|
|
||||||
|
# Check device
|
||||||
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
|
print(f"Using device: {device}")
|
||||||
|
if device.type == 'cuda':
|
||||||
|
print(f"GPU: {torch.cuda.get_device_name(0)}")
|
||||||
|
else:
|
||||||
|
print("Warning: CUDA not available. Running on CPU will be slower.")
|
||||||
|
|
||||||
|
# Load vocab
|
||||||
|
try:
|
||||||
|
with open('vocab.json', 'r') as f:
|
||||||
|
vocab = json.load(f)
|
||||||
|
except FileNotFoundError:
|
||||||
|
print("Error: vocab.json not found. Run build_tokenizer.py first.")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
# Load model
|
||||||
|
try:
|
||||||
|
model = VivianTransformer(len(vocab)).to(device)
|
||||||
|
model.load_state_dict(torch.load('vivi_finetuned.pt', map_location=device))
|
||||||
|
except FileNotFoundError:
|
||||||
|
print("Error: vivi_finetuned.pt not found. Trying vivi_base.pt...")
|
||||||
|
try:
|
||||||
|
model.load_state_dict(torch.load('vivi_base.pt', map_location=device))
|
||||||
|
except FileNotFoundError:
|
||||||
|
print("Error: vivi_base.pt not found. Run train_vivi.py first.")
|
||||||
|
exit(1)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error loading model: {e}")
|
||||||
|
exit(1)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
# Reverse vocab for decoding
|
||||||
|
id2word = {idx: word for word, idx in vocab.items()}
|
||||||
|
|
||||||
|
# Context memory
|
||||||
|
context_memory = []
|
||||||
|
memory_size = 5
|
||||||
|
|
||||||
|
def generate_response(prompt, max_len=32, p=0.9):
|
||||||
|
global context_memory
|
||||||
|
context_memory.append(prompt)
|
||||||
|
if len(context_memory) > memory_size:
|
||||||
|
context_memory = context_memory[-memory_size:]
|
||||||
|
input_text = ' '.join(context_memory).lower()
|
||||||
|
input_ids = [vocab['<s>']] + [vocab.get(word, vocab['<unk>']) for word in input_text.split()]
|
||||||
|
input_tensor = torch.tensor([input_ids], device=device)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
for _ in range(max_len - len(input_ids)):
|
||||||
|
output = model(input_tensor)
|
||||||
|
logits = output[:, -1, :]
|
||||||
|
probs = torch.softmax(logits, dim=-1)
|
||||||
|
probs, indices = probs.sort(descending=True)
|
||||||
|
cum_probs = torch.cumsum(probs, dim=-1)
|
||||||
|
mask = cum_probs <= p
|
||||||
|
if not mask.any():
|
||||||
|
mask[0] = True
|
||||||
|
probs = probs[mask]
|
||||||
|
indices = indices[mask]
|
||||||
|
next_word_id = torch.multinomial(probs, 1).item() # Get scalar index
|
||||||
|
next_word_tensor = torch.tensor([[indices[next_word_id]]], device=device)
|
||||||
|
input_tensor = torch.cat([input_tensor, next_word_tensor], dim=1)
|
||||||
|
if indices[next_word_id].item() == vocab['</s>']:
|
||||||
|
break
|
||||||
|
|
||||||
|
response_ids = input_tensor[0, len(input_ids):].tolist()
|
||||||
|
response = ' '.join(id2word.get(idx, '<unk>') for idx in response_ids if idx != vocab['<pad>'])
|
||||||
|
context_memory.append(response)
|
||||||
|
return response
|
||||||
|
|
||||||
|
# Save conversations
|
||||||
|
conversations = []
|
||||||
|
try:
|
||||||
|
with open('vivi_conversations.json', 'r') as f:
|
||||||
|
conversations = json.load(f)
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Interactive loop
|
||||||
|
print("Chat with Vivi! Type 'exit' or 'quit' to stop.")
|
||||||
|
while True:
|
||||||
|
user_input = input("You: ")
|
||||||
|
if user_input.lower() in ['exit', 'quit']:
|
||||||
|
break
|
||||||
|
response = generate_response(user_input)
|
||||||
|
print(f"Vivi: {response}")
|
||||||
|
conversations.append({"user": user_input, "vivi": response})
|
||||||
|
with open('vivi_conversations.json', 'w') as f:
|
||||||
|
json.dump(conversations, f, indent=2)
|
94
train_vivi.py
Normal file
94
train_vivi.py
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from torch.utils.data import Dataset, DataLoader
|
||||||
|
import torch.optim as optim
|
||||||
|
import json
|
||||||
|
|
||||||
|
# Define TextDataset
|
||||||
|
class TextDataset(Dataset):
|
||||||
|
def __init__(self, corpus_file, vocab, max_len=32):
|
||||||
|
self.vocab = vocab
|
||||||
|
self.max_len = max_len
|
||||||
|
with open(corpus_file, 'r', encoding='utf-8') as f:
|
||||||
|
text = f.read().lower().split()
|
||||||
|
self.tokens = [self.vocab.get(word, self.vocab['<unk>']) for word in text]
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.tokens) // self.max_len
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
start = idx * self.max_len
|
||||||
|
seq = self.tokens[start:start + self.max_len]
|
||||||
|
if len(seq) < self.max_len:
|
||||||
|
seq += [self.vocab['<pad>']] * (self.max_len - len(seq))
|
||||||
|
return torch.tensor(seq[:-1]), torch.tensor(seq[1:])
|
||||||
|
|
||||||
|
# Define model
|
||||||
|
class VivianTransformer(nn.Module):
|
||||||
|
def __init__(self, vocab_size, d_model=128, n_layers=2, n_heads=4, d_ff=512):
|
||||||
|
super().__init__()
|
||||||
|
self.embedding = nn.Embedding(vocab_size, d_model)
|
||||||
|
self.pos_encoding = nn.Parameter(torch.randn(1, 512, d_model))
|
||||||
|
encoder_layer = nn.TransformerEncoderLayer(d_model, n_heads, d_ff, dropout=0.1)
|
||||||
|
self.transformer = nn.TransformerEncoder(encoder_layer, n_layers)
|
||||||
|
self.fc_out = nn.Linear(d_model, vocab_size)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.embedding(x) + self.pos_encoding[:, :x.size(1), :]
|
||||||
|
x = self.transformer(x)
|
||||||
|
return self.fc_out(x)
|
||||||
|
|
||||||
|
# Check device
|
||||||
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
|
print(f"Using device: {device}")
|
||||||
|
if device.type == 'cuda':
|
||||||
|
print(f"GPU: {torch.cuda.get_device_name(0)}")
|
||||||
|
else:
|
||||||
|
print("Warning: CUDA not available. Training on CPU will be slower.")
|
||||||
|
|
||||||
|
# Load vocab
|
||||||
|
try:
|
||||||
|
with open('vocab.json', 'r') as f:
|
||||||
|
vocab = json.load(f)
|
||||||
|
except FileNotFoundError:
|
||||||
|
print("Error: vocab.json not found. Run build_tokenizer.py first.")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
# Load dataset
|
||||||
|
try:
|
||||||
|
dataset = torch.load('dataset.pt')
|
||||||
|
except FileNotFoundError:
|
||||||
|
print("Error: dataset.pt not found. Run prepare_dataset.py first.")
|
||||||
|
exit(1)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error loading dataset.pt: {e}")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
# Create dataloader
|
||||||
|
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)
|
||||||
|
|
||||||
|
# Initialize model
|
||||||
|
model = VivianTransformer(len(vocab)).to(device)
|
||||||
|
optimizer = optim.Adam(model.parameters(), lr=0.0001)
|
||||||
|
criterion = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'])
|
||||||
|
|
||||||
|
# Train
|
||||||
|
print("Starting training...")
|
||||||
|
for epoch in range(5):
|
||||||
|
model.train()
|
||||||
|
total_loss = 0
|
||||||
|
for batch_idx, (src, tgt) in enumerate(dataloader):
|
||||||
|
src, tgt = src.to(device), tgt.to(device)
|
||||||
|
optimizer.zero_grad()
|
||||||
|
output = model(src)
|
||||||
|
loss = criterion(output.view(-1, len(vocab)), tgt.view(-1))
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
total_loss += loss.item()
|
||||||
|
if batch_idx % 100 == 0:
|
||||||
|
print(f"Epoch {epoch+1}, Batch {batch_idx}, Loss: {loss.item():.4f}")
|
||||||
|
print(f'Epoch {epoch+1}, Average Loss: {total_loss / len(dataloader):.4f}')
|
||||||
|
|
||||||
|
# Save model
|
||||||
|
torch.save(model.state_dict(), 'vivi_base.pt')
|
||||||
|
print("Model saved to vivi_base.pt")
|
Reference in New Issue
Block a user