Trying to use Deepseek to help instead of ChatGPT
This commit is contained in:
parent
326a7b81d7
commit
ffcc60e205
1
.gitignore
vendored
1
.gitignore
vendored
@ -169,3 +169,4 @@ cython_debug/
|
||||
|
||||
# PyPI configuration file
|
||||
.pypirc
|
||||
/dataset_cache.bin
|
19
config.py
Normal file
19
config.py
Normal file
@ -0,0 +1,19 @@
|
||||
import os
|
||||
import torch
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
class Config:
|
||||
model_dim = int(os.getenv("MODEL_DIM", 256))
|
||||
num_layers = int(os.getenv("NUM_LAYERS", 4))
|
||||
num_heads = int(os.getenv("HEADS", 8))
|
||||
vocab_size = int(os.getenv("VOCAB_SIZE", 30000))
|
||||
context_size = int(os.getenv("CONTEXT_SIZE", 512))
|
||||
batch_size = int(os.getenv("BATCH_SIZE", 8))
|
||||
lr = float(os.getenv("LEARNING_RATE", 1e-4))
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
|
||||
|
||||
cfg = Config()
|
70
main.py
70
main.py
@ -1,70 +0,0 @@
|
||||
import discord
|
||||
import requests
|
||||
import json
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Load environment variables from .env file
|
||||
load_dotenv()
|
||||
|
||||
# Replace with your bot token
|
||||
BOT_TOKEN = os.getenv('DISCORD_TOKEN')
|
||||
|
||||
# Ollama configuration
|
||||
OLLAMA_API_URL = 'http://192.168.1.159:11434/api/generate' # Adjust if your Ollama setup is different
|
||||
|
||||
# Set up the Discord client
|
||||
intents = discord.Intents.default()
|
||||
intents.messages = True
|
||||
intents.message_content = True
|
||||
|
||||
client = discord.Client(intents=intents)
|
||||
|
||||
|
||||
# Function to query Ollama
|
||||
def query_ollama(prompt):
|
||||
payload = {
|
||||
"prompt": prompt,
|
||||
"model": "nollama/mythomax-l2-13b:Q4_K_M" # Replace with your Ollama model
|
||||
}
|
||||
try:
|
||||
response = requests.post(OLLAMA_API_URL, json=payload, stream=True)
|
||||
if response.status_code == 200:
|
||||
collected_response = ""
|
||||
# Stream and parse each line of JSON from the response
|
||||
for line in response.iter_lines(decode_unicode=True):
|
||||
if line.strip(): # Skip empty lines
|
||||
try:
|
||||
data = json.loads(line) # Parse each line as JSON
|
||||
collected_response += data.get("response", "")
|
||||
if data.get("done", False):
|
||||
break
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error decoding JSON line: {line}, Error: {e}")
|
||||
return collected_response.strip() or "No response from model."
|
||||
else:
|
||||
return f"Error: {response.status_code} - {response.text}"
|
||||
except requests.RequestException as e:
|
||||
return f"Error connecting to Ollama: {str(e)}"
|
||||
|
||||
|
||||
# Event for when the bot is ready
|
||||
@client.event
|
||||
async def on_ready():
|
||||
print(f'We have logged in as {client.user}')
|
||||
|
||||
|
||||
# Event for when a message is sent
|
||||
@client.event
|
||||
async def on_message(message):
|
||||
# Ignore the bot's own messages
|
||||
if message.author == client.user:
|
||||
return
|
||||
|
||||
# Respond to all messages except those in DMs
|
||||
if not isinstance(message.channel, discord.DMChannel):
|
||||
response = query_ollama(message.content.strip())
|
||||
await message.channel.send(response)
|
||||
|
||||
# Run the bot
|
||||
client.run(BOT_TOKEN)
|
107920
tokenizer.json
Normal file
107920
tokenizer.json
Normal file
File diff suppressed because it is too large
Load Diff
159
train.py
Normal file
159
train.py
Normal file
@ -0,0 +1,159 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import time
|
||||
import os
|
||||
import numpy as np
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
from datasets import load_dataset
|
||||
from tokenizers import Tokenizer, models, trainers, decoders
|
||||
from config import cfg
|
||||
from torch.cuda.amp import autocast, GradScaler
|
||||
|
||||
|
||||
# 1. Tokenizer Implementation (Modified)
|
||||
class RubyTokenizer:
|
||||
def __init__(self):
|
||||
self.tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
|
||||
self.tokenizer.add_special_tokens(["[PAD]", "[UNK]"])
|
||||
self.tokenizer.decoder = decoders.ByteLevel()
|
||||
|
||||
def train(self, texts):
|
||||
trainer = trainers.BpeTrainer(
|
||||
special_tokens=["[PAD]", "[UNK]"],
|
||||
vocab_size=cfg.vocab_size,
|
||||
min_frequency=2, # Modified
|
||||
show_progress=True
|
||||
)
|
||||
self.tokenizer.train_from_iterator(
|
||||
(text.split() for text in texts), # Modified: better word handling
|
||||
trainer=trainer
|
||||
)
|
||||
|
||||
def encode(self, text):
|
||||
return self.tokenizer.encode(text).ids
|
||||
|
||||
@property
|
||||
def pad_id(self):
|
||||
return self.tokenizer.token_to_id("[PAD]") # Modified
|
||||
|
||||
|
||||
# 2. Optimized Dataset (Modified padding handling)
|
||||
class CachedDataset(Dataset):
|
||||
def __init__(self):
|
||||
self.data = np.memmap("dataset_cache.bin",
|
||||
dtype=np.int32,
|
||||
mode="r",
|
||||
shape=(os.path.getsize("dataset_cache.bin")//4,))
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data) // cfg.context_size
|
||||
|
||||
def __getitem__(self, idx):
|
||||
start = idx * cfg.context_size
|
||||
return torch.from_numpy(self.data[start:start+cfg.context_size].copy())
|
||||
|
||||
|
||||
# 3. Transformer Model (Modified padding_idx)
|
||||
class Transformer(nn.Module):
|
||||
def __init__(self, pad_id):
|
||||
super().__init__()
|
||||
self.embed = nn.Embedding(
|
||||
cfg.vocab_size,
|
||||
cfg.model_dim,
|
||||
padding_idx=pad_id # Modified
|
||||
)
|
||||
self.blocks = nn.ModuleList([
|
||||
nn.TransformerEncoderLayer(
|
||||
d_model=cfg.model_dim,
|
||||
nhead=cfg.num_heads,
|
||||
dim_feedforward=cfg.model_dim*4,
|
||||
batch_first=True
|
||||
) for _ in range(cfg.num_layers)
|
||||
])
|
||||
self.head = nn.Linear(cfg.model_dim, cfg.vocab_size)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.embed(x)
|
||||
for block in self.blocks:
|
||||
x = block(x)
|
||||
return self.head(x)
|
||||
|
||||
|
||||
# 4. Main Training Process (Critical fixes)
|
||||
def main():
|
||||
# Initialize tokenizer
|
||||
tokenizer = RubyTokenizer()
|
||||
|
||||
if not os.path.exists("dataset_cache.bin"):
|
||||
print("Creating dataset cache...")
|
||||
ds = load_dataset("openwebtext", split="train[:5%]")
|
||||
|
||||
# Train and save tokenizer (Modified)
|
||||
if not os.path.exists("tokenizer.json"):
|
||||
print("Training tokenizer...")
|
||||
tokenizer.train([text for text in ds["text"] if len(text) > 100])
|
||||
tokenizer.tokenizer.save("tokenizer.json")
|
||||
else:
|
||||
tokenizer.tokenizer = Tokenizer.from_file("tokenizer.json")
|
||||
|
||||
# Tokenize and cache data (Modified)
|
||||
all_tokens = []
|
||||
pad_id = tokenizer.pad_id
|
||||
|
||||
for text in ds["text"]:
|
||||
tokens = tokenizer.encode(text)
|
||||
tokens = tokens[:cfg.context_size] # Truncate after tokenization
|
||||
pad_len = cfg.context_size - len(tokens)
|
||||
all_tokens.extend(tokens + [pad_id]*pad_len) # Modified
|
||||
|
||||
memmap = np.memmap("dataset_cache.bin",
|
||||
dtype=np.int32,
|
||||
mode="w+",
|
||||
shape=(len(all_tokens),))
|
||||
memmap[:] = np.array(all_tokens, dtype=np.int32)
|
||||
del memmap
|
||||
|
||||
# Test tokenizer (Modified)
|
||||
test_text = "The quick brown fox jumps over the lazy dog."
|
||||
print("Tokenizer test:", tokenizer.tokenizer.encode(test_text).tokens)
|
||||
|
||||
# Initialize model with pad_id (Modified)
|
||||
model = Transformer(pad_id=tokenizer.pad_id).to(cfg.device)
|
||||
opt = torch.optim.AdamW(model.parameters(), lr=cfg.lr)
|
||||
scaler = GradScaler()
|
||||
|
||||
dataset = CachedDataset()
|
||||
loader = DataLoader(dataset,
|
||||
batch_size=cfg.batch_size,
|
||||
pin_memory=True,
|
||||
shuffle=True)
|
||||
|
||||
# Training loop (Modified loss calculation)
|
||||
start = time.time()
|
||||
for step, batch in enumerate(loader):
|
||||
batch = batch.to(cfg.device, non_blocking=True)
|
||||
|
||||
inputs = batch[:, :-1]
|
||||
targets = batch[:, 1:]
|
||||
|
||||
with autocast():
|
||||
outputs = model(inputs)
|
||||
loss = torch.nn.functional.cross_entropy(
|
||||
outputs.reshape(-1, cfg.vocab_size),
|
||||
targets.reshape(-1).long(),
|
||||
ignore_index=tokenizer.pad_id # Modified
|
||||
)
|
||||
|
||||
scaler.scale(loss).backward()
|
||||
scaler.step(opt)
|
||||
scaler.update()
|
||||
opt.zero_grad()
|
||||
|
||||
if step % 10 == 0:
|
||||
elapsed = time.time() - start
|
||||
speed = (step + 1) * cfg.batch_size / elapsed
|
||||
print(f"Step {step} | Loss: {loss.item():.4f} | Speed: {speed:.1f} samples/s")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
x
Reference in New Issue
Block a user