Added the basics of her code, updated to not include any extra files

This commit is contained in:
2025-06-10 10:56:56 -04:00
parent 6d18d21f2a
commit 3d0e5410f1
7 changed files with 364 additions and 0 deletions

21
build_tokenizer.py Normal file
View File

@ -0,0 +1,21 @@
from collections import Counter
import json
# Read corpus
with open('corpus.txt', 'r', encoding='utf-8') as f:
text = f.read().lower() # Normalize to lowercase
words = text.split()
# Build vocabulary
vocab_size = 10000
word_counts = Counter(words).most_common(vocab_size - 4) # Reserve 4 for special tokens
vocab = {word: idx for idx, (word, _) in enumerate(word_counts)}
vocab['<unk>'] = len(vocab)
vocab['<pad>'] = len(vocab)
vocab['<s>'] = len(vocab)
vocab['</s>'] = len(vocab)
# Save vocab
with open('vocab.json', 'w') as f:
json.dump(vocab, f)
print(f"Vocabulary of size {len(vocab)} saved to vocab.json")