Added the basics of her code, updated to not include any extra files

2025-06-10 10:56:56 -04:00
parent 6d18d21f2a
commit 3d0e5410f1
7 changed files with 364 additions and 0 deletions
--- a/build_tokenizer.py
+++ b/build_tokenizer.py
@ -0,0 +1,21 @@
+from collections import Counter
+import json
+
+# Read corpus
+with open('corpus.txt', 'r', encoding='utf-8') as f:
+    text = f.read().lower()  # Normalize to lowercase
+    words = text.split()
+
+# Build vocabulary
+vocab_size = 10000
+word_counts = Counter(words).most_common(vocab_size - 4)  # Reserve 4 for special tokens
+vocab = {word: idx for idx, (word, _) in enumerate(word_counts)}
+vocab['<unk>'] = len(vocab)
+vocab['<pad>'] = len(vocab)
+vocab['<s>'] = len(vocab)
+vocab['</s>'] = len(vocab)
+
+# Save vocab
+with open('vocab.json', 'w') as f:
+    json.dump(vocab, f)
+print(f"Vocabulary of size {len(vocab)} saved to vocab.json")