Added the basics of her code, updated to not include any extra files
This commit is contained in:
21
build_tokenizer.py
Normal file
21
build_tokenizer.py
Normal file
@ -0,0 +1,21 @@
|
||||
from collections import Counter
|
||||
import json
|
||||
|
||||
# Read corpus
|
||||
with open('corpus.txt', 'r', encoding='utf-8') as f:
|
||||
text = f.read().lower() # Normalize to lowercase
|
||||
words = text.split()
|
||||
|
||||
# Build vocabulary
|
||||
vocab_size = 10000
|
||||
word_counts = Counter(words).most_common(vocab_size - 4) # Reserve 4 for special tokens
|
||||
vocab = {word: idx for idx, (word, _) in enumerate(word_counts)}
|
||||
vocab['<unk>'] = len(vocab)
|
||||
vocab['<pad>'] = len(vocab)
|
||||
vocab['<s>'] = len(vocab)
|
||||
vocab['</s>'] = len(vocab)
|
||||
|
||||
# Save vocab
|
||||
with open('vocab.json', 'w') as f:
|
||||
json.dump(vocab, f)
|
||||
print(f"Vocabulary of size {len(vocab)} saved to vocab.json")
|
Reference in New Issue
Block a user