diff --git a/model/tokenizer.py b/model/tokenizer.py index 6359a79..7d60093 100644 --- a/model/tokenizer.py +++ b/model/tokenizer.py @@ -20,9 +20,9 @@ def save_vocab(vocab): class Tokenizer: def __init__(self): - self.vocab = {"": 0, "": 1, "": 2, "": 3} - self.reverse_vocab = {0: "", 1: "", 2: "", 3: ""} - self.next_id = 4 + self.vocab = {"": 0, "": 1, "": 2, "": 3, "": 4} + self.reverse_vocab = {0: "", 1: "", 2: "", 3: "", 4: ""} + self.next_id = 5 def tokenize(self, text): text = clean_unicode(text) # 🚨 Always clean incoming text