From ec82d0ab6304d7f3cea5980b74831a1134c82c86 Mon Sep 17 00:00:00 2001 From: Dani Date: Sun, 27 Apr 2025 16:21:26 -0400 Subject: [PATCH] Fixed a spacing error and confirmed that unicode was being removed. --- model/tokenizer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/model/tokenizer.py b/model/tokenizer.py index 6359a79..7d60093 100644 --- a/model/tokenizer.py +++ b/model/tokenizer.py @@ -20,9 +20,9 @@ def save_vocab(vocab): class Tokenizer: def __init__(self): - self.vocab = {"": 0, "": 1, "": 2, "": 3} - self.reverse_vocab = {0: "", 1: "", 2: "", 3: ""} - self.next_id = 4 + self.vocab = {"": 0, "": 1, "": 2, "": 3, "": 4} + self.reverse_vocab = {0: "", 1: "", 2: "", 3: "", 4: ""} + self.next_id = 5 def tokenize(self, text): text = clean_unicode(text) # 🚨 Always clean incoming text