Fixed a spacing error and confirmed that unicode was being removed.

This commit is contained in:
Dani 2025-04-27 16:21:26 -04:00
parent 3a77b5db32
commit ec82d0ab63

View File

@ -20,9 +20,9 @@ def save_vocab(vocab):
class Tokenizer: class Tokenizer:
def __init__(self): def __init__(self):
self.vocab = {"<pad>": 0, "<unk>": 1, "<start>": 2, "<end>": 3} self.vocab = {"<pad>": 0, "<unk>": 1, "<start>": 2, "<end>": 3, "<sep>": 4}
self.reverse_vocab = {0: "<pad>", 1: "<unk>", 2: "<start>", 3: "<end>"} self.reverse_vocab = {0: "<pad>", 1: "<unk>", 2: "<start>", 3: "<end>", 4: "<sep>"}
self.next_id = 4 self.next_id = 5
def tokenize(self, text): def tokenize(self, text):
text = clean_unicode(text) # 🚨 Always clean incoming text text = clean_unicode(text) # 🚨 Always clean incoming text