Fixed a spacing error and confirmed that unicode was being removed.
This commit is contained in:
parent
3a77b5db32
commit
ec82d0ab63
@ -20,9 +20,9 @@ def save_vocab(vocab):
|
|||||||
|
|
||||||
class Tokenizer:
|
class Tokenizer:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.vocab = {"<pad>": 0, "<unk>": 1, "<start>": 2, "<end>": 3}
|
self.vocab = {"<pad>": 0, "<unk>": 1, "<start>": 2, "<end>": 3, "<sep>": 4}
|
||||||
self.reverse_vocab = {0: "<pad>", 1: "<unk>", 2: "<start>", 3: "<end>"}
|
self.reverse_vocab = {0: "<pad>", 1: "<unk>", 2: "<start>", 3: "<end>", 4: "<sep>"}
|
||||||
self.next_id = 4
|
self.next_id = 5
|
||||||
|
|
||||||
def tokenize(self, text):
|
def tokenize(self, text):
|
||||||
text = clean_unicode(text) # 🚨 Always clean incoming text
|
text = clean_unicode(text) # 🚨 Always clean incoming text
|
||||||
|
Loading…
x
Reference in New Issue
Block a user