33 lines
909 B
Python
33 lines
909 B
Python
import json
|
|
|
|
# Save vocabulary
|
|
def save_vocab():
|
|
vocab = {char: idx for idx, char in enumerate("abcdefghijklmnopqrstuvwxyz ")}
|
|
vocab["<unk>"] = len(vocab) # Add unknown token
|
|
with open('vocab.json', 'w') as f:
|
|
json.dump(vocab, f)
|
|
|
|
|
|
# Load vocabulary
|
|
def load_vocab():
|
|
with open('vocab.json', 'r') as f:
|
|
return json.load(f)
|
|
|
|
# Tokenizer
|
|
def simple_tokenizer(text, vocab):
|
|
# Convert text to lowercase and replace unknown characters with <unk>
|
|
text = text.lower()
|
|
unk_token = vocab.get("<unk>", None)
|
|
return [vocab[char] if char in vocab else unk_token for char in text]
|
|
|
|
|
|
# Detokenizer
|
|
def detokenizer(tokens, vocab):
|
|
reverse_vocab = {idx: char for char, idx in vocab.items()}
|
|
return ''.join(reverse_vocab[token] for token in tokens)
|
|
|
|
if __name__ == "__main__":
|
|
save_vocab()
|
|
vocab = load_vocab()
|
|
print(simple_tokenizer("hello world", vocab))
|