Ruby/tokenizer.py

33 lines
909 B
Python

import json
# Save vocabulary
def save_vocab():
vocab = {char: idx for idx, char in enumerate("abcdefghijklmnopqrstuvwxyz ")}
vocab["<unk>"] = len(vocab) # Add unknown token
with open('vocab.json', 'w') as f:
json.dump(vocab, f)
# Load vocabulary
def load_vocab():
with open('vocab.json', 'r') as f:
return json.load(f)
# Tokenizer
def simple_tokenizer(text, vocab):
# Convert text to lowercase and replace unknown characters with <unk>
text = text.lower()
unk_token = vocab.get("<unk>", None)
return [vocab[char] if char in vocab else unk_token for char in text]
# Detokenizer
def detokenizer(tokens, vocab):
reverse_vocab = {idx: char for char, idx in vocab.items()}
return ''.join(reverse_vocab[token] for token in tokens)
if __name__ == "__main__":
save_vocab()
vocab = load_vocab()
print(simple_tokenizer("hello world", vocab))