import json # Save vocabulary def save_vocab(): vocab = {char: idx for idx, char in enumerate("abcdefghijklmnopqrstuvwxyz ")} vocab[""] = len(vocab) # Add unknown token with open('vocab.json', 'w') as f: json.dump(vocab, f) # Load vocabulary def load_vocab(): with open('vocab.json', 'r') as f: return json.load(f) # Tokenizer def simple_tokenizer(text, vocab): # Convert text to lowercase and replace unknown characters with text = text.lower() unk_token = vocab.get("", None) return [vocab[char] if char in vocab else unk_token for char in text] # Detokenizer def detokenizer(tokens, vocab): reverse_vocab = {idx: char for char, idx in vocab.items()} return ''.join(reverse_vocab[token] for token in tokens) if __name__ == "__main__": save_vocab() vocab = load_vocab() print(simple_tokenizer("hello world", vocab))