Created NORA. She has been designed from zero. At this point, I have determined the best hyperparamers for her to train. Next step is to help her communicate on discord and see how she handles it.

2025-06-03 23:43:58 -04:00
commit 16289fc942
10 changed files with 955 additions and 0 deletions
--- a/tokenizer.py
+++ b/tokenizer.py
@ -0,0 +1,86 @@
+"""
+tokenizer.py
+
+A simple character‐level tokenizer that builds its own vocabulary from all text files.
+Saves/loads vocab to/from JSON. You can extend this to a word‐level tokenizer if you wish.
+"""
+
+import json
+import os
+from collections import Counter
+from typing import List, Dict, Union
+
+
+class CharTokenizer:
+    def __init__(self, vocab_path: str, data_dir: str):
+        """
+        If vocab_path exists, load it; otherwise, build from raw text in data_dir.
+        """
+        self.vocab_path = vocab_path
+        self.data_dir = data_dir
+        self.stoi: Dict[str, int] = {}
+        self.itos: Dict[int, str] = {}
+
+        if os.path.isfile(self.vocab_path):
+            self._load_vocab()
+        else:
+            self._build_vocab()
+
+    def _build_vocab(self):
+        """
+        Read all .txt files under data_dir, count character frequencies,
+        build a sorted vocabulary, and save to vocab_path.
+        """
+        counter = Counter()
+        print(f"[tokenizer] Building vocabulary from data in '{self.data_dir}'...")
+        for root, _, files in os.walk(self.data_dir):
+            for fname in files:
+                if not fname.lower().endswith(".txt"):
+                    continue
+                path = os.path.join(root, fname)
+                with open(path, "r", encoding="utf-8", errors="ignore") as f:
+                    text = f.read()
+                    counter.update(text)
+
+        # Ensure a consistent ordering: sort by frequency descending, then Unicode codepoint
+        sorted_chars = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
+        unique_chars = [ch for ch, _ in sorted_chars]
+
+        # Add special tokens
+        tokens = ["<pad>", "<unk>"] + unique_chars
+
+        self.stoi = {ch: i for i, ch in enumerate(tokens)}
+        self.itos = {i: ch for i, ch in enumerate(tokens)}
+
+        # Save to JSON
+        os.makedirs(os.path.dirname(self.vocab_path), exist_ok=True)
+        with open(self.vocab_path, "w", encoding="utf-8") as f:
+            json.dump(self.stoi, f, ensure_ascii=False, indent=2)
+        print(f"[tokenizer] Built vocab size = {len(self.stoi)}; saved to '{self.vocab_path}'.")
+
+    def _load_vocab(self):
+        """
+        Load existing vocabulary from vocab_path.
+        """
+        print(f"[tokenizer] Loading vocabulary from '{self.vocab_path}'...")
+        with open(self.vocab_path, "r", encoding="utf-8") as f:
+            self.stoi = json.load(f)
+        self.itos = {i: ch for ch, i in self.stoi.items()}
+        print(f"[tokenizer] Loaded vocab size = {len(self.stoi)}.")
+
+    def encode(self, text: str) -> List[int]:
+        """
+        Convert a string to a list of integer token IDs (character‐level).
+        Unrecognized chars map to <unk>.
+        """
+        unk_id = self.stoi.get("<unk>")
+        return [self.stoi.get(ch, unk_id) for ch in text]
+
+    def decode(self, token_ids: List[int]) -> str:
+        """
+        Convert a list of token IDs back into a string.
+        """
+        return "".join(self.itos.get(i, "<unk>") for i in token_ids)
+
+    def vocab_size(self) -> int:
+        return len(self.stoi)