feat: add vocabulary encoding/decoding script with character-level tokenization

This commit introduces a new script that implements character-level vocabulary building and text encoding/decoding functionality. The script loads text from train.txt or falls back to data.txt, normalizes line endings, builds character-to-id mappings, and includes round-trip encoding/decoding validation. It's designed for CPU-only operation using only Python standard library modules and provides clear error handling for unseen characters during encoding.
2025-09-23 20:57:48 -04:00
parent abba60a798
commit feecf05ee3
1 changed files with 125 additions and 0 deletions
--- a/04_vocab_encode_decode.py
+++ b/04_vocab_encode_decode.py
@@ -0,0 +1,125 @@
 # 04_vocab_encode_decode.py
 """
 Build a character vocabulary and encode/decode text.
 What it does:
 - Loads UTF-8 text from train.txt (preferred) or data.txt (fallback included).
 - Normalizes newlines to "\\n".
 - Builds char<->id mappings, then encodes and decodes a sample for a round-trip check.
 How to run:
    python 04_vocab_encode_decode.py
 Notes:
 - The vocabulary is built from the loaded text. Encoding a string with unseen characters
  will raise a KeyError (expected at this stage).
 - Everything is CPU-only and uses Python stdlib.
 """
 from __future__ import annotations
 from pathlib import Path
 from typing import Dict, List
 FALLBACK = "hello\\nworld\\n"
 def load_source() -> str:
    """Load source text from train.txt, then data.txt, else fallback; normalize newlines.
    Returns:
        Text with only '\\n' newlines.
    >>> isinstance(load_source(), str)
    True
    """
    p = Path("train.txt") if Path("train.txt").exists() else Path("data.txt")
    text = p.read_text(encoding="utf-8") if p.exists() else FALLBACK
    return text.replace("\\r\\n", "\\n").replace("\\r", "\\n")
 def build_vocab(text: str) -> Dict[str, int]:
    """Build a sorted character vocabulary mapping char->index.
    Args:
        text: Source text.
    Returns:
        stoi dict mapping each unique character to an integer id.
    >>> stoi = build_vocab("ab\\n")
    >>> sorted(stoi) == ['\\n','a','b']
    True
    """
    chars = sorted(set(text))
    return {ch: i for i, ch in enumerate(chars)}
 def invert(stoi: Dict[str, int]) -> Dict[int, str]:
    """Invert a char->id mapping to id->char.
    Args:
        stoi: Mapping from characters to integer ids.
    Returns:
        itos dict mapping integer ids back to characters.
    >>> invert({'a': 0, 'b': 1})[1]
    'b'
    """
    return {v: k for k, v in stoi.items()}
 def encode(text: str, stoi: Dict[str, int]) -> List[int]:
    """Encode a string into a list of integer ids using the provided vocabulary.
    Args:
        text: Input text.
        stoi: Character-to-index mapping.
    Returns:
        List of integer ids.
    Raises:
        KeyError: If a character is not in the vocabulary.
    >>> encode("ab", {'a':0, 'b':1})
    [0, 1]
    """
    return [stoi[ch] for ch in text]
 def decode(ids: List[int], itos: Dict[int, str]) -> str:
    """Decode a list of integer ids back into a string.
    Args:
        ids: Sequence of token ids.
        itos: Index-to-character mapping.
    Returns:
        Decoded string.
    >>> decode([0, 1], {0:'a', 1:'b'})
    'ab'
    """
    return "".join(itos[i] for i in ids)
 def main() -> None:
    text = load_source()
    stoi = build_vocab(text)
    itos = invert(stoi)
    # Take a small sample to test round-trip
    sample = text[:200]
    ids = encode(sample, stoi)
    roundtrip = decode(ids, itos)
    print(f"Vocab size: {len(stoi)}")
    print("Sample (first 200 chars, \\n shown literally):")
    print(sample.replace("\\n", "\\\\n"))
    print("\nEncoded (first 40 ids):", ids[:40], "...")
    print("Roundtrip OK:", roundtrip == sample)
 if __name__ == "__main__":
    main()