Aria/09_perplexity.py

# 09_perplexity.py
"""
Compute validation perplexity for a Laplace-smoothed bigram model (Numpy).

What it does:
- Loads train/val text  (or splits data.txt 90/10 if missing).
- Builds char vocab on train ONLY.
- Trains a Laplace-smoothed bigram model (alpha=1.0).
- Evaluates NLL and Perplexity on val.

How to run:
    python 09_perplexity.py
    # doctests (optional)
    python -m doctest -v 09_perplexity.py

Notes:
- If val has many OOV characters (not seen in train), those positions are skipped.
- Perplexity = exp(mean NLL), where NLL is computed over observed bigrams.
"""
from __future__ import annotations
from pathlib import Path
import numpy as np

FALLBACK = "aabbaabb\n"

# ---------- IO ----------


def load_or_split():
    """Return (train_text, val_text), normalizing newlines to '\\n'.

    If train/val files exist, read them; else split data.txt 90/10.

    >>> tr, va = load_or_split()
    >>> isinstance(tr, str) and isinstance(va, str)
    True
    """
    tr_p, va_p = Path("train.txt"), Path("val.txt")
    if tr_p.exists() and va_p.exists():
        tr = tr_p.read_text(encoding="utf-8")
        va = va_p.read_text(encoding="utf-8")
    else:
        base = Path("data.txt")
        txt = base.read_text(encoding="utf-8") if base.exists() else FALLBACK
        cut = int(0.9 * len(txt))
        tr, va = txt[:cut], txt[cut:]
    # normalize
    to_lf = lambda s: s.replace("\r\n", "\n").replace("\r", "\n")
    return to_lf(tr), to_lf(va)


# ---------- vocab ----------

def build_vocab(text: str):
    """Build vocab from train text: returns (chars, stoi, itos)."""
    chars = sorted(set(text))
    stoi = {c: i for i, c in enumerate(chars)}
    itos = {i: c for c, i in stoi.items()}
    return chars, stoi, itos


# ---------- model ----------

def bigram_counts(text: str, stoi: dict[str, int]) -> np.ndarray:
    """Return VxV bigram counts for text restricted to train vocab.

    >>> import numpy as _np
    >>> stoi = {'a':0, 'b':1}
    >>> M = bigram_counts("abba", stoi)
    >>> int(M[0,1]), int(M[1,1])
    (1, 1)
    """
    ids = [stoi[c] for c in text if c in stoi]
    V = len(stoi)
    mat = np.zeros((V, V), dtype=np.int64)
    for a, b in zip(ids[:-1], ids[1:]):
        mat[a, b] += 1
    return mat


def laplace(mat: np.ndarray, alpha: float = 1.0) -> np.ndarray:
    """Row-wise Laplace smoothing; returns probabilites.

    >>> import numpy as _np
    >>> P = laplace(_np.array([[0,1],[0,0]]), alpha=1.0)
    >>> _np.allclose(P.sum(axis=1), 1.0)
    True
    """
    P = mat.astype(np.float64) + alpha
    P /= P.sum(axis=1, keepdims=True)
    return P


# ---------- evaluation ----------

def nll_on_text(text: str, stoi: dict[str, int], P: np.ndarray) -> float:
    """Average negative log-likelihood over bigrams present in vocab.

    >>> import numpy as _np
    >>> stoi={'a':0, 'b':1}; P=_np.array([[0.5, 0.5], [0.5, 0.5]])
    >>> round(nll_on_text("ab", stoi, P), 5)
    0.69315
    """
    ids = [stoi[c] for c in text if c in stoi]
    logs = []
    for a, b in zip(ids[:-1], ids[1:]):
        p = float(P[a, b])
        logs.append(-np.log(max(1e-12, p)))
    return float(np.mean(logs)) if logs else float("inf")


def perplexity(nll: float) -> float:
    """Perplexity = exp(NLL)."""
    return float(np.exp(nll))


def main() -> None:
    train, val = load_or_split()
    chars, stoi, itos = build_vocab(train)
    M = bigram_counts(train, stoi)
    P = laplace(M, alpha=1.0)

    nll = nll_on_text(val, stoi, P)
    ppl = perplexity(nll)
    print(f"Validation NLL: {nll:.4f}")
    print(f"Perplexity: {ppl:.4f}")


if __name__ == "__main__":
    main()