From 674c53651c46e0414fe51e9b5837b4394eef65a9 Mon Sep 17 00:00:00 2001
From: Dani <dsapelli@yahoo.com>
Date: Wed, 24 Sep 2025 00:33:23 -0400
Subject: [PATCH] feat: add Laplace-smoothed bigram model perplexity
 computation script

This commit introduces a new script that implements a Laplace-smoothed bigram language model for computing validation perplexity. The implementation includes:
- Data loading and splitting functionality (90/10 train/validation split)
- Character vocabulary building from training data only
- Bigram counting and Laplace smoothing with alpha=1.0
- Negative log-likelihood and perplexity computation
- Proper handling of out-of-vocabulary characters during evaluation

The script can process existing train.txt/val.txt files or automatically split a data.txt file if the required input files are missing, making it self-contained and easy to use for language model evaluation tasks.
---
 09_perplexity.py | 130 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)
 create mode 100644 09_perplexity.py

diff --git a/09_perplexity.py b/09_perplexity.py
new file mode 100644
index 0000000..dbecab5
--- /dev/null
+++ b/09_perplexity.py
@@ -0,0 +1,130 @@
+# 09_perplexity.py
+"""
+Compute validation perplexity for a Laplace-smoothed bigram model (Numpy).
+
+What it does:
+- Loads train/val text  (or splits data.txt 90/10 if missing).
+- Builds char vocab on train ONLY.
+- Trains a Laplace-smoothed bigram model (alpha=1.0).
+- Evaluates NLL and Perplexity on val.
+
+How to run:
+    python 09_perplexity.py
+    # doctests (optional)
+    python -m doctest -v 09_perplexity.py
+
+Notes:
+- If val has many OOV characters (not seen in train), those positions are skipped.
+- Perplexity = exp(mean NLL), where NLL is computed over observed bigrams.
+"""
+from __future__ import annotations
+from pathlib import Path
+import numpy as np
+
+FALLBACK = "aabbaabb\n"
+
+# ---------- IO ----------
+
+
+def load_or_split():
+    """Return (train_text, val_text), normalizing newlines to '\\n'.
+
+    If train/val files exist, read them; else split data.txt 90/10.
+
+    >>> tr, va = load_or_split()
+    >>> isinstance(tr, str) and isinstance(va, str)
+    True
+    """
+    tr_p, va_p = Path("train.txt"), Path("val.txt")
+    if tr_p.exists() and va_p.exists():
+        tr = tr_p.read_text(encoding="utf-8")
+        va = va_p.read_text(encoding="utf-8")
+    else:
+        base = Path("data.txt")
+        txt = base.read_text(encoding="utf-8") if base.exists() else FALLBACK
+        cut = int(0.9 * len(txt))
+        tr, va = txt[:cut], txt[cut:]
+    # normalize
+    to_lf = lambda s: s.replace("\r\n", "\n").replace("\r", "\n")
+    return to_lf(tr), to_lf(va)
+
+
+# ---------- vocab ----------
+
+def build_vocab(text: str):
+    """Build vocab from train text: returns (chars, stoi, itos)."""
+    chars = sorted(set(text))
+    stoi = {c: i for i, c in enumerate(chars)}
+    itos = {i: c for c, i in stoi.items()}
+    return chars, stoi, itos
+
+
+# ---------- model ----------
+
+def bigram_counts(text: str, stoi: dict[str, int]) -> np.ndarray:
+    """Return VxV bigram counts for text restricted to train vocab.
+
+    >>> import numpy as _np
+    >>> stoi = {'a':0, 'b':1}
+    >>> M = bigram_counts("abba", stoi)
+    >>> int(M[0,1]), int(M[1,1])
+    (1, 1)
+    """
+    ids = [stoi[c] for c in text if c in stoi]
+    V = len(stoi)
+    mat = np.zeros((V, V), dtype=np.int64)
+    for a, b in zip(ids[:-1], ids[1:]):
+        mat[a, b] += 1
+    return mat
+
+
+def laplace(mat: np.ndarray, alpha: float = 1.0) -> np.ndarray:
+    """Row-wise Laplace smoothing; returns probabilites.
+
+    >>> import numpy as _np
+    >>> P = laplace(_np.array([[0,1],[0,0]]), alpha=1.0)
+    >>> _np.allclose(P.sum(axis=1), 1.0)
+    True
+    """
+    P = mat.astype(np.float64) + alpha
+    P /= P.sum(axis=1, keepdims=True)
+    return P
+
+
+# ---------- evaluation ----------
+
+def nll_on_text(text: str, stoi: dict[str, int], P: np.ndarray) -> float:
+    """Average negative log-likelihood over bigrams present in vocab.
+
+    >>> import numpy as _np
+    >>> stoi={'a':0, 'b':1}; P=_np.array([[0.5, 0.5], [0.5, 0.5]])
+    >>> round(nll_on_text("ab", stoi, P), 5)
+    0.69315
+    """
+    ids = [stoi[c] for c in text if c in stoi]
+    logs = []
+    for a, b in zip(ids[:-1], ids[1:]):
+        p = float(P[a, b])
+        logs.append(-np.log(max(1e-12, p)))
+    return float(np.mean(logs)) if logs else float("inf")
+
+
+def perplexity(nll: float) -> float:
+    """Perplexity = exp(NLL)."""
+    return float(np.exp(nll))
+
+
+def main() -> None:
+    train, val = load_or_split()
+    chars, stoi, itos = build_vocab(train)
+    M = bigram_counts(train, stoi)
+    P = laplace(M, alpha=1.0)
+
+    nll = nll_on_text(val, stoi, P)
+    ppl = perplexity(nll)
+    print(f"Validation NLL: {nll:.4f}")
+    print(f"Perplexity: {ppl:.4f}")
+
+
+if __name__ == "__main__":
+    main()