feat: add uniform random text generator with reproducible sampling

This commit introduces a new script that generates random text by uniformly sampling characters from the training data's vocabulary. It loads text from train.txt or falls back to data.txt, normalizes line endings, builds a sorted character vocabulary, and samples characters using a fixed RNG seed for reproducibility. The implementation includes command-line arguments for specifying generation length and random seed, making it configurable while maintaining consistent output for the same inputs.
2025-09-23 21:18:57 -04:00
parent feecf05ee3
commit 538a44247b
1 changed files with 86 additions and 0 deletions
--- a/05_uniform_generator.py
+++ b/05_uniform_generator.py
@@ -0,0 +1,86 @@
+# 05_uniform_generator.py
+"""
+Generate random text uniformly across observed characters.
+
+What it does:
+- Loads text from train.txt (preferred) or data.txt (fallback if missing).
+- Normalizes newlines to "\\n".
+- Builds a sorted character vocabulary.
+- Samples characters uniformly using a fixed RNG seed for reproducibility.
+
+How to run:
+    python 05_uniform_generator.py
+    # Optional flags:
+    #   --length 300 --seed 123
+
+Notes:
+- This is a baseline; it ignores context. We'll get smarter in later lessons.
+- if no dataset is present, a tiny built-in fallback is used so the script always runs.
+"""
+
+from __future__ import annotations
+from pathlib import Path
+from typing import List
+import argparse
+import random
+
+FALLBACK = "abc\n"
+
+
+def load_text() -> str:
+    """Load and normalize text from train.txt or data.txt, else use a fallback.
+
+    Returns:
+        A UTF-8 string with only LF ('\\n') newlines.
+
+    >>> isinstance(load_text(), str)
+    True
+    """
+    p = Path("train.txt") if Path("train.txt").exists() else Path("data.txt")
+    txt = p.read_text(encoding="utf-8") if p.exists() else FALLBACK
+    return txt.replace("\r\n", "\n").replace("\r", "\n")
+
+
+def build_vocab(text: str) -> List[str]:
+    """Return a sorted list of unique characters in the text.
+
+    >>> build_vocab("ba\\n") == ['\\n', 'a', 'b']
+    True
+    """
+    return sorted(set(text))
+
+
+def generate_uniform(vocab: List[str], n: int = 300, seed: int = 123) -> str:
+    """Sample characters uniformly at random from the vocabulary.
+
+    Args:
+        vocab: Sorted list of unique characters.
+        n: Number of characters to generate.
+        seed: RNG seed for reproducibility.
+
+    Returns:
+        Generated text string of length n.
+
+    >>> len(generate_uniform(['x'], n=5, seed=0)) == 5
+    True
+    """
+    rng = random.Random(seed)
+    return "".join(rng.choice(vocab) for _ in range(n))
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--length", type=int, default=300, help="Number of characters to generate.")
+    parser.add_argument("--seed", type=int, default=123, help="Random seed for reproducibility.")
+    args = parser.parse_args()
+
+    text = load_text()
+    vocab = build_vocab(text)
+    out = generate_uniform(vocab, n=args.length, seed=args.seed)
+
+    print(f"[uniform]  length={args.length} seed={args.seed} vocab_size={len(vocab)}")
+    print(out)
+
+
+if __name__ == "__main__":
+    main()