# 05_uniform_generator.py """ Generate random text uniformly across observed characters. What it does: - Loads text from train.txt (preferred) or data.txt (fallback if missing). - Normalizes newlines to "\\n". - Builds a sorted character vocabulary. - Samples characters uniformly using a fixed RNG seed for reproducibility. How to run: python 05_uniform_generator.py # Optional flags: # --length 300 --seed 123 Notes: - This is a baseline; it ignores context. We'll get smarter in later lessons. - if no dataset is present, a tiny built-in fallback is used so the script always runs. """ from __future__ import annotations from pathlib import Path from typing import List import argparse import random FALLBACK = "abc\n" def load_text() -> str: """Load and normalize text from train.txt or data.txt, else use a fallback. Returns: A UTF-8 string with only LF ('\\n') newlines. >>> isinstance(load_text(), str) True """ p = Path("train.txt") if Path("train.txt").exists() else Path("data.txt") txt = p.read_text(encoding="utf-8") if p.exists() else FALLBACK return txt.replace("\r\n", "\n").replace("\r", "\n") def build_vocab(text: str) -> List[str]: """Return a sorted list of unique characters in the text. >>> build_vocab("ba\\n") == ['\\n', 'a', 'b'] True """ return sorted(set(text)) def generate_uniform(vocab: List[str], n: int = 300, seed: int = 123) -> str: """Sample characters uniformly at random from the vocabulary. Args: vocab: Sorted list of unique characters. n: Number of characters to generate. seed: RNG seed for reproducibility. Returns: Generated text string of length n. >>> len(generate_uniform(['x'], n=5, seed=0)) == 5 True """ rng = random.Random(seed) return "".join(rng.choice(vocab) for _ in range(n)) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--length", type=int, default=300, help="Number of characters to generate.") parser.add_argument("--seed", type=int, default=123, help="Random seed for reproducibility.") args = parser.parse_args() text = load_text() vocab = build_vocab(text) out = generate_uniform(vocab, n=args.length, seed=args.seed) print(f"[uniform] length={args.length} seed={args.seed} vocab_size={len(vocab)}") print(out) if __name__ == "__main__": main()