diff --git a/05_uniform_generator.py b/05_uniform_generator.py new file mode 100644 index 0000000..a57b34e --- /dev/null +++ b/05_uniform_generator.py @@ -0,0 +1,86 @@ +# 05_uniform_generator.py +""" +Generate random text uniformly across observed characters. + +What it does: +- Loads text from train.txt (preferred) or data.txt (fallback if missing). +- Normalizes newlines to "\\n". +- Builds a sorted character vocabulary. +- Samples characters uniformly using a fixed RNG seed for reproducibility. + +How to run: + python 05_uniform_generator.py + # Optional flags: + # --length 300 --seed 123 + +Notes: +- This is a baseline; it ignores context. We'll get smarter in later lessons. +- if no dataset is present, a tiny built-in fallback is used so the script always runs. +""" + +from __future__ import annotations +from pathlib import Path +from typing import List +import argparse +import random + +FALLBACK = "abc\n" + + +def load_text() -> str: + """Load and normalize text from train.txt or data.txt, else use a fallback. + + Returns: + A UTF-8 string with only LF ('\\n') newlines. + + >>> isinstance(load_text(), str) + True + """ + p = Path("train.txt") if Path("train.txt").exists() else Path("data.txt") + txt = p.read_text(encoding="utf-8") if p.exists() else FALLBACK + return txt.replace("\r\n", "\n").replace("\r", "\n") + + +def build_vocab(text: str) -> List[str]: + """Return a sorted list of unique characters in the text. + + >>> build_vocab("ba\\n") == ['\\n', 'a', 'b'] + True + """ + return sorted(set(text)) + + +def generate_uniform(vocab: List[str], n: int = 300, seed: int = 123) -> str: + """Sample characters uniformly at random from the vocabulary. + + Args: + vocab: Sorted list of unique characters. + n: Number of characters to generate. + seed: RNG seed for reproducibility. + + Returns: + Generated text string of length n. + + >>> len(generate_uniform(['x'], n=5, seed=0)) == 5 + True + """ + rng = random.Random(seed) + return "".join(rng.choice(vocab) for _ in range(n)) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--length", type=int, default=300, help="Number of characters to generate.") + parser.add_argument("--seed", type=int, default=123, help="Random seed for reproducibility.") + args = parser.parse_args() + + text = load_text() + vocab = build_vocab(text) + out = generate_uniform(vocab, n=args.length, seed=args.seed) + + print(f"[uniform] length={args.length} seed={args.seed} vocab_size={len(vocab)}") + print(out) + + +if __name__ == "__main__": + main()