feat: add uniform random text generator with reproducible sampling
This commit introduces a new script that generates random text by uniformly sampling characters from the training data's vocabulary. It loads text from train.txt or falls back to data.txt, normalizes line endings, builds a sorted character vocabulary, and samples characters using a fixed RNG seed for reproducibility. The implementation includes command-line arguments for specifying generation length and random seed, making it configurable while maintaining consistent output for the same inputs.
This commit is contained in:
86
05_uniform_generator.py
Normal file
86
05_uniform_generator.py
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
# 05_uniform_generator.py
|
||||||
|
"""
|
||||||
|
Generate random text uniformly across observed characters.
|
||||||
|
|
||||||
|
What it does:
|
||||||
|
- Loads text from train.txt (preferred) or data.txt (fallback if missing).
|
||||||
|
- Normalizes newlines to "\\n".
|
||||||
|
- Builds a sorted character vocabulary.
|
||||||
|
- Samples characters uniformly using a fixed RNG seed for reproducibility.
|
||||||
|
|
||||||
|
How to run:
|
||||||
|
python 05_uniform_generator.py
|
||||||
|
# Optional flags:
|
||||||
|
# --length 300 --seed 123
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- This is a baseline; it ignores context. We'll get smarter in later lessons.
|
||||||
|
- if no dataset is present, a tiny built-in fallback is used so the script always runs.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
import argparse
|
||||||
|
import random
|
||||||
|
|
||||||
|
FALLBACK = "abc\n"
|
||||||
|
|
||||||
|
|
||||||
|
def load_text() -> str:
|
||||||
|
"""Load and normalize text from train.txt or data.txt, else use a fallback.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A UTF-8 string with only LF ('\\n') newlines.
|
||||||
|
|
||||||
|
>>> isinstance(load_text(), str)
|
||||||
|
True
|
||||||
|
"""
|
||||||
|
p = Path("train.txt") if Path("train.txt").exists() else Path("data.txt")
|
||||||
|
txt = p.read_text(encoding="utf-8") if p.exists() else FALLBACK
|
||||||
|
return txt.replace("\r\n", "\n").replace("\r", "\n")
|
||||||
|
|
||||||
|
|
||||||
|
def build_vocab(text: str) -> List[str]:
|
||||||
|
"""Return a sorted list of unique characters in the text.
|
||||||
|
|
||||||
|
>>> build_vocab("ba\\n") == ['\\n', 'a', 'b']
|
||||||
|
True
|
||||||
|
"""
|
||||||
|
return sorted(set(text))
|
||||||
|
|
||||||
|
|
||||||
|
def generate_uniform(vocab: List[str], n: int = 300, seed: int = 123) -> str:
|
||||||
|
"""Sample characters uniformly at random from the vocabulary.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab: Sorted list of unique characters.
|
||||||
|
n: Number of characters to generate.
|
||||||
|
seed: RNG seed for reproducibility.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Generated text string of length n.
|
||||||
|
|
||||||
|
>>> len(generate_uniform(['x'], n=5, seed=0)) == 5
|
||||||
|
True
|
||||||
|
"""
|
||||||
|
rng = random.Random(seed)
|
||||||
|
return "".join(rng.choice(vocab) for _ in range(n))
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--length", type=int, default=300, help="Number of characters to generate.")
|
||||||
|
parser.add_argument("--seed", type=int, default=123, help="Random seed for reproducibility.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
text = load_text()
|
||||||
|
vocab = build_vocab(text)
|
||||||
|
out = generate_uniform(vocab, n=args.length, seed=args.seed)
|
||||||
|
|
||||||
|
print(f"[uniform] length={args.length} seed={args.seed} vocab_size={len(vocab)}")
|
||||||
|
print(out)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Reference in New Issue
Block a user