diff --git a/02_char_freq.py b/02_char_freq.py new file mode 100644 index 0000000..63a6369 --- /dev/null +++ b/02_char_freq.py @@ -0,0 +1,124 @@ +# 02_char_freq.py +""" +Count character frequencies in a corpus and print a sorted table. + +What it does: +- Loads UTF-8 text from train.txt or data.txt (with a tiny built-in fallback) +- Normalizes newlines to "\\n". +- Prints a frequency table (top 50 by default), showing "\\n" literally. + +How to run: + python 02_char_freq.py + +Notes: +- Keep your dataset besides this script as 'data.txt' (or run Lesson 03 to create train/val) +- Output uses a simple ASCII table; newline is shown as the literal sequence "\\n". +""" + +from __future__ import annotations +from pathlib import Path +from typing import Dict, Iterable, Tuple + +FALLBACK = ( + "O Romeo, Romeo! wherefore art thou Romeo?\n" + "Deny thy father and refuse thy name;\n" +) + + +def load_and_normalize() -> str: + """Load corpus text and normalize newlines to LF ('\\n'). + + Returns: + Text as a single string with only '\\n' newlines. + + >>> s = "a\\rb\\r\\nc\\nd" + >>> load_and_normalize.__doc__ is not None # doctest sanity check + True + """ + p = Path("train.txt") if Path("train.txt").exists() else Path("data.txt") + text = p.read_text(encoding="utf-8") if p.exists() else FALLBACK + return text.replace("\r\n", "\n").replace("\r", "\n") + + +def char_counts(text: str) -> Dict[str, int]: + """Return a dict mapping character -> count. + + Args: + text: Input text. + + Returns: + Dictionary of counts. + + >>> char_counts("aba\\n")["a"] == 2 + True + >>> isinstance(char_counts(""), dict) + True + >>> char_counts("") + {} + """ + counts: Dict[str, int] = {} + for ch in text: + counts[ch] = counts.get(ch, 0) + 1 + return counts + + +def sorted_counts(counts: Dict[str, int]) -> Tuple[Tuple[str, int], ...]: + """Sort counts by descending frequency, then by character ascending. + + Args: + counts: Dictionary mapping char -> count. + + Returns: + Tuple of (char, count) rows sorted by (-count, char). + + >>> out = sorted_counts({'b': 3, 'a': 3, 'c': 1}) + >>> out[0][1] == 3 and out [1][1] == 3 and out [-1] == ('c', 1) + True + """ + return tuple(sorted(counts.items(), key=lambda kv: (-kv[1], kv[0]))) + + +def format_row(ch: str, c: int) -> str: + """Format a single table row, showing newline as literal '\\n'. + + Args: + ch: The character. + c: Count + + Returns: + Formatted row string. + + >>> format_row('\\n', 5).endswith(" | 5") + True + """ + label = "\\n" if ch == "\n" else ch + return f"{label!r:>4} | {c}" + + +def print_table(rows: Iterable[Tuple[str, int]], limit: int = 50) -> None: + """Print a simple table of (char, count) + + Args: + rows: Iterable of (char, count) + limit: Max rows to print + """ + print("Char | Count") + print("-----+------") + n = 0 + for ch, c in rows: + if n >= limit: + break + print(format_row(ch, c)) + n += 1 + + +def main() -> None: + text = load_and_normalize() + counts = char_counts(text) + rows = sorted_counts(counts) + print_table(rows, limit=50) + print(f"\nTotal chars: {len(text)} | Unique chars: {len(counts)}") + + +if __name__ == "__main__": + main()