# 02_char_freq.py """ Count character frequencies in a corpus and print a sorted table. What it does: - Loads UTF-8 text from train.txt or data.txt (with a tiny built-in fallback) - Normalizes newlines to "\\n". - Prints a frequency table (top 50 by default), showing "\\n" literally. How to run: python 02_char_freq.py Notes: - Keep your dataset besides this script as 'data.txt' (or run Lesson 03 to create train/val) - Output uses a simple ASCII table; newline is shown as the literal sequence "\\n". """ from __future__ import annotations from pathlib import Path from typing import Dict, Iterable, Tuple FALLBACK = ( "O Romeo, Romeo! wherefore art thou Romeo?\n" "Deny thy father and refuse thy name;\n" ) def load_and_normalize() -> str: """Load corpus text and normalize newlines to LF ('\\n'). Returns: Text as a single string with only '\\n' newlines. >>> s = "a\\rb\\r\\nc\\nd" >>> load_and_normalize.__doc__ is not None # doctest sanity check True """ p = Path("train.txt") if Path("train.txt").exists() else Path("data.txt") text = p.read_text(encoding="utf-8") if p.exists() else FALLBACK return text.replace("\r\n", "\n").replace("\r", "\n") def char_counts(text: str) -> Dict[str, int]: """Return a dict mapping character -> count. Args: text: Input text. Returns: Dictionary of counts. >>> char_counts("aba\\n")["a"] == 2 True >>> isinstance(char_counts(""), dict) True >>> char_counts("") {} """ counts: Dict[str, int] = {} for ch in text: counts[ch] = counts.get(ch, 0) + 1 return counts def sorted_counts(counts: Dict[str, int]) -> Tuple[Tuple[str, int], ...]: """Sort counts by descending frequency, then by character ascending. Args: counts: Dictionary mapping char -> count. Returns: Tuple of (char, count) rows sorted by (-count, char). >>> out = sorted_counts({'b': 3, 'a': 3, 'c': 1}) >>> out[0][1] == 3 and out [1][1] == 3 and out [-1] == ('c', 1) True """ return tuple(sorted(counts.items(), key=lambda kv: (-kv[1], kv[0]))) def format_row(ch: str, c: int) -> str: """Format a single table row, showing newline as literal '\\n'. Args: ch: The character. c: Count Returns: Formatted row string. >>> format_row('\\n', 5).endswith(" | 5") True """ label = "\\n" if ch == "\n" else ch return f"{label!r:>4} | {c}" def print_table(rows: Iterable[Tuple[str, int]], limit: int = 50) -> None: """Print a simple table of (char, count) Args: rows: Iterable of (char, count) limit: Max rows to print """ print("Char | Count") print("-----+------") n = 0 for ch, c in rows: if n >= limit: break print(format_row(ch, c)) n += 1 def main() -> None: text = load_and_normalize() counts = char_counts(text) rows = sorted_counts(counts) print_table(rows, limit=50) print(f"\nTotal chars: {len(text)} | Unique chars: {len(counts)}") if __name__ == "__main__": main()