Aria/02_char_freq.py

# 02_char_freq.py
"""
Count character frequencies in a corpus and print a sorted table.

What it does:
- Loads UTF-8 text from train.txt or data.txt (with a tiny built-in fallback)
- Normalizes newlines to "\\n".
- Prints a frequency table (top 50 by default), showing "\\n" literally.

How to run:
    python 02_char_freq.py

Notes:
- Keep your dataset besides this script as 'data.txt' (or run Lesson 03 to create train/val)
- Output uses a simple ASCII table; newline is shown as the literal sequence "\\n".
"""

from __future__ import annotations
from pathlib import Path
from typing import Dict, Iterable, Tuple

FALLBACK = (
    "O Romeo, Romeo! wherefore art thou Romeo?\n"
    "Deny thy father and refuse thy name;\n"
)


def load_and_normalize() -> str:
    """Load corpus text and normalize newlines to LF ('\\n').

    Returns:
        Text as a single string with only '\\n' newlines.

    >>> s = "a\\rb\\r\\nc\\nd"
    >>> load_and_normalize.__doc__ is not None  # doctest sanity check
    True
    """
    p = Path("train.txt") if Path("train.txt").exists() else Path("data.txt")
    text = p.read_text(encoding="utf-8") if p.exists() else FALLBACK
    return text.replace("\r\n", "\n").replace("\r", "\n")


def char_counts(text: str) -> Dict[str, int]:
    """Return a dict mapping character -> count.

    Args:
        text: Input text.

    Returns:
        Dictionary of counts.

    >>> char_counts("aba\\n")["a"] == 2
    True
    >>> isinstance(char_counts(""), dict)
    True
    >>> char_counts("")
    {}
    """
    counts: Dict[str, int] = {}
    for ch in text:
        counts[ch] = counts.get(ch, 0) + 1
    return counts


def sorted_counts(counts: Dict[str, int]) -> Tuple[Tuple[str, int], ...]:
    """Sort counts by descending frequency, then by character ascending.

    Args:
        counts: Dictionary mapping char -> count.

    Returns:
        Tuple of (char, count) rows sorted by (-count, char).

    >>> out = sorted_counts({'b': 3, 'a': 3, 'c': 1})
    >>> out[0][1] == 3 and out [1][1] == 3 and out [-1] == ('c', 1)
    True
    """
    return tuple(sorted(counts.items(), key=lambda kv: (-kv[1], kv[0])))


def format_row(ch: str, c: int) -> str:
    """Format a single table row, showing newline as literal '\\n'.

    Args:
        ch: The character.
        c: Count

    Returns:
        Formatted row string.

    >>> format_row('\\n', 5).endswith(" | 5")
    True
    """
    label = "\\n" if ch == "\n" else ch
    return f"{label!r:>4} | {c}"


def print_table(rows: Iterable[Tuple[str, int]], limit: int = 50) -> None:
    """Print a simple table of (char, count)

    Args:
        rows: Iterable of (char, count)
        limit: Max rows to print
    """
    print("Char | Count")
    print("-----+------")
    n = 0
    for ch, c in rows:
        if n >= limit:
            break
        print(format_row(ch, c))
        n += 1


def main() -> None:
    text = load_and_normalize()
    counts = char_counts(text)
    rows = sorted_counts(counts)
    print_table(rows, limit=50)
    print(f"\nTotal chars: {len(text)} | Unique chars: {len(counts)}")


if __name__ == "__main__":
    main()