Files
Aria/02_char_freq.py
Dani a82efe6ea2 add character frequency counter script with newline normalization and sorted output
This commit introduces a new script (02_char_freq.py) that analyzes character frequencies in text corpora. The script loads text from train.txt or data.txt (with fallback), normalizes all line endings to LF, counts character occurrences, sorts them by frequency (descending) and character (ascending), then prints a formatted ASCII table showing the top 50 most frequent characters. Newlines are displayed as literal "\\n" sequences in the output. The script includes proper type hints, docstrings with doctests, and handles missing files gracefully with a built-in fallback text.
2025-09-23 20:08:30 -04:00

125 lines
3.1 KiB
Python

# 02_char_freq.py
"""
Count character frequencies in a corpus and print a sorted table.
What it does:
- Loads UTF-8 text from train.txt or data.txt (with a tiny built-in fallback)
- Normalizes newlines to "\\n".
- Prints a frequency table (top 50 by default), showing "\\n" literally.
How to run:
python 02_char_freq.py
Notes:
- Keep your dataset besides this script as 'data.txt' (or run Lesson 03 to create train/val)
- Output uses a simple ASCII table; newline is shown as the literal sequence "\\n".
"""
from __future__ import annotations
from pathlib import Path
from typing import Dict, Iterable, Tuple
FALLBACK = (
"O Romeo, Romeo! wherefore art thou Romeo?\n"
"Deny thy father and refuse thy name;\n"
)
def load_and_normalize() -> str:
"""Load corpus text and normalize newlines to LF ('\\n').
Returns:
Text as a single string with only '\\n' newlines.
>>> s = "a\\rb\\r\\nc\\nd"
>>> load_and_normalize.__doc__ is not None # doctest sanity check
True
"""
p = Path("train.txt") if Path("train.txt").exists() else Path("data.txt")
text = p.read_text(encoding="utf-8") if p.exists() else FALLBACK
return text.replace("\r\n", "\n").replace("\r", "\n")
def char_counts(text: str) -> Dict[str, int]:
"""Return a dict mapping character -> count.
Args:
text: Input text.
Returns:
Dictionary of counts.
>>> char_counts("aba\\n")["a"] == 2
True
>>> isinstance(char_counts(""), dict)
True
>>> char_counts("")
{}
"""
counts: Dict[str, int] = {}
for ch in text:
counts[ch] = counts.get(ch, 0) + 1
return counts
def sorted_counts(counts: Dict[str, int]) -> Tuple[Tuple[str, int], ...]:
"""Sort counts by descending frequency, then by character ascending.
Args:
counts: Dictionary mapping char -> count.
Returns:
Tuple of (char, count) rows sorted by (-count, char).
>>> out = sorted_counts({'b': 3, 'a': 3, 'c': 1})
>>> out[0][1] == 3 and out [1][1] == 3 and out [-1] == ('c', 1)
True
"""
return tuple(sorted(counts.items(), key=lambda kv: (-kv[1], kv[0])))
def format_row(ch: str, c: int) -> str:
"""Format a single table row, showing newline as literal '\\n'.
Args:
ch: The character.
c: Count
Returns:
Formatted row string.
>>> format_row('\\n', 5).endswith(" | 5")
True
"""
label = "\\n" if ch == "\n" else ch
return f"{label!r:>4} | {c}"
def print_table(rows: Iterable[Tuple[str, int]], limit: int = 50) -> None:
"""Print a simple table of (char, count)
Args:
rows: Iterable of (char, count)
limit: Max rows to print
"""
print("Char | Count")
print("-----+------")
n = 0
for ch, c in rows:
if n >= limit:
break
print(format_row(ch, c))
n += 1
def main() -> None:
text = load_and_normalize()
counts = char_counts(text)
rows = sorted_counts(counts)
print_table(rows, limit=50)
print(f"\nTotal chars: {len(text)} | Unique chars: {len(counts)}")
if __name__ == "__main__":
main()