add character frequency counter script with newline normalization and sorted output
This commit introduces a new script (02_char_freq.py) that analyzes character frequencies in text corpora. The script loads text from train.txt or data.txt (with fallback), normalizes all line endings to LF, counts character occurrences, sorts them by frequency (descending) and character (ascending), then prints a formatted ASCII table showing the top 50 most frequent characters. Newlines are displayed as literal "\\n" sequences in the output. The script includes proper type hints, docstrings with doctests, and handles missing files gracefully with a built-in fallback text.
This commit is contained in:
124
02_char_freq.py
Normal file
124
02_char_freq.py
Normal file
@@ -0,0 +1,124 @@
|
|||||||
|
# 02_char_freq.py
|
||||||
|
"""
|
||||||
|
Count character frequencies in a corpus and print a sorted table.
|
||||||
|
|
||||||
|
What it does:
|
||||||
|
- Loads UTF-8 text from train.txt or data.txt (with a tiny built-in fallback)
|
||||||
|
- Normalizes newlines to "\\n".
|
||||||
|
- Prints a frequency table (top 50 by default), showing "\\n" literally.
|
||||||
|
|
||||||
|
How to run:
|
||||||
|
python 02_char_freq.py
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- Keep your dataset besides this script as 'data.txt' (or run Lesson 03 to create train/val)
|
||||||
|
- Output uses a simple ASCII table; newline is shown as the literal sequence "\\n".
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Iterable, Tuple
|
||||||
|
|
||||||
|
FALLBACK = (
|
||||||
|
"O Romeo, Romeo! wherefore art thou Romeo?\n"
|
||||||
|
"Deny thy father and refuse thy name;\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_and_normalize() -> str:
|
||||||
|
"""Load corpus text and normalize newlines to LF ('\\n').
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Text as a single string with only '\\n' newlines.
|
||||||
|
|
||||||
|
>>> s = "a\\rb\\r\\nc\\nd"
|
||||||
|
>>> load_and_normalize.__doc__ is not None # doctest sanity check
|
||||||
|
True
|
||||||
|
"""
|
||||||
|
p = Path("train.txt") if Path("train.txt").exists() else Path("data.txt")
|
||||||
|
text = p.read_text(encoding="utf-8") if p.exists() else FALLBACK
|
||||||
|
return text.replace("\r\n", "\n").replace("\r", "\n")
|
||||||
|
|
||||||
|
|
||||||
|
def char_counts(text: str) -> Dict[str, int]:
|
||||||
|
"""Return a dict mapping character -> count.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Input text.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of counts.
|
||||||
|
|
||||||
|
>>> char_counts("aba\\n")["a"] == 2
|
||||||
|
True
|
||||||
|
>>> isinstance(char_counts(""), dict)
|
||||||
|
True
|
||||||
|
>>> char_counts("")
|
||||||
|
{}
|
||||||
|
"""
|
||||||
|
counts: Dict[str, int] = {}
|
||||||
|
for ch in text:
|
||||||
|
counts[ch] = counts.get(ch, 0) + 1
|
||||||
|
return counts
|
||||||
|
|
||||||
|
|
||||||
|
def sorted_counts(counts: Dict[str, int]) -> Tuple[Tuple[str, int], ...]:
|
||||||
|
"""Sort counts by descending frequency, then by character ascending.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
counts: Dictionary mapping char -> count.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (char, count) rows sorted by (-count, char).
|
||||||
|
|
||||||
|
>>> out = sorted_counts({'b': 3, 'a': 3, 'c': 1})
|
||||||
|
>>> out[0][1] == 3 and out [1][1] == 3 and out [-1] == ('c', 1)
|
||||||
|
True
|
||||||
|
"""
|
||||||
|
return tuple(sorted(counts.items(), key=lambda kv: (-kv[1], kv[0])))
|
||||||
|
|
||||||
|
|
||||||
|
def format_row(ch: str, c: int) -> str:
|
||||||
|
"""Format a single table row, showing newline as literal '\\n'.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ch: The character.
|
||||||
|
c: Count
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Formatted row string.
|
||||||
|
|
||||||
|
>>> format_row('\\n', 5).endswith(" | 5")
|
||||||
|
True
|
||||||
|
"""
|
||||||
|
label = "\\n" if ch == "\n" else ch
|
||||||
|
return f"{label!r:>4} | {c}"
|
||||||
|
|
||||||
|
|
||||||
|
def print_table(rows: Iterable[Tuple[str, int]], limit: int = 50) -> None:
|
||||||
|
"""Print a simple table of (char, count)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
rows: Iterable of (char, count)
|
||||||
|
limit: Max rows to print
|
||||||
|
"""
|
||||||
|
print("Char | Count")
|
||||||
|
print("-----+------")
|
||||||
|
n = 0
|
||||||
|
for ch, c in rows:
|
||||||
|
if n >= limit:
|
||||||
|
break
|
||||||
|
print(format_row(ch, c))
|
||||||
|
n += 1
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
text = load_and_normalize()
|
||||||
|
counts = char_counts(text)
|
||||||
|
rows = sorted_counts(counts)
|
||||||
|
print_table(rows, limit=50)
|
||||||
|
print(f"\nTotal chars: {len(text)} | Unique chars: {len(counts)}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Reference in New Issue
Block a user