Files
Aria/01_read_text.py
Dani 68d9d00123 feat: add text file reader with normalization and stats preview
Adds a new script to read local text files, normalize line endings, and display character statistics and previews. The script handles missing data files gracefully by using a fallback sample and provides detailed output including total characters, unique characters, and a 200-character preview with literal newline representations.
2025-09-23 12:44:22 -04:00

103 lines
2.4 KiB
Python

# 01_read_text.py
"""
Read a local text file, normalize newlines, and preview stats.
Useage:
python 01_read_text.py
Notes:
- Expects a UTF-8 'data.txt' in the same directory. If missing, uses a fallback
- Prints total chars, unique chars, and a 200-char preview with literal "\\n".
"""
from __future__ import annotations
from pathlib import Path
from typing import Optional
FALLBACK = (
"From fairest creatures we desire increase,\n"
"That thereby beauty's rose might never die,\n"
"But as the riper should by time decease,\n"
"His tender heir might bear his memory.\n"
)
def load_text(path: Optional[Path]) -> str:
"""Load UTF-8 text from a path if provided/existing, else return fallback.
Args:
path: Optional path to the text file.
Returns:
The file contents or a small fallback sample.
>>> isinstance(load_text(None), str)
True
"""
if path and path.exists():
return path.read_text(encoding="utf-8")
return FALLBACK
def normalize_newlines(text: str) -> str:
"""Convert all newlines to '\\n'.
Args:
text: Raw text.
Returns:
Text with CRLF/CR normalized to LF ('\\n').
>>> normalize_newlines("a\\r\\nb\\rc\\nd") == "a\\nb\\nc\\nd"
True
"""
text = text.replace("\r\n", "\n").replace("\r", "\n")
return text
def make_preview(text: str, n_chars: int = 200) -> str:
"""Returns a first-N-chars preview with literal newlines escaped.
Args:
text: Input text.
n_chars: Max characters to preview.
Returns:
Preview string (newlines shown as '\\n').
>>> make_preview("hi\\nthere", 5)
'hi\\\\nth'
"""
preview = text[:n_chars]
return preview.replace("\n", "\\n")
def report_stats(text: str) -> str:
"""Produce a human-readable stats report: total chars, unique chars.
Args:
text: Input text
Returns:
A multi-line report string.
>>> "Total chars:" in report_stats("abaca")
True
"""
total = len(text)
uniq = len(set(text))
return f"Total chars: {total} \n Unique chars: {uniq}"
def main() -> None:
"""Entry point: load, normlize, and report a preview"""
data_path = Path("data.txt")
text = normalize_newlines(load_text(data_path))
print(report_stats(text))
print("Preview (200 chars, \\n shown literally):")
print(make_preview(text, 200))
if __name__ == "__main__":
main()