feat: add text file reader with normalization and stats preview
Adds a new script to read local text files, normalize line endings, and display character statistics and previews. The script handles missing data files gracefully by using a fallback sample and provides detailed output including total characters, unique characters, and a 200-character preview with literal newline representations.
This commit is contained in:
102
01_read_text.py
Normal file
102
01_read_text.py
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
# 01_read_text.py
|
||||||
|
"""
|
||||||
|
Read a local text file, normalize newlines, and preview stats.
|
||||||
|
|
||||||
|
Useage:
|
||||||
|
python 01_read_text.py
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- Expects a UTF-8 'data.txt' in the same directory. If missing, uses a fallback
|
||||||
|
- Prints total chars, unique chars, and a 200-char preview with literal "\\n".
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
FALLBACK = (
|
||||||
|
"From fairest creatures we desire increase,\n"
|
||||||
|
"That thereby beauty's rose might never die,\n"
|
||||||
|
"But as the riper should by time decease,\n"
|
||||||
|
"His tender heir might bear his memory.\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_text(path: Optional[Path]) -> str:
|
||||||
|
"""Load UTF-8 text from a path if provided/existing, else return fallback.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: Optional path to the text file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The file contents or a small fallback sample.
|
||||||
|
|
||||||
|
>>> isinstance(load_text(None), str)
|
||||||
|
True
|
||||||
|
"""
|
||||||
|
if path and path.exists():
|
||||||
|
return path.read_text(encoding="utf-8")
|
||||||
|
return FALLBACK
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_newlines(text: str) -> str:
|
||||||
|
"""Convert all newlines to '\\n'.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Raw text.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Text with CRLF/CR normalized to LF ('\\n').
|
||||||
|
|
||||||
|
>>> normalize_newlines("a\\r\\nb\\rc\\nd") == "a\\nb\\nc\\nd"
|
||||||
|
True
|
||||||
|
"""
|
||||||
|
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def make_preview(text: str, n_chars: int = 200) -> str:
|
||||||
|
"""Returns a first-N-chars preview with literal newlines escaped.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Input text.
|
||||||
|
n_chars: Max characters to preview.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Preview string (newlines shown as '\\n').
|
||||||
|
|
||||||
|
>>> make_preview("hi\\nthere", 5)
|
||||||
|
'hi\\\\nth'
|
||||||
|
"""
|
||||||
|
preview = text[:n_chars]
|
||||||
|
return preview.replace("\n", "\\n")
|
||||||
|
|
||||||
|
|
||||||
|
def report_stats(text: str) -> str:
|
||||||
|
"""Produce a human-readable stats report: total chars, unique chars.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Input text
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A multi-line report string.
|
||||||
|
|
||||||
|
>>> "Total chars:" in report_stats("abaca")
|
||||||
|
True
|
||||||
|
"""
|
||||||
|
total = len(text)
|
||||||
|
uniq = len(set(text))
|
||||||
|
return f"Total chars: {total} \n Unique chars: {uniq}"
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
"""Entry point: load, normlize, and report a preview"""
|
||||||
|
data_path = Path("data.txt")
|
||||||
|
text = normalize_newlines(load_text(data_path))
|
||||||
|
print(report_stats(text))
|
||||||
|
print("Preview (200 chars, \\n shown literally):")
|
||||||
|
print(make_preview(text, 200))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Reference in New Issue
Block a user