feat: add text file reader with normalization and stats preview
Adds a new script to read local text files, normalize line endings, and display character statistics and previews. The script handles missing data files gracefully by using a fallback sample and provides detailed output including total characters, unique characters, and a 200-character preview with literal newline representations.
This commit is contained in:
102
01_read_text.py
Normal file
102
01_read_text.py
Normal file
@@ -0,0 +1,102 @@
|
||||
# 01_read_text.py
|
||||
"""
|
||||
Read a local text file, normalize newlines, and preview stats.
|
||||
|
||||
Useage:
|
||||
python 01_read_text.py
|
||||
|
||||
Notes:
|
||||
- Expects a UTF-8 'data.txt' in the same directory. If missing, uses a fallback
|
||||
- Prints total chars, unique chars, and a 200-char preview with literal "\\n".
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
FALLBACK = (
|
||||
"From fairest creatures we desire increase,\n"
|
||||
"That thereby beauty's rose might never die,\n"
|
||||
"But as the riper should by time decease,\n"
|
||||
"His tender heir might bear his memory.\n"
|
||||
)
|
||||
|
||||
|
||||
def load_text(path: Optional[Path]) -> str:
|
||||
"""Load UTF-8 text from a path if provided/existing, else return fallback.
|
||||
|
||||
Args:
|
||||
path: Optional path to the text file.
|
||||
|
||||
Returns:
|
||||
The file contents or a small fallback sample.
|
||||
|
||||
>>> isinstance(load_text(None), str)
|
||||
True
|
||||
"""
|
||||
if path and path.exists():
|
||||
return path.read_text(encoding="utf-8")
|
||||
return FALLBACK
|
||||
|
||||
|
||||
def normalize_newlines(text: str) -> str:
|
||||
"""Convert all newlines to '\\n'.
|
||||
|
||||
Args:
|
||||
text: Raw text.
|
||||
|
||||
Returns:
|
||||
Text with CRLF/CR normalized to LF ('\\n').
|
||||
|
||||
>>> normalize_newlines("a\\r\\nb\\rc\\nd") == "a\\nb\\nc\\nd"
|
||||
True
|
||||
"""
|
||||
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
||||
return text
|
||||
|
||||
|
||||
def make_preview(text: str, n_chars: int = 200) -> str:
|
||||
"""Returns a first-N-chars preview with literal newlines escaped.
|
||||
|
||||
Args:
|
||||
text: Input text.
|
||||
n_chars: Max characters to preview.
|
||||
|
||||
Returns:
|
||||
Preview string (newlines shown as '\\n').
|
||||
|
||||
>>> make_preview("hi\\nthere", 5)
|
||||
'hi\\\\nth'
|
||||
"""
|
||||
preview = text[:n_chars]
|
||||
return preview.replace("\n", "\\n")
|
||||
|
||||
|
||||
def report_stats(text: str) -> str:
|
||||
"""Produce a human-readable stats report: total chars, unique chars.
|
||||
|
||||
Args:
|
||||
text: Input text
|
||||
|
||||
Returns:
|
||||
A multi-line report string.
|
||||
|
||||
>>> "Total chars:" in report_stats("abaca")
|
||||
True
|
||||
"""
|
||||
total = len(text)
|
||||
uniq = len(set(text))
|
||||
return f"Total chars: {total} \n Unique chars: {uniq}"
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Entry point: load, normlize, and report a preview"""
|
||||
data_path = Path("data.txt")
|
||||
text = normalize_newlines(load_text(data_path))
|
||||
print(report_stats(text))
|
||||
print("Preview (200 chars, \\n shown literally):")
|
||||
print(make_preview(text, 200))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Reference in New Issue
Block a user