Ruby/utils/unicleaner.py

40 lines
1.2 KiB
Python

import unicodedata
import re
# Precompiled regexes (fast)
RE_SPACES = re.compile(r"\s+")
RE_CONTROL_CHARS = re.compile(r"[\u0000-\u001F\u007F-\u009F]")
RE_QUOTES = {
'\u2018': "'", # Left single quotation mark
'\u2019': "'", # Right single quotation mark
'\u201C': '"', # Left double quotation mark
'\u201D': '"', # Right double quotation mark
'\u201E': '"', # Double low-9 quotation mark
'\u201F': '"', # Double high-reversed-9 quotation mark
}
RE_DASHES = {
'\u2013': '-', # En dash
'\u2014': '-', # Em dash
}
def clean_unicode(text: str) -> str:
# 1. Replace fancy quotes
for bad, good in RE_QUOTES.items():
text = text.replace(bad, good)
# 2. Replace fancy dashes
for bad, good in RE_DASHES.items():
text = text.replace(bad, good)
# 3. Remove BOMs and stray control characters
text = RE_CONTROL_CHARS.sub('', text)
# 4. Normalize Unicode (NFKC collapses fullwidth, fractions, weird numerics)
text = unicodedata.normalize('NFKC', text)
# 5. Collapse all whitespace to a single space
text = RE_SPACES.sub(' ', text)
# 6. Strip leading/trailing whitespace
return text.strip()