import unicodedata import re # Precompiled regexes (fast) RE_SPACES = re.compile(r"\s+") RE_CONTROL_CHARS = re.compile(r"[\u0000-\u001F\u007F-\u009F]") RE_QUOTES = { '\u2018': "'", # Left single quotation mark '\u2019': "'", # Right single quotation mark '\u201C': '"', # Left double quotation mark '\u201D': '"', # Right double quotation mark '\u201E': '"', # Double low-9 quotation mark '\u201F': '"', # Double high-reversed-9 quotation mark } RE_DASHES = { '\u2013': '-', # En dash '\u2014': '-', # Em dash } def clean_unicode(text: str) -> str: # 1. Replace fancy quotes for bad, good in RE_QUOTES.items(): text = text.replace(bad, good) # 2. Replace fancy dashes for bad, good in RE_DASHES.items(): text = text.replace(bad, good) # 3. Remove BOMs and stray control characters text = RE_CONTROL_CHARS.sub('', text) # 4. Normalize Unicode (NFKC collapses fullwidth, fractions, weird numerics) text = unicodedata.normalize('NFKC', text) # 5. Collapse all whitespace to a single space text = RE_SPACES.sub(' ', text) # 6. Strip leading/trailing whitespace return text.strip()