Ruby/utils/unicleaner.py

import unicodedata
import re

# Precompiled regexes (fast)
RE_SPACES = re.compile(r"\s+")
RE_CONTROL_CHARS = re.compile(r"[\u0000-\u001F\u007F-\u009F]")
RE_QUOTES = {
    '\u2018': "'",  # Left single quotation mark
    '\u2019': "'",  # Right single quotation mark
    '\u201C': '"',  # Left double quotation mark
    '\u201D': '"',  # Right double quotation mark
    '\u201E': '"',  # Double low-9 quotation mark
    '\u201F': '"',  # Double high-reversed-9 quotation mark
}
RE_DASHES = {
    '\u2013': '-',  # En dash
    '\u2014': '-',  # Em dash
}

def clean_unicode(text: str) -> str:
    # 1. Replace fancy quotes
    for bad, good in RE_QUOTES.items():
        text = text.replace(bad, good)

    # 2. Replace fancy dashes
    for bad, good in RE_DASHES.items():
        text = text.replace(bad, good)

    # 3. Remove BOMs and stray control characters
    text = RE_CONTROL_CHARS.sub('', text)

    # 4. Normalize Unicode (NFKC collapses fullwidth, fractions, weird numerics)
    text = unicodedata.normalize('NFKC', text)

    # 5. Collapse all whitespace to a single space
    text = RE_SPACES.sub(' ', text)

    # 6. Strip leading/trailing whitespace
    return text.strip()