41 lines
1.2 KiB
Python
41 lines
1.2 KiB
Python
import unicodedata
|
|
import re
|
|
|
|
# Precompiled regexes (fast)
|
|
RE_SPACES = re.compile(r"\s+")
|
|
RE_CONTROL_CHARS = re.compile(r"[\u0000-\u001F\u007F-\u009F]")
|
|
RE_QUOTES = {
|
|
'\u2018': "'", # Left single quotation mark
|
|
'\u2019': "'", # Right single quotation mark
|
|
'\u201C': '"', # Left double quotation mark
|
|
'\u201D': '"', # Right double quotation mark
|
|
'\u201E': '"', # Double low-9 quotation mark
|
|
'\u201F': '"', # Double high-reversed-9 quotation mark
|
|
}
|
|
RE_DASHES = {
|
|
'\u2013': '-', # En dash
|
|
'\u2014': '-', # Em dash
|
|
}
|
|
|
|
|
|
def clean_unicode(text: str) -> str:
|
|
# 1. Replace fancy quotes
|
|
for bad, good in RE_QUOTES.items():
|
|
text = text.replace(bad, good)
|
|
|
|
# 2. Replace fancy dashes
|
|
for bad, good in RE_DASHES.items():
|
|
text = text.replace(bad, good)
|
|
|
|
# 3. Remove BOMs and stray control characters
|
|
text = RE_CONTROL_CHARS.sub('', text)
|
|
|
|
# 4. Normalize Unicode (NFKC collapses fullwidth, fractions, weird numerics)
|
|
text = unicodedata.normalize('NFKC', text)
|
|
|
|
# 5. Collapse all whitespace to a single space
|
|
text = RE_SPACES.sub(' ', text)
|
|
|
|
# 6. Strip leading/trailing whitespace
|
|
return text.strip()
|