import re import json import os import time from model.tokenizer import VOCAB_PATH from model.memory import DREAM_LOG_PATH from context.context import CONTEXT_FILE CLEANUP_LOG = "data/logs/cleanup.log" def log(msg): os.makedirs(os.path.dirname(CLEANUP_LOG), exist_ok=True) with open(CLEANUP_LOG, "a", encoding="utf-8") as f: f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} | {msg}\n") def cleanup_vocab(): if not os.path.exists(VOCAB_PATH): return with open(VOCAB_PATH, "r", encoding="utf-8") as f: vocab = json.load(f) removed = [] for word in list(vocab.keys()): if re.search(r"[^\w-]", word): removed.append(word) del vocab[word] elif len(word) <= 2 and not word.isalpha(): removed.append(word) del vocab[word] elif "�" in word or "\ufffd" in word: removed.append(word) del vocab[word] with open(VOCAB_PATH, "w", encoding="utf-8") as f: json.dump(vocab, f, indent=2) if removed: log(f"Removed {len(removed)} malformed tokens: {removed[:5]}...") def cleanup_dreams(): if not os.path.exists(DREAM_LOG_PATH): return with open(DREAM_LOG_PATH, "r", encoding="utf-8") as f: dreams = json.load(f) filtered = [d for d in dreams if d["score"] >= 0.3][:100] with open(DREAM_LOG_PATH, "w", encoding="utf-8") as f: json.dump(filtered, f, indent=2) if len(filtered) < len(dreams): log(f"Removed {len(dreams) - len(filtered)} low-score dreams") def cleanup_context(): if not os.path.exists(CONTEXT_FILE): return with open(CONTEXT_FILE, "r", encoding="utf-8") as f: context = json.load(f) filtered = context[-100:] with open(CONTEXT_FILE, "w", encoding="utf-8") as f: json.dump(filtered, f, indent=2) if len(filtered) < len(context): log(f"Trimmed context memory from {len(context)} → {len(filtered)}") def full_cleanup(): cleanup_vocab() cleanup_dreams() cleanup_context()