import re
import json
import os
import time
from model.tokenizer import VOCAB_PATH
from model.memory import DREAM_LOG_PATH
from context.context import CONTEXT_FILE

CLEANUP_LOG = "data/logs/cleanup.log"


def log(msg):
    os.makedirs(os.path.dirname(CLEANUP_LOG), exist_ok=True)
    with open(CLEANUP_LOG, "a", encoding="utf-8") as f:
        f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} | {msg}\n")


def cleanup_vocab():
    if not os.path.exists(VOCAB_PATH):
        return

    with open(VOCAB_PATH, "r", encoding="utf-8") as f:
        vocab = json.load(f)

    removed = []
    for word in list(vocab.keys()):
        if re.search(r"[^\w-]", word):
            removed.append(word)
            del vocab[word]
        elif len(word) <= 2 and not word.isalpha():
            removed.append(word)
            del vocab[word]
        elif "�" in word or "\ufffd" in word:
            removed.append(word)
            del vocab[word]

    with open(VOCAB_PATH, "w", encoding="utf-8") as f:
        json.dump(vocab, f, indent=2)

    if removed:
        log(f"Removed {len(removed)} malformed tokens: {removed[:5]}...")


def cleanup_dreams():
    if not os.path.exists(DREAM_LOG_PATH):
        return

    with open(DREAM_LOG_PATH, "r", encoding="utf-8") as f:
        dreams = json.load(f)

    filtered = [d for d in dreams if d["score"] >= 0.3][:100]

    with open(DREAM_LOG_PATH, "w", encoding="utf-8") as f:
        json.dump(filtered, f, indent=2)

    if len(filtered) < len(dreams):
        log(f"Removed {len(dreams) - len(filtered)} low-score dreams")


def cleanup_context():
    if not os.path.exists(CONTEXT_FILE):
        return

    with open(CONTEXT_FILE, "r", encoding="utf-8") as f:
        context = json.load(f)

    filtered = context[-100:]

    with open(CONTEXT_FILE, "w", encoding="utf-8") as f:
        json.dump(filtered, f, indent=2)

    if len(filtered) < len(context):
        log(f"Trimmed context memory from {len(context)} → {len(filtered)}")


def full_cleanup():
    cleanup_vocab()
    cleanup_dreams()
    cleanup_context()