Ruby/model/cleanup.py

import re
import json
import os
import time
from model.tokenizer import VOCAB_PATH
from model.dreams import DREAM_LOG_PATH
from context.context import CONTEXT_FILE
from model.brainmap import load_brainmap, save_brainmap

CLEANUP_LOG = "data/logs/cleanup.log"


def log(msg):
    os.makedirs(os.path.dirname(CLEANUP_LOG), exist_ok=True)
    with open(CLEANUP_LOG, "a", encoding="utf-8") as f:
        f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} | {msg}\n")


def cleanup_vocab():
    if not os.path.exists(VOCAB_PATH):
        return

    with open(VOCAB_PATH, "r", encoding="utf-8") as f:
        vocab = json.load(f)

    removed = []
    for word in list(vocab.keys()):
        if re.search(r"[^\w-]", word):
            removed.append(word)
            del vocab[word]
        elif len(word) <= 2 and not word.isalpha():
            removed.append(word)
            del vocab[word]
        elif "<EFBFBD>" in word or "\ufffd" in word:
            removed.append(word)
            del vocab[word]

    with open(VOCAB_PATH, "w", encoding="utf-8") as f:
        json.dump(vocab, f, indent=2)

    if removed:
        log(f"Removed {len(removed)} malformed tokens: {removed[:5]}...")


def cleanup_dreams():
    if not os.path.exists(DREAM_LOG_PATH):
        return

    with open(DREAM_LOG_PATH, "r", encoding="utf-8") as f:
        dreams = json.load(f)

    filtered = [d for d in dreams if d["score"] >= 0.3][:100]

    with open(DREAM_LOG_PATH, "w", encoding="utf-8") as f:
        json.dump(filtered, f, indent=2)

    if len(filtered) < len(dreams):
        log(f"Removed {len(dreams) - len(filtered)} low-score dreams")


def cleanup_context():
    if not os.path.exists(CONTEXT_FILE):
        return

    with open(CONTEXT_FILE, "r", encoding="utf-8") as f:
        context = json.load(f)

    filtered = context[-100:]

    with open(CONTEXT_FILE, "w", encoding="utf-8") as f:
        json.dump(filtered, f, indent=2)

    if len(filtered) < len(context):
        log(f"Trimmed context memory from {len(context)} → {len(filtered)}")


def cleanup_brainmap(min_neighbors=2, min_strength=2):
    load_brainmap()

    from model.brainmap import brainmap  # after load

    to_delete = []

    for word, neighbors in brainmap.items():
        # Remove weak neighbors
        weak_neighbors = [n for n, count in neighbors.items() if count < min_strength]
        for n in weak_neighbors:
            del neighbors[n]

        # Mark lonely words
        if len(neighbors) < min_neighbors:
            to_delete.append(word)

    for word in to_delete:
        del brainmap[word]

    save_brainmap()

    if to_delete:
        log(f"Pruned {len(to_delete)} weak brainmap words")


def full_cleanup():
    cleanup_vocab()
    cleanup_dreams()
    cleanup_context()
    cleanup_brainmap()