Ruby/model/brainmap.py

import re
import json
import os
import shutil
from sklearn.cluster import KMeans
import numpy as np
from utils.unicleaner import clean_unicode

BRAINMAP_PATH = "data/memory/brainmap.json"         # actual connection data
BRAINMAP_CACHE_PATH = "data/memory/brainmap_cache.json"  # for dashboard rendering only
brainmap = {}

MAX_CONNECTIONS = 50  # Max neighbors to keep per word


def is_valid_brainword(word: str) -> bool:
    word = clean_unicode(word.strip())

    if len(word) < 3:
        return False
    if re.fullmatch(r"\d+", word):  # Pure numbers
        return False
    if re.fullmatch(r"(i|ii|iii|iv|v|vi|vii|viii|ix|x|xi|xii|xiii|xiv|xv)", word.lower()):
        return False
    if not word.isascii():
        return False
    if re.search(r"[^a-zA-Z0-9\-]", word):  # Block weird characters except dash
        return False
    return True


def load_brainmap():
    global brainmap
    if os.path.exists(BRAINMAP_PATH):
        with open(BRAINMAP_PATH, "r", encoding="utf-8") as f:
            brainmap = json.load(f)


def save_brainmap():
    with open(BRAINMAP_PATH, "w", encoding="utf-8") as f:
        json.dump(brainmap, f, indent=2)


def add_to_brainmap(words):
    if isinstance(words, str):
        words = words.split()

    cleaned_words = [w.lower() for w in words if is_valid_brainword(w)]

    updated = False

    for i, word in enumerate(cleaned_words):
        if word not in brainmap:
            brainmap[word] = {}
            updated = True

        neighbors = cleaned_words[max(0, i-2):i] + cleaned_words[i+1:i+3]
        for neighbor in neighbors:
            if neighbor == word or not is_valid_brainword(neighbor):
                continue
            previous_count = brainmap[word].get(neighbor, 0)
            brainmap[word][neighbor] = previous_count + 1
            if previous_count == 0:
                updated = True

        # Limit neighbors
        if len(brainmap[word]) > MAX_CONNECTIONS:
            brainmap[word] = dict(sorted(brainmap[word].items(), key=lambda x: x[1], reverse=True)[:MAX_CONNECTIONS])

    if updated:
        save_brainmap()


def prune_brainmap(min_neighbors=2, min_strength=2):
    """
    Remove weakly connected or isolated words from the brainmap.

    Args:
        min_neighbors (int): Minimum neighbors required to keep a word.
        min_strength (int): Minimum strength (connection count) for neighbors.
    """
    global brainmap
    to_delete = []

    for word, neighbors in brainmap.items():
        # Clean weak neighbors
        weak_neighbors = [n for n, count in neighbors.items() if count < min_strength]
        for n in weak_neighbors:
            del neighbors[n]

        # Delete word if too few neighbors remain
        if len(neighbors) < min_neighbors:
            to_delete.append(word)

    for word in to_delete:
        del brainmap[word]

    save_brainmap()


def get_brainmap():
    return brainmap


def refresh_brainmap_cache(min_weight=2, max_nodes=300):
    """
    Generates a clustered brainmap view and writes to:
    - data/memory/brainmap_cache.json (master copy)
    - static/brainmap.json (served to frontend)
    """
    map_data = get_brainmap()
    links = []
    seen_words = set()

    for word, connections in map_data.items():
        if not isinstance(connections, dict):
            print(f"[Brainmap] Skipping corrupted entry: {word} => {type(connections)}")
            continue
        for linked_word, weight in connections.items():
            if weight >= min_weight:
                links.append({
                    "source": word,
                    "target": linked_word,
                    "value": weight
                })
                seen_words.add(word)
                seen_words.add(linked_word)

    node_set = {link["source"] for link in links} | {link["target"] for link in links}
    nodes = sorted(node_set)
    if len(nodes) > max_nodes:
        nodes = nodes[:max_nodes]
        node_set = set(nodes)
        links = [l for l in links if l["source"] in node_set and l["target"] in node_set]

    index_lookup = {word: i for i, word in enumerate(nodes)}
    word_vectors = []
    for word in nodes:
        vec = np.zeros(len(nodes), dtype=np.float32)
        connections = map_data.get(word, {})
        for other, strength in connections.items():
            if other in index_lookup:
                vec[index_lookup[other]] = strength
        word_vectors.append(vec)

    if len(word_vectors) < 2:
        print("[Brainmap] Not enough nodes to cluster.")
        return

    kmeans = KMeans(n_clusters=min(8, len(nodes)), n_init="auto")
    labels = kmeans.fit_predict(word_vectors)
    clustered_nodes = [{"id": word, "group": int(label)} for word, label in zip(nodes, labels)]

    output = {
        "nodes": clustered_nodes,
        "links": links
    }

    os.makedirs("data/memory", exist_ok=True)
    os.makedirs("static", exist_ok=True)

    cache_path = "data/memory/brainmap_cache.json"
    static_path = "static/brainmap.json"

    with open(cache_path, "w", encoding="utf-8") as f:
        json.dump(output, f, indent=2)

    shutil.copyfile(cache_path, static_path)
    # print(f"[Brainmap] Cache written to {cache_path} and copied to {static_path}")