added a brainmap checker,

Fixed the trainer and reader
2025-04-27 16:40:50 -04:00 · 2025-04-27 16:40:50 -04:00 · 4d4b39b4c7
commit 4d4b39b4c7
parent ec82d0ab63
4 changed files with 150 additions and 56 deletions
--- a/model/brainmap.py
+++ b/model/brainmap.py
@ -1,59 +1,98 @@
-import os
+import re
 import json
-from collections import defaultdict
+import os
 from utils.unicleaner import clean_unicode

-BRAINMAP_FILE = "data/memory/brainmap.json"
+BRAINMAP_PATH = "data/memory/brainmap.json"
+brainmap = {}
+
+MAX_CONNECTIONS = 50  # Max neighbors to keep per word
+
+
+def is_valid_brainword(word: str) -> bool:
+    word = clean_unicode(word.strip())
+
+    if len(word) < 3:
+        return False
+    if re.fullmatch(r"\d+", word):  # Pure numbers
+        return False
+    if re.fullmatch(r"(i|ii|iii|iv|v|vi|vii|viii|ix|x|xi|xii|xiii|xiv|xv)", word.lower()):
+        return False
+    if not word.isascii():
+        return False
+    if re.search(r"[^a-zA-Z0-9\-]", word):  # Block weird characters except dash
+        return False
+    return True


 def load_brainmap():
-    if os.path.exists(BRAINMAP_FILE):
-        with open(BRAINMAP_FILE, "r", encoding="utf-8") as f:
-            return json.load(f)
-    return {}
+    global brainmap
+    if os.path.exists(BRAINMAP_PATH):
+        with open(BRAINMAP_PATH, "r", encoding="utf-8") as f:
+            brainmap = json.load(f)


-def save_brainmap(map_data):
-    with open(BRAINMAP_FILE, "w", encoding="utf-8") as f:
-        json.dump(map_data, f, indent=2)
+def save_brainmap():
+    with open(BRAINMAP_PATH, "w", encoding="utf-8") as f:
+        json.dump(brainmap, f, indent=2)


-brain_map = load_brainmap()
+def add_to_brainmap(words):
+    if isinstance(words, str):
+        words = words.split()

+    cleaned_words = [w.lower() for w in words if is_valid_brainword(w)]

-def update_brainmap(words):
-    for i, word in enumerate(words):
-        for j in range(i+1, len(words)):
-            w1 = word
-            w2 = words[j]
-            if w1 == w2:
+    updated = False
+
+    for i, word in enumerate(cleaned_words):
+        if word not in brainmap:
+            brainmap[word] = {}
+            updated = True
+
+        neighbors = cleaned_words[max(0, i-2):i] + cleaned_words[i+1:i+3]
+        for neighbor in neighbors:
+            if neighbor == word or not is_valid_brainword(neighbor):
                continue
-            if w1 not in brain_map:
-                brain_map[w1] = {}
-            if w2 not in brain_map[w1]:
-                brain_map[w1][w2] = 0
-            brain_map[w1][w2] += 1
-    save_brainmap(brain_map)
+            previous_count = brainmap[word].get(neighbor, 0)
+            brainmap[word][neighbor] = previous_count + 1
+            if previous_count == 0:
+                updated = True
+
+        # Limit neighbors
+        if len(brainmap[word]) > MAX_CONNECTIONS:
+            brainmap[word] = dict(sorted(brainmap[word].items(), key=lambda x: x[1], reverse=True)[:MAX_CONNECTIONS])
+
+    if updated:
+        save_brainmap()
+
+
+def prune_brainmap(min_neighbors=2, min_strength=2):
+    """
+    Remove weakly connected or isolated words from the brainmap.
+
+    Args:
+        min_neighbors (int): Minimum neighbors required to keep a word.
+        min_strength (int): Minimum strength (connection count) for neighbors.
+    """
+    global brainmap
+    to_delete = []
+
+    for word, neighbors in brainmap.items():
+        # Clean weak neighbors
+        weak_neighbors = [n for n, count in neighbors.items() if count < min_strength]
+        for n in weak_neighbors:
+            del neighbors[n]
+
+        # Delete word if too few neighbors remain
+        if len(neighbors) < min_neighbors:
+            to_delete.append(word)
+
+    for word in to_delete:
+        del brainmap[word]
+
+    save_brainmap()


 def get_brainmap():
-    return brain_map
-
-
-def fix_brainmap(brainmap: dict) -> dict:
-    cleaned_brainmap = {}
-
-    for word, value in brainmap.items():
-        cleaned_word = clean_unicode(word.strip())
-
-        # Skip bad entries
-        if not cleaned_word or cleaned_word in {"...", "-", "--", "''", '""'}:
-            continue
-
-        # Merge duplicates (case-insensitive optional)
-        if cleaned_word in cleaned_brainmap:
-            cleaned_brainmap[cleaned_word] += value
-        else:
-            cleaned_brainmap[cleaned_word] = value
-
-    return cleaned_brainmap
+    return brainmap
--- a/model/brainmap_analysis.py
+++ b/model/brainmap_analysis.py
@ -0,0 +1,39 @@
+import json
+import os
+
+BRAINMAP_PATH = "data/memory/brainmap.json"
+
+
+def analyze_brainmap(path=BRAINMAP_PATH):
+    if not os.path.exists(path):
+        print("No brainmap found.")
+        return
+
+    with open(path, "r", encoding="utf-8") as f:
+        brainmap = json.load(f)
+
+    total_words = len(brainmap)
+    total_neighbors = 0
+    orphan_words = 0
+    weak_links = 0
+
+    for word, neighbors in brainmap.items():
+        num_neighbors = len(neighbors)
+        total_neighbors += num_neighbors
+
+        if num_neighbors <= 1:
+            orphan_words += 1
+
+        weak_links += sum(1 for strength in neighbors.values() if strength <= 2)
+
+    avg_neighbors = total_neighbors / total_words if total_words else 0
+
+    print(f"📖 Brainmap Analysis:")
+    print(f"- Total Words: {total_words}")
+    print(f"- Average Neighbors per Word: {avg_neighbors:.2f}")
+    print(f"- Orphan Words (<=1 neighbor): {orphan_words}")
+    print(f"- Weak Connections (strength <=2): {weak_links}")
+
+
+if __name__ == "__main__":
+    analyze_brainmap()
--- a/model/trainer.py
+++ b/model/trainer.py
@ -2,7 +2,7 @@ import torch
 import time
 from model.dynamic_expand import expand_model_if_needed, _last_expansion_time, get_optimizer, expand_lock
 from model.brain_state import model, tokenizer, DEVICE, loss_fn
-from model.brainmap import update_brainmap
+from model.brainmap import add_to_brainmap
 from context.context import add_to_context, get_recent_context

 LOSS_FILE = "data/logs/loss.log"
@ -69,7 +69,7 @@ def train_on_message(text: str, source: str = "user"):
        log_loss(loss.item())
        log_vocab_growth()
        add_to_context(text, source=source)
-        update_brainmap(augmented_text.split())
+        add_to_brainmap(augmented_text.split())

    finally:
        expand_lock.release()
--- a/reader/reader.py
+++ b/reader/reader.py
@ -1,13 +1,13 @@
 import os
 import asyncio
+import json
 from model.trainer import train_on_message
 from model.scheduler import set_next_action
 from reader.filter import is_valid_line
-import json

 BOOK_DIR = "data/books"
 PROGRESS_FILE = "data/memory/book_progress.json"
-READ_DELAY = 0.2  # seconds between lines
+READ_DELAY = 0.2  # seconds between paragraphs
 PARAGRAPH_MIN_LENGTH = 20


@ -19,7 +19,7 @@ def load_progress():
    if os.path.exists(PROGRESS_FILE):
        with open(PROGRESS_FILE, "r", encoding="utf-8") as f:
            return json.load(f)
-    return {}
+    return {"progress": {}, "completed": []}


 def save_progress(prog):
@ -29,9 +29,23 @@ def save_progress(prog):

 async def read_books_forever():
    books = get_books()
-    progress = load_progress()
+    progress_data = load_progress()
+    progress = progress_data.get("progress", {})
+    completed_books = progress_data.get("completed", [])
+
    while True:
-        for book in books:
+        # Filter out completed books
+        available_books = [b for b in books if b not in completed_books]
+
+        if not available_books:
+            print("[Reader] All books completed. Resetting progress.")
+            progress_data = {"progress": {}, "completed": []}
+            save_progress(progress_data)
+            available_books = books  # Re-enable all books
+            progress = {}
+            completed_books = []
+
+        for book in available_books:
            path = os.path.join(BOOK_DIR, book)
            if not os.path.exists(path):
                continue
@ -56,10 +70,12 @@ async def read_books_forever():
                    paragraph += " " + line

                progress[book] = idx
-                save_progress(progress)
+                progress_data["progress"] = progress
+                save_progress(progress_data)

-            # train last paragraph if any
-            if paragraph and len(paragraph) > PARAGRAPH_MIN_LENGTH:
-                train_on_message(paragraph.strip(), source="book")
-                await asyncio.sleep(READ_DELAY)
-                set_next_action(READ_DELAY, "Reading")
+            # End of book
+            if idx >= len(lines):
+                print(f"[Reader] Finished reading {book}.")
+                completed_books.append(book)
+                progress_data["completed"] = list(set(completed_books))  # Avoid duplicates
+                save_progress(progress_data)