From 4d4b39b4c71349cd9eec113015cc21c74505c964 Mon Sep 17 00:00:00 2001 From: Dani Date: Sun, 27 Apr 2025 16:40:50 -0400 Subject: [PATCH] added a brainmap checker, Fixed the trainer and reader --- model/brainmap.py | 125 ++++++++++++++++++++++++------------- model/brainmap_analysis.py | 39 ++++++++++++ model/trainer.py | 4 +- reader/reader.py | 38 +++++++---- 4 files changed, 150 insertions(+), 56 deletions(-) create mode 100644 model/brainmap_analysis.py diff --git a/model/brainmap.py b/model/brainmap.py index 99d34f3..ab14c67 100644 --- a/model/brainmap.py +++ b/model/brainmap.py @@ -1,59 +1,98 @@ -import os +import re import json -from collections import defaultdict +import os from utils.unicleaner import clean_unicode -BRAINMAP_FILE = "data/memory/brainmap.json" +BRAINMAP_PATH = "data/memory/brainmap.json" +brainmap = {} + +MAX_CONNECTIONS = 50 # Max neighbors to keep per word + + +def is_valid_brainword(word: str) -> bool: + word = clean_unicode(word.strip()) + + if len(word) < 3: + return False + if re.fullmatch(r"\d+", word): # Pure numbers + return False + if re.fullmatch(r"(i|ii|iii|iv|v|vi|vii|viii|ix|x|xi|xii|xiii|xiv|xv)", word.lower()): + return False + if not word.isascii(): + return False + if re.search(r"[^a-zA-Z0-9\-]", word): # Block weird characters except dash + return False + return True def load_brainmap(): - if os.path.exists(BRAINMAP_FILE): - with open(BRAINMAP_FILE, "r", encoding="utf-8") as f: - return json.load(f) - return {} + global brainmap + if os.path.exists(BRAINMAP_PATH): + with open(BRAINMAP_PATH, "r", encoding="utf-8") as f: + brainmap = json.load(f) -def save_brainmap(map_data): - with open(BRAINMAP_FILE, "w", encoding="utf-8") as f: - json.dump(map_data, f, indent=2) +def save_brainmap(): + with open(BRAINMAP_PATH, "w", encoding="utf-8") as f: + json.dump(brainmap, f, indent=2) -brain_map = load_brainmap() +def add_to_brainmap(words): + if isinstance(words, str): + words = words.split() + cleaned_words = [w.lower() for w in words if is_valid_brainword(w)] -def update_brainmap(words): - for i, word in enumerate(words): - for j in range(i+1, len(words)): - w1 = word - w2 = words[j] - if w1 == w2: + updated = False + + for i, word in enumerate(cleaned_words): + if word not in brainmap: + brainmap[word] = {} + updated = True + + neighbors = cleaned_words[max(0, i-2):i] + cleaned_words[i+1:i+3] + for neighbor in neighbors: + if neighbor == word or not is_valid_brainword(neighbor): continue - if w1 not in brain_map: - brain_map[w1] = {} - if w2 not in brain_map[w1]: - brain_map[w1][w2] = 0 - brain_map[w1][w2] += 1 - save_brainmap(brain_map) + previous_count = brainmap[word].get(neighbor, 0) + brainmap[word][neighbor] = previous_count + 1 + if previous_count == 0: + updated = True + + # Limit neighbors + if len(brainmap[word]) > MAX_CONNECTIONS: + brainmap[word] = dict(sorted(brainmap[word].items(), key=lambda x: x[1], reverse=True)[:MAX_CONNECTIONS]) + + if updated: + save_brainmap() + + +def prune_brainmap(min_neighbors=2, min_strength=2): + """ + Remove weakly connected or isolated words from the brainmap. + + Args: + min_neighbors (int): Minimum neighbors required to keep a word. + min_strength (int): Minimum strength (connection count) for neighbors. + """ + global brainmap + to_delete = [] + + for word, neighbors in brainmap.items(): + # Clean weak neighbors + weak_neighbors = [n for n, count in neighbors.items() if count < min_strength] + for n in weak_neighbors: + del neighbors[n] + + # Delete word if too few neighbors remain + if len(neighbors) < min_neighbors: + to_delete.append(word) + + for word in to_delete: + del brainmap[word] + + save_brainmap() def get_brainmap(): - return brain_map - - -def fix_brainmap(brainmap: dict) -> dict: - cleaned_brainmap = {} - - for word, value in brainmap.items(): - cleaned_word = clean_unicode(word.strip()) - - # Skip bad entries - if not cleaned_word or cleaned_word in {"...", "-", "--", "''", '""'}: - continue - - # Merge duplicates (case-insensitive optional) - if cleaned_word in cleaned_brainmap: - cleaned_brainmap[cleaned_word] += value - else: - cleaned_brainmap[cleaned_word] = value - - return cleaned_brainmap + return brainmap diff --git a/model/brainmap_analysis.py b/model/brainmap_analysis.py new file mode 100644 index 0000000..a651f97 --- /dev/null +++ b/model/brainmap_analysis.py @@ -0,0 +1,39 @@ +import json +import os + +BRAINMAP_PATH = "data/memory/brainmap.json" + + +def analyze_brainmap(path=BRAINMAP_PATH): + if not os.path.exists(path): + print("No brainmap found.") + return + + with open(path, "r", encoding="utf-8") as f: + brainmap = json.load(f) + + total_words = len(brainmap) + total_neighbors = 0 + orphan_words = 0 + weak_links = 0 + + for word, neighbors in brainmap.items(): + num_neighbors = len(neighbors) + total_neighbors += num_neighbors + + if num_neighbors <= 1: + orphan_words += 1 + + weak_links += sum(1 for strength in neighbors.values() if strength <= 2) + + avg_neighbors = total_neighbors / total_words if total_words else 0 + + print(f"📖 Brainmap Analysis:") + print(f"- Total Words: {total_words}") + print(f"- Average Neighbors per Word: {avg_neighbors:.2f}") + print(f"- Orphan Words (<=1 neighbor): {orphan_words}") + print(f"- Weak Connections (strength <=2): {weak_links}") + + +if __name__ == "__main__": + analyze_brainmap() diff --git a/model/trainer.py b/model/trainer.py index 2b3df70..48d45dc 100644 --- a/model/trainer.py +++ b/model/trainer.py @@ -2,7 +2,7 @@ import torch import time from model.dynamic_expand import expand_model_if_needed, _last_expansion_time, get_optimizer, expand_lock from model.brain_state import model, tokenizer, DEVICE, loss_fn -from model.brainmap import update_brainmap +from model.brainmap import add_to_brainmap from context.context import add_to_context, get_recent_context LOSS_FILE = "data/logs/loss.log" @@ -69,7 +69,7 @@ def train_on_message(text: str, source: str = "user"): log_loss(loss.item()) log_vocab_growth() add_to_context(text, source=source) - update_brainmap(augmented_text.split()) + add_to_brainmap(augmented_text.split()) finally: expand_lock.release() diff --git a/reader/reader.py b/reader/reader.py index 9783ea0..65f4158 100644 --- a/reader/reader.py +++ b/reader/reader.py @@ -1,13 +1,13 @@ import os import asyncio +import json from model.trainer import train_on_message from model.scheduler import set_next_action from reader.filter import is_valid_line -import json BOOK_DIR = "data/books" PROGRESS_FILE = "data/memory/book_progress.json" -READ_DELAY = 0.2 # seconds between lines +READ_DELAY = 0.2 # seconds between paragraphs PARAGRAPH_MIN_LENGTH = 20 @@ -19,7 +19,7 @@ def load_progress(): if os.path.exists(PROGRESS_FILE): with open(PROGRESS_FILE, "r", encoding="utf-8") as f: return json.load(f) - return {} + return {"progress": {}, "completed": []} def save_progress(prog): @@ -29,9 +29,23 @@ def save_progress(prog): async def read_books_forever(): books = get_books() - progress = load_progress() + progress_data = load_progress() + progress = progress_data.get("progress", {}) + completed_books = progress_data.get("completed", []) + while True: - for book in books: + # Filter out completed books + available_books = [b for b in books if b not in completed_books] + + if not available_books: + print("[Reader] All books completed. Resetting progress.") + progress_data = {"progress": {}, "completed": []} + save_progress(progress_data) + available_books = books # Re-enable all books + progress = {} + completed_books = [] + + for book in available_books: path = os.path.join(BOOK_DIR, book) if not os.path.exists(path): continue @@ -56,10 +70,12 @@ async def read_books_forever(): paragraph += " " + line progress[book] = idx - save_progress(progress) + progress_data["progress"] = progress + save_progress(progress_data) - # train last paragraph if any - if paragraph and len(paragraph) > PARAGRAPH_MIN_LENGTH: - train_on_message(paragraph.strip(), source="book") - await asyncio.sleep(READ_DELAY) - set_next_action(READ_DELAY, "Reading") + # End of book + if idx >= len(lines): + print(f"[Reader] Finished reading {book}.") + completed_books.append(book) + progress_data["completed"] = list(set(completed_books)) # Avoid duplicates + save_progress(progress_data)