added a brainmap checker,
Fixed the trainer and reader
This commit is contained in:
parent
ec82d0ab63
commit
4d4b39b4c7
@ -1,59 +1,98 @@
|
|||||||
import os
|
import re
|
||||||
import json
|
import json
|
||||||
from collections import defaultdict
|
import os
|
||||||
from utils.unicleaner import clean_unicode
|
from utils.unicleaner import clean_unicode
|
||||||
|
|
||||||
BRAINMAP_FILE = "data/memory/brainmap.json"
|
BRAINMAP_PATH = "data/memory/brainmap.json"
|
||||||
|
brainmap = {}
|
||||||
|
|
||||||
|
MAX_CONNECTIONS = 50 # Max neighbors to keep per word
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_brainword(word: str) -> bool:
|
||||||
|
word = clean_unicode(word.strip())
|
||||||
|
|
||||||
|
if len(word) < 3:
|
||||||
|
return False
|
||||||
|
if re.fullmatch(r"\d+", word): # Pure numbers
|
||||||
|
return False
|
||||||
|
if re.fullmatch(r"(i|ii|iii|iv|v|vi|vii|viii|ix|x|xi|xii|xiii|xiv|xv)", word.lower()):
|
||||||
|
return False
|
||||||
|
if not word.isascii():
|
||||||
|
return False
|
||||||
|
if re.search(r"[^a-zA-Z0-9\-]", word): # Block weird characters except dash
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def load_brainmap():
|
def load_brainmap():
|
||||||
if os.path.exists(BRAINMAP_FILE):
|
global brainmap
|
||||||
with open(BRAINMAP_FILE, "r", encoding="utf-8") as f:
|
if os.path.exists(BRAINMAP_PATH):
|
||||||
return json.load(f)
|
with open(BRAINMAP_PATH, "r", encoding="utf-8") as f:
|
||||||
return {}
|
brainmap = json.load(f)
|
||||||
|
|
||||||
|
|
||||||
def save_brainmap(map_data):
|
def save_brainmap():
|
||||||
with open(BRAINMAP_FILE, "w", encoding="utf-8") as f:
|
with open(BRAINMAP_PATH, "w", encoding="utf-8") as f:
|
||||||
json.dump(map_data, f, indent=2)
|
json.dump(brainmap, f, indent=2)
|
||||||
|
|
||||||
|
|
||||||
brain_map = load_brainmap()
|
def add_to_brainmap(words):
|
||||||
|
if isinstance(words, str):
|
||||||
|
words = words.split()
|
||||||
|
|
||||||
|
cleaned_words = [w.lower() for w in words if is_valid_brainword(w)]
|
||||||
|
|
||||||
def update_brainmap(words):
|
updated = False
|
||||||
for i, word in enumerate(words):
|
|
||||||
for j in range(i+1, len(words)):
|
for i, word in enumerate(cleaned_words):
|
||||||
w1 = word
|
if word not in brainmap:
|
||||||
w2 = words[j]
|
brainmap[word] = {}
|
||||||
if w1 == w2:
|
updated = True
|
||||||
|
|
||||||
|
neighbors = cleaned_words[max(0, i-2):i] + cleaned_words[i+1:i+3]
|
||||||
|
for neighbor in neighbors:
|
||||||
|
if neighbor == word or not is_valid_brainword(neighbor):
|
||||||
continue
|
continue
|
||||||
if w1 not in brain_map:
|
previous_count = brainmap[word].get(neighbor, 0)
|
||||||
brain_map[w1] = {}
|
brainmap[word][neighbor] = previous_count + 1
|
||||||
if w2 not in brain_map[w1]:
|
if previous_count == 0:
|
||||||
brain_map[w1][w2] = 0
|
updated = True
|
||||||
brain_map[w1][w2] += 1
|
|
||||||
save_brainmap(brain_map)
|
# Limit neighbors
|
||||||
|
if len(brainmap[word]) > MAX_CONNECTIONS:
|
||||||
|
brainmap[word] = dict(sorted(brainmap[word].items(), key=lambda x: x[1], reverse=True)[:MAX_CONNECTIONS])
|
||||||
|
|
||||||
|
if updated:
|
||||||
|
save_brainmap()
|
||||||
|
|
||||||
|
|
||||||
|
def prune_brainmap(min_neighbors=2, min_strength=2):
|
||||||
|
"""
|
||||||
|
Remove weakly connected or isolated words from the brainmap.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
min_neighbors (int): Minimum neighbors required to keep a word.
|
||||||
|
min_strength (int): Minimum strength (connection count) for neighbors.
|
||||||
|
"""
|
||||||
|
global brainmap
|
||||||
|
to_delete = []
|
||||||
|
|
||||||
|
for word, neighbors in brainmap.items():
|
||||||
|
# Clean weak neighbors
|
||||||
|
weak_neighbors = [n for n, count in neighbors.items() if count < min_strength]
|
||||||
|
for n in weak_neighbors:
|
||||||
|
del neighbors[n]
|
||||||
|
|
||||||
|
# Delete word if too few neighbors remain
|
||||||
|
if len(neighbors) < min_neighbors:
|
||||||
|
to_delete.append(word)
|
||||||
|
|
||||||
|
for word in to_delete:
|
||||||
|
del brainmap[word]
|
||||||
|
|
||||||
|
save_brainmap()
|
||||||
|
|
||||||
|
|
||||||
def get_brainmap():
|
def get_brainmap():
|
||||||
return brain_map
|
return brainmap
|
||||||
|
|
||||||
|
|
||||||
def fix_brainmap(brainmap: dict) -> dict:
|
|
||||||
cleaned_brainmap = {}
|
|
||||||
|
|
||||||
for word, value in brainmap.items():
|
|
||||||
cleaned_word = clean_unicode(word.strip())
|
|
||||||
|
|
||||||
# Skip bad entries
|
|
||||||
if not cleaned_word or cleaned_word in {"...", "-", "--", "''", '""'}:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Merge duplicates (case-insensitive optional)
|
|
||||||
if cleaned_word in cleaned_brainmap:
|
|
||||||
cleaned_brainmap[cleaned_word] += value
|
|
||||||
else:
|
|
||||||
cleaned_brainmap[cleaned_word] = value
|
|
||||||
|
|
||||||
return cleaned_brainmap
|
|
||||||
|
39
model/brainmap_analysis.py
Normal file
39
model/brainmap_analysis.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
BRAINMAP_PATH = "data/memory/brainmap.json"
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_brainmap(path=BRAINMAP_PATH):
|
||||||
|
if not os.path.exists(path):
|
||||||
|
print("No brainmap found.")
|
||||||
|
return
|
||||||
|
|
||||||
|
with open(path, "r", encoding="utf-8") as f:
|
||||||
|
brainmap = json.load(f)
|
||||||
|
|
||||||
|
total_words = len(brainmap)
|
||||||
|
total_neighbors = 0
|
||||||
|
orphan_words = 0
|
||||||
|
weak_links = 0
|
||||||
|
|
||||||
|
for word, neighbors in brainmap.items():
|
||||||
|
num_neighbors = len(neighbors)
|
||||||
|
total_neighbors += num_neighbors
|
||||||
|
|
||||||
|
if num_neighbors <= 1:
|
||||||
|
orphan_words += 1
|
||||||
|
|
||||||
|
weak_links += sum(1 for strength in neighbors.values() if strength <= 2)
|
||||||
|
|
||||||
|
avg_neighbors = total_neighbors / total_words if total_words else 0
|
||||||
|
|
||||||
|
print(f"📖 Brainmap Analysis:")
|
||||||
|
print(f"- Total Words: {total_words}")
|
||||||
|
print(f"- Average Neighbors per Word: {avg_neighbors:.2f}")
|
||||||
|
print(f"- Orphan Words (<=1 neighbor): {orphan_words}")
|
||||||
|
print(f"- Weak Connections (strength <=2): {weak_links}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
analyze_brainmap()
|
@ -2,7 +2,7 @@ import torch
|
|||||||
import time
|
import time
|
||||||
from model.dynamic_expand import expand_model_if_needed, _last_expansion_time, get_optimizer, expand_lock
|
from model.dynamic_expand import expand_model_if_needed, _last_expansion_time, get_optimizer, expand_lock
|
||||||
from model.brain_state import model, tokenizer, DEVICE, loss_fn
|
from model.brain_state import model, tokenizer, DEVICE, loss_fn
|
||||||
from model.brainmap import update_brainmap
|
from model.brainmap import add_to_brainmap
|
||||||
from context.context import add_to_context, get_recent_context
|
from context.context import add_to_context, get_recent_context
|
||||||
|
|
||||||
LOSS_FILE = "data/logs/loss.log"
|
LOSS_FILE = "data/logs/loss.log"
|
||||||
@ -69,7 +69,7 @@ def train_on_message(text: str, source: str = "user"):
|
|||||||
log_loss(loss.item())
|
log_loss(loss.item())
|
||||||
log_vocab_growth()
|
log_vocab_growth()
|
||||||
add_to_context(text, source=source)
|
add_to_context(text, source=source)
|
||||||
update_brainmap(augmented_text.split())
|
add_to_brainmap(augmented_text.split())
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
expand_lock.release()
|
expand_lock.release()
|
||||||
|
@ -1,13 +1,13 @@
|
|||||||
import os
|
import os
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import json
|
||||||
from model.trainer import train_on_message
|
from model.trainer import train_on_message
|
||||||
from model.scheduler import set_next_action
|
from model.scheduler import set_next_action
|
||||||
from reader.filter import is_valid_line
|
from reader.filter import is_valid_line
|
||||||
import json
|
|
||||||
|
|
||||||
BOOK_DIR = "data/books"
|
BOOK_DIR = "data/books"
|
||||||
PROGRESS_FILE = "data/memory/book_progress.json"
|
PROGRESS_FILE = "data/memory/book_progress.json"
|
||||||
READ_DELAY = 0.2 # seconds between lines
|
READ_DELAY = 0.2 # seconds between paragraphs
|
||||||
PARAGRAPH_MIN_LENGTH = 20
|
PARAGRAPH_MIN_LENGTH = 20
|
||||||
|
|
||||||
|
|
||||||
@ -19,7 +19,7 @@ def load_progress():
|
|||||||
if os.path.exists(PROGRESS_FILE):
|
if os.path.exists(PROGRESS_FILE):
|
||||||
with open(PROGRESS_FILE, "r", encoding="utf-8") as f:
|
with open(PROGRESS_FILE, "r", encoding="utf-8") as f:
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
return {}
|
return {"progress": {}, "completed": []}
|
||||||
|
|
||||||
|
|
||||||
def save_progress(prog):
|
def save_progress(prog):
|
||||||
@ -29,9 +29,23 @@ def save_progress(prog):
|
|||||||
|
|
||||||
async def read_books_forever():
|
async def read_books_forever():
|
||||||
books = get_books()
|
books = get_books()
|
||||||
progress = load_progress()
|
progress_data = load_progress()
|
||||||
|
progress = progress_data.get("progress", {})
|
||||||
|
completed_books = progress_data.get("completed", [])
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
for book in books:
|
# Filter out completed books
|
||||||
|
available_books = [b for b in books if b not in completed_books]
|
||||||
|
|
||||||
|
if not available_books:
|
||||||
|
print("[Reader] All books completed. Resetting progress.")
|
||||||
|
progress_data = {"progress": {}, "completed": []}
|
||||||
|
save_progress(progress_data)
|
||||||
|
available_books = books # Re-enable all books
|
||||||
|
progress = {}
|
||||||
|
completed_books = []
|
||||||
|
|
||||||
|
for book in available_books:
|
||||||
path = os.path.join(BOOK_DIR, book)
|
path = os.path.join(BOOK_DIR, book)
|
||||||
if not os.path.exists(path):
|
if not os.path.exists(path):
|
||||||
continue
|
continue
|
||||||
@ -56,10 +70,12 @@ async def read_books_forever():
|
|||||||
paragraph += " " + line
|
paragraph += " " + line
|
||||||
|
|
||||||
progress[book] = idx
|
progress[book] = idx
|
||||||
save_progress(progress)
|
progress_data["progress"] = progress
|
||||||
|
save_progress(progress_data)
|
||||||
|
|
||||||
# train last paragraph if any
|
# End of book
|
||||||
if paragraph and len(paragraph) > PARAGRAPH_MIN_LENGTH:
|
if idx >= len(lines):
|
||||||
train_on_message(paragraph.strip(), source="book")
|
print(f"[Reader] Finished reading {book}.")
|
||||||
await asyncio.sleep(READ_DELAY)
|
completed_books.append(book)
|
||||||
set_next_action(READ_DELAY, "Reading")
|
progress_data["completed"] = list(set(completed_books)) # Avoid duplicates
|
||||||
|
save_progress(progress_data)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user