Prevented new books from being uploaded by mistake.

Added a graph to track vocab growth.
2025-04-26 23:45:06 -04:00 · 2025-04-26 23:45:06 -04:00 · 8d7cf38f1b
commit 8d7cf38f1b
parent a9b4871420
4 changed files with 85 additions and 3 deletions
--- a/.gitignore
+++ b/.gitignore
@ -169,8 +169,7 @@ cython_debug/
 #.idea/

 .vscode/launch.json
-/data/books/alice_in_wonderland.txt
-/data/books/wizard_of_oz.txt
+/data/books/*
 /data/memory/context.json
 /data/memory/dreams.json
 data/memory/brainmap.json
--- a/dashboard/dashboard.py
+++ b/dashboard/dashboard.py
@ -10,6 +10,7 @@ from context.context import load_context
 import json
 import os
 import time
+import datetime


 app = Flask(__name__)
@ -27,6 +28,21 @@ def load_loss_data():
    return [float(line.strip().split(",")[1]) for line in lines[-50:]]


+def load_vocab_growth():
+    path = "data/logs/vocab_growth.log"
+    if not os.path.exists(path):
+        return []
+    with open(path, "r", encoding="utf-8") as f:
+        lines = f.readlines()
+    data = []
+    for line in lines:
+        timestamp, vocab_size = line.strip().split(",")
+        # Reformat timestamp to human-readable
+        readable_time = datetime.datetime.fromtimestamp(float(timestamp)).strftime("%H:%M:%S")
+        data.append((readable_time, int(vocab_size)))
+    return data
+
+
 def update_next_cycle(seconds):
    global next_cycle_time
    next_cycle_time = time.time() + seconds
@ -55,10 +71,13 @@ def growth():
    vocab_size = len(tokenizer.vocab)
    brainmap_size = len(get_brainmap())
    memory_size = len(load_context())
+    vocab_growth = load_vocab_growth()
+
    return render_template("growth.html",
                           vocab_size=vocab_size,
                           brainmap_size=brainmap_size,
-                           memory_size=memory_size)
+                           memory_size=memory_size,
+                           vocab_growth=vocab_growth)


@app.route("/brainmap")
--- a/dashboard/templates/growth.html
+++ b/dashboard/templates/growth.html
@ -2,6 +2,7 @@
 <html lang="en">
 <head>
    <meta charset="UTF-8">
+    <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
    <title>Ruby's Brain Growth</title>
    <style>
        body {
@ -46,5 +47,61 @@
 <div class="stat">Brain Map Size: {{ brainmap_size }}</div>
 <div class="stat">Memory Entries: {{ memory_size }}</div>

+<div class="divider"></div>
+
+<h2>🧠 Vocabulary Growth Over Time</h2>
+<canvas id="vocabChart" width="600" height="300"></canvas>
+
+<script>
+const ctx = document.getElementById('vocabChart').getContext('2d');
+const vocabData = {
+    labels: [
+        {% for entry in vocab_growth %}
+            "{{ entry[0] }}",
+        {% endfor %}
+    ],
+    datasets: [{
+        label: 'Vocab Size',
+        data: [
+            {% for entry in vocab_growth %}
+                {{ entry[1] }},
+            {% endfor %}
+        ],
+        fill: true,
+        borderColor: 'rgb(75, 192, 192)',
+        backgroundColor: 'rgba(75, 192, 192, 0.2)',
+        tension: 0.3
+    }]
+};
+
+const vocabChart = new Chart(ctx, {
+    type: 'line',
+    data: vocabData,
+    options: {
+        scales: {
+            x: {
+                ticks: {
+                    autoSkip: true,
+                    maxTicksLimit: 10  // only show up to 10 x-axis labels
+                },
+                title: {
+                    display: true,
+                    text: 'Time'
+                }
+            },
+            y: {
+                title: {
+                    display: true,
+                    text: 'Vocabulary Size'
+                },
+                beginAtZero: true
+            }
+        }
+    }
+});
+</script>
+
+
+
 </body>
 </html>
--- a/model/trainer.py
+++ b/model/trainer.py
@ -5,6 +5,12 @@ from model.brain_state import model, tokenizer, DEVICE, loss_fn
 from context.context import add_to_context, get_recent_context

 LOSS_FILE = "data/logs/loss.log"
+VOCAB_GROWTH_FILE = "data/logs/vocab_growth.log"
+
+
+def log_vocab_growth():
+    with open(VOCAB_GROWTH_FILE, "a", encoding="utf-8") as f:
+        f.write(f"{time.time()},{len(tokenizer.vocab)}\n")


 def log_loss(value: float):
@ -45,4 +51,5 @@ def train_on_message(text: str, source: str = "user"):
    opt.step()

    log_loss(loss.item())
+    log_vocab_growth()
    add_to_context(text, source=source)