Added another learning source for Nora. Also added the requirements.

This commit is contained in:
2025-06-09 14:25:11 -04:00
parent da23742671
commit 5d53ba7cb8
14 changed files with 1070 additions and 78 deletions

88
persona_manager.py Normal file
View File

@ -0,0 +1,88 @@
# persona_manager.py
import os
import torch
import asyncio
import re
from tokenizer import CharTokenizer
from model import NoraTransformerLM
PERSONA_PATH = "nora_persona.txt"
# 1) A meta-prompt that explicitly tells Nora to invent a persona and avoid quoting:
PERSONA_META_PROMPT = (
"Below, Nora will create a brandnew identity for herself. "
"She must NOT quote from any books or passages she has read. "
"Instead, she should invent her own style, voice, quirks, and personality traits as if she were a completely new person. "
"Her persona should be flirty, playful, curious, and speak in full sentences. "
"Write at least three paragraphs in your own words.\n\n"
"Nora, please invent and write your complete persona now:\n\nNora:"
)
async def generate_persona(model: NoraTransformerLM, tokenizer: CharTokenizer, device: str) -> str:
"""
Ask Nora to write out her own, original persona, avoiding any verbatim quotes.
Returns the raw generated text.
"""
# Well ask for up to 512 tokens, with higher temperature and top_p sampling.
# That combination tends to produce more creative, lessmemorizable text.
raw = await asyncio.to_thread(
model.generate,
tokenizer,
device,
PERSONA_META_PROMPT,
512, # allow several paragraphs
1.2, # higher temperature for more creativity
0 # top_k=0 means no fixed-k; well apply top_p filtering instead
)
# At this point, “raw” may include the word “Nora:” etc. Strip everything before “Nora:”
if "Nora:" in raw:
persona_text = raw.split("Nora:")[-1].strip()
else:
persona_text = raw.strip()
# Now apply a simple postfilter: remove any long spans that match exact sequences in the book corpus.
# This is optional but helps ensure she didnt copy large chunks verbatim. We check for 6+ character substrings
# appearing more than once in her output.
def remove_long_quotes(text: str) -> str:
filtered = text
# find any substring of length ≥6 that appears twice; well just guess shes quoting if its repeated.
for match in re.finditer(r"\b[\w',]{6,}\b", text):
substr = match.group(0)
if filtered.count(substr) > 1:
filtered = filtered.replace(substr, "[…]")
return filtered
persona_text = remove_long_quotes(persona_text)
return persona_text
def ensure_persona_file(model: NoraTransformerLM, tokenizer: CharTokenizer, device: str):
"""
If nora_persona.txt does not exist, generate one (ensuring originality).
"""
if os.path.isfile(PERSONA_PATH):
return
print("[persona] No persona found. Generating a new, original persona…")
persona_text = asyncio.run(generate_persona(model, tokenizer, device))
# Save to disk
with open(PERSONA_PATH, "w", encoding="utf-8") as f:
f.write(persona_text)
print(f"[persona] Wrote new persona to {PERSONA_PATH}.")
async def maybe_update_persona(model: NoraTransformerLM, tokenizer: CharTokenizer, device: str):
"""
Regenerate Noras persona if she requests it, overwriting the file.
"""
print("[persona] Updating persona at Nora's request…")
persona_text = await generate_persona(model, tokenizer, device)
with open(PERSONA_PATH, "w", encoding="utf-8") as f:
f.write(persona_text)
print(f"[persona] Updated persona in {PERSONA_PATH}.")
return persona_text