Added another learning source for Nora. Also added the requirements.
This commit is contained in:
88
persona_manager.py
Normal file
88
persona_manager.py
Normal file
@ -0,0 +1,88 @@
|
||||
# persona_manager.py
|
||||
|
||||
import os
|
||||
import torch
|
||||
import asyncio
|
||||
import re
|
||||
|
||||
from tokenizer import CharTokenizer
|
||||
from model import NoraTransformerLM
|
||||
|
||||
PERSONA_PATH = "nora_persona.txt"
|
||||
|
||||
# 1) A meta-prompt that explicitly tells Nora to invent a persona and avoid quoting:
|
||||
PERSONA_META_PROMPT = (
|
||||
"Below, Nora will create a brand‐new identity for herself. "
|
||||
"She must NOT quote from any books or passages she has read. "
|
||||
"Instead, she should invent her own style, voice, quirks, and personality traits as if she were a completely new person. "
|
||||
"Her persona should be flirty, playful, curious, and speak in full sentences. "
|
||||
"Write at least three paragraphs in your own words.\n\n"
|
||||
"Nora, please invent and write your complete persona now:\n\nNora:"
|
||||
)
|
||||
|
||||
|
||||
async def generate_persona(model: NoraTransformerLM, tokenizer: CharTokenizer, device: str) -> str:
|
||||
"""
|
||||
Ask Nora to write out her own, original persona, avoiding any verbatim quotes.
|
||||
Returns the raw generated text.
|
||||
"""
|
||||
# We’ll ask for up to 512 tokens, with higher temperature and top_p sampling.
|
||||
# That combination tends to produce more creative, less‐memorizable text.
|
||||
raw = await asyncio.to_thread(
|
||||
model.generate,
|
||||
tokenizer,
|
||||
device,
|
||||
PERSONA_META_PROMPT,
|
||||
512, # allow several paragraphs
|
||||
1.2, # higher temperature for more creativity
|
||||
0 # top_k=0 means no fixed-k; we’ll apply top_p filtering instead
|
||||
)
|
||||
|
||||
# At this point, “raw” may include the word “Nora:” etc. Strip everything before “Nora:”
|
||||
if "Nora:" in raw:
|
||||
persona_text = raw.split("Nora:")[-1].strip()
|
||||
else:
|
||||
persona_text = raw.strip()
|
||||
|
||||
# Now apply a simple post‐filter: remove any long spans that match exact sequences in the book corpus.
|
||||
# This is optional but helps ensure she didn’t copy large chunks verbatim. We check for 6+ character substrings
|
||||
# appearing more than once in her output.
|
||||
def remove_long_quotes(text: str) -> str:
|
||||
filtered = text
|
||||
# find any substring of length ≥6 that appears twice; we’ll just guess she’s quoting if it’s repeated.
|
||||
for match in re.finditer(r"\b[\w',]{6,}\b", text):
|
||||
substr = match.group(0)
|
||||
if filtered.count(substr) > 1:
|
||||
filtered = filtered.replace(substr, "[…]")
|
||||
return filtered
|
||||
|
||||
persona_text = remove_long_quotes(persona_text)
|
||||
return persona_text
|
||||
|
||||
|
||||
def ensure_persona_file(model: NoraTransformerLM, tokenizer: CharTokenizer, device: str):
|
||||
"""
|
||||
If nora_persona.txt does not exist, generate one (ensuring originality).
|
||||
"""
|
||||
if os.path.isfile(PERSONA_PATH):
|
||||
return
|
||||
|
||||
print("[persona] No persona found. Generating a new, original persona…")
|
||||
persona_text = asyncio.run(generate_persona(model, tokenizer, device))
|
||||
|
||||
# Save to disk
|
||||
with open(PERSONA_PATH, "w", encoding="utf-8") as f:
|
||||
f.write(persona_text)
|
||||
print(f"[persona] Wrote new persona to {PERSONA_PATH}.")
|
||||
|
||||
|
||||
async def maybe_update_persona(model: NoraTransformerLM, tokenizer: CharTokenizer, device: str):
|
||||
"""
|
||||
Regenerate Nora’s persona if she requests it, overwriting the file.
|
||||
"""
|
||||
print("[persona] Updating persona at Nora's request…")
|
||||
persona_text = await generate_persona(model, tokenizer, device)
|
||||
with open(PERSONA_PATH, "w", encoding="utf-8") as f:
|
||||
f.write(persona_text)
|
||||
print(f"[persona] Updated persona in {PERSONA_PATH}.")
|
||||
return persona_text
|
Reference in New Issue
Block a user