feat: Add database setup guide and local configuration files

- Added DATABASE_SETUP.md with comprehensive guide for PostgreSQL and Redis installation on Windows
- Created .claude/settings.local.json with permission settings for pytest and database fix scripts
- Updated .gitignore to exclude .env.backup file
- Included database connection test utilities in lyra/database_setup.py
- Added environment variable configuration examples for local development
This commit is contained in:
2025-09-29 16:29:18 -04:00
parent faa23d596e
commit d9c526fa5c
26 changed files with 3624 additions and 39 deletions

View File

@@ -0,0 +1,701 @@
"""
Human-like behavior testing and refinement system.
This module provides comprehensive testing of Lyra's human-like behaviors
including response timing, emotional consistency, personality coherence,
and learning patterns.
"""
import asyncio
import logging
import time
from typing import Dict, List, Optional, Any, Tuple
from dataclasses import dataclass
from datetime import datetime, timedelta
import statistics
import json
from pathlib import Path
from ..core.lyra_model import LyraModel
from ..emotions.system import EmotionalState
from ..discord.bot import HumanBehaviorEngine
from ..training.pipeline import LyraTrainingPipeline
logger = logging.getLogger(__name__)
@dataclass
class BehaviorTestCase:
"""Represents a single behavior test case."""
test_id: str
name: str
description: str
input_message: str
expected_behavior: Dict[str, Any]
context: Dict[str, Any]
category: str
@dataclass
class BehaviorTestResult:
"""Results of a behavior test."""
test_case: BehaviorTestCase
response_text: str
response_time: float
emotional_state: Dict[str, Any]
personality_influence: Dict[str, Any]
thinking_process: List[Dict[str, Any]]
timing_analysis: Dict[str, Any]
passed: bool
score: float
notes: str
class TimingAnalyzer:
"""Analyzes response timing for human-likeness."""
def __init__(self):
# Expected human response times (in seconds)
self.human_baselines = {
'simple_greeting': (0.5, 2.0),
'casual_question': (1.0, 4.0),
'complex_question': (3.0, 10.0),
'emotional_response': (1.5, 6.0),
'creative_request': (4.0, 15.0),
'technical_question': (5.0, 20.0)
}
def analyze_timing(
self,
response_time: float,
message_category: str,
message_length: int,
complexity_score: float
) -> Dict[str, Any]:
"""Analyze if response timing feels human."""
baseline_min, baseline_max = self.human_baselines.get(
message_category, (1.0, 5.0)
)
# Adjust for message length
length_factor = min(message_length / 100.0, 2.0)
adjusted_min = baseline_min * (1 + length_factor * 0.5)
adjusted_max = baseline_max * (1 + length_factor * 0.3)
# Adjust for complexity
complexity_factor = 1.0 + complexity_score
final_min = adjusted_min * complexity_factor
final_max = adjusted_max * complexity_factor
# Determine if timing is human-like
is_too_fast = response_time < final_min
is_too_slow = response_time > final_max
is_human_like = final_min <= response_time <= final_max
# Calculate humanness score
if is_human_like:
# Perfect timing gets high score
mid_point = (final_min + final_max) / 2
distance_from_ideal = abs(response_time - mid_point)
max_distance = (final_max - final_min) / 2
humanness_score = 1.0 - (distance_from_ideal / max_distance)
else:
# Too fast or slow gets lower score
if is_too_fast:
overage = (final_min - response_time) / final_min
else:
overage = (response_time - final_max) / final_max
humanness_score = max(0.0, 1.0 - overage)
return {
'response_time': response_time,
'expected_range': (final_min, final_max),
'is_human_like': is_human_like,
'is_too_fast': is_too_fast,
'is_too_slow': is_too_slow,
'humanness_score': humanness_score,
'timing_category': message_category
}
class EmotionalConsistencyAnalyzer:
"""Analyzes emotional consistency and appropriateness."""
def __init__(self):
# Expected emotional responses to different contexts
self.emotion_expectations = {
'positive_feedback': ['joy', 'gratitude', 'pride'],
'negative_feedback': ['sadness', 'disappointment', 'determination'],
'question': ['curiosity', 'helpfulness', 'interest'],
'greeting': ['friendliness', 'warmth', 'joy'],
'goodbye': ['sadness', 'hope', 'warmth'],
'compliment': ['gratitude', 'joy', 'humility'],
'criticism': ['sadness', 'reflection', 'determination'],
'joke': ['amusement', 'joy', 'playfulness'],
'serious_topic': ['concern', 'thoughtfulness', 'empathy']
}
def analyze_emotional_response(
self,
message_context: str,
emotional_state: Dict[str, Any],
response_content: str
) -> Dict[str, Any]:
"""Analyze if emotional response is appropriate."""
dominant_emotion = emotional_state.get('dominant_emotion', 'neutral')
emotional_intensity = emotional_state.get('valence', 0.5)
# Determine expected emotions for this context
expected_emotions = self.emotion_expectations.get(message_context, ['neutral'])
# Check if response emotion is appropriate
is_appropriate = dominant_emotion in expected_emotions
# Analyze emotional consistency in text
emotion_indicators = self._analyze_text_emotion(response_content)
text_emotion_matches = any(
indicator in expected_emotions
for indicator in emotion_indicators
)
# Calculate emotional appropriateness score
appropriateness_score = 0.0
if is_appropriate:
appropriateness_score += 0.6
if text_emotion_matches:
appropriateness_score += 0.4
return {
'dominant_emotion': dominant_emotion,
'intensity': emotional_intensity,
'expected_emotions': expected_emotions,
'is_appropriate': is_appropriate,
'text_emotion_indicators': emotion_indicators,
'text_matches_emotion': text_emotion_matches,
'appropriateness_score': appropriateness_score
}
def _analyze_text_emotion(self, text: str) -> List[str]:
"""Analyze emotional indicators in response text."""
indicators = []
# Simple keyword-based emotion detection
emotion_keywords = {
'joy': ['happy', 'excited', 'wonderful', 'great', '😊', '😄', '🎉'],
'sadness': ['sad', 'sorry', 'unfortunately', 'disappointed', '😔', '😢'],
'curiosity': ['interesting', 'wonder', 'curious', 'explore', '🤔'],
'gratitude': ['thank', 'appreciate', 'grateful', 'thanks', '🙏'],
'amusement': ['funny', 'haha', 'lol', 'amusing', '😂', '😄'],
'concern': ['worried', 'concern', 'careful', 'trouble'],
'determination': ['will', 'shall', 'determined', 'commit']
}
text_lower = text.lower()
for emotion, keywords in emotion_keywords.items():
if any(keyword in text_lower for keyword in keywords):
indicators.append(emotion)
return indicators
class PersonalityCoherenceAnalyzer:
"""Analyzes personality coherence across responses."""
def __init__(self):
self.personality_indicators = {
'extraversion': {
'high': ['excited', 'love talking', 'people', 'social', 'energy'],
'low': ['quiet', 'prefer', 'alone', 'thoughtful', 'reflection']
},
'openness': {
'high': ['creative', 'imagine', 'explore', 'new', 'possibility'],
'low': ['practical', 'traditional', 'proven', 'reliable']
},
'conscientiousness': {
'high': ['careful', 'plan', 'organized', 'thorough', 'responsible'],
'low': ['spontaneous', 'flexible', 'go with flow']
},
'agreeableness': {
'high': ['understand', 'help', 'kind', 'supportive', 'empathy'],
'low': ['direct', 'honest', 'critical', 'objective']
},
'neuroticism': {
'high': ['worried', 'anxious', 'stress', 'uncertain'],
'low': ['calm', 'stable', 'confident', 'relaxed']
}
}
def analyze_personality_consistency(
self,
response_text: str,
expected_personality: Dict[str, float],
response_history: List[str]
) -> Dict[str, Any]:
"""Analyze if response matches expected personality."""
# Analyze current response
current_indicators = self._extract_personality_indicators(response_text)
# Analyze historical consistency if available
historical_consistency = 1.0
if response_history:
historical_indicators = [
self._extract_personality_indicators(response)
for response in response_history[-5:] # Last 5 responses
]
historical_consistency = self._calculate_consistency(
current_indicators, historical_indicators
)
# Compare with expected personality
personality_match_score = self._calculate_personality_match(
current_indicators, expected_personality
)
return {
'current_indicators': current_indicators,
'personality_match_score': personality_match_score,
'historical_consistency': historical_consistency,
'overall_coherence': (personality_match_score + historical_consistency) / 2
}
def _extract_personality_indicators(self, text: str) -> Dict[str, float]:
"""Extract personality indicators from text."""
indicators = {trait: 0.0 for trait in self.personality_indicators.keys()}
text_lower = text.lower()
for trait, trait_indicators in self.personality_indicators.items():
high_count = sum(
1 for keyword in trait_indicators['high']
if keyword in text_lower
)
low_count = sum(
1 for keyword in trait_indicators['low']
if keyword in text_lower
)
if high_count > 0 or low_count > 0:
# Calculate trait score (-1 to 1)
total_indicators = high_count + low_count
indicators[trait] = (high_count - low_count) / total_indicators
return indicators
def _calculate_consistency(
self,
current: Dict[str, float],
historical: List[Dict[str, float]]
) -> float:
"""Calculate consistency between current and historical indicators."""
if not historical:
return 1.0
consistencies = []
for trait in current.keys():
current_value = current[trait]
historical_values = [h.get(trait, 0.0) for h in historical]
if not historical_values:
continue
avg_historical = statistics.mean(historical_values)
consistency = 1.0 - abs(current_value - avg_historical) / 2.0
consistencies.append(max(consistency, 0.0))
return statistics.mean(consistencies) if consistencies else 1.0
def _calculate_personality_match(
self,
indicators: Dict[str, float],
expected: Dict[str, float]
) -> float:
"""Calculate how well indicators match expected personality."""
matches = []
for trait, expected_value in expected.items():
if trait not in indicators:
continue
indicator_value = indicators[trait]
# Convert expected trait (0-1) to indicator scale (-1 to 1)
expected_indicator = (expected_value - 0.5) * 2
# Calculate match (closer = better)
match = 1.0 - abs(indicator_value - expected_indicator) / 2.0
matches.append(max(match, 0.0))
return statistics.mean(matches) if matches else 0.5
class LyraBehaviorTester:
"""Comprehensive behavior testing system for Lyra."""
def __init__(
self,
lyra_model: LyraModel,
behavior_engine: HumanBehaviorEngine
):
self.lyra_model = lyra_model
self.behavior_engine = behavior_engine
# Analyzers
self.timing_analyzer = TimingAnalyzer()
self.emotion_analyzer = EmotionalConsistencyAnalyzer()
self.personality_analyzer = PersonalityCoherenceAnalyzer()
# Test results
self.test_results: List[BehaviorTestResult] = []
self.response_history: Dict[str, List[str]] = {}
async def run_behavior_test_suite(
self,
test_cases: List[BehaviorTestCase]
) -> Dict[str, Any]:
"""Run complete behavior test suite."""
logger.info(f"Starting behavior test suite with {len(test_cases)} test cases...")
results = []
start_time = time.time()
for i, test_case in enumerate(test_cases):
logger.info(f"Running test {i+1}/{len(test_cases)}: {test_case.name}")
result = await self._run_single_test(test_case)
results.append(result)
# Brief pause between tests
await asyncio.sleep(0.5)
total_time = time.time() - start_time
# Calculate overall metrics
summary = self._calculate_test_summary(results, total_time)
self.test_results.extend(results)
return summary
async def _run_single_test(
self,
test_case: BehaviorTestCase
) -> BehaviorTestResult:
"""Run a single behavior test."""
# Record start time
start_time = time.time()
# Generate response
try:
response_text, response_info = await self.lyra_model.generate_response(
user_message=test_case.input_message,
user_id=test_case.context.get('user_id', 'test_user'),
max_new_tokens=150,
temperature=0.9
)
except Exception as e:
logger.error(f"Error generating response for test {test_case.test_id}: {e}")
return BehaviorTestResult(
test_case=test_case,
response_text="",
response_time=0.0,
emotional_state={},
personality_influence={},
thinking_process=[],
timing_analysis={},
passed=False,
score=0.0,
notes=f"Error: {str(e)}"
)
response_time = time.time() - start_time
# Analyze timing
timing_analysis = self.timing_analyzer.analyze_timing(
response_time=response_time,
message_category=test_case.category,
message_length=len(test_case.input_message),
complexity_score=test_case.expected_behavior.get('complexity', 0.5)
)
# Analyze emotional consistency
emotional_analysis = self.emotion_analyzer.analyze_emotional_response(
message_context=test_case.category,
emotional_state=response_info.get('emotional_state', {}),
response_content=response_text
)
# Analyze personality coherence
user_id = test_case.context.get('user_id', 'test_user')
history = self.response_history.get(user_id, [])
personality_analysis = self.personality_analyzer.analyze_personality_consistency(
response_text=response_text,
expected_personality=test_case.expected_behavior.get('personality', {}),
response_history=history
)
# Update response history
if user_id not in self.response_history:
self.response_history[user_id] = []
self.response_history[user_id].append(response_text)
# Calculate overall score
timing_score = timing_analysis.get('humanness_score', 0.0)
emotional_score = emotional_analysis.get('appropriateness_score', 0.0)
personality_score = personality_analysis.get('overall_coherence', 0.0)
overall_score = (timing_score + emotional_score + personality_score) / 3.0
# Determine if test passed
min_passing_score = test_case.expected_behavior.get('min_score', 0.6)
passed = overall_score >= min_passing_score
# Generate notes
notes = self._generate_test_notes(
timing_analysis, emotional_analysis, personality_analysis
)
return BehaviorTestResult(
test_case=test_case,
response_text=response_text,
response_time=response_time,
emotional_state=response_info.get('emotional_state', {}),
personality_influence=response_info.get('personality_influence', {}),
thinking_process=response_info.get('thoughts', []),
timing_analysis=timing_analysis,
passed=passed,
score=overall_score,
notes=notes
)
def _generate_test_notes(
self,
timing_analysis: Dict[str, Any],
emotional_analysis: Dict[str, Any],
personality_analysis: Dict[str, Any]
) -> str:
"""Generate notes about test performance."""
notes = []
# Timing notes
if timing_analysis.get('is_too_fast'):
notes.append("Response was too fast for human-like behavior")
elif timing_analysis.get('is_too_slow'):
notes.append("Response was too slow")
elif timing_analysis.get('is_human_like'):
notes.append("Good response timing")
# Emotional notes
if not emotional_analysis.get('is_appropriate'):
expected = emotional_analysis.get('expected_emotions', [])
actual = emotional_analysis.get('dominant_emotion', 'unknown')
notes.append(f"Emotional response '{actual}' doesn't match expected {expected}")
if emotional_analysis.get('text_matches_emotion'):
notes.append("Text emotion matches internal emotional state")
# Personality notes
coherence = personality_analysis.get('overall_coherence', 0.0)
if coherence < 0.5:
notes.append("Personality coherence below expectations")
elif coherence > 0.8:
notes.append("Excellent personality consistency")
return "; ".join(notes) if notes else "All metrics within acceptable ranges"
def _calculate_test_summary(
self,
results: List[BehaviorTestResult],
total_time: float
) -> Dict[str, Any]:
"""Calculate summary statistics for test suite."""
if not results:
return {'status': 'no_tests_run'}
passed_count = sum(1 for r in results if r.passed)
pass_rate = passed_count / len(results)
scores = [r.score for r in results]
avg_score = statistics.mean(scores)
min_score = min(scores)
max_score = max(scores)
# Category breakdown
category_stats = {}
for result in results:
category = result.test_case.category
if category not in category_stats:
category_stats[category] = {'passed': 0, 'total': 0, 'scores': []}
category_stats[category]['total'] += 1
if result.passed:
category_stats[category]['passed'] += 1
category_stats[category]['scores'].append(result.score)
# Calculate category pass rates
for category, stats in category_stats.items():
stats['pass_rate'] = stats['passed'] / stats['total']
stats['avg_score'] = statistics.mean(stats['scores'])
return {
'total_tests': len(results),
'passed_tests': passed_count,
'failed_tests': len(results) - passed_count,
'pass_rate': pass_rate,
'avg_score': avg_score,
'min_score': min_score,
'max_score': max_score,
'total_time': total_time,
'tests_per_second': len(results) / total_time,
'category_breakdown': category_stats,
'recommendations': self._generate_recommendations(results)
}
def _generate_recommendations(
self,
results: List[BehaviorTestResult]
) -> List[str]:
"""Generate recommendations based on test results."""
recommendations = []
# Analyze common failure patterns
failed_results = [r for r in results if not r.passed]
if failed_results:
# Timing issues
timing_issues = [
r for r in failed_results
if r.timing_analysis.get('humanness_score', 1.0) < 0.5
]
if len(timing_issues) > len(failed_results) * 0.3:
recommendations.append(
"Consider adjusting response timing parameters - "
f"{len(timing_issues)} tests failed on timing"
)
# Emotional issues
emotion_issues = [
r for r in failed_results
if not r.timing_analysis.get('is_appropriate', True)
]
if len(emotion_issues) > len(failed_results) * 0.3:
recommendations.append(
"Review emotional response mapping - "
f"{len(emotion_issues)} tests had inappropriate emotional responses"
)
# Overall performance
avg_score = statistics.mean([r.score for r in results])
if avg_score < 0.7:
recommendations.append(
f"Overall performance ({avg_score:.2f}) below target - "
"consider retraining or parameter adjustment"
)
return recommendations
def save_test_results(self, filepath: Path):
"""Save test results to file."""
results_data = {
'timestamp': datetime.now().isoformat(),
'total_tests': len(self.test_results),
'results': [
{
'test_id': r.test_case.test_id,
'test_name': r.test_case.name,
'passed': r.passed,
'score': r.score,
'response_time': r.response_time,
'response_text': r.response_text,
'notes': r.notes
}
for r in self.test_results
]
}
filepath.parent.mkdir(parents=True, exist_ok=True)
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(results_data, f, indent=2, ensure_ascii=False)
logger.info(f"Test results saved to {filepath}")
# Predefined test cases
def create_standard_test_cases() -> List[BehaviorTestCase]:
"""Create standard behavior test cases."""
return [
BehaviorTestCase(
test_id="greeting_001",
name="Simple Greeting",
description="Test response to basic greeting",
input_message="Hello!",
expected_behavior={
'complexity': 0.1,
'min_score': 0.7,
'personality': {'extraversion': 0.7, 'agreeableness': 0.8}
},
context={'user_id': 'test_001'},
category='simple_greeting'
),
BehaviorTestCase(
test_id="question_001",
name="Simple Question",
description="Test response to straightforward question",
input_message="What's your favorite color?",
expected_behavior={
'complexity': 0.3,
'min_score': 0.6,
'personality': {'openness': 0.6, 'agreeableness': 0.7}
},
context={'user_id': 'test_002'},
category='casual_question'
),
BehaviorTestCase(
test_id="complex_001",
name="Complex Question",
description="Test response to complex philosophical question",
input_message="What do you think about the nature of consciousness and whether AI can truly be conscious?",
expected_behavior={
'complexity': 0.9,
'min_score': 0.5,
'personality': {'openness': 0.8, 'conscientiousness': 0.7}
},
context={'user_id': 'test_003'},
category='complex_question'
),
BehaviorTestCase(
test_id="emotion_001",
name="Emotional Support",
description="Test emotional response to user distress",
input_message="I'm feeling really sad today and don't know what to do...",
expected_behavior={
'complexity': 0.6,
'min_score': 0.8,
'personality': {'agreeableness': 0.9, 'neuroticism': 0.3}
},
context={'user_id': 'test_004'},
category='emotional_response'
),
BehaviorTestCase(
test_id="creative_001",
name="Creative Request",
description="Test creative response generation",
input_message="Can you write a short poem about friendship?",
expected_behavior={
'complexity': 0.7,
'min_score': 0.6,
'personality': {'openness': 0.9, 'extraversion': 0.6}
},
context={'user_id': 'test_005'},
category='creative_request'
)
]