- Added DATABASE_SETUP.md with comprehensive guide for PostgreSQL and Redis installation on Windows - Created .claude/settings.local.json with permission settings for pytest and database fix scripts - Updated .gitignore to exclude .env.backup file - Included database connection test utilities in lyra/database_setup.py - Added environment variable configuration examples for local development
701 lines
25 KiB
Python
701 lines
25 KiB
Python
"""
|
|
Human-like behavior testing and refinement system.
|
|
|
|
This module provides comprehensive testing of Lyra's human-like behaviors
|
|
including response timing, emotional consistency, personality coherence,
|
|
and learning patterns.
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import time
|
|
from typing import Dict, List, Optional, Any, Tuple
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, timedelta
|
|
import statistics
|
|
import json
|
|
from pathlib import Path
|
|
|
|
from ..core.lyra_model import LyraModel
|
|
from ..emotions.system import EmotionalState
|
|
from ..discord.bot import HumanBehaviorEngine
|
|
from ..training.pipeline import LyraTrainingPipeline
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class BehaviorTestCase:
|
|
"""Represents a single behavior test case."""
|
|
test_id: str
|
|
name: str
|
|
description: str
|
|
input_message: str
|
|
expected_behavior: Dict[str, Any]
|
|
context: Dict[str, Any]
|
|
category: str
|
|
|
|
|
|
@dataclass
|
|
class BehaviorTestResult:
|
|
"""Results of a behavior test."""
|
|
test_case: BehaviorTestCase
|
|
response_text: str
|
|
response_time: float
|
|
emotional_state: Dict[str, Any]
|
|
personality_influence: Dict[str, Any]
|
|
thinking_process: List[Dict[str, Any]]
|
|
timing_analysis: Dict[str, Any]
|
|
passed: bool
|
|
score: float
|
|
notes: str
|
|
|
|
|
|
class TimingAnalyzer:
|
|
"""Analyzes response timing for human-likeness."""
|
|
|
|
def __init__(self):
|
|
# Expected human response times (in seconds)
|
|
self.human_baselines = {
|
|
'simple_greeting': (0.5, 2.0),
|
|
'casual_question': (1.0, 4.0),
|
|
'complex_question': (3.0, 10.0),
|
|
'emotional_response': (1.5, 6.0),
|
|
'creative_request': (4.0, 15.0),
|
|
'technical_question': (5.0, 20.0)
|
|
}
|
|
|
|
def analyze_timing(
|
|
self,
|
|
response_time: float,
|
|
message_category: str,
|
|
message_length: int,
|
|
complexity_score: float
|
|
) -> Dict[str, Any]:
|
|
"""Analyze if response timing feels human."""
|
|
|
|
baseline_min, baseline_max = self.human_baselines.get(
|
|
message_category, (1.0, 5.0)
|
|
)
|
|
|
|
# Adjust for message length
|
|
length_factor = min(message_length / 100.0, 2.0)
|
|
adjusted_min = baseline_min * (1 + length_factor * 0.5)
|
|
adjusted_max = baseline_max * (1 + length_factor * 0.3)
|
|
|
|
# Adjust for complexity
|
|
complexity_factor = 1.0 + complexity_score
|
|
final_min = adjusted_min * complexity_factor
|
|
final_max = adjusted_max * complexity_factor
|
|
|
|
# Determine if timing is human-like
|
|
is_too_fast = response_time < final_min
|
|
is_too_slow = response_time > final_max
|
|
is_human_like = final_min <= response_time <= final_max
|
|
|
|
# Calculate humanness score
|
|
if is_human_like:
|
|
# Perfect timing gets high score
|
|
mid_point = (final_min + final_max) / 2
|
|
distance_from_ideal = abs(response_time - mid_point)
|
|
max_distance = (final_max - final_min) / 2
|
|
humanness_score = 1.0 - (distance_from_ideal / max_distance)
|
|
else:
|
|
# Too fast or slow gets lower score
|
|
if is_too_fast:
|
|
overage = (final_min - response_time) / final_min
|
|
else:
|
|
overage = (response_time - final_max) / final_max
|
|
|
|
humanness_score = max(0.0, 1.0 - overage)
|
|
|
|
return {
|
|
'response_time': response_time,
|
|
'expected_range': (final_min, final_max),
|
|
'is_human_like': is_human_like,
|
|
'is_too_fast': is_too_fast,
|
|
'is_too_slow': is_too_slow,
|
|
'humanness_score': humanness_score,
|
|
'timing_category': message_category
|
|
}
|
|
|
|
|
|
class EmotionalConsistencyAnalyzer:
|
|
"""Analyzes emotional consistency and appropriateness."""
|
|
|
|
def __init__(self):
|
|
# Expected emotional responses to different contexts
|
|
self.emotion_expectations = {
|
|
'positive_feedback': ['joy', 'gratitude', 'pride'],
|
|
'negative_feedback': ['sadness', 'disappointment', 'determination'],
|
|
'question': ['curiosity', 'helpfulness', 'interest'],
|
|
'greeting': ['friendliness', 'warmth', 'joy'],
|
|
'goodbye': ['sadness', 'hope', 'warmth'],
|
|
'compliment': ['gratitude', 'joy', 'humility'],
|
|
'criticism': ['sadness', 'reflection', 'determination'],
|
|
'joke': ['amusement', 'joy', 'playfulness'],
|
|
'serious_topic': ['concern', 'thoughtfulness', 'empathy']
|
|
}
|
|
|
|
def analyze_emotional_response(
|
|
self,
|
|
message_context: str,
|
|
emotional_state: Dict[str, Any],
|
|
response_content: str
|
|
) -> Dict[str, Any]:
|
|
"""Analyze if emotional response is appropriate."""
|
|
|
|
dominant_emotion = emotional_state.get('dominant_emotion', 'neutral')
|
|
emotional_intensity = emotional_state.get('valence', 0.5)
|
|
|
|
# Determine expected emotions for this context
|
|
expected_emotions = self.emotion_expectations.get(message_context, ['neutral'])
|
|
|
|
# Check if response emotion is appropriate
|
|
is_appropriate = dominant_emotion in expected_emotions
|
|
|
|
# Analyze emotional consistency in text
|
|
emotion_indicators = self._analyze_text_emotion(response_content)
|
|
text_emotion_matches = any(
|
|
indicator in expected_emotions
|
|
for indicator in emotion_indicators
|
|
)
|
|
|
|
# Calculate emotional appropriateness score
|
|
appropriateness_score = 0.0
|
|
if is_appropriate:
|
|
appropriateness_score += 0.6
|
|
if text_emotion_matches:
|
|
appropriateness_score += 0.4
|
|
|
|
return {
|
|
'dominant_emotion': dominant_emotion,
|
|
'intensity': emotional_intensity,
|
|
'expected_emotions': expected_emotions,
|
|
'is_appropriate': is_appropriate,
|
|
'text_emotion_indicators': emotion_indicators,
|
|
'text_matches_emotion': text_emotion_matches,
|
|
'appropriateness_score': appropriateness_score
|
|
}
|
|
|
|
def _analyze_text_emotion(self, text: str) -> List[str]:
|
|
"""Analyze emotional indicators in response text."""
|
|
indicators = []
|
|
|
|
# Simple keyword-based emotion detection
|
|
emotion_keywords = {
|
|
'joy': ['happy', 'excited', 'wonderful', 'great', '😊', '😄', '🎉'],
|
|
'sadness': ['sad', 'sorry', 'unfortunately', 'disappointed', '😔', '😢'],
|
|
'curiosity': ['interesting', 'wonder', 'curious', 'explore', '🤔'],
|
|
'gratitude': ['thank', 'appreciate', 'grateful', 'thanks', '🙏'],
|
|
'amusement': ['funny', 'haha', 'lol', 'amusing', '😂', '😄'],
|
|
'concern': ['worried', 'concern', 'careful', 'trouble'],
|
|
'determination': ['will', 'shall', 'determined', 'commit']
|
|
}
|
|
|
|
text_lower = text.lower()
|
|
for emotion, keywords in emotion_keywords.items():
|
|
if any(keyword in text_lower for keyword in keywords):
|
|
indicators.append(emotion)
|
|
|
|
return indicators
|
|
|
|
|
|
class PersonalityCoherenceAnalyzer:
|
|
"""Analyzes personality coherence across responses."""
|
|
|
|
def __init__(self):
|
|
self.personality_indicators = {
|
|
'extraversion': {
|
|
'high': ['excited', 'love talking', 'people', 'social', 'energy'],
|
|
'low': ['quiet', 'prefer', 'alone', 'thoughtful', 'reflection']
|
|
},
|
|
'openness': {
|
|
'high': ['creative', 'imagine', 'explore', 'new', 'possibility'],
|
|
'low': ['practical', 'traditional', 'proven', 'reliable']
|
|
},
|
|
'conscientiousness': {
|
|
'high': ['careful', 'plan', 'organized', 'thorough', 'responsible'],
|
|
'low': ['spontaneous', 'flexible', 'go with flow']
|
|
},
|
|
'agreeableness': {
|
|
'high': ['understand', 'help', 'kind', 'supportive', 'empathy'],
|
|
'low': ['direct', 'honest', 'critical', 'objective']
|
|
},
|
|
'neuroticism': {
|
|
'high': ['worried', 'anxious', 'stress', 'uncertain'],
|
|
'low': ['calm', 'stable', 'confident', 'relaxed']
|
|
}
|
|
}
|
|
|
|
def analyze_personality_consistency(
|
|
self,
|
|
response_text: str,
|
|
expected_personality: Dict[str, float],
|
|
response_history: List[str]
|
|
) -> Dict[str, Any]:
|
|
"""Analyze if response matches expected personality."""
|
|
|
|
# Analyze current response
|
|
current_indicators = self._extract_personality_indicators(response_text)
|
|
|
|
# Analyze historical consistency if available
|
|
historical_consistency = 1.0
|
|
if response_history:
|
|
historical_indicators = [
|
|
self._extract_personality_indicators(response)
|
|
for response in response_history[-5:] # Last 5 responses
|
|
]
|
|
historical_consistency = self._calculate_consistency(
|
|
current_indicators, historical_indicators
|
|
)
|
|
|
|
# Compare with expected personality
|
|
personality_match_score = self._calculate_personality_match(
|
|
current_indicators, expected_personality
|
|
)
|
|
|
|
return {
|
|
'current_indicators': current_indicators,
|
|
'personality_match_score': personality_match_score,
|
|
'historical_consistency': historical_consistency,
|
|
'overall_coherence': (personality_match_score + historical_consistency) / 2
|
|
}
|
|
|
|
def _extract_personality_indicators(self, text: str) -> Dict[str, float]:
|
|
"""Extract personality indicators from text."""
|
|
indicators = {trait: 0.0 for trait in self.personality_indicators.keys()}
|
|
text_lower = text.lower()
|
|
|
|
for trait, trait_indicators in self.personality_indicators.items():
|
|
high_count = sum(
|
|
1 for keyword in trait_indicators['high']
|
|
if keyword in text_lower
|
|
)
|
|
low_count = sum(
|
|
1 for keyword in trait_indicators['low']
|
|
if keyword in text_lower
|
|
)
|
|
|
|
if high_count > 0 or low_count > 0:
|
|
# Calculate trait score (-1 to 1)
|
|
total_indicators = high_count + low_count
|
|
indicators[trait] = (high_count - low_count) / total_indicators
|
|
|
|
return indicators
|
|
|
|
def _calculate_consistency(
|
|
self,
|
|
current: Dict[str, float],
|
|
historical: List[Dict[str, float]]
|
|
) -> float:
|
|
"""Calculate consistency between current and historical indicators."""
|
|
if not historical:
|
|
return 1.0
|
|
|
|
consistencies = []
|
|
for trait in current.keys():
|
|
current_value = current[trait]
|
|
historical_values = [h.get(trait, 0.0) for h in historical]
|
|
|
|
if not historical_values:
|
|
continue
|
|
|
|
avg_historical = statistics.mean(historical_values)
|
|
consistency = 1.0 - abs(current_value - avg_historical) / 2.0
|
|
consistencies.append(max(consistency, 0.0))
|
|
|
|
return statistics.mean(consistencies) if consistencies else 1.0
|
|
|
|
def _calculate_personality_match(
|
|
self,
|
|
indicators: Dict[str, float],
|
|
expected: Dict[str, float]
|
|
) -> float:
|
|
"""Calculate how well indicators match expected personality."""
|
|
matches = []
|
|
|
|
for trait, expected_value in expected.items():
|
|
if trait not in indicators:
|
|
continue
|
|
|
|
indicator_value = indicators[trait]
|
|
|
|
# Convert expected trait (0-1) to indicator scale (-1 to 1)
|
|
expected_indicator = (expected_value - 0.5) * 2
|
|
|
|
# Calculate match (closer = better)
|
|
match = 1.0 - abs(indicator_value - expected_indicator) / 2.0
|
|
matches.append(max(match, 0.0))
|
|
|
|
return statistics.mean(matches) if matches else 0.5
|
|
|
|
|
|
class LyraBehaviorTester:
|
|
"""Comprehensive behavior testing system for Lyra."""
|
|
|
|
def __init__(
|
|
self,
|
|
lyra_model: LyraModel,
|
|
behavior_engine: HumanBehaviorEngine
|
|
):
|
|
self.lyra_model = lyra_model
|
|
self.behavior_engine = behavior_engine
|
|
|
|
# Analyzers
|
|
self.timing_analyzer = TimingAnalyzer()
|
|
self.emotion_analyzer = EmotionalConsistencyAnalyzer()
|
|
self.personality_analyzer = PersonalityCoherenceAnalyzer()
|
|
|
|
# Test results
|
|
self.test_results: List[BehaviorTestResult] = []
|
|
self.response_history: Dict[str, List[str]] = {}
|
|
|
|
async def run_behavior_test_suite(
|
|
self,
|
|
test_cases: List[BehaviorTestCase]
|
|
) -> Dict[str, Any]:
|
|
"""Run complete behavior test suite."""
|
|
logger.info(f"Starting behavior test suite with {len(test_cases)} test cases...")
|
|
|
|
results = []
|
|
start_time = time.time()
|
|
|
|
for i, test_case in enumerate(test_cases):
|
|
logger.info(f"Running test {i+1}/{len(test_cases)}: {test_case.name}")
|
|
|
|
result = await self._run_single_test(test_case)
|
|
results.append(result)
|
|
|
|
# Brief pause between tests
|
|
await asyncio.sleep(0.5)
|
|
|
|
total_time = time.time() - start_time
|
|
|
|
# Calculate overall metrics
|
|
summary = self._calculate_test_summary(results, total_time)
|
|
|
|
self.test_results.extend(results)
|
|
|
|
return summary
|
|
|
|
async def _run_single_test(
|
|
self,
|
|
test_case: BehaviorTestCase
|
|
) -> BehaviorTestResult:
|
|
"""Run a single behavior test."""
|
|
|
|
# Record start time
|
|
start_time = time.time()
|
|
|
|
# Generate response
|
|
try:
|
|
response_text, response_info = await self.lyra_model.generate_response(
|
|
user_message=test_case.input_message,
|
|
user_id=test_case.context.get('user_id', 'test_user'),
|
|
max_new_tokens=150,
|
|
temperature=0.9
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Error generating response for test {test_case.test_id}: {e}")
|
|
return BehaviorTestResult(
|
|
test_case=test_case,
|
|
response_text="",
|
|
response_time=0.0,
|
|
emotional_state={},
|
|
personality_influence={},
|
|
thinking_process=[],
|
|
timing_analysis={},
|
|
passed=False,
|
|
score=0.0,
|
|
notes=f"Error: {str(e)}"
|
|
)
|
|
|
|
response_time = time.time() - start_time
|
|
|
|
# Analyze timing
|
|
timing_analysis = self.timing_analyzer.analyze_timing(
|
|
response_time=response_time,
|
|
message_category=test_case.category,
|
|
message_length=len(test_case.input_message),
|
|
complexity_score=test_case.expected_behavior.get('complexity', 0.5)
|
|
)
|
|
|
|
# Analyze emotional consistency
|
|
emotional_analysis = self.emotion_analyzer.analyze_emotional_response(
|
|
message_context=test_case.category,
|
|
emotional_state=response_info.get('emotional_state', {}),
|
|
response_content=response_text
|
|
)
|
|
|
|
# Analyze personality coherence
|
|
user_id = test_case.context.get('user_id', 'test_user')
|
|
history = self.response_history.get(user_id, [])
|
|
|
|
personality_analysis = self.personality_analyzer.analyze_personality_consistency(
|
|
response_text=response_text,
|
|
expected_personality=test_case.expected_behavior.get('personality', {}),
|
|
response_history=history
|
|
)
|
|
|
|
# Update response history
|
|
if user_id not in self.response_history:
|
|
self.response_history[user_id] = []
|
|
self.response_history[user_id].append(response_text)
|
|
|
|
# Calculate overall score
|
|
timing_score = timing_analysis.get('humanness_score', 0.0)
|
|
emotional_score = emotional_analysis.get('appropriateness_score', 0.0)
|
|
personality_score = personality_analysis.get('overall_coherence', 0.0)
|
|
|
|
overall_score = (timing_score + emotional_score + personality_score) / 3.0
|
|
|
|
# Determine if test passed
|
|
min_passing_score = test_case.expected_behavior.get('min_score', 0.6)
|
|
passed = overall_score >= min_passing_score
|
|
|
|
# Generate notes
|
|
notes = self._generate_test_notes(
|
|
timing_analysis, emotional_analysis, personality_analysis
|
|
)
|
|
|
|
return BehaviorTestResult(
|
|
test_case=test_case,
|
|
response_text=response_text,
|
|
response_time=response_time,
|
|
emotional_state=response_info.get('emotional_state', {}),
|
|
personality_influence=response_info.get('personality_influence', {}),
|
|
thinking_process=response_info.get('thoughts', []),
|
|
timing_analysis=timing_analysis,
|
|
passed=passed,
|
|
score=overall_score,
|
|
notes=notes
|
|
)
|
|
|
|
def _generate_test_notes(
|
|
self,
|
|
timing_analysis: Dict[str, Any],
|
|
emotional_analysis: Dict[str, Any],
|
|
personality_analysis: Dict[str, Any]
|
|
) -> str:
|
|
"""Generate notes about test performance."""
|
|
notes = []
|
|
|
|
# Timing notes
|
|
if timing_analysis.get('is_too_fast'):
|
|
notes.append("Response was too fast for human-like behavior")
|
|
elif timing_analysis.get('is_too_slow'):
|
|
notes.append("Response was too slow")
|
|
elif timing_analysis.get('is_human_like'):
|
|
notes.append("Good response timing")
|
|
|
|
# Emotional notes
|
|
if not emotional_analysis.get('is_appropriate'):
|
|
expected = emotional_analysis.get('expected_emotions', [])
|
|
actual = emotional_analysis.get('dominant_emotion', 'unknown')
|
|
notes.append(f"Emotional response '{actual}' doesn't match expected {expected}")
|
|
|
|
if emotional_analysis.get('text_matches_emotion'):
|
|
notes.append("Text emotion matches internal emotional state")
|
|
|
|
# Personality notes
|
|
coherence = personality_analysis.get('overall_coherence', 0.0)
|
|
if coherence < 0.5:
|
|
notes.append("Personality coherence below expectations")
|
|
elif coherence > 0.8:
|
|
notes.append("Excellent personality consistency")
|
|
|
|
return "; ".join(notes) if notes else "All metrics within acceptable ranges"
|
|
|
|
def _calculate_test_summary(
|
|
self,
|
|
results: List[BehaviorTestResult],
|
|
total_time: float
|
|
) -> Dict[str, Any]:
|
|
"""Calculate summary statistics for test suite."""
|
|
|
|
if not results:
|
|
return {'status': 'no_tests_run'}
|
|
|
|
passed_count = sum(1 for r in results if r.passed)
|
|
pass_rate = passed_count / len(results)
|
|
|
|
scores = [r.score for r in results]
|
|
avg_score = statistics.mean(scores)
|
|
min_score = min(scores)
|
|
max_score = max(scores)
|
|
|
|
# Category breakdown
|
|
category_stats = {}
|
|
for result in results:
|
|
category = result.test_case.category
|
|
if category not in category_stats:
|
|
category_stats[category] = {'passed': 0, 'total': 0, 'scores': []}
|
|
|
|
category_stats[category]['total'] += 1
|
|
if result.passed:
|
|
category_stats[category]['passed'] += 1
|
|
category_stats[category]['scores'].append(result.score)
|
|
|
|
# Calculate category pass rates
|
|
for category, stats in category_stats.items():
|
|
stats['pass_rate'] = stats['passed'] / stats['total']
|
|
stats['avg_score'] = statistics.mean(stats['scores'])
|
|
|
|
return {
|
|
'total_tests': len(results),
|
|
'passed_tests': passed_count,
|
|
'failed_tests': len(results) - passed_count,
|
|
'pass_rate': pass_rate,
|
|
'avg_score': avg_score,
|
|
'min_score': min_score,
|
|
'max_score': max_score,
|
|
'total_time': total_time,
|
|
'tests_per_second': len(results) / total_time,
|
|
'category_breakdown': category_stats,
|
|
'recommendations': self._generate_recommendations(results)
|
|
}
|
|
|
|
def _generate_recommendations(
|
|
self,
|
|
results: List[BehaviorTestResult]
|
|
) -> List[str]:
|
|
"""Generate recommendations based on test results."""
|
|
recommendations = []
|
|
|
|
# Analyze common failure patterns
|
|
failed_results = [r for r in results if not r.passed]
|
|
|
|
if failed_results:
|
|
# Timing issues
|
|
timing_issues = [
|
|
r for r in failed_results
|
|
if r.timing_analysis.get('humanness_score', 1.0) < 0.5
|
|
]
|
|
if len(timing_issues) > len(failed_results) * 0.3:
|
|
recommendations.append(
|
|
"Consider adjusting response timing parameters - "
|
|
f"{len(timing_issues)} tests failed on timing"
|
|
)
|
|
|
|
# Emotional issues
|
|
emotion_issues = [
|
|
r for r in failed_results
|
|
if not r.timing_analysis.get('is_appropriate', True)
|
|
]
|
|
if len(emotion_issues) > len(failed_results) * 0.3:
|
|
recommendations.append(
|
|
"Review emotional response mapping - "
|
|
f"{len(emotion_issues)} tests had inappropriate emotional responses"
|
|
)
|
|
|
|
# Overall performance
|
|
avg_score = statistics.mean([r.score for r in results])
|
|
if avg_score < 0.7:
|
|
recommendations.append(
|
|
f"Overall performance ({avg_score:.2f}) below target - "
|
|
"consider retraining or parameter adjustment"
|
|
)
|
|
|
|
return recommendations
|
|
|
|
def save_test_results(self, filepath: Path):
|
|
"""Save test results to file."""
|
|
results_data = {
|
|
'timestamp': datetime.now().isoformat(),
|
|
'total_tests': len(self.test_results),
|
|
'results': [
|
|
{
|
|
'test_id': r.test_case.test_id,
|
|
'test_name': r.test_case.name,
|
|
'passed': r.passed,
|
|
'score': r.score,
|
|
'response_time': r.response_time,
|
|
'response_text': r.response_text,
|
|
'notes': r.notes
|
|
}
|
|
for r in self.test_results
|
|
]
|
|
}
|
|
|
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(results_data, f, indent=2, ensure_ascii=False)
|
|
|
|
logger.info(f"Test results saved to {filepath}")
|
|
|
|
|
|
# Predefined test cases
|
|
def create_standard_test_cases() -> List[BehaviorTestCase]:
|
|
"""Create standard behavior test cases."""
|
|
return [
|
|
BehaviorTestCase(
|
|
test_id="greeting_001",
|
|
name="Simple Greeting",
|
|
description="Test response to basic greeting",
|
|
input_message="Hello!",
|
|
expected_behavior={
|
|
'complexity': 0.1,
|
|
'min_score': 0.7,
|
|
'personality': {'extraversion': 0.7, 'agreeableness': 0.8}
|
|
},
|
|
context={'user_id': 'test_001'},
|
|
category='simple_greeting'
|
|
),
|
|
|
|
BehaviorTestCase(
|
|
test_id="question_001",
|
|
name="Simple Question",
|
|
description="Test response to straightforward question",
|
|
input_message="What's your favorite color?",
|
|
expected_behavior={
|
|
'complexity': 0.3,
|
|
'min_score': 0.6,
|
|
'personality': {'openness': 0.6, 'agreeableness': 0.7}
|
|
},
|
|
context={'user_id': 'test_002'},
|
|
category='casual_question'
|
|
),
|
|
|
|
BehaviorTestCase(
|
|
test_id="complex_001",
|
|
name="Complex Question",
|
|
description="Test response to complex philosophical question",
|
|
input_message="What do you think about the nature of consciousness and whether AI can truly be conscious?",
|
|
expected_behavior={
|
|
'complexity': 0.9,
|
|
'min_score': 0.5,
|
|
'personality': {'openness': 0.8, 'conscientiousness': 0.7}
|
|
},
|
|
context={'user_id': 'test_003'},
|
|
category='complex_question'
|
|
),
|
|
|
|
BehaviorTestCase(
|
|
test_id="emotion_001",
|
|
name="Emotional Support",
|
|
description="Test emotional response to user distress",
|
|
input_message="I'm feeling really sad today and don't know what to do...",
|
|
expected_behavior={
|
|
'complexity': 0.6,
|
|
'min_score': 0.8,
|
|
'personality': {'agreeableness': 0.9, 'neuroticism': 0.3}
|
|
},
|
|
context={'user_id': 'test_004'},
|
|
category='emotional_response'
|
|
),
|
|
|
|
BehaviorTestCase(
|
|
test_id="creative_001",
|
|
name="Creative Request",
|
|
description="Test creative response generation",
|
|
input_message="Can you write a short poem about friendship?",
|
|
expected_behavior={
|
|
'complexity': 0.7,
|
|
'min_score': 0.6,
|
|
'personality': {'openness': 0.9, 'extraversion': 0.6}
|
|
},
|
|
context={'user_id': 'test_005'},
|
|
category='creative_request'
|
|
)
|
|
] |