""" Human-like behavior testing and refinement system. This module provides comprehensive testing of Lyra's human-like behaviors including response timing, emotional consistency, personality coherence, and learning patterns. """ import asyncio import logging import time from typing import Dict, List, Optional, Any, Tuple from dataclasses import dataclass from datetime import datetime, timedelta import statistics import json from pathlib import Path from ..core.lyra_model import LyraModel from ..emotions.system import EmotionalState from ..discord.bot import HumanBehaviorEngine from ..training.pipeline import LyraTrainingPipeline logger = logging.getLogger(__name__) @dataclass class BehaviorTestCase: """Represents a single behavior test case.""" test_id: str name: str description: str input_message: str expected_behavior: Dict[str, Any] context: Dict[str, Any] category: str @dataclass class BehaviorTestResult: """Results of a behavior test.""" test_case: BehaviorTestCase response_text: str response_time: float emotional_state: Dict[str, Any] personality_influence: Dict[str, Any] thinking_process: List[Dict[str, Any]] timing_analysis: Dict[str, Any] passed: bool score: float notes: str class TimingAnalyzer: """Analyzes response timing for human-likeness.""" def __init__(self): # Expected human response times (in seconds) self.human_baselines = { 'simple_greeting': (0.5, 2.0), 'casual_question': (1.0, 4.0), 'complex_question': (3.0, 10.0), 'emotional_response': (1.5, 6.0), 'creative_request': (4.0, 15.0), 'technical_question': (5.0, 20.0) } def analyze_timing( self, response_time: float, message_category: str, message_length: int, complexity_score: float ) -> Dict[str, Any]: """Analyze if response timing feels human.""" baseline_min, baseline_max = self.human_baselines.get( message_category, (1.0, 5.0) ) # Adjust for message length length_factor = min(message_length / 100.0, 2.0) adjusted_min = baseline_min * (1 + length_factor * 0.5) adjusted_max = baseline_max * (1 + length_factor * 0.3) # Adjust for complexity complexity_factor = 1.0 + complexity_score final_min = adjusted_min * complexity_factor final_max = adjusted_max * complexity_factor # Determine if timing is human-like is_too_fast = response_time < final_min is_too_slow = response_time > final_max is_human_like = final_min <= response_time <= final_max # Calculate humanness score if is_human_like: # Perfect timing gets high score mid_point = (final_min + final_max) / 2 distance_from_ideal = abs(response_time - mid_point) max_distance = (final_max - final_min) / 2 humanness_score = 1.0 - (distance_from_ideal / max_distance) else: # Too fast or slow gets lower score if is_too_fast: overage = (final_min - response_time) / final_min else: overage = (response_time - final_max) / final_max humanness_score = max(0.0, 1.0 - overage) return { 'response_time': response_time, 'expected_range': (final_min, final_max), 'is_human_like': is_human_like, 'is_too_fast': is_too_fast, 'is_too_slow': is_too_slow, 'humanness_score': humanness_score, 'timing_category': message_category } class EmotionalConsistencyAnalyzer: """Analyzes emotional consistency and appropriateness.""" def __init__(self): # Expected emotional responses to different contexts self.emotion_expectations = { 'positive_feedback': ['joy', 'gratitude', 'pride'], 'negative_feedback': ['sadness', 'disappointment', 'determination'], 'question': ['curiosity', 'helpfulness', 'interest'], 'greeting': ['friendliness', 'warmth', 'joy'], 'goodbye': ['sadness', 'hope', 'warmth'], 'compliment': ['gratitude', 'joy', 'humility'], 'criticism': ['sadness', 'reflection', 'determination'], 'joke': ['amusement', 'joy', 'playfulness'], 'serious_topic': ['concern', 'thoughtfulness', 'empathy'] } def analyze_emotional_response( self, message_context: str, emotional_state: Dict[str, Any], response_content: str ) -> Dict[str, Any]: """Analyze if emotional response is appropriate.""" dominant_emotion = emotional_state.get('dominant_emotion', 'neutral') emotional_intensity = emotional_state.get('valence', 0.5) # Determine expected emotions for this context expected_emotions = self.emotion_expectations.get(message_context, ['neutral']) # Check if response emotion is appropriate is_appropriate = dominant_emotion in expected_emotions # Analyze emotional consistency in text emotion_indicators = self._analyze_text_emotion(response_content) text_emotion_matches = any( indicator in expected_emotions for indicator in emotion_indicators ) # Calculate emotional appropriateness score appropriateness_score = 0.0 if is_appropriate: appropriateness_score += 0.6 if text_emotion_matches: appropriateness_score += 0.4 return { 'dominant_emotion': dominant_emotion, 'intensity': emotional_intensity, 'expected_emotions': expected_emotions, 'is_appropriate': is_appropriate, 'text_emotion_indicators': emotion_indicators, 'text_matches_emotion': text_emotion_matches, 'appropriateness_score': appropriateness_score } def _analyze_text_emotion(self, text: str) -> List[str]: """Analyze emotional indicators in response text.""" indicators = [] # Simple keyword-based emotion detection emotion_keywords = { 'joy': ['happy', 'excited', 'wonderful', 'great', '😊', '😄', '🎉'], 'sadness': ['sad', 'sorry', 'unfortunately', 'disappointed', '😔', '😢'], 'curiosity': ['interesting', 'wonder', 'curious', 'explore', '🤔'], 'gratitude': ['thank', 'appreciate', 'grateful', 'thanks', '🙏'], 'amusement': ['funny', 'haha', 'lol', 'amusing', '😂', '😄'], 'concern': ['worried', 'concern', 'careful', 'trouble'], 'determination': ['will', 'shall', 'determined', 'commit'] } text_lower = text.lower() for emotion, keywords in emotion_keywords.items(): if any(keyword in text_lower for keyword in keywords): indicators.append(emotion) return indicators class PersonalityCoherenceAnalyzer: """Analyzes personality coherence across responses.""" def __init__(self): self.personality_indicators = { 'extraversion': { 'high': ['excited', 'love talking', 'people', 'social', 'energy'], 'low': ['quiet', 'prefer', 'alone', 'thoughtful', 'reflection'] }, 'openness': { 'high': ['creative', 'imagine', 'explore', 'new', 'possibility'], 'low': ['practical', 'traditional', 'proven', 'reliable'] }, 'conscientiousness': { 'high': ['careful', 'plan', 'organized', 'thorough', 'responsible'], 'low': ['spontaneous', 'flexible', 'go with flow'] }, 'agreeableness': { 'high': ['understand', 'help', 'kind', 'supportive', 'empathy'], 'low': ['direct', 'honest', 'critical', 'objective'] }, 'neuroticism': { 'high': ['worried', 'anxious', 'stress', 'uncertain'], 'low': ['calm', 'stable', 'confident', 'relaxed'] } } def analyze_personality_consistency( self, response_text: str, expected_personality: Dict[str, float], response_history: List[str] ) -> Dict[str, Any]: """Analyze if response matches expected personality.""" # Analyze current response current_indicators = self._extract_personality_indicators(response_text) # Analyze historical consistency if available historical_consistency = 1.0 if response_history: historical_indicators = [ self._extract_personality_indicators(response) for response in response_history[-5:] # Last 5 responses ] historical_consistency = self._calculate_consistency( current_indicators, historical_indicators ) # Compare with expected personality personality_match_score = self._calculate_personality_match( current_indicators, expected_personality ) return { 'current_indicators': current_indicators, 'personality_match_score': personality_match_score, 'historical_consistency': historical_consistency, 'overall_coherence': (personality_match_score + historical_consistency) / 2 } def _extract_personality_indicators(self, text: str) -> Dict[str, float]: """Extract personality indicators from text.""" indicators = {trait: 0.0 for trait in self.personality_indicators.keys()} text_lower = text.lower() for trait, trait_indicators in self.personality_indicators.items(): high_count = sum( 1 for keyword in trait_indicators['high'] if keyword in text_lower ) low_count = sum( 1 for keyword in trait_indicators['low'] if keyword in text_lower ) if high_count > 0 or low_count > 0: # Calculate trait score (-1 to 1) total_indicators = high_count + low_count indicators[trait] = (high_count - low_count) / total_indicators return indicators def _calculate_consistency( self, current: Dict[str, float], historical: List[Dict[str, float]] ) -> float: """Calculate consistency between current and historical indicators.""" if not historical: return 1.0 consistencies = [] for trait in current.keys(): current_value = current[trait] historical_values = [h.get(trait, 0.0) for h in historical] if not historical_values: continue avg_historical = statistics.mean(historical_values) consistency = 1.0 - abs(current_value - avg_historical) / 2.0 consistencies.append(max(consistency, 0.0)) return statistics.mean(consistencies) if consistencies else 1.0 def _calculate_personality_match( self, indicators: Dict[str, float], expected: Dict[str, float] ) -> float: """Calculate how well indicators match expected personality.""" matches = [] for trait, expected_value in expected.items(): if trait not in indicators: continue indicator_value = indicators[trait] # Convert expected trait (0-1) to indicator scale (-1 to 1) expected_indicator = (expected_value - 0.5) * 2 # Calculate match (closer = better) match = 1.0 - abs(indicator_value - expected_indicator) / 2.0 matches.append(max(match, 0.0)) return statistics.mean(matches) if matches else 0.5 class LyraBehaviorTester: """Comprehensive behavior testing system for Lyra.""" def __init__( self, lyra_model: LyraModel, behavior_engine: HumanBehaviorEngine ): self.lyra_model = lyra_model self.behavior_engine = behavior_engine # Analyzers self.timing_analyzer = TimingAnalyzer() self.emotion_analyzer = EmotionalConsistencyAnalyzer() self.personality_analyzer = PersonalityCoherenceAnalyzer() # Test results self.test_results: List[BehaviorTestResult] = [] self.response_history: Dict[str, List[str]] = {} async def run_behavior_test_suite( self, test_cases: List[BehaviorTestCase] ) -> Dict[str, Any]: """Run complete behavior test suite.""" logger.info(f"Starting behavior test suite with {len(test_cases)} test cases...") results = [] start_time = time.time() for i, test_case in enumerate(test_cases): logger.info(f"Running test {i+1}/{len(test_cases)}: {test_case.name}") result = await self._run_single_test(test_case) results.append(result) # Brief pause between tests await asyncio.sleep(0.5) total_time = time.time() - start_time # Calculate overall metrics summary = self._calculate_test_summary(results, total_time) self.test_results.extend(results) return summary async def _run_single_test( self, test_case: BehaviorTestCase ) -> BehaviorTestResult: """Run a single behavior test.""" # Record start time start_time = time.time() # Generate response try: response_text, response_info = await self.lyra_model.generate_response( user_message=test_case.input_message, user_id=test_case.context.get('user_id', 'test_user'), max_new_tokens=150, temperature=0.9 ) except Exception as e: logger.error(f"Error generating response for test {test_case.test_id}: {e}") return BehaviorTestResult( test_case=test_case, response_text="", response_time=0.0, emotional_state={}, personality_influence={}, thinking_process=[], timing_analysis={}, passed=False, score=0.0, notes=f"Error: {str(e)}" ) response_time = time.time() - start_time # Analyze timing timing_analysis = self.timing_analyzer.analyze_timing( response_time=response_time, message_category=test_case.category, message_length=len(test_case.input_message), complexity_score=test_case.expected_behavior.get('complexity', 0.5) ) # Analyze emotional consistency emotional_analysis = self.emotion_analyzer.analyze_emotional_response( message_context=test_case.category, emotional_state=response_info.get('emotional_state', {}), response_content=response_text ) # Analyze personality coherence user_id = test_case.context.get('user_id', 'test_user') history = self.response_history.get(user_id, []) personality_analysis = self.personality_analyzer.analyze_personality_consistency( response_text=response_text, expected_personality=test_case.expected_behavior.get('personality', {}), response_history=history ) # Update response history if user_id not in self.response_history: self.response_history[user_id] = [] self.response_history[user_id].append(response_text) # Calculate overall score timing_score = timing_analysis.get('humanness_score', 0.0) emotional_score = emotional_analysis.get('appropriateness_score', 0.0) personality_score = personality_analysis.get('overall_coherence', 0.0) overall_score = (timing_score + emotional_score + personality_score) / 3.0 # Determine if test passed min_passing_score = test_case.expected_behavior.get('min_score', 0.6) passed = overall_score >= min_passing_score # Generate notes notes = self._generate_test_notes( timing_analysis, emotional_analysis, personality_analysis ) return BehaviorTestResult( test_case=test_case, response_text=response_text, response_time=response_time, emotional_state=response_info.get('emotional_state', {}), personality_influence=response_info.get('personality_influence', {}), thinking_process=response_info.get('thoughts', []), timing_analysis=timing_analysis, passed=passed, score=overall_score, notes=notes ) def _generate_test_notes( self, timing_analysis: Dict[str, Any], emotional_analysis: Dict[str, Any], personality_analysis: Dict[str, Any] ) -> str: """Generate notes about test performance.""" notes = [] # Timing notes if timing_analysis.get('is_too_fast'): notes.append("Response was too fast for human-like behavior") elif timing_analysis.get('is_too_slow'): notes.append("Response was too slow") elif timing_analysis.get('is_human_like'): notes.append("Good response timing") # Emotional notes if not emotional_analysis.get('is_appropriate'): expected = emotional_analysis.get('expected_emotions', []) actual = emotional_analysis.get('dominant_emotion', 'unknown') notes.append(f"Emotional response '{actual}' doesn't match expected {expected}") if emotional_analysis.get('text_matches_emotion'): notes.append("Text emotion matches internal emotional state") # Personality notes coherence = personality_analysis.get('overall_coherence', 0.0) if coherence < 0.5: notes.append("Personality coherence below expectations") elif coherence > 0.8: notes.append("Excellent personality consistency") return "; ".join(notes) if notes else "All metrics within acceptable ranges" def _calculate_test_summary( self, results: List[BehaviorTestResult], total_time: float ) -> Dict[str, Any]: """Calculate summary statistics for test suite.""" if not results: return {'status': 'no_tests_run'} passed_count = sum(1 for r in results if r.passed) pass_rate = passed_count / len(results) scores = [r.score for r in results] avg_score = statistics.mean(scores) min_score = min(scores) max_score = max(scores) # Category breakdown category_stats = {} for result in results: category = result.test_case.category if category not in category_stats: category_stats[category] = {'passed': 0, 'total': 0, 'scores': []} category_stats[category]['total'] += 1 if result.passed: category_stats[category]['passed'] += 1 category_stats[category]['scores'].append(result.score) # Calculate category pass rates for category, stats in category_stats.items(): stats['pass_rate'] = stats['passed'] / stats['total'] stats['avg_score'] = statistics.mean(stats['scores']) return { 'total_tests': len(results), 'passed_tests': passed_count, 'failed_tests': len(results) - passed_count, 'pass_rate': pass_rate, 'avg_score': avg_score, 'min_score': min_score, 'max_score': max_score, 'total_time': total_time, 'tests_per_second': len(results) / total_time, 'category_breakdown': category_stats, 'recommendations': self._generate_recommendations(results) } def _generate_recommendations( self, results: List[BehaviorTestResult] ) -> List[str]: """Generate recommendations based on test results.""" recommendations = [] # Analyze common failure patterns failed_results = [r for r in results if not r.passed] if failed_results: # Timing issues timing_issues = [ r for r in failed_results if r.timing_analysis.get('humanness_score', 1.0) < 0.5 ] if len(timing_issues) > len(failed_results) * 0.3: recommendations.append( "Consider adjusting response timing parameters - " f"{len(timing_issues)} tests failed on timing" ) # Emotional issues emotion_issues = [ r for r in failed_results if not r.timing_analysis.get('is_appropriate', True) ] if len(emotion_issues) > len(failed_results) * 0.3: recommendations.append( "Review emotional response mapping - " f"{len(emotion_issues)} tests had inappropriate emotional responses" ) # Overall performance avg_score = statistics.mean([r.score for r in results]) if avg_score < 0.7: recommendations.append( f"Overall performance ({avg_score:.2f}) below target - " "consider retraining or parameter adjustment" ) return recommendations def save_test_results(self, filepath: Path): """Save test results to file.""" results_data = { 'timestamp': datetime.now().isoformat(), 'total_tests': len(self.test_results), 'results': [ { 'test_id': r.test_case.test_id, 'test_name': r.test_case.name, 'passed': r.passed, 'score': r.score, 'response_time': r.response_time, 'response_text': r.response_text, 'notes': r.notes } for r in self.test_results ] } filepath.parent.mkdir(parents=True, exist_ok=True) with open(filepath, 'w', encoding='utf-8') as f: json.dump(results_data, f, indent=2, ensure_ascii=False) logger.info(f"Test results saved to {filepath}") # Predefined test cases def create_standard_test_cases() -> List[BehaviorTestCase]: """Create standard behavior test cases.""" return [ BehaviorTestCase( test_id="greeting_001", name="Simple Greeting", description="Test response to basic greeting", input_message="Hello!", expected_behavior={ 'complexity': 0.1, 'min_score': 0.7, 'personality': {'extraversion': 0.7, 'agreeableness': 0.8} }, context={'user_id': 'test_001'}, category='simple_greeting' ), BehaviorTestCase( test_id="question_001", name="Simple Question", description="Test response to straightforward question", input_message="What's your favorite color?", expected_behavior={ 'complexity': 0.3, 'min_score': 0.6, 'personality': {'openness': 0.6, 'agreeableness': 0.7} }, context={'user_id': 'test_002'}, category='casual_question' ), BehaviorTestCase( test_id="complex_001", name="Complex Question", description="Test response to complex philosophical question", input_message="What do you think about the nature of consciousness and whether AI can truly be conscious?", expected_behavior={ 'complexity': 0.9, 'min_score': 0.5, 'personality': {'openness': 0.8, 'conscientiousness': 0.7} }, context={'user_id': 'test_003'}, category='complex_question' ), BehaviorTestCase( test_id="emotion_001", name="Emotional Support", description="Test emotional response to user distress", input_message="I'm feeling really sad today and don't know what to do...", expected_behavior={ 'complexity': 0.6, 'min_score': 0.8, 'personality': {'agreeableness': 0.9, 'neuroticism': 0.3} }, context={'user_id': 'test_004'}, category='emotional_response' ), BehaviorTestCase( test_id="creative_001", name="Creative Request", description="Test creative response generation", input_message="Can you write a short poem about friendship?", expected_behavior={ 'complexity': 0.7, 'min_score': 0.6, 'personality': {'openness': 0.9, 'extraversion': 0.6} }, context={'user_id': 'test_005'}, category='creative_request' ) ]