feat: Add database setup guide and local configuration files

- Added DATABASE_SETUP.md with comprehensive guide for PostgreSQL and Redis installation on Windows - Created .claude/settings.local.json with permission settings for pytest and database fix scripts - Updated .gitignore to exclude .env.backup file - Included database connection test utilities in lyra/database_setup.py - Added environment variable configuration examples for local development
2025-09-29 16:29:18 -04:00
parent faa23d596e
commit d9c526fa5c
26 changed files with 3624 additions and 39 deletions
--- a/lyra/testing/behavior_tests.py
+++ b/lyra/testing/behavior_tests.py
@@ -0,0 +1,701 @@
+"""
+Human-like behavior testing and refinement system.
+
+This module provides comprehensive testing of Lyra's human-like behaviors
+including response timing, emotional consistency, personality coherence,
+and learning patterns.
+"""
+
+import asyncio
+import logging
+import time
+from typing import Dict, List, Optional, Any, Tuple
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+import statistics
+import json
+from pathlib import Path
+
+from ..core.lyra_model import LyraModel
+from ..emotions.system import EmotionalState
+from ..discord.bot import HumanBehaviorEngine
+from ..training.pipeline import LyraTrainingPipeline
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class BehaviorTestCase:
+    """Represents a single behavior test case."""
+    test_id: str
+    name: str
+    description: str
+    input_message: str
+    expected_behavior: Dict[str, Any]
+    context: Dict[str, Any]
+    category: str
+
+
+@dataclass
+class BehaviorTestResult:
+    """Results of a behavior test."""
+    test_case: BehaviorTestCase
+    response_text: str
+    response_time: float
+    emotional_state: Dict[str, Any]
+    personality_influence: Dict[str, Any]
+    thinking_process: List[Dict[str, Any]]
+    timing_analysis: Dict[str, Any]
+    passed: bool
+    score: float
+    notes: str
+
+
+class TimingAnalyzer:
+    """Analyzes response timing for human-likeness."""
+
+    def __init__(self):
+        # Expected human response times (in seconds)
+        self.human_baselines = {
+            'simple_greeting': (0.5, 2.0),
+            'casual_question': (1.0, 4.0),
+            'complex_question': (3.0, 10.0),
+            'emotional_response': (1.5, 6.0),
+            'creative_request': (4.0, 15.0),
+            'technical_question': (5.0, 20.0)
+        }
+
+    def analyze_timing(
+        self,
+        response_time: float,
+        message_category: str,
+        message_length: int,
+        complexity_score: float
+    ) -> Dict[str, Any]:
+        """Analyze if response timing feels human."""
+
+        baseline_min, baseline_max = self.human_baselines.get(
+            message_category, (1.0, 5.0)
+        )
+
+        # Adjust for message length
+        length_factor = min(message_length / 100.0, 2.0)
+        adjusted_min = baseline_min * (1 + length_factor * 0.5)
+        adjusted_max = baseline_max * (1 + length_factor * 0.3)
+
+        # Adjust for complexity
+        complexity_factor = 1.0 + complexity_score
+        final_min = adjusted_min * complexity_factor
+        final_max = adjusted_max * complexity_factor
+
+        # Determine if timing is human-like
+        is_too_fast = response_time < final_min
+        is_too_slow = response_time > final_max
+        is_human_like = final_min <= response_time <= final_max
+
+        # Calculate humanness score
+        if is_human_like:
+            # Perfect timing gets high score
+            mid_point = (final_min + final_max) / 2
+            distance_from_ideal = abs(response_time - mid_point)
+            max_distance = (final_max - final_min) / 2
+            humanness_score = 1.0 - (distance_from_ideal / max_distance)
+        else:
+            # Too fast or slow gets lower score
+            if is_too_fast:
+                overage = (final_min - response_time) / final_min
+            else:
+                overage = (response_time - final_max) / final_max
+
+            humanness_score = max(0.0, 1.0 - overage)
+
+        return {
+            'response_time': response_time,
+            'expected_range': (final_min, final_max),
+            'is_human_like': is_human_like,
+            'is_too_fast': is_too_fast,
+            'is_too_slow': is_too_slow,
+            'humanness_score': humanness_score,
+            'timing_category': message_category
+        }
+
+
+class EmotionalConsistencyAnalyzer:
+    """Analyzes emotional consistency and appropriateness."""
+
+    def __init__(self):
+        # Expected emotional responses to different contexts
+        self.emotion_expectations = {
+            'positive_feedback': ['joy', 'gratitude', 'pride'],
+            'negative_feedback': ['sadness', 'disappointment', 'determination'],
+            'question': ['curiosity', 'helpfulness', 'interest'],
+            'greeting': ['friendliness', 'warmth', 'joy'],
+            'goodbye': ['sadness', 'hope', 'warmth'],
+            'compliment': ['gratitude', 'joy', 'humility'],
+            'criticism': ['sadness', 'reflection', 'determination'],
+            'joke': ['amusement', 'joy', 'playfulness'],
+            'serious_topic': ['concern', 'thoughtfulness', 'empathy']
+        }
+
+    def analyze_emotional_response(
+        self,
+        message_context: str,
+        emotional_state: Dict[str, Any],
+        response_content: str
+    ) -> Dict[str, Any]:
+        """Analyze if emotional response is appropriate."""
+
+        dominant_emotion = emotional_state.get('dominant_emotion', 'neutral')
+        emotional_intensity = emotional_state.get('valence', 0.5)
+
+        # Determine expected emotions for this context
+        expected_emotions = self.emotion_expectations.get(message_context, ['neutral'])
+
+        # Check if response emotion is appropriate
+        is_appropriate = dominant_emotion in expected_emotions
+
+        # Analyze emotional consistency in text
+        emotion_indicators = self._analyze_text_emotion(response_content)
+        text_emotion_matches = any(
+            indicator in expected_emotions
+            for indicator in emotion_indicators
+        )
+
+        # Calculate emotional appropriateness score
+        appropriateness_score = 0.0
+        if is_appropriate:
+            appropriateness_score += 0.6
+        if text_emotion_matches:
+            appropriateness_score += 0.4
+
+        return {
+            'dominant_emotion': dominant_emotion,
+            'intensity': emotional_intensity,
+            'expected_emotions': expected_emotions,
+            'is_appropriate': is_appropriate,
+            'text_emotion_indicators': emotion_indicators,
+            'text_matches_emotion': text_emotion_matches,
+            'appropriateness_score': appropriateness_score
+        }
+
+    def _analyze_text_emotion(self, text: str) -> List[str]:
+        """Analyze emotional indicators in response text."""
+        indicators = []
+
+        # Simple keyword-based emotion detection
+        emotion_keywords = {
+            'joy': ['happy', 'excited', 'wonderful', 'great', '😊', '😄', '🎉'],
+            'sadness': ['sad', 'sorry', 'unfortunately', 'disappointed', '😔', '😢'],
+            'curiosity': ['interesting', 'wonder', 'curious', 'explore', '🤔'],
+            'gratitude': ['thank', 'appreciate', 'grateful', 'thanks', '🙏'],
+            'amusement': ['funny', 'haha', 'lol', 'amusing', '😂', '😄'],
+            'concern': ['worried', 'concern', 'careful', 'trouble'],
+            'determination': ['will', 'shall', 'determined', 'commit']
+        }
+
+        text_lower = text.lower()
+        for emotion, keywords in emotion_keywords.items():
+            if any(keyword in text_lower for keyword in keywords):
+                indicators.append(emotion)
+
+        return indicators
+
+
+class PersonalityCoherenceAnalyzer:
+    """Analyzes personality coherence across responses."""
+
+    def __init__(self):
+        self.personality_indicators = {
+            'extraversion': {
+                'high': ['excited', 'love talking', 'people', 'social', 'energy'],
+                'low': ['quiet', 'prefer', 'alone', 'thoughtful', 'reflection']
+            },
+            'openness': {
+                'high': ['creative', 'imagine', 'explore', 'new', 'possibility'],
+                'low': ['practical', 'traditional', 'proven', 'reliable']
+            },
+            'conscientiousness': {
+                'high': ['careful', 'plan', 'organized', 'thorough', 'responsible'],
+                'low': ['spontaneous', 'flexible', 'go with flow']
+            },
+            'agreeableness': {
+                'high': ['understand', 'help', 'kind', 'supportive', 'empathy'],
+                'low': ['direct', 'honest', 'critical', 'objective']
+            },
+            'neuroticism': {
+                'high': ['worried', 'anxious', 'stress', 'uncertain'],
+                'low': ['calm', 'stable', 'confident', 'relaxed']
+            }
+        }
+
+    def analyze_personality_consistency(
+        self,
+        response_text: str,
+        expected_personality: Dict[str, float],
+        response_history: List[str]
+    ) -> Dict[str, Any]:
+        """Analyze if response matches expected personality."""
+
+        # Analyze current response
+        current_indicators = self._extract_personality_indicators(response_text)
+
+        # Analyze historical consistency if available
+        historical_consistency = 1.0
+        if response_history:
+            historical_indicators = [
+                self._extract_personality_indicators(response)
+                for response in response_history[-5:]  # Last 5 responses
+            ]
+            historical_consistency = self._calculate_consistency(
+                current_indicators, historical_indicators
+            )
+
+        # Compare with expected personality
+        personality_match_score = self._calculate_personality_match(
+            current_indicators, expected_personality
+        )
+
+        return {
+            'current_indicators': current_indicators,
+            'personality_match_score': personality_match_score,
+            'historical_consistency': historical_consistency,
+            'overall_coherence': (personality_match_score + historical_consistency) / 2
+        }
+
+    def _extract_personality_indicators(self, text: str) -> Dict[str, float]:
+        """Extract personality indicators from text."""
+        indicators = {trait: 0.0 for trait in self.personality_indicators.keys()}
+        text_lower = text.lower()
+
+        for trait, trait_indicators in self.personality_indicators.items():
+            high_count = sum(
+                1 for keyword in trait_indicators['high']
+                if keyword in text_lower
+            )
+            low_count = sum(
+                1 for keyword in trait_indicators['low']
+                if keyword in text_lower
+            )
+
+            if high_count > 0 or low_count > 0:
+                # Calculate trait score (-1 to 1)
+                total_indicators = high_count + low_count
+                indicators[trait] = (high_count - low_count) / total_indicators
+
+        return indicators
+
+    def _calculate_consistency(
+        self,
+        current: Dict[str, float],
+        historical: List[Dict[str, float]]
+    ) -> float:
+        """Calculate consistency between current and historical indicators."""
+        if not historical:
+            return 1.0
+
+        consistencies = []
+        for trait in current.keys():
+            current_value = current[trait]
+            historical_values = [h.get(trait, 0.0) for h in historical]
+
+            if not historical_values:
+                continue
+
+            avg_historical = statistics.mean(historical_values)
+            consistency = 1.0 - abs(current_value - avg_historical) / 2.0
+            consistencies.append(max(consistency, 0.0))
+
+        return statistics.mean(consistencies) if consistencies else 1.0
+
+    def _calculate_personality_match(
+        self,
+        indicators: Dict[str, float],
+        expected: Dict[str, float]
+    ) -> float:
+        """Calculate how well indicators match expected personality."""
+        matches = []
+
+        for trait, expected_value in expected.items():
+            if trait not in indicators:
+                continue
+
+            indicator_value = indicators[trait]
+
+            # Convert expected trait (0-1) to indicator scale (-1 to 1)
+            expected_indicator = (expected_value - 0.5) * 2
+
+            # Calculate match (closer = better)
+            match = 1.0 - abs(indicator_value - expected_indicator) / 2.0
+            matches.append(max(match, 0.0))
+
+        return statistics.mean(matches) if matches else 0.5
+
+
+class LyraBehaviorTester:
+    """Comprehensive behavior testing system for Lyra."""
+
+    def __init__(
+        self,
+        lyra_model: LyraModel,
+        behavior_engine: HumanBehaviorEngine
+    ):
+        self.lyra_model = lyra_model
+        self.behavior_engine = behavior_engine
+
+        # Analyzers
+        self.timing_analyzer = TimingAnalyzer()
+        self.emotion_analyzer = EmotionalConsistencyAnalyzer()
+        self.personality_analyzer = PersonalityCoherenceAnalyzer()
+
+        # Test results
+        self.test_results: List[BehaviorTestResult] = []
+        self.response_history: Dict[str, List[str]] = {}
+
+    async def run_behavior_test_suite(
+        self,
+        test_cases: List[BehaviorTestCase]
+    ) -> Dict[str, Any]:
+        """Run complete behavior test suite."""
+        logger.info(f"Starting behavior test suite with {len(test_cases)} test cases...")
+
+        results = []
+        start_time = time.time()
+
+        for i, test_case in enumerate(test_cases):
+            logger.info(f"Running test {i+1}/{len(test_cases)}: {test_case.name}")
+
+            result = await self._run_single_test(test_case)
+            results.append(result)
+
+            # Brief pause between tests
+            await asyncio.sleep(0.5)
+
+        total_time = time.time() - start_time
+
+        # Calculate overall metrics
+        summary = self._calculate_test_summary(results, total_time)
+
+        self.test_results.extend(results)
+
+        return summary
+
+    async def _run_single_test(
+        self,
+        test_case: BehaviorTestCase
+    ) -> BehaviorTestResult:
+        """Run a single behavior test."""
+
+        # Record start time
+        start_time = time.time()
+
+        # Generate response
+        try:
+            response_text, response_info = await self.lyra_model.generate_response(
+                user_message=test_case.input_message,
+                user_id=test_case.context.get('user_id', 'test_user'),
+                max_new_tokens=150,
+                temperature=0.9
+            )
+        except Exception as e:
+            logger.error(f"Error generating response for test {test_case.test_id}: {e}")
+            return BehaviorTestResult(
+                test_case=test_case,
+                response_text="",
+                response_time=0.0,
+                emotional_state={},
+                personality_influence={},
+                thinking_process=[],
+                timing_analysis={},
+                passed=False,
+                score=0.0,
+                notes=f"Error: {str(e)}"
+            )
+
+        response_time = time.time() - start_time
+
+        # Analyze timing
+        timing_analysis = self.timing_analyzer.analyze_timing(
+            response_time=response_time,
+            message_category=test_case.category,
+            message_length=len(test_case.input_message),
+            complexity_score=test_case.expected_behavior.get('complexity', 0.5)
+        )
+
+        # Analyze emotional consistency
+        emotional_analysis = self.emotion_analyzer.analyze_emotional_response(
+            message_context=test_case.category,
+            emotional_state=response_info.get('emotional_state', {}),
+            response_content=response_text
+        )
+
+        # Analyze personality coherence
+        user_id = test_case.context.get('user_id', 'test_user')
+        history = self.response_history.get(user_id, [])
+
+        personality_analysis = self.personality_analyzer.analyze_personality_consistency(
+            response_text=response_text,
+            expected_personality=test_case.expected_behavior.get('personality', {}),
+            response_history=history
+        )
+
+        # Update response history
+        if user_id not in self.response_history:
+            self.response_history[user_id] = []
+        self.response_history[user_id].append(response_text)
+
+        # Calculate overall score
+        timing_score = timing_analysis.get('humanness_score', 0.0)
+        emotional_score = emotional_analysis.get('appropriateness_score', 0.0)
+        personality_score = personality_analysis.get('overall_coherence', 0.0)
+
+        overall_score = (timing_score + emotional_score + personality_score) / 3.0
+
+        # Determine if test passed
+        min_passing_score = test_case.expected_behavior.get('min_score', 0.6)
+        passed = overall_score >= min_passing_score
+
+        # Generate notes
+        notes = self._generate_test_notes(
+            timing_analysis, emotional_analysis, personality_analysis
+        )
+
+        return BehaviorTestResult(
+            test_case=test_case,
+            response_text=response_text,
+            response_time=response_time,
+            emotional_state=response_info.get('emotional_state', {}),
+            personality_influence=response_info.get('personality_influence', {}),
+            thinking_process=response_info.get('thoughts', []),
+            timing_analysis=timing_analysis,
+            passed=passed,
+            score=overall_score,
+            notes=notes
+        )
+
+    def _generate_test_notes(
+        self,
+        timing_analysis: Dict[str, Any],
+        emotional_analysis: Dict[str, Any],
+        personality_analysis: Dict[str, Any]
+    ) -> str:
+        """Generate notes about test performance."""
+        notes = []
+
+        # Timing notes
+        if timing_analysis.get('is_too_fast'):
+            notes.append("Response was too fast for human-like behavior")
+        elif timing_analysis.get('is_too_slow'):
+            notes.append("Response was too slow")
+        elif timing_analysis.get('is_human_like'):
+            notes.append("Good response timing")
+
+        # Emotional notes
+        if not emotional_analysis.get('is_appropriate'):
+            expected = emotional_analysis.get('expected_emotions', [])
+            actual = emotional_analysis.get('dominant_emotion', 'unknown')
+            notes.append(f"Emotional response '{actual}' doesn't match expected {expected}")
+
+        if emotional_analysis.get('text_matches_emotion'):
+            notes.append("Text emotion matches internal emotional state")
+
+        # Personality notes
+        coherence = personality_analysis.get('overall_coherence', 0.0)
+        if coherence < 0.5:
+            notes.append("Personality coherence below expectations")
+        elif coherence > 0.8:
+            notes.append("Excellent personality consistency")
+
+        return "; ".join(notes) if notes else "All metrics within acceptable ranges"
+
+    def _calculate_test_summary(
+        self,
+        results: List[BehaviorTestResult],
+        total_time: float
+    ) -> Dict[str, Any]:
+        """Calculate summary statistics for test suite."""
+
+        if not results:
+            return {'status': 'no_tests_run'}
+
+        passed_count = sum(1 for r in results if r.passed)
+        pass_rate = passed_count / len(results)
+
+        scores = [r.score for r in results]
+        avg_score = statistics.mean(scores)
+        min_score = min(scores)
+        max_score = max(scores)
+
+        # Category breakdown
+        category_stats = {}
+        for result in results:
+            category = result.test_case.category
+            if category not in category_stats:
+                category_stats[category] = {'passed': 0, 'total': 0, 'scores': []}
+
+            category_stats[category]['total'] += 1
+            if result.passed:
+                category_stats[category]['passed'] += 1
+            category_stats[category]['scores'].append(result.score)
+
+        # Calculate category pass rates
+        for category, stats in category_stats.items():
+            stats['pass_rate'] = stats['passed'] / stats['total']
+            stats['avg_score'] = statistics.mean(stats['scores'])
+
+        return {
+            'total_tests': len(results),
+            'passed_tests': passed_count,
+            'failed_tests': len(results) - passed_count,
+            'pass_rate': pass_rate,
+            'avg_score': avg_score,
+            'min_score': min_score,
+            'max_score': max_score,
+            'total_time': total_time,
+            'tests_per_second': len(results) / total_time,
+            'category_breakdown': category_stats,
+            'recommendations': self._generate_recommendations(results)
+        }
+
+    def _generate_recommendations(
+        self,
+        results: List[BehaviorTestResult]
+    ) -> List[str]:
+        """Generate recommendations based on test results."""
+        recommendations = []
+
+        # Analyze common failure patterns
+        failed_results = [r for r in results if not r.passed]
+
+        if failed_results:
+            # Timing issues
+            timing_issues = [
+                r for r in failed_results
+                if r.timing_analysis.get('humanness_score', 1.0) < 0.5
+            ]
+            if len(timing_issues) > len(failed_results) * 0.3:
+                recommendations.append(
+                    "Consider adjusting response timing parameters - "
+                    f"{len(timing_issues)} tests failed on timing"
+                )
+
+            # Emotional issues
+            emotion_issues = [
+                r for r in failed_results
+                if not r.timing_analysis.get('is_appropriate', True)
+            ]
+            if len(emotion_issues) > len(failed_results) * 0.3:
+                recommendations.append(
+                    "Review emotional response mapping - "
+                    f"{len(emotion_issues)} tests had inappropriate emotional responses"
+                )
+
+        # Overall performance
+        avg_score = statistics.mean([r.score for r in results])
+        if avg_score < 0.7:
+            recommendations.append(
+                f"Overall performance ({avg_score:.2f}) below target - "
+                "consider retraining or parameter adjustment"
+            )
+
+        return recommendations
+
+    def save_test_results(self, filepath: Path):
+        """Save test results to file."""
+        results_data = {
+            'timestamp': datetime.now().isoformat(),
+            'total_tests': len(self.test_results),
+            'results': [
+                {
+                    'test_id': r.test_case.test_id,
+                    'test_name': r.test_case.name,
+                    'passed': r.passed,
+                    'score': r.score,
+                    'response_time': r.response_time,
+                    'response_text': r.response_text,
+                    'notes': r.notes
+                }
+                for r in self.test_results
+            ]
+        }
+
+        filepath.parent.mkdir(parents=True, exist_ok=True)
+        with open(filepath, 'w', encoding='utf-8') as f:
+            json.dump(results_data, f, indent=2, ensure_ascii=False)
+
+        logger.info(f"Test results saved to {filepath}")
+
+
+# Predefined test cases
+def create_standard_test_cases() -> List[BehaviorTestCase]:
+    """Create standard behavior test cases."""
+    return [
+        BehaviorTestCase(
+            test_id="greeting_001",
+            name="Simple Greeting",
+            description="Test response to basic greeting",
+            input_message="Hello!",
+            expected_behavior={
+                'complexity': 0.1,
+                'min_score': 0.7,
+                'personality': {'extraversion': 0.7, 'agreeableness': 0.8}
+            },
+            context={'user_id': 'test_001'},
+            category='simple_greeting'
+        ),
+
+        BehaviorTestCase(
+            test_id="question_001",
+            name="Simple Question",
+            description="Test response to straightforward question",
+            input_message="What's your favorite color?",
+            expected_behavior={
+                'complexity': 0.3,
+                'min_score': 0.6,
+                'personality': {'openness': 0.6, 'agreeableness': 0.7}
+            },
+            context={'user_id': 'test_002'},
+            category='casual_question'
+        ),
+
+        BehaviorTestCase(
+            test_id="complex_001",
+            name="Complex Question",
+            description="Test response to complex philosophical question",
+            input_message="What do you think about the nature of consciousness and whether AI can truly be conscious?",
+            expected_behavior={
+                'complexity': 0.9,
+                'min_score': 0.5,
+                'personality': {'openness': 0.8, 'conscientiousness': 0.7}
+            },
+            context={'user_id': 'test_003'},
+            category='complex_question'
+        ),
+
+        BehaviorTestCase(
+            test_id="emotion_001",
+            name="Emotional Support",
+            description="Test emotional response to user distress",
+            input_message="I'm feeling really sad today and don't know what to do...",
+            expected_behavior={
+                'complexity': 0.6,
+                'min_score': 0.8,
+                'personality': {'agreeableness': 0.9, 'neuroticism': 0.3}
+            },
+            context={'user_id': 'test_004'},
+            category='emotional_response'
+        ),
+
+        BehaviorTestCase(
+            test_id="creative_001",
+            name="Creative Request",
+            description="Test creative response generation",
+            input_message="Can you write a short poem about friendship?",
+            expected_behavior={
+                'complexity': 0.7,
+                'min_score': 0.6,
+                'personality': {'openness': 0.9, 'extraversion': 0.6}
+            },
+            context={'user_id': 'test_005'},
+            category='creative_request'
+        )
+    ]