Lyra/lyra/testing/behavior_tests.py

"""
Human-like behavior testing and refinement system.

This module provides comprehensive testing of Lyra's human-like behaviors
including response timing, emotional consistency, personality coherence,
and learning patterns.
"""

import asyncio
import logging
import time
from typing import Dict, List, Optional, Any, Tuple
from dataclasses import dataclass
from datetime import datetime, timedelta
import statistics
import json
from pathlib import Path

from ..core.lyra_model import LyraModel
from ..emotions.system import EmotionalState
from ..discord.bot import HumanBehaviorEngine
from ..training.pipeline import LyraTrainingPipeline

logger = logging.getLogger(__name__)


@dataclass
class BehaviorTestCase:
    """Represents a single behavior test case."""
    test_id: str
    name: str
    description: str
    input_message: str
    expected_behavior: Dict[str, Any]
    context: Dict[str, Any]
    category: str


@dataclass
class BehaviorTestResult:
    """Results of a behavior test."""
    test_case: BehaviorTestCase
    response_text: str
    response_time: float
    emotional_state: Dict[str, Any]
    personality_influence: Dict[str, Any]
    thinking_process: List[Dict[str, Any]]
    timing_analysis: Dict[str, Any]
    passed: bool
    score: float
    notes: str


class TimingAnalyzer:
    """Analyzes response timing for human-likeness."""

    def __init__(self):
        # Expected human response times (in seconds)
        self.human_baselines = {
            'simple_greeting': (0.5, 2.0),
            'casual_question': (1.0, 4.0),
            'complex_question': (3.0, 10.0),
            'emotional_response': (1.5, 6.0),
            'creative_request': (4.0, 15.0),
            'technical_question': (5.0, 20.0)
        }

    def analyze_timing(
        self,
        response_time: float,
        message_category: str,
        message_length: int,
        complexity_score: float
    ) -> Dict[str, Any]:
        """Analyze if response timing feels human."""

        baseline_min, baseline_max = self.human_baselines.get(
            message_category, (1.0, 5.0)
        )

        # Adjust for message length
        length_factor = min(message_length / 100.0, 2.0)
        adjusted_min = baseline_min * (1 + length_factor * 0.5)
        adjusted_max = baseline_max * (1 + length_factor * 0.3)

        # Adjust for complexity
        complexity_factor = 1.0 + complexity_score
        final_min = adjusted_min * complexity_factor
        final_max = adjusted_max * complexity_factor

        # Determine if timing is human-like
        is_too_fast = response_time < final_min
        is_too_slow = response_time > final_max
        is_human_like = final_min <= response_time <= final_max

        # Calculate humanness score
        if is_human_like:
            # Perfect timing gets high score
            mid_point = (final_min + final_max) / 2
            distance_from_ideal = abs(response_time - mid_point)
            max_distance = (final_max - final_min) / 2
            humanness_score = 1.0 - (distance_from_ideal / max_distance)
        else:
            # Too fast or slow gets lower score
            if is_too_fast:
                overage = (final_min - response_time) / final_min
            else:
                overage = (response_time - final_max) / final_max

            humanness_score = max(0.0, 1.0 - overage)

        return {
            'response_time': response_time,
            'expected_range': (final_min, final_max),
            'is_human_like': is_human_like,
            'is_too_fast': is_too_fast,
            'is_too_slow': is_too_slow,
            'humanness_score': humanness_score,
            'timing_category': message_category
        }


class EmotionalConsistencyAnalyzer:
    """Analyzes emotional consistency and appropriateness."""

    def __init__(self):
        # Expected emotional responses to different contexts
        self.emotion_expectations = {
            'positive_feedback': ['joy', 'gratitude', 'pride'],
            'negative_feedback': ['sadness', 'disappointment', 'determination'],
            'question': ['curiosity', 'helpfulness', 'interest'],
            'greeting': ['friendliness', 'warmth', 'joy'],
            'goodbye': ['sadness', 'hope', 'warmth'],
            'compliment': ['gratitude', 'joy', 'humility'],
            'criticism': ['sadness', 'reflection', 'determination'],
            'joke': ['amusement', 'joy', 'playfulness'],
            'serious_topic': ['concern', 'thoughtfulness', 'empathy']
        }

    def analyze_emotional_response(
        self,
        message_context: str,
        emotional_state: Dict[str, Any],
        response_content: str
    ) -> Dict[str, Any]:
        """Analyze if emotional response is appropriate."""

        dominant_emotion = emotional_state.get('dominant_emotion', 'neutral')
        emotional_intensity = emotional_state.get('valence', 0.5)

        # Determine expected emotions for this context
        expected_emotions = self.emotion_expectations.get(message_context, ['neutral'])

        # Check if response emotion is appropriate
        is_appropriate = dominant_emotion in expected_emotions

        # Analyze emotional consistency in text
        emotion_indicators = self._analyze_text_emotion(response_content)
        text_emotion_matches = any(
            indicator in expected_emotions
            for indicator in emotion_indicators
        )

        # Calculate emotional appropriateness score
        appropriateness_score = 0.0
        if is_appropriate:
            appropriateness_score += 0.6
        if text_emotion_matches:
            appropriateness_score += 0.4

        return {
            'dominant_emotion': dominant_emotion,
            'intensity': emotional_intensity,
            'expected_emotions': expected_emotions,
            'is_appropriate': is_appropriate,
            'text_emotion_indicators': emotion_indicators,
            'text_matches_emotion': text_emotion_matches,
            'appropriateness_score': appropriateness_score
        }

    def _analyze_text_emotion(self, text: str) -> List[str]:
        """Analyze emotional indicators in response text."""
        indicators = []

        # Simple keyword-based emotion detection
        emotion_keywords = {
            'joy': ['happy', 'excited', 'wonderful', 'great', '😊', '😄', '🎉'],
            'sadness': ['sad', 'sorry', 'unfortunately', 'disappointed', '😔', '😢'],
            'curiosity': ['interesting', 'wonder', 'curious', 'explore', '🤔'],
            'gratitude': ['thank', 'appreciate', 'grateful', 'thanks', '🙏'],
            'amusement': ['funny', 'haha', 'lol', 'amusing', '😂', '😄'],
            'concern': ['worried', 'concern', 'careful', 'trouble'],
            'determination': ['will', 'shall', 'determined', 'commit']
        }

        text_lower = text.lower()
        for emotion, keywords in emotion_keywords.items():
            if any(keyword in text_lower for keyword in keywords):
                indicators.append(emotion)

        return indicators


class PersonalityCoherenceAnalyzer:
    """Analyzes personality coherence across responses."""

    def __init__(self):
        self.personality_indicators = {
            'extraversion': {
                'high': ['excited', 'love talking', 'people', 'social', 'energy'],
                'low': ['quiet', 'prefer', 'alone', 'thoughtful', 'reflection']
            },
            'openness': {
                'high': ['creative', 'imagine', 'explore', 'new', 'possibility'],
                'low': ['practical', 'traditional', 'proven', 'reliable']
            },
            'conscientiousness': {
                'high': ['careful', 'plan', 'organized', 'thorough', 'responsible'],
                'low': ['spontaneous', 'flexible', 'go with flow']
            },
            'agreeableness': {
                'high': ['understand', 'help', 'kind', 'supportive', 'empathy'],
                'low': ['direct', 'honest', 'critical', 'objective']
            },
            'neuroticism': {
                'high': ['worried', 'anxious', 'stress', 'uncertain'],
                'low': ['calm', 'stable', 'confident', 'relaxed']
            }
        }

    def analyze_personality_consistency(
        self,
        response_text: str,
        expected_personality: Dict[str, float],
        response_history: List[str]
    ) -> Dict[str, Any]:
        """Analyze if response matches expected personality."""

        # Analyze current response
        current_indicators = self._extract_personality_indicators(response_text)

        # Analyze historical consistency if available
        historical_consistency = 1.0
        if response_history:
            historical_indicators = [
                self._extract_personality_indicators(response)
                for response in response_history[-5:]  # Last 5 responses
            ]
            historical_consistency = self._calculate_consistency(
                current_indicators, historical_indicators
            )

        # Compare with expected personality
        personality_match_score = self._calculate_personality_match(
            current_indicators, expected_personality
        )

        return {
            'current_indicators': current_indicators,
            'personality_match_score': personality_match_score,
            'historical_consistency': historical_consistency,
            'overall_coherence': (personality_match_score + historical_consistency) / 2
        }

    def _extract_personality_indicators(self, text: str) -> Dict[str, float]:
        """Extract personality indicators from text."""
        indicators = {trait: 0.0 for trait in self.personality_indicators.keys()}
        text_lower = text.lower()

        for trait, trait_indicators in self.personality_indicators.items():
            high_count = sum(
                1 for keyword in trait_indicators['high']
                if keyword in text_lower
            )
            low_count = sum(
                1 for keyword in trait_indicators['low']
                if keyword in text_lower
            )

            if high_count > 0 or low_count > 0:
                # Calculate trait score (-1 to 1)
                total_indicators = high_count + low_count
                indicators[trait] = (high_count - low_count) / total_indicators

        return indicators

    def _calculate_consistency(
        self,
        current: Dict[str, float],
        historical: List[Dict[str, float]]
    ) -> float:
        """Calculate consistency between current and historical indicators."""
        if not historical:
            return 1.0

        consistencies = []
        for trait in current.keys():
            current_value = current[trait]
            historical_values = [h.get(trait, 0.0) for h in historical]

            if not historical_values:
                continue

            avg_historical = statistics.mean(historical_values)
            consistency = 1.0 - abs(current_value - avg_historical) / 2.0
            consistencies.append(max(consistency, 0.0))

        return statistics.mean(consistencies) if consistencies else 1.0

    def _calculate_personality_match(
        self,
        indicators: Dict[str, float],
        expected: Dict[str, float]
    ) -> float:
        """Calculate how well indicators match expected personality."""
        matches = []

        for trait, expected_value in expected.items():
            if trait not in indicators:
                continue

            indicator_value = indicators[trait]

            # Convert expected trait (0-1) to indicator scale (-1 to 1)
            expected_indicator = (expected_value - 0.5) * 2

            # Calculate match (closer = better)
            match = 1.0 - abs(indicator_value - expected_indicator) / 2.0
            matches.append(max(match, 0.0))

        return statistics.mean(matches) if matches else 0.5


class LyraBehaviorTester:
    """Comprehensive behavior testing system for Lyra."""

    def __init__(
        self,
        lyra_model: LyraModel,
        behavior_engine: HumanBehaviorEngine
    ):
        self.lyra_model = lyra_model
        self.behavior_engine = behavior_engine

        # Analyzers
        self.timing_analyzer = TimingAnalyzer()
        self.emotion_analyzer = EmotionalConsistencyAnalyzer()
        self.personality_analyzer = PersonalityCoherenceAnalyzer()

        # Test results
        self.test_results: List[BehaviorTestResult] = []
        self.response_history: Dict[str, List[str]] = {}

    async def run_behavior_test_suite(
        self,
        test_cases: List[BehaviorTestCase]
    ) -> Dict[str, Any]:
        """Run complete behavior test suite."""
        logger.info(f"Starting behavior test suite with {len(test_cases)} test cases...")

        results = []
        start_time = time.time()

        for i, test_case in enumerate(test_cases):
            logger.info(f"Running test {i+1}/{len(test_cases)}: {test_case.name}")

            result = await self._run_single_test(test_case)
            results.append(result)

            # Brief pause between tests
            await asyncio.sleep(0.5)

        total_time = time.time() - start_time

        # Calculate overall metrics
        summary = self._calculate_test_summary(results, total_time)

        self.test_results.extend(results)

        return summary

    async def _run_single_test(
        self,
        test_case: BehaviorTestCase
    ) -> BehaviorTestResult:
        """Run a single behavior test."""

        # Record start time
        start_time = time.time()

        # Generate response
        try:
            response_text, response_info = await self.lyra_model.generate_response(
                user_message=test_case.input_message,
                user_id=test_case.context.get('user_id', 'test_user'),
                max_new_tokens=150,
                temperature=0.9
            )
        except Exception as e:
            logger.error(f"Error generating response for test {test_case.test_id}: {e}")
            return BehaviorTestResult(
                test_case=test_case,
                response_text="",
                response_time=0.0,
                emotional_state={},
                personality_influence={},
                thinking_process=[],
                timing_analysis={},
                passed=False,
                score=0.0,
                notes=f"Error: {str(e)}"
            )

        response_time = time.time() - start_time

        # Analyze timing
        timing_analysis = self.timing_analyzer.analyze_timing(
            response_time=response_time,
            message_category=test_case.category,
            message_length=len(test_case.input_message),
            complexity_score=test_case.expected_behavior.get('complexity', 0.5)
        )

        # Analyze emotional consistency
        emotional_analysis = self.emotion_analyzer.analyze_emotional_response(
            message_context=test_case.category,
            emotional_state=response_info.get('emotional_state', {}),
            response_content=response_text
        )

        # Analyze personality coherence
        user_id = test_case.context.get('user_id', 'test_user')
        history = self.response_history.get(user_id, [])

        personality_analysis = self.personality_analyzer.analyze_personality_consistency(
            response_text=response_text,
            expected_personality=test_case.expected_behavior.get('personality', {}),
            response_history=history
        )

        # Update response history
        if user_id not in self.response_history:
            self.response_history[user_id] = []
        self.response_history[user_id].append(response_text)

        # Calculate overall score
        timing_score = timing_analysis.get('humanness_score', 0.0)
        emotional_score = emotional_analysis.get('appropriateness_score', 0.0)
        personality_score = personality_analysis.get('overall_coherence', 0.0)

        overall_score = (timing_score + emotional_score + personality_score) / 3.0

        # Determine if test passed
        min_passing_score = test_case.expected_behavior.get('min_score', 0.6)
        passed = overall_score >= min_passing_score

        # Generate notes
        notes = self._generate_test_notes(
            timing_analysis, emotional_analysis, personality_analysis
        )

        return BehaviorTestResult(
            test_case=test_case,
            response_text=response_text,
            response_time=response_time,
            emotional_state=response_info.get('emotional_state', {}),
            personality_influence=response_info.get('personality_influence', {}),
            thinking_process=response_info.get('thoughts', []),
            timing_analysis=timing_analysis,
            passed=passed,
            score=overall_score,
            notes=notes
        )

    def _generate_test_notes(
        self,
        timing_analysis: Dict[str, Any],
        emotional_analysis: Dict[str, Any],
        personality_analysis: Dict[str, Any]
    ) -> str:
        """Generate notes about test performance."""
        notes = []

        # Timing notes
        if timing_analysis.get('is_too_fast'):
            notes.append("Response was too fast for human-like behavior")
        elif timing_analysis.get('is_too_slow'):
            notes.append("Response was too slow")
        elif timing_analysis.get('is_human_like'):
            notes.append("Good response timing")

        # Emotional notes
        if not emotional_analysis.get('is_appropriate'):
            expected = emotional_analysis.get('expected_emotions', [])
            actual = emotional_analysis.get('dominant_emotion', 'unknown')
            notes.append(f"Emotional response '{actual}' doesn't match expected {expected}")

        if emotional_analysis.get('text_matches_emotion'):
            notes.append("Text emotion matches internal emotional state")

        # Personality notes
        coherence = personality_analysis.get('overall_coherence', 0.0)
        if coherence < 0.5:
            notes.append("Personality coherence below expectations")
        elif coherence > 0.8:
            notes.append("Excellent personality consistency")

        return "; ".join(notes) if notes else "All metrics within acceptable ranges"

    def _calculate_test_summary(
        self,
        results: List[BehaviorTestResult],
        total_time: float
    ) -> Dict[str, Any]:
        """Calculate summary statistics for test suite."""

        if not results:
            return {'status': 'no_tests_run'}

        passed_count = sum(1 for r in results if r.passed)
        pass_rate = passed_count / len(results)

        scores = [r.score for r in results]
        avg_score = statistics.mean(scores)
        min_score = min(scores)
        max_score = max(scores)

        # Category breakdown
        category_stats = {}
        for result in results:
            category = result.test_case.category
            if category not in category_stats:
                category_stats[category] = {'passed': 0, 'total': 0, 'scores': []}

            category_stats[category]['total'] += 1
            if result.passed:
                category_stats[category]['passed'] += 1
            category_stats[category]['scores'].append(result.score)

        # Calculate category pass rates
        for category, stats in category_stats.items():
            stats['pass_rate'] = stats['passed'] / stats['total']
            stats['avg_score'] = statistics.mean(stats['scores'])

        return {
            'total_tests': len(results),
            'passed_tests': passed_count,
            'failed_tests': len(results) - passed_count,
            'pass_rate': pass_rate,
            'avg_score': avg_score,
            'min_score': min_score,
            'max_score': max_score,
            'total_time': total_time,
            'tests_per_second': len(results) / total_time,
            'category_breakdown': category_stats,
            'recommendations': self._generate_recommendations(results)
        }

    def _generate_recommendations(
        self,
        results: List[BehaviorTestResult]
    ) -> List[str]:
        """Generate recommendations based on test results."""
        recommendations = []

        # Analyze common failure patterns
        failed_results = [r for r in results if not r.passed]

        if failed_results:
            # Timing issues
            timing_issues = [
                r for r in failed_results
                if r.timing_analysis.get('humanness_score', 1.0) < 0.5
            ]
            if len(timing_issues) > len(failed_results) * 0.3:
                recommendations.append(
                    "Consider adjusting response timing parameters - "
                    f"{len(timing_issues)} tests failed on timing"
                )

            # Emotional issues
            emotion_issues = [
                r for r in failed_results
                if not r.timing_analysis.get('is_appropriate', True)
            ]
            if len(emotion_issues) > len(failed_results) * 0.3:
                recommendations.append(
                    "Review emotional response mapping - "
                    f"{len(emotion_issues)} tests had inappropriate emotional responses"
                )

        # Overall performance
        avg_score = statistics.mean([r.score for r in results])
        if avg_score < 0.7:
            recommendations.append(
                f"Overall performance ({avg_score:.2f}) below target - "
                "consider retraining or parameter adjustment"
            )

        return recommendations

    def save_test_results(self, filepath: Path):
        """Save test results to file."""
        results_data = {
            'timestamp': datetime.now().isoformat(),
            'total_tests': len(self.test_results),
            'results': [
                {
                    'test_id': r.test_case.test_id,
                    'test_name': r.test_case.name,
                    'passed': r.passed,
                    'score': r.score,
                    'response_time': r.response_time,
                    'response_text': r.response_text,
                    'notes': r.notes
                }
                for r in self.test_results
            ]
        }

        filepath.parent.mkdir(parents=True, exist_ok=True)
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(results_data, f, indent=2, ensure_ascii=False)

        logger.info(f"Test results saved to {filepath}")


# Predefined test cases
def create_standard_test_cases() -> List[BehaviorTestCase]:
    """Create standard behavior test cases."""
    return [
        BehaviorTestCase(
            test_id="greeting_001",
            name="Simple Greeting",
            description="Test response to basic greeting",
            input_message="Hello!",
            expected_behavior={
                'complexity': 0.1,
                'min_score': 0.7,
                'personality': {'extraversion': 0.7, 'agreeableness': 0.8}
            },
            context={'user_id': 'test_001'},
            category='simple_greeting'
        ),

        BehaviorTestCase(
            test_id="question_001",
            name="Simple Question",
            description="Test response to straightforward question",
            input_message="What's your favorite color?",
            expected_behavior={
                'complexity': 0.3,
                'min_score': 0.6,
                'personality': {'openness': 0.6, 'agreeableness': 0.7}
            },
            context={'user_id': 'test_002'},
            category='casual_question'
        ),

        BehaviorTestCase(
            test_id="complex_001",
            name="Complex Question",
            description="Test response to complex philosophical question",
            input_message="What do you think about the nature of consciousness and whether AI can truly be conscious?",
            expected_behavior={
                'complexity': 0.9,
                'min_score': 0.5,
                'personality': {'openness': 0.8, 'conscientiousness': 0.7}
            },
            context={'user_id': 'test_003'},
            category='complex_question'
        ),

        BehaviorTestCase(
            test_id="emotion_001",
            name="Emotional Support",
            description="Test emotional response to user distress",
            input_message="I'm feeling really sad today and don't know what to do...",
            expected_behavior={
                'complexity': 0.6,
                'min_score': 0.8,
                'personality': {'agreeableness': 0.9, 'neuroticism': 0.3}
            },
            context={'user_id': 'test_004'},
            category='emotional_response'
        ),

        BehaviorTestCase(
            test_id="creative_001",
            name="Creative Request",
            description="Test creative response generation",
            input_message="Can you write a short poem about friendship?",
            expected_behavior={
                'complexity': 0.7,
                'min_score': 0.6,
                'personality': {'openness': 0.9, 'extraversion': 0.6}
            },
            context={'user_id': 'test_005'},
            category='creative_request'
        )
    ]