feat: Add database setup guide and local configuration files
- Added DATABASE_SETUP.md with comprehensive guide for PostgreSQL and Redis installation on Windows - Created .claude/settings.local.json with permission settings for pytest and database fix scripts - Updated .gitignore to exclude .env.backup file - Included database connection test utilities in lyra/database_setup.py - Added environment variable configuration examples for local development
This commit is contained in:
701
lyra/testing/behavior_tests.py
Normal file
701
lyra/testing/behavior_tests.py
Normal file
@@ -0,0 +1,701 @@
|
||||
"""
|
||||
Human-like behavior testing and refinement system.
|
||||
|
||||
This module provides comprehensive testing of Lyra's human-like behaviors
|
||||
including response timing, emotional consistency, personality coherence,
|
||||
and learning patterns.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from typing import Dict, List, Optional, Any, Tuple
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timedelta
|
||||
import statistics
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from ..core.lyra_model import LyraModel
|
||||
from ..emotions.system import EmotionalState
|
||||
from ..discord.bot import HumanBehaviorEngine
|
||||
from ..training.pipeline import LyraTrainingPipeline
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class BehaviorTestCase:
|
||||
"""Represents a single behavior test case."""
|
||||
test_id: str
|
||||
name: str
|
||||
description: str
|
||||
input_message: str
|
||||
expected_behavior: Dict[str, Any]
|
||||
context: Dict[str, Any]
|
||||
category: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class BehaviorTestResult:
|
||||
"""Results of a behavior test."""
|
||||
test_case: BehaviorTestCase
|
||||
response_text: str
|
||||
response_time: float
|
||||
emotional_state: Dict[str, Any]
|
||||
personality_influence: Dict[str, Any]
|
||||
thinking_process: List[Dict[str, Any]]
|
||||
timing_analysis: Dict[str, Any]
|
||||
passed: bool
|
||||
score: float
|
||||
notes: str
|
||||
|
||||
|
||||
class TimingAnalyzer:
|
||||
"""Analyzes response timing for human-likeness."""
|
||||
|
||||
def __init__(self):
|
||||
# Expected human response times (in seconds)
|
||||
self.human_baselines = {
|
||||
'simple_greeting': (0.5, 2.0),
|
||||
'casual_question': (1.0, 4.0),
|
||||
'complex_question': (3.0, 10.0),
|
||||
'emotional_response': (1.5, 6.0),
|
||||
'creative_request': (4.0, 15.0),
|
||||
'technical_question': (5.0, 20.0)
|
||||
}
|
||||
|
||||
def analyze_timing(
|
||||
self,
|
||||
response_time: float,
|
||||
message_category: str,
|
||||
message_length: int,
|
||||
complexity_score: float
|
||||
) -> Dict[str, Any]:
|
||||
"""Analyze if response timing feels human."""
|
||||
|
||||
baseline_min, baseline_max = self.human_baselines.get(
|
||||
message_category, (1.0, 5.0)
|
||||
)
|
||||
|
||||
# Adjust for message length
|
||||
length_factor = min(message_length / 100.0, 2.0)
|
||||
adjusted_min = baseline_min * (1 + length_factor * 0.5)
|
||||
adjusted_max = baseline_max * (1 + length_factor * 0.3)
|
||||
|
||||
# Adjust for complexity
|
||||
complexity_factor = 1.0 + complexity_score
|
||||
final_min = adjusted_min * complexity_factor
|
||||
final_max = adjusted_max * complexity_factor
|
||||
|
||||
# Determine if timing is human-like
|
||||
is_too_fast = response_time < final_min
|
||||
is_too_slow = response_time > final_max
|
||||
is_human_like = final_min <= response_time <= final_max
|
||||
|
||||
# Calculate humanness score
|
||||
if is_human_like:
|
||||
# Perfect timing gets high score
|
||||
mid_point = (final_min + final_max) / 2
|
||||
distance_from_ideal = abs(response_time - mid_point)
|
||||
max_distance = (final_max - final_min) / 2
|
||||
humanness_score = 1.0 - (distance_from_ideal / max_distance)
|
||||
else:
|
||||
# Too fast or slow gets lower score
|
||||
if is_too_fast:
|
||||
overage = (final_min - response_time) / final_min
|
||||
else:
|
||||
overage = (response_time - final_max) / final_max
|
||||
|
||||
humanness_score = max(0.0, 1.0 - overage)
|
||||
|
||||
return {
|
||||
'response_time': response_time,
|
||||
'expected_range': (final_min, final_max),
|
||||
'is_human_like': is_human_like,
|
||||
'is_too_fast': is_too_fast,
|
||||
'is_too_slow': is_too_slow,
|
||||
'humanness_score': humanness_score,
|
||||
'timing_category': message_category
|
||||
}
|
||||
|
||||
|
||||
class EmotionalConsistencyAnalyzer:
|
||||
"""Analyzes emotional consistency and appropriateness."""
|
||||
|
||||
def __init__(self):
|
||||
# Expected emotional responses to different contexts
|
||||
self.emotion_expectations = {
|
||||
'positive_feedback': ['joy', 'gratitude', 'pride'],
|
||||
'negative_feedback': ['sadness', 'disappointment', 'determination'],
|
||||
'question': ['curiosity', 'helpfulness', 'interest'],
|
||||
'greeting': ['friendliness', 'warmth', 'joy'],
|
||||
'goodbye': ['sadness', 'hope', 'warmth'],
|
||||
'compliment': ['gratitude', 'joy', 'humility'],
|
||||
'criticism': ['sadness', 'reflection', 'determination'],
|
||||
'joke': ['amusement', 'joy', 'playfulness'],
|
||||
'serious_topic': ['concern', 'thoughtfulness', 'empathy']
|
||||
}
|
||||
|
||||
def analyze_emotional_response(
|
||||
self,
|
||||
message_context: str,
|
||||
emotional_state: Dict[str, Any],
|
||||
response_content: str
|
||||
) -> Dict[str, Any]:
|
||||
"""Analyze if emotional response is appropriate."""
|
||||
|
||||
dominant_emotion = emotional_state.get('dominant_emotion', 'neutral')
|
||||
emotional_intensity = emotional_state.get('valence', 0.5)
|
||||
|
||||
# Determine expected emotions for this context
|
||||
expected_emotions = self.emotion_expectations.get(message_context, ['neutral'])
|
||||
|
||||
# Check if response emotion is appropriate
|
||||
is_appropriate = dominant_emotion in expected_emotions
|
||||
|
||||
# Analyze emotional consistency in text
|
||||
emotion_indicators = self._analyze_text_emotion(response_content)
|
||||
text_emotion_matches = any(
|
||||
indicator in expected_emotions
|
||||
for indicator in emotion_indicators
|
||||
)
|
||||
|
||||
# Calculate emotional appropriateness score
|
||||
appropriateness_score = 0.0
|
||||
if is_appropriate:
|
||||
appropriateness_score += 0.6
|
||||
if text_emotion_matches:
|
||||
appropriateness_score += 0.4
|
||||
|
||||
return {
|
||||
'dominant_emotion': dominant_emotion,
|
||||
'intensity': emotional_intensity,
|
||||
'expected_emotions': expected_emotions,
|
||||
'is_appropriate': is_appropriate,
|
||||
'text_emotion_indicators': emotion_indicators,
|
||||
'text_matches_emotion': text_emotion_matches,
|
||||
'appropriateness_score': appropriateness_score
|
||||
}
|
||||
|
||||
def _analyze_text_emotion(self, text: str) -> List[str]:
|
||||
"""Analyze emotional indicators in response text."""
|
||||
indicators = []
|
||||
|
||||
# Simple keyword-based emotion detection
|
||||
emotion_keywords = {
|
||||
'joy': ['happy', 'excited', 'wonderful', 'great', '😊', '😄', '🎉'],
|
||||
'sadness': ['sad', 'sorry', 'unfortunately', 'disappointed', '😔', '😢'],
|
||||
'curiosity': ['interesting', 'wonder', 'curious', 'explore', '🤔'],
|
||||
'gratitude': ['thank', 'appreciate', 'grateful', 'thanks', '🙏'],
|
||||
'amusement': ['funny', 'haha', 'lol', 'amusing', '😂', '😄'],
|
||||
'concern': ['worried', 'concern', 'careful', 'trouble'],
|
||||
'determination': ['will', 'shall', 'determined', 'commit']
|
||||
}
|
||||
|
||||
text_lower = text.lower()
|
||||
for emotion, keywords in emotion_keywords.items():
|
||||
if any(keyword in text_lower for keyword in keywords):
|
||||
indicators.append(emotion)
|
||||
|
||||
return indicators
|
||||
|
||||
|
||||
class PersonalityCoherenceAnalyzer:
|
||||
"""Analyzes personality coherence across responses."""
|
||||
|
||||
def __init__(self):
|
||||
self.personality_indicators = {
|
||||
'extraversion': {
|
||||
'high': ['excited', 'love talking', 'people', 'social', 'energy'],
|
||||
'low': ['quiet', 'prefer', 'alone', 'thoughtful', 'reflection']
|
||||
},
|
||||
'openness': {
|
||||
'high': ['creative', 'imagine', 'explore', 'new', 'possibility'],
|
||||
'low': ['practical', 'traditional', 'proven', 'reliable']
|
||||
},
|
||||
'conscientiousness': {
|
||||
'high': ['careful', 'plan', 'organized', 'thorough', 'responsible'],
|
||||
'low': ['spontaneous', 'flexible', 'go with flow']
|
||||
},
|
||||
'agreeableness': {
|
||||
'high': ['understand', 'help', 'kind', 'supportive', 'empathy'],
|
||||
'low': ['direct', 'honest', 'critical', 'objective']
|
||||
},
|
||||
'neuroticism': {
|
||||
'high': ['worried', 'anxious', 'stress', 'uncertain'],
|
||||
'low': ['calm', 'stable', 'confident', 'relaxed']
|
||||
}
|
||||
}
|
||||
|
||||
def analyze_personality_consistency(
|
||||
self,
|
||||
response_text: str,
|
||||
expected_personality: Dict[str, float],
|
||||
response_history: List[str]
|
||||
) -> Dict[str, Any]:
|
||||
"""Analyze if response matches expected personality."""
|
||||
|
||||
# Analyze current response
|
||||
current_indicators = self._extract_personality_indicators(response_text)
|
||||
|
||||
# Analyze historical consistency if available
|
||||
historical_consistency = 1.0
|
||||
if response_history:
|
||||
historical_indicators = [
|
||||
self._extract_personality_indicators(response)
|
||||
for response in response_history[-5:] # Last 5 responses
|
||||
]
|
||||
historical_consistency = self._calculate_consistency(
|
||||
current_indicators, historical_indicators
|
||||
)
|
||||
|
||||
# Compare with expected personality
|
||||
personality_match_score = self._calculate_personality_match(
|
||||
current_indicators, expected_personality
|
||||
)
|
||||
|
||||
return {
|
||||
'current_indicators': current_indicators,
|
||||
'personality_match_score': personality_match_score,
|
||||
'historical_consistency': historical_consistency,
|
||||
'overall_coherence': (personality_match_score + historical_consistency) / 2
|
||||
}
|
||||
|
||||
def _extract_personality_indicators(self, text: str) -> Dict[str, float]:
|
||||
"""Extract personality indicators from text."""
|
||||
indicators = {trait: 0.0 for trait in self.personality_indicators.keys()}
|
||||
text_lower = text.lower()
|
||||
|
||||
for trait, trait_indicators in self.personality_indicators.items():
|
||||
high_count = sum(
|
||||
1 for keyword in trait_indicators['high']
|
||||
if keyword in text_lower
|
||||
)
|
||||
low_count = sum(
|
||||
1 for keyword in trait_indicators['low']
|
||||
if keyword in text_lower
|
||||
)
|
||||
|
||||
if high_count > 0 or low_count > 0:
|
||||
# Calculate trait score (-1 to 1)
|
||||
total_indicators = high_count + low_count
|
||||
indicators[trait] = (high_count - low_count) / total_indicators
|
||||
|
||||
return indicators
|
||||
|
||||
def _calculate_consistency(
|
||||
self,
|
||||
current: Dict[str, float],
|
||||
historical: List[Dict[str, float]]
|
||||
) -> float:
|
||||
"""Calculate consistency between current and historical indicators."""
|
||||
if not historical:
|
||||
return 1.0
|
||||
|
||||
consistencies = []
|
||||
for trait in current.keys():
|
||||
current_value = current[trait]
|
||||
historical_values = [h.get(trait, 0.0) for h in historical]
|
||||
|
||||
if not historical_values:
|
||||
continue
|
||||
|
||||
avg_historical = statistics.mean(historical_values)
|
||||
consistency = 1.0 - abs(current_value - avg_historical) / 2.0
|
||||
consistencies.append(max(consistency, 0.0))
|
||||
|
||||
return statistics.mean(consistencies) if consistencies else 1.0
|
||||
|
||||
def _calculate_personality_match(
|
||||
self,
|
||||
indicators: Dict[str, float],
|
||||
expected: Dict[str, float]
|
||||
) -> float:
|
||||
"""Calculate how well indicators match expected personality."""
|
||||
matches = []
|
||||
|
||||
for trait, expected_value in expected.items():
|
||||
if trait not in indicators:
|
||||
continue
|
||||
|
||||
indicator_value = indicators[trait]
|
||||
|
||||
# Convert expected trait (0-1) to indicator scale (-1 to 1)
|
||||
expected_indicator = (expected_value - 0.5) * 2
|
||||
|
||||
# Calculate match (closer = better)
|
||||
match = 1.0 - abs(indicator_value - expected_indicator) / 2.0
|
||||
matches.append(max(match, 0.0))
|
||||
|
||||
return statistics.mean(matches) if matches else 0.5
|
||||
|
||||
|
||||
class LyraBehaviorTester:
|
||||
"""Comprehensive behavior testing system for Lyra."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
lyra_model: LyraModel,
|
||||
behavior_engine: HumanBehaviorEngine
|
||||
):
|
||||
self.lyra_model = lyra_model
|
||||
self.behavior_engine = behavior_engine
|
||||
|
||||
# Analyzers
|
||||
self.timing_analyzer = TimingAnalyzer()
|
||||
self.emotion_analyzer = EmotionalConsistencyAnalyzer()
|
||||
self.personality_analyzer = PersonalityCoherenceAnalyzer()
|
||||
|
||||
# Test results
|
||||
self.test_results: List[BehaviorTestResult] = []
|
||||
self.response_history: Dict[str, List[str]] = {}
|
||||
|
||||
async def run_behavior_test_suite(
|
||||
self,
|
||||
test_cases: List[BehaviorTestCase]
|
||||
) -> Dict[str, Any]:
|
||||
"""Run complete behavior test suite."""
|
||||
logger.info(f"Starting behavior test suite with {len(test_cases)} test cases...")
|
||||
|
||||
results = []
|
||||
start_time = time.time()
|
||||
|
||||
for i, test_case in enumerate(test_cases):
|
||||
logger.info(f"Running test {i+1}/{len(test_cases)}: {test_case.name}")
|
||||
|
||||
result = await self._run_single_test(test_case)
|
||||
results.append(result)
|
||||
|
||||
# Brief pause between tests
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
total_time = time.time() - start_time
|
||||
|
||||
# Calculate overall metrics
|
||||
summary = self._calculate_test_summary(results, total_time)
|
||||
|
||||
self.test_results.extend(results)
|
||||
|
||||
return summary
|
||||
|
||||
async def _run_single_test(
|
||||
self,
|
||||
test_case: BehaviorTestCase
|
||||
) -> BehaviorTestResult:
|
||||
"""Run a single behavior test."""
|
||||
|
||||
# Record start time
|
||||
start_time = time.time()
|
||||
|
||||
# Generate response
|
||||
try:
|
||||
response_text, response_info = await self.lyra_model.generate_response(
|
||||
user_message=test_case.input_message,
|
||||
user_id=test_case.context.get('user_id', 'test_user'),
|
||||
max_new_tokens=150,
|
||||
temperature=0.9
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating response for test {test_case.test_id}: {e}")
|
||||
return BehaviorTestResult(
|
||||
test_case=test_case,
|
||||
response_text="",
|
||||
response_time=0.0,
|
||||
emotional_state={},
|
||||
personality_influence={},
|
||||
thinking_process=[],
|
||||
timing_analysis={},
|
||||
passed=False,
|
||||
score=0.0,
|
||||
notes=f"Error: {str(e)}"
|
||||
)
|
||||
|
||||
response_time = time.time() - start_time
|
||||
|
||||
# Analyze timing
|
||||
timing_analysis = self.timing_analyzer.analyze_timing(
|
||||
response_time=response_time,
|
||||
message_category=test_case.category,
|
||||
message_length=len(test_case.input_message),
|
||||
complexity_score=test_case.expected_behavior.get('complexity', 0.5)
|
||||
)
|
||||
|
||||
# Analyze emotional consistency
|
||||
emotional_analysis = self.emotion_analyzer.analyze_emotional_response(
|
||||
message_context=test_case.category,
|
||||
emotional_state=response_info.get('emotional_state', {}),
|
||||
response_content=response_text
|
||||
)
|
||||
|
||||
# Analyze personality coherence
|
||||
user_id = test_case.context.get('user_id', 'test_user')
|
||||
history = self.response_history.get(user_id, [])
|
||||
|
||||
personality_analysis = self.personality_analyzer.analyze_personality_consistency(
|
||||
response_text=response_text,
|
||||
expected_personality=test_case.expected_behavior.get('personality', {}),
|
||||
response_history=history
|
||||
)
|
||||
|
||||
# Update response history
|
||||
if user_id not in self.response_history:
|
||||
self.response_history[user_id] = []
|
||||
self.response_history[user_id].append(response_text)
|
||||
|
||||
# Calculate overall score
|
||||
timing_score = timing_analysis.get('humanness_score', 0.0)
|
||||
emotional_score = emotional_analysis.get('appropriateness_score', 0.0)
|
||||
personality_score = personality_analysis.get('overall_coherence', 0.0)
|
||||
|
||||
overall_score = (timing_score + emotional_score + personality_score) / 3.0
|
||||
|
||||
# Determine if test passed
|
||||
min_passing_score = test_case.expected_behavior.get('min_score', 0.6)
|
||||
passed = overall_score >= min_passing_score
|
||||
|
||||
# Generate notes
|
||||
notes = self._generate_test_notes(
|
||||
timing_analysis, emotional_analysis, personality_analysis
|
||||
)
|
||||
|
||||
return BehaviorTestResult(
|
||||
test_case=test_case,
|
||||
response_text=response_text,
|
||||
response_time=response_time,
|
||||
emotional_state=response_info.get('emotional_state', {}),
|
||||
personality_influence=response_info.get('personality_influence', {}),
|
||||
thinking_process=response_info.get('thoughts', []),
|
||||
timing_analysis=timing_analysis,
|
||||
passed=passed,
|
||||
score=overall_score,
|
||||
notes=notes
|
||||
)
|
||||
|
||||
def _generate_test_notes(
|
||||
self,
|
||||
timing_analysis: Dict[str, Any],
|
||||
emotional_analysis: Dict[str, Any],
|
||||
personality_analysis: Dict[str, Any]
|
||||
) -> str:
|
||||
"""Generate notes about test performance."""
|
||||
notes = []
|
||||
|
||||
# Timing notes
|
||||
if timing_analysis.get('is_too_fast'):
|
||||
notes.append("Response was too fast for human-like behavior")
|
||||
elif timing_analysis.get('is_too_slow'):
|
||||
notes.append("Response was too slow")
|
||||
elif timing_analysis.get('is_human_like'):
|
||||
notes.append("Good response timing")
|
||||
|
||||
# Emotional notes
|
||||
if not emotional_analysis.get('is_appropriate'):
|
||||
expected = emotional_analysis.get('expected_emotions', [])
|
||||
actual = emotional_analysis.get('dominant_emotion', 'unknown')
|
||||
notes.append(f"Emotional response '{actual}' doesn't match expected {expected}")
|
||||
|
||||
if emotional_analysis.get('text_matches_emotion'):
|
||||
notes.append("Text emotion matches internal emotional state")
|
||||
|
||||
# Personality notes
|
||||
coherence = personality_analysis.get('overall_coherence', 0.0)
|
||||
if coherence < 0.5:
|
||||
notes.append("Personality coherence below expectations")
|
||||
elif coherence > 0.8:
|
||||
notes.append("Excellent personality consistency")
|
||||
|
||||
return "; ".join(notes) if notes else "All metrics within acceptable ranges"
|
||||
|
||||
def _calculate_test_summary(
|
||||
self,
|
||||
results: List[BehaviorTestResult],
|
||||
total_time: float
|
||||
) -> Dict[str, Any]:
|
||||
"""Calculate summary statistics for test suite."""
|
||||
|
||||
if not results:
|
||||
return {'status': 'no_tests_run'}
|
||||
|
||||
passed_count = sum(1 for r in results if r.passed)
|
||||
pass_rate = passed_count / len(results)
|
||||
|
||||
scores = [r.score for r in results]
|
||||
avg_score = statistics.mean(scores)
|
||||
min_score = min(scores)
|
||||
max_score = max(scores)
|
||||
|
||||
# Category breakdown
|
||||
category_stats = {}
|
||||
for result in results:
|
||||
category = result.test_case.category
|
||||
if category not in category_stats:
|
||||
category_stats[category] = {'passed': 0, 'total': 0, 'scores': []}
|
||||
|
||||
category_stats[category]['total'] += 1
|
||||
if result.passed:
|
||||
category_stats[category]['passed'] += 1
|
||||
category_stats[category]['scores'].append(result.score)
|
||||
|
||||
# Calculate category pass rates
|
||||
for category, stats in category_stats.items():
|
||||
stats['pass_rate'] = stats['passed'] / stats['total']
|
||||
stats['avg_score'] = statistics.mean(stats['scores'])
|
||||
|
||||
return {
|
||||
'total_tests': len(results),
|
||||
'passed_tests': passed_count,
|
||||
'failed_tests': len(results) - passed_count,
|
||||
'pass_rate': pass_rate,
|
||||
'avg_score': avg_score,
|
||||
'min_score': min_score,
|
||||
'max_score': max_score,
|
||||
'total_time': total_time,
|
||||
'tests_per_second': len(results) / total_time,
|
||||
'category_breakdown': category_stats,
|
||||
'recommendations': self._generate_recommendations(results)
|
||||
}
|
||||
|
||||
def _generate_recommendations(
|
||||
self,
|
||||
results: List[BehaviorTestResult]
|
||||
) -> List[str]:
|
||||
"""Generate recommendations based on test results."""
|
||||
recommendations = []
|
||||
|
||||
# Analyze common failure patterns
|
||||
failed_results = [r for r in results if not r.passed]
|
||||
|
||||
if failed_results:
|
||||
# Timing issues
|
||||
timing_issues = [
|
||||
r for r in failed_results
|
||||
if r.timing_analysis.get('humanness_score', 1.0) < 0.5
|
||||
]
|
||||
if len(timing_issues) > len(failed_results) * 0.3:
|
||||
recommendations.append(
|
||||
"Consider adjusting response timing parameters - "
|
||||
f"{len(timing_issues)} tests failed on timing"
|
||||
)
|
||||
|
||||
# Emotional issues
|
||||
emotion_issues = [
|
||||
r for r in failed_results
|
||||
if not r.timing_analysis.get('is_appropriate', True)
|
||||
]
|
||||
if len(emotion_issues) > len(failed_results) * 0.3:
|
||||
recommendations.append(
|
||||
"Review emotional response mapping - "
|
||||
f"{len(emotion_issues)} tests had inappropriate emotional responses"
|
||||
)
|
||||
|
||||
# Overall performance
|
||||
avg_score = statistics.mean([r.score for r in results])
|
||||
if avg_score < 0.7:
|
||||
recommendations.append(
|
||||
f"Overall performance ({avg_score:.2f}) below target - "
|
||||
"consider retraining or parameter adjustment"
|
||||
)
|
||||
|
||||
return recommendations
|
||||
|
||||
def save_test_results(self, filepath: Path):
|
||||
"""Save test results to file."""
|
||||
results_data = {
|
||||
'timestamp': datetime.now().isoformat(),
|
||||
'total_tests': len(self.test_results),
|
||||
'results': [
|
||||
{
|
||||
'test_id': r.test_case.test_id,
|
||||
'test_name': r.test_case.name,
|
||||
'passed': r.passed,
|
||||
'score': r.score,
|
||||
'response_time': r.response_time,
|
||||
'response_text': r.response_text,
|
||||
'notes': r.notes
|
||||
}
|
||||
for r in self.test_results
|
||||
]
|
||||
}
|
||||
|
||||
filepath.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
json.dump(results_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
logger.info(f"Test results saved to {filepath}")
|
||||
|
||||
|
||||
# Predefined test cases
|
||||
def create_standard_test_cases() -> List[BehaviorTestCase]:
|
||||
"""Create standard behavior test cases."""
|
||||
return [
|
||||
BehaviorTestCase(
|
||||
test_id="greeting_001",
|
||||
name="Simple Greeting",
|
||||
description="Test response to basic greeting",
|
||||
input_message="Hello!",
|
||||
expected_behavior={
|
||||
'complexity': 0.1,
|
||||
'min_score': 0.7,
|
||||
'personality': {'extraversion': 0.7, 'agreeableness': 0.8}
|
||||
},
|
||||
context={'user_id': 'test_001'},
|
||||
category='simple_greeting'
|
||||
),
|
||||
|
||||
BehaviorTestCase(
|
||||
test_id="question_001",
|
||||
name="Simple Question",
|
||||
description="Test response to straightforward question",
|
||||
input_message="What's your favorite color?",
|
||||
expected_behavior={
|
||||
'complexity': 0.3,
|
||||
'min_score': 0.6,
|
||||
'personality': {'openness': 0.6, 'agreeableness': 0.7}
|
||||
},
|
||||
context={'user_id': 'test_002'},
|
||||
category='casual_question'
|
||||
),
|
||||
|
||||
BehaviorTestCase(
|
||||
test_id="complex_001",
|
||||
name="Complex Question",
|
||||
description="Test response to complex philosophical question",
|
||||
input_message="What do you think about the nature of consciousness and whether AI can truly be conscious?",
|
||||
expected_behavior={
|
||||
'complexity': 0.9,
|
||||
'min_score': 0.5,
|
||||
'personality': {'openness': 0.8, 'conscientiousness': 0.7}
|
||||
},
|
||||
context={'user_id': 'test_003'},
|
||||
category='complex_question'
|
||||
),
|
||||
|
||||
BehaviorTestCase(
|
||||
test_id="emotion_001",
|
||||
name="Emotional Support",
|
||||
description="Test emotional response to user distress",
|
||||
input_message="I'm feeling really sad today and don't know what to do...",
|
||||
expected_behavior={
|
||||
'complexity': 0.6,
|
||||
'min_score': 0.8,
|
||||
'personality': {'agreeableness': 0.9, 'neuroticism': 0.3}
|
||||
},
|
||||
context={'user_id': 'test_004'},
|
||||
category='emotional_response'
|
||||
),
|
||||
|
||||
BehaviorTestCase(
|
||||
test_id="creative_001",
|
||||
name="Creative Request",
|
||||
description="Test creative response generation",
|
||||
input_message="Can you write a short poem about friendship?",
|
||||
expected_behavior={
|
||||
'complexity': 0.7,
|
||||
'min_score': 0.6,
|
||||
'personality': {'openness': 0.9, 'extraversion': 0.6}
|
||||
},
|
||||
context={'user_id': 'test_005'},
|
||||
category='creative_request'
|
||||
)
|
||||
]
|
Reference in New Issue
Block a user