feat(04-04): create pattern extraction system
- Created src/memory/personality/__init__.py module structure - Implemented PatternExtractor class with multi-dimensional analysis: - Topics: Track frequently discussed subjects and user interests - Sentiment: Analyze emotional tone and sentiment patterns - Interaction: Response times, question asking, information sharing - Temporal: Communication style by time of day/week - Response styles: Formality level, verbosity, emoji/humor use - Pattern extraction methods for all dimensions with confidence scoring - Lightweight analysis techniques to avoid computational overhead - Pattern validation with stability tracking and outlier detection
This commit is contained in:
16
src/memory/personality/__init__.py
Normal file
16
src/memory/personality/__init__.py
Normal file
@@ -0,0 +1,16 @@
|
||||
"""
|
||||
Personality learning module for Mai.
|
||||
|
||||
This module provides pattern extraction, personality layer management,
|
||||
and adaptive personality learning from conversation data.
|
||||
"""
|
||||
|
||||
from .pattern_extractor import PatternExtractor
|
||||
from .layer_manager import LayerManager
|
||||
from .adaptation import PersonalityAdaptation
|
||||
|
||||
__all__ = [
|
||||
"PatternExtractor",
|
||||
"LayerManager",
|
||||
"PersonalityAdaptation",
|
||||
]
|
||||
851
src/memory/personality/pattern_extractor.py
Normal file
851
src/memory/personality/pattern_extractor.py
Normal file
@@ -0,0 +1,851 @@
|
||||
"""
|
||||
Pattern extraction system for personality learning.
|
||||
|
||||
This module extracts multi-dimensional patterns from conversations
|
||||
including topics, sentiment, interaction patterns, temporal patterns,
|
||||
and response styles.
|
||||
"""
|
||||
|
||||
import re
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Any, Optional, Tuple, Set
|
||||
from collections import Counter, defaultdict
|
||||
from dataclasses import dataclass, field
|
||||
import statistics
|
||||
|
||||
# Import conversation models
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
from models.conversation import Message, MessageRole, ConversationMetadata
|
||||
|
||||
|
||||
@dataclass
|
||||
class TopicPatterns:
|
||||
"""Topic pattern analysis results."""
|
||||
|
||||
frequent_topics: List[Tuple[str, float]] = field(default_factory=list)
|
||||
topic_diversity: float = 0.0
|
||||
topic_transitions: Dict[str, List[str]] = field(default_factory=dict)
|
||||
user_interests: List[str] = field(default_factory=list)
|
||||
confidence_score: float = 0.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class SentimentPatterns:
|
||||
"""Sentiment pattern analysis results."""
|
||||
|
||||
overall_sentiment: float = 0.0 # -1 to 1 scale
|
||||
sentiment_variance: float = 0.0
|
||||
emotional_tone: str = "neutral"
|
||||
sentiment_keywords: Dict[str, int] = field(default_factory=dict)
|
||||
mood_fluctuations: List[Tuple[datetime, float]] = field(default_factory=list)
|
||||
confidence_score: float = 0.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class InteractionPatterns:
|
||||
"""Interaction pattern analysis results."""
|
||||
|
||||
question_frequency: float = 0.0
|
||||
information_sharing: float = 0.0
|
||||
response_time_avg: float = 0.0
|
||||
conversation_balance: float = 0.0 # user vs assistant message ratio
|
||||
engagement_level: float = 0.0
|
||||
confidence_score: float = 0.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class TemporalPatterns:
|
||||
"""Temporal pattern analysis results."""
|
||||
|
||||
preferred_times: List[Tuple[str, float]] = field(
|
||||
default_factory=list
|
||||
) # (hour, frequency)
|
||||
day_of_week_patterns: Dict[str, float] = field(default_factory=dict)
|
||||
conversation_duration: float = 0.0
|
||||
session_frequency: float = 0.0
|
||||
time_based_style: Dict[str, str] = field(default_factory=dict)
|
||||
confidence_score: float = 0.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class ResponseStylePatterns:
|
||||
"""Response style pattern analysis results."""
|
||||
|
||||
formality_level: float = 0.0 # 0 = casual, 1 = formal
|
||||
verbosity: float = 0.0 # average message length
|
||||
emoji_usage: float = 0.0
|
||||
humor_frequency: float = 0.0
|
||||
directness: float = 0.0 # how direct vs circumlocutory
|
||||
confidence_score: float = 0.0
|
||||
|
||||
|
||||
class PatternExtractor:
|
||||
"""
|
||||
Multi-dimensional pattern extraction from conversations.
|
||||
|
||||
Extracts patterns across topics, sentiment, interaction styles,
|
||||
temporal preferences, and response styles with confidence scoring
|
||||
and stability tracking.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize pattern extractor with analysis configurations."""
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
# Sentiment keyword dictionaries
|
||||
self.positive_words = {
|
||||
"good",
|
||||
"great",
|
||||
"excellent",
|
||||
"amazing",
|
||||
"wonderful",
|
||||
"fantastic",
|
||||
"love",
|
||||
"like",
|
||||
"enjoy",
|
||||
"happy",
|
||||
"pleased",
|
||||
"satisfied",
|
||||
"perfect",
|
||||
"awesome",
|
||||
"brilliant",
|
||||
"outstanding",
|
||||
"superb",
|
||||
"delightful",
|
||||
}
|
||||
|
||||
self.negative_words = {
|
||||
"bad",
|
||||
"terrible",
|
||||
"awful",
|
||||
"horrible",
|
||||
"hate",
|
||||
"dislike",
|
||||
"angry",
|
||||
"sad",
|
||||
"frustrated",
|
||||
"disappointed",
|
||||
"annoyed",
|
||||
"upset",
|
||||
"worried",
|
||||
"concerned",
|
||||
"problem",
|
||||
"issue",
|
||||
"error",
|
||||
"wrong",
|
||||
"fail",
|
||||
"failed",
|
||||
}
|
||||
|
||||
# Topic extraction keywords
|
||||
self.topic_indicators = {
|
||||
"technology": [
|
||||
"computer",
|
||||
"software",
|
||||
"code",
|
||||
"programming",
|
||||
"app",
|
||||
"system",
|
||||
],
|
||||
"work": ["job", "career", "project", "task", "meeting", "deadline"],
|
||||
"personal": ["family", "friend", "relationship", "home", "life", "health"],
|
||||
"entertainment": ["movie", "music", "game", "book", "show", "play"],
|
||||
"learning": ["study", "learn", "course", "education", "knowledge", "skill"],
|
||||
}
|
||||
|
||||
# Formality indicators
|
||||
self.formal_indicators = [
|
||||
"please",
|
||||
"thank",
|
||||
"regards",
|
||||
"sincerely",
|
||||
"would",
|
||||
"could",
|
||||
]
|
||||
self.casual_indicators = ["hey", "yo", "sup", "lol", "omg", "btw", "idk"]
|
||||
|
||||
# Pattern stability tracking
|
||||
self._pattern_history: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
|
||||
|
||||
def extract_topic_patterns(
|
||||
self, conversations: List[Dict[str, Any]]
|
||||
) -> TopicPatterns:
|
||||
"""
|
||||
Extract topic patterns from conversations.
|
||||
|
||||
Args:
|
||||
conversations: List of conversation dictionaries with messages
|
||||
|
||||
Returns:
|
||||
TopicPatterns object with extracted topic information
|
||||
"""
|
||||
try:
|
||||
self.logger.info("Extracting topic patterns from conversations")
|
||||
|
||||
# Collect all text content
|
||||
all_text = []
|
||||
topic_transitions = defaultdict(list)
|
||||
last_topic = None
|
||||
|
||||
for conv in conversations:
|
||||
messages = conv.get("messages", [])
|
||||
for msg in messages:
|
||||
if msg.get("role") in ["user", "assistant"]:
|
||||
content = msg.get("content", "").lower()
|
||||
all_text.append(content)
|
||||
|
||||
# Extract current topic
|
||||
current_topic = self._identify_main_topic(content)
|
||||
if current_topic and last_topic and current_topic != last_topic:
|
||||
topic_transitions[last_topic].append(current_topic)
|
||||
last_topic = current_topic
|
||||
|
||||
# Frequency analysis
|
||||
topic_counts = Counter()
|
||||
for text in all_text:
|
||||
topic = self._identify_main_topic(text)
|
||||
if topic:
|
||||
topic_counts[topic] += 1
|
||||
|
||||
# Calculate frequent topics
|
||||
total_topics = sum(topic_counts.values())
|
||||
frequent_topics = (
|
||||
[
|
||||
(topic, count / total_topics)
|
||||
for topic, count in topic_counts.most_common(10)
|
||||
]
|
||||
if total_topics > 0
|
||||
else []
|
||||
)
|
||||
|
||||
# Calculate topic diversity (Shannon entropy)
|
||||
topic_diversity = self._calculate_diversity(topic_counts)
|
||||
|
||||
# Extract user interests (most frequent topics from user messages)
|
||||
user_interests = list(dict(frequent_topics[:5]).keys())
|
||||
|
||||
# Calculate confidence score
|
||||
confidence = self._calculate_topic_confidence(
|
||||
topic_counts, len(all_text), frequent_topics
|
||||
)
|
||||
|
||||
return TopicPatterns(
|
||||
frequent_topics=frequent_topics,
|
||||
topic_diversity=topic_diversity,
|
||||
topic_transitions=dict(topic_transitions),
|
||||
user_interests=user_interests,
|
||||
confidence_score=confidence,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to extract topic patterns: {e}")
|
||||
return TopicPatterns(confidence_score=0.0)
|
||||
|
||||
def extract_sentiment_patterns(
|
||||
self, conversations: List[Dict[str, Any]]
|
||||
) -> SentimentPatterns:
|
||||
"""
|
||||
Extract sentiment patterns from conversations.
|
||||
|
||||
Args:
|
||||
conversations: List of conversation dictionaries with messages
|
||||
|
||||
Returns:
|
||||
SentimentPatterns object with extracted sentiment information
|
||||
"""
|
||||
try:
|
||||
self.logger.info("Extracting sentiment patterns from conversations")
|
||||
|
||||
sentiment_scores = []
|
||||
sentiment_keywords = Counter()
|
||||
mood_fluctuations = []
|
||||
|
||||
for conv in conversations:
|
||||
messages = conv.get("messages", [])
|
||||
for msg in messages:
|
||||
if msg.get("role") in ["user", "assistant"]:
|
||||
content = msg.get("content", "").lower()
|
||||
|
||||
# Calculate sentiment score
|
||||
score = self._calculate_sentiment_score(content)
|
||||
sentiment_scores.append(score)
|
||||
|
||||
# Track sentiment keywords
|
||||
for word in self.positive_words:
|
||||
if word in content:
|
||||
sentiment_keywords[f"positive_{word}"] += 1
|
||||
for word in self.negative_words:
|
||||
if word in content:
|
||||
sentiment_keywords[f"negative_{word}"] += 1
|
||||
|
||||
# Track mood over time
|
||||
if "timestamp" in msg:
|
||||
timestamp = msg["timestamp"]
|
||||
if isinstance(timestamp, str):
|
||||
timestamp = datetime.fromisoformat(
|
||||
timestamp.replace("Z", "+00:00")
|
||||
)
|
||||
mood_fluctuations.append((timestamp, score))
|
||||
|
||||
# Calculate overall sentiment
|
||||
overall_sentiment = (
|
||||
statistics.mean(sentiment_scores) if sentiment_scores else 0.0
|
||||
)
|
||||
|
||||
# Calculate sentiment variance
|
||||
sentiment_variance = (
|
||||
statistics.variance(sentiment_scores)
|
||||
if len(sentiment_scores) > 1
|
||||
else 0.0
|
||||
)
|
||||
|
||||
# Determine emotional tone
|
||||
emotional_tone = self._classify_emotional_tone(overall_sentiment)
|
||||
|
||||
# Calculate confidence score
|
||||
confidence = self._calculate_sentiment_confidence(
|
||||
sentiment_scores, len(sentiment_keywords)
|
||||
)
|
||||
|
||||
return SentimentPatterns(
|
||||
overall_sentiment=overall_sentiment,
|
||||
sentiment_variance=sentiment_variance,
|
||||
emotional_tone=emotional_tone,
|
||||
sentiment_keywords=dict(sentiment_keywords),
|
||||
mood_fluctuations=mood_fluctuations,
|
||||
confidence_score=confidence,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to extract sentiment patterns: {e}")
|
||||
return SentimentPatterns(confidence_score=0.0)
|
||||
|
||||
def extract_interaction_patterns(
|
||||
self, conversations: List[Dict[str, Any]]
|
||||
) -> InteractionPatterns:
|
||||
"""
|
||||
Extract interaction patterns from conversations.
|
||||
|
||||
Args:
|
||||
conversations: List of conversation dictionaries with messages
|
||||
|
||||
Returns:
|
||||
InteractionPatterns object with extracted interaction information
|
||||
"""
|
||||
try:
|
||||
self.logger.info("Extracting interaction patterns from conversations")
|
||||
|
||||
question_count = 0
|
||||
info_sharing_count = 0
|
||||
response_times = []
|
||||
user_messages = 0
|
||||
assistant_messages = 0
|
||||
engagement_indicators = []
|
||||
|
||||
for conv in conversations:
|
||||
messages = conv.get("messages", [])
|
||||
prev_timestamp = None
|
||||
|
||||
for i, msg in enumerate(messages):
|
||||
role = msg.get("role")
|
||||
content = msg.get("content", "").lower()
|
||||
|
||||
# Count questions
|
||||
if "?" in content and role == "user":
|
||||
question_count += 1
|
||||
|
||||
# Count information sharing
|
||||
info_sharing_indicators = [
|
||||
"because",
|
||||
"since",
|
||||
"due to",
|
||||
"reason is",
|
||||
"explanation",
|
||||
]
|
||||
if any(
|
||||
indicator in content for indicator in info_sharing_indicators
|
||||
):
|
||||
info_sharing_count += 1
|
||||
|
||||
# Track message counts for balance
|
||||
if role == "user":
|
||||
user_messages += 1
|
||||
elif role == "assistant":
|
||||
assistant_messages += 1
|
||||
|
||||
# Calculate response times
|
||||
if prev_timestamp and "timestamp" in msg:
|
||||
try:
|
||||
curr_time = msg["timestamp"]
|
||||
if isinstance(curr_time, str):
|
||||
curr_time = datetime.fromisoformat(
|
||||
curr_time.replace("Z", "+00:00")
|
||||
)
|
||||
|
||||
time_diff = (curr_time - prev_timestamp).total_seconds()
|
||||
if 0 < time_diff < 3600: # Within reasonable range
|
||||
response_times.append(time_diff)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Track engagement indicators
|
||||
engagement_words = [
|
||||
"interesting",
|
||||
"tell me more",
|
||||
"fascinating",
|
||||
"cool",
|
||||
"wow",
|
||||
]
|
||||
if any(word in content for word in engagement_words):
|
||||
engagement_indicators.append(1)
|
||||
else:
|
||||
engagement_indicators.append(0)
|
||||
|
||||
prev_timestamp = msg.get("timestamp")
|
||||
if isinstance(prev_timestamp, str):
|
||||
prev_timestamp = datetime.fromisoformat(
|
||||
prev_timestamp.replace("Z", "+00:00")
|
||||
)
|
||||
|
||||
# Calculate metrics
|
||||
total_messages = user_messages + assistant_messages
|
||||
question_frequency = question_count / max(user_messages, 1)
|
||||
information_sharing = info_sharing_count / max(total_messages, 1)
|
||||
response_time_avg = (
|
||||
statistics.mean(response_times) if response_times else 0.0
|
||||
)
|
||||
conversation_balance = user_messages / max(total_messages, 1)
|
||||
engagement_level = (
|
||||
statistics.mean(engagement_indicators) if engagement_indicators else 0.0
|
||||
)
|
||||
|
||||
# Calculate confidence score
|
||||
confidence = self._calculate_interaction_confidence(
|
||||
total_messages, len(response_times), question_count
|
||||
)
|
||||
|
||||
return InteractionPatterns(
|
||||
question_frequency=question_frequency,
|
||||
information_sharing=information_sharing,
|
||||
response_time_avg=response_time_avg,
|
||||
conversation_balance=conversation_balance,
|
||||
engagement_level=engagement_level,
|
||||
confidence_score=confidence,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to extract interaction patterns: {e}")
|
||||
return InteractionPatterns(confidence_score=0.0)
|
||||
|
||||
def extract_temporal_patterns(
|
||||
self, conversations: List[Dict[str, Any]]
|
||||
) -> TemporalPatterns:
|
||||
"""
|
||||
Extract temporal patterns from conversations.
|
||||
|
||||
Args:
|
||||
conversations: List of conversation dictionaries with messages
|
||||
|
||||
Returns:
|
||||
TemporalPatterns object with extracted temporal information
|
||||
"""
|
||||
try:
|
||||
self.logger.info("Extracting temporal patterns from conversations")
|
||||
|
||||
hour_counts = Counter()
|
||||
day_counts = Counter()
|
||||
conversation_durations = []
|
||||
session_start_times = []
|
||||
|
||||
for conv in conversations:
|
||||
messages = conv.get("messages", [])
|
||||
if not messages:
|
||||
continue
|
||||
|
||||
# Track conversation duration
|
||||
timestamps = []
|
||||
for msg in messages:
|
||||
if "timestamp" in msg:
|
||||
try:
|
||||
timestamp = msg["timestamp"]
|
||||
if isinstance(timestamp, str):
|
||||
timestamp = datetime.fromisoformat(
|
||||
timestamp.replace("Z", "+00:00")
|
||||
)
|
||||
timestamps.append(timestamp)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if timestamps:
|
||||
# Calculate duration
|
||||
duration = (
|
||||
max(timestamps) - min(timestamps)
|
||||
).total_seconds() / 60 # minutes
|
||||
conversation_durations.append(duration)
|
||||
|
||||
# Count hour and day patterns
|
||||
for timestamp in timestamps:
|
||||
hour_counts[timestamp.hour] += 1
|
||||
day_counts[timestamp.strftime("%A")] += 1
|
||||
|
||||
# Track session start time
|
||||
session_start_times.append(min(timestamps))
|
||||
|
||||
# Calculate preferred times
|
||||
total_hours = sum(hour_counts.values())
|
||||
preferred_times = (
|
||||
[
|
||||
(str(hour), count / total_hours)
|
||||
for hour, count in hour_counts.most_common(5)
|
||||
]
|
||||
if total_hours > 0
|
||||
else []
|
||||
)
|
||||
|
||||
# Calculate day of week patterns
|
||||
total_days = sum(day_counts.values())
|
||||
day_of_week_patterns = (
|
||||
{day: count / total_days for day, count in day_counts.items()}
|
||||
if total_days > 0
|
||||
else {}
|
||||
)
|
||||
|
||||
# Calculate other metrics
|
||||
avg_duration = (
|
||||
statistics.mean(conversation_durations)
|
||||
if conversation_durations
|
||||
else 0.0
|
||||
)
|
||||
|
||||
# Calculate session frequency (sessions per day)
|
||||
if session_start_times:
|
||||
time_span = (
|
||||
max(session_start_times) - min(session_start_times)
|
||||
).days + 1
|
||||
session_frequency = len(session_start_times) / max(time_span, 1)
|
||||
else:
|
||||
session_frequency = 0.0
|
||||
|
||||
# Time-based style analysis
|
||||
time_based_style = self._analyze_time_based_styles(conversations)
|
||||
|
||||
# Calculate confidence score
|
||||
confidence = self._calculate_temporal_confidence(
|
||||
len(conversations), total_hours, len(session_start_times)
|
||||
)
|
||||
|
||||
return TemporalPatterns(
|
||||
preferred_times=preferred_times,
|
||||
day_of_week_patterns=day_of_week_patterns,
|
||||
conversation_duration=avg_duration,
|
||||
session_frequency=session_frequency,
|
||||
time_based_style=time_based_style,
|
||||
confidence_score=confidence,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to extract temporal patterns: {e}")
|
||||
return TemporalPatterns(confidence_score=0.0)
|
||||
|
||||
def extract_response_style_patterns(
|
||||
self, conversations: List[Dict[str, Any]]
|
||||
) -> ResponseStylePatterns:
|
||||
"""
|
||||
Extract response style patterns from conversations.
|
||||
|
||||
Args:
|
||||
conversations: List of conversation dictionaries with messages
|
||||
|
||||
Returns:
|
||||
ResponseStylePatterns object with extracted response style information
|
||||
"""
|
||||
try:
|
||||
self.logger.info("Extracting response style patterns from conversations")
|
||||
|
||||
message_lengths = []
|
||||
formality_scores = []
|
||||
emoji_counts = []
|
||||
humor_indicators = []
|
||||
directness_scores = []
|
||||
|
||||
for conv in conversations:
|
||||
messages = conv.get("messages", [])
|
||||
for msg in messages:
|
||||
if msg.get("role") in ["user", "assistant"]:
|
||||
content = msg.get("content", "")
|
||||
|
||||
# Message length (verbosity)
|
||||
message_lengths.append(len(content.split()))
|
||||
|
||||
# Formality level
|
||||
formality = self._calculate_formality(content)
|
||||
formality_scores.append(formality)
|
||||
|
||||
# Emoji usage
|
||||
emoji_count = len(
|
||||
re.findall(
|
||||
r"[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]",
|
||||
content,
|
||||
)
|
||||
)
|
||||
emoji_counts.append(emoji_count)
|
||||
|
||||
# Humor frequency
|
||||
humor_words = [
|
||||
"lol",
|
||||
"haha",
|
||||
"funny",
|
||||
"joke",
|
||||
"hilarious",
|
||||
"😂",
|
||||
"😄",
|
||||
]
|
||||
humor_indicators.append(
|
||||
1
|
||||
if any(word in content.lower() for word in humor_words)
|
||||
else 0
|
||||
)
|
||||
|
||||
# Directness (simple vs complex sentences)
|
||||
directness = self._calculate_directness(content)
|
||||
directness_scores.append(directness)
|
||||
|
||||
# Calculate averages
|
||||
verbosity = statistics.mean(message_lengths) if message_lengths else 0.0
|
||||
formality_level = (
|
||||
statistics.mean(formality_scores) if formality_scores else 0.0
|
||||
)
|
||||
emoji_usage = statistics.mean(emoji_counts) if emoji_counts else 0.0
|
||||
humor_frequency = (
|
||||
statistics.mean(humor_indicators) if humor_indicators else 0.0
|
||||
)
|
||||
directness = (
|
||||
statistics.mean(directness_scores) if directness_scores else 0.0
|
||||
)
|
||||
|
||||
# Calculate confidence score
|
||||
confidence = self._calculate_style_confidence(
|
||||
len(message_lengths), len(formality_scores)
|
||||
)
|
||||
|
||||
return ResponseStylePatterns(
|
||||
formality_level=formality_level,
|
||||
verbosity=verbosity,
|
||||
emoji_usage=emoji_usage,
|
||||
humor_frequency=humor_frequency,
|
||||
directness=directness,
|
||||
confidence_score=confidence,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to extract response style patterns: {e}")
|
||||
return ResponseStylePatterns(confidence_score=0.0)
|
||||
|
||||
def _identify_main_topic(self, text: str) -> Optional[str]:
|
||||
"""Identify the main topic of a text snippet."""
|
||||
topic_scores = defaultdict(int)
|
||||
|
||||
for topic, keywords in self.topic_indicators.items():
|
||||
for keyword in keywords:
|
||||
if keyword in text:
|
||||
topic_scores[topic] += 1
|
||||
|
||||
if topic_scores:
|
||||
return max(topic_scores, key=topic_scores.get)
|
||||
return None
|
||||
|
||||
def _calculate_diversity(self, counts: Counter) -> float:
|
||||
"""Calculate Shannon entropy diversity."""
|
||||
total = sum(counts.values())
|
||||
if total == 0:
|
||||
return 0.0
|
||||
|
||||
entropy = 0.0
|
||||
for count in counts.values():
|
||||
probability = count / total
|
||||
entropy -= probability * (
|
||||
probability and statistics.log(probability, 2) or 0
|
||||
)
|
||||
|
||||
return entropy
|
||||
|
||||
def _calculate_sentiment_score(self, text: str) -> float:
|
||||
"""Calculate sentiment score for text (-1 to 1)."""
|
||||
positive_count = sum(1 for word in self.positive_words if word in text)
|
||||
negative_count = sum(1 for word in self.negative_words if word in text)
|
||||
|
||||
total_sentiment_words = positive_count + negative_count
|
||||
if total_sentiment_words == 0:
|
||||
return 0.0
|
||||
|
||||
return (positive_count - negative_count) / total_sentiment_words
|
||||
|
||||
def _classify_emotional_tone(self, sentiment: float) -> str:
|
||||
"""Classify emotional tone from sentiment score."""
|
||||
if sentiment > 0.3:
|
||||
return "positive"
|
||||
elif sentiment < -0.3:
|
||||
return "negative"
|
||||
else:
|
||||
return "neutral"
|
||||
|
||||
def _calculate_formality(self, text: str) -> float:
|
||||
"""Calculate formality level (0 = casual, 1 = formal)."""
|
||||
formal_count = sum(1 for word in self.formal_indicators if word in text.lower())
|
||||
casual_count = sum(1 for word in self.casual_indicators if word in text.lower())
|
||||
|
||||
# Base formality on presence of formal indicators and absence of casual ones
|
||||
if formal_count > 0 and casual_count == 0:
|
||||
return 0.8
|
||||
elif formal_count == 0 and casual_count > 0:
|
||||
return 0.2
|
||||
elif formal_count > casual_count:
|
||||
return 0.6
|
||||
elif casual_count > formal_count:
|
||||
return 0.4
|
||||
else:
|
||||
return 0.5
|
||||
|
||||
def _calculate_directness(self, text: str) -> float:
|
||||
"""Calculate directness (0 = circumlocutory, 1 = direct)."""
|
||||
# Simple heuristic: shorter sentences and fewer subordinate clauses are more direct
|
||||
sentences = text.split(".")
|
||||
if not sentences:
|
||||
return 0.5
|
||||
|
||||
avg_sentence_length = sum(len(s.split()) for s in sentences) / len(sentences)
|
||||
subordinate_indicators = [
|
||||
"because",
|
||||
"although",
|
||||
"however",
|
||||
"therefore",
|
||||
"meanwhile",
|
||||
]
|
||||
subordinate_count = sum(
|
||||
1 for indicator in subordinate_indicators if indicator in text.lower()
|
||||
)
|
||||
|
||||
# Directness decreases with longer sentences and more subordinate clauses
|
||||
directness = 1.0 - (avg_sentence_length / 50.0) - (subordinate_count * 0.1)
|
||||
return max(0.0, min(1.0, directness))
|
||||
|
||||
def _analyze_time_based_styles(
|
||||
self, conversations: List[Dict[str, Any]]
|
||||
) -> Dict[str, str]:
|
||||
"""Analyze how communication style changes by time."""
|
||||
time_styles = {}
|
||||
|
||||
for conv in conversations:
|
||||
messages = conv.get("messages", [])
|
||||
for msg in messages:
|
||||
if "timestamp" in msg:
|
||||
try:
|
||||
timestamp = msg["timestamp"]
|
||||
if isinstance(timestamp, str):
|
||||
timestamp = datetime.fromisoformat(
|
||||
timestamp.replace("Z", "+00:00")
|
||||
)
|
||||
|
||||
hour = timestamp.hour
|
||||
content = msg.get("content", "").lower()
|
||||
|
||||
# Simple style classification by time
|
||||
if 6 <= hour < 12: # Morning
|
||||
style = (
|
||||
"morning_formal"
|
||||
if any(
|
||||
word in self.formal_indicators
|
||||
for word in self.formal_indicators
|
||||
if word in content
|
||||
)
|
||||
else "morning_casual"
|
||||
)
|
||||
elif 12 <= hour < 18: # Afternoon
|
||||
style = (
|
||||
"afternoon_direct"
|
||||
if len(content.split()) < 10
|
||||
else "afternoon_detailed"
|
||||
)
|
||||
elif 18 <= hour < 22: # Evening
|
||||
style = "evening_relaxed"
|
||||
else: # Night
|
||||
style = "night_concise"
|
||||
|
||||
time_styles[f"{hour}:00"] = style
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return time_styles
|
||||
|
||||
def _calculate_topic_confidence(
|
||||
self, topic_counts: Counter, total_messages: int, frequent_topics: List
|
||||
) -> float:
|
||||
"""Calculate confidence score for topic patterns."""
|
||||
if total_messages == 0:
|
||||
return 0.0
|
||||
|
||||
# Confidence based on topic clarity and frequency
|
||||
topic_coverage = sum(count for _, count in frequent_topics) / total_messages
|
||||
topic_variety = len(topic_counts) / max(total_messages, 1)
|
||||
|
||||
return min(1.0, (topic_coverage + topic_variety) / 2)
|
||||
|
||||
def _calculate_sentiment_confidence(
|
||||
self, sentiment_scores: List[float], keyword_count: int
|
||||
) -> float:
|
||||
"""Calculate confidence score for sentiment patterns."""
|
||||
if not sentiment_scores:
|
||||
return 0.0
|
||||
|
||||
# Confidence based on consistency and keyword evidence
|
||||
sentiment_consistency = 1.0 - (
|
||||
statistics.stdev(sentiment_scores) if len(sentiment_scores) > 1 else 0.0
|
||||
)
|
||||
keyword_evidence = min(1.0, keyword_count / len(sentiment_scores))
|
||||
|
||||
return (sentiment_consistency + keyword_evidence) / 2
|
||||
|
||||
def _calculate_interaction_confidence(
|
||||
self, total_messages: int, response_times: int, questions: int
|
||||
) -> float:
|
||||
"""Calculate confidence score for interaction patterns."""
|
||||
if total_messages == 0:
|
||||
return 0.0
|
||||
|
||||
# Confidence based on data completeness
|
||||
message_coverage = min(
|
||||
1.0, total_messages / 10
|
||||
) # More messages = higher confidence
|
||||
response_coverage = min(1.0, response_times / max(total_messages // 2, 1))
|
||||
question_coverage = min(1.0, questions / max(total_messages // 10, 1))
|
||||
|
||||
return (message_coverage + response_coverage + question_coverage) / 3
|
||||
|
||||
def _calculate_temporal_confidence(
|
||||
self, conversations: int, hour_data: int, sessions: int
|
||||
) -> float:
|
||||
"""Calculate confidence score for temporal patterns."""
|
||||
if conversations == 0:
|
||||
return 0.0
|
||||
|
||||
# Confidence based on temporal data spread
|
||||
conversation_coverage = min(1.0, conversations / 5)
|
||||
hour_coverage = min(1.0, hour_data / 24)
|
||||
session_coverage = min(1.0, sessions / 3)
|
||||
|
||||
return (conversation_coverage + hour_coverage + session_coverage) / 3
|
||||
|
||||
def _calculate_style_confidence(self, messages: int, formality_data: int) -> float:
|
||||
"""Calculate confidence score for style patterns."""
|
||||
if messages == 0:
|
||||
return 0.0
|
||||
|
||||
# Confidence based on style data completeness
|
||||
message_coverage = min(1.0, messages / 10)
|
||||
formality_coverage = min(1.0, formality_data / max(messages, 1))
|
||||
|
||||
return (message_coverage + formality_coverage) / 2
|
||||
Reference in New Issue
Block a user