- Added ArchivalManager for JSON export/import with gzip compression - Implemented organized directory structure by year/month - Added batch archival operations and restore functionality - Created RetentionPolicy with importance-based scoring - Smart retention considers engagement, topics, user-marked importance - MemoryManager integrates compression and archival automatically - Added automatic compression triggering and archival scheduling - Comprehensive archival statistics and retention recommendations - Support for backup integration and restore verification
541 lines
19 KiB
Python
541 lines
19 KiB
Python
"""
|
|
Smart retention policies for conversation preservation.
|
|
|
|
Implements value-based retention scoring that keeps important
|
|
conversations longer while efficiently managing storage usage.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from datetime import datetime, timedelta
|
|
from typing import Dict, Any, List, Optional, Tuple
|
|
from collections import defaultdict
|
|
import statistics
|
|
|
|
import sys
|
|
import os
|
|
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
|
|
|
from memory.storage.sqlite_manager import SQLiteManager
|
|
|
|
|
|
class RetentionPolicy:
|
|
"""
|
|
Smart retention policy engine.
|
|
|
|
Calculates conversation importance scores and determines
|
|
which conversations should be retained or compressed.
|
|
"""
|
|
|
|
def __init__(self, sqlite_manager: SQLiteManager):
|
|
"""
|
|
Initialize retention policy.
|
|
|
|
Args:
|
|
sqlite_manager: SQLite manager instance for data access
|
|
"""
|
|
self.db_manager = sqlite_manager
|
|
self.logger = logging.getLogger(__name__)
|
|
|
|
# Retention policy parameters
|
|
self.important_threshold = 0.7 # Above this = retain full
|
|
self.preserve_threshold = 0.4 # Above this = lighter compression
|
|
self.user_marked_multiplier = 1.5 # Boost for user-marked important
|
|
|
|
# Engagement scoring weights
|
|
self.weights = {
|
|
"message_count": 0.2, # More messages = higher engagement
|
|
"response_quality": 0.25, # Back-and-forth conversation
|
|
"topic_diversity": 0.15, # Multiple topics = important
|
|
"time_span": 0.1, # Longer duration = important
|
|
"user_marked": 0.2, # User explicitly marked important
|
|
"question_density": 0.1, # Questions = seeking information
|
|
}
|
|
|
|
def calculate_importance_score(self, conversation: Dict[str, Any]) -> float:
|
|
"""
|
|
Calculate importance score for a conversation.
|
|
|
|
Args:
|
|
conversation: Conversation data with messages and metadata
|
|
|
|
Returns:
|
|
Importance score between 0.0 and 1.0
|
|
"""
|
|
try:
|
|
messages = conversation.get("messages", [])
|
|
if not messages:
|
|
return 0.0
|
|
|
|
# Extract basic metrics
|
|
message_count = len(messages)
|
|
user_messages = [m for m in messages if m["role"] == "user"]
|
|
assistant_messages = [m for m in messages if m["role"] == "assistant"]
|
|
|
|
# Calculate engagement metrics
|
|
scores = {}
|
|
|
|
# 1. Message count score (normalized)
|
|
scores["message_count"] = min(
|
|
message_count / 20, 1.0
|
|
) # 20 messages = full score
|
|
|
|
# 2. Response quality (back-and-forth ratio)
|
|
if len(user_messages) > 0 and len(assistant_messages) > 0:
|
|
ratio = min(len(assistant_messages), len(user_messages)) / max(
|
|
len(assistant_messages), len(user_messages)
|
|
)
|
|
scores["response_quality"] = ratio # Close to 1.0 = good conversation
|
|
else:
|
|
scores["response_quality"] = 0.5
|
|
|
|
# 3. Topic diversity (variety in content)
|
|
scores["topic_diversity"] = self._calculate_topic_diversity(messages)
|
|
|
|
# 4. Time span (conversation duration)
|
|
scores["time_span"] = self._calculate_time_span_score(messages)
|
|
|
|
# 5. User marked important
|
|
metadata = conversation.get("metadata", {})
|
|
user_marked = metadata.get("user_marked_important", False)
|
|
scores["user_marked"] = self.user_marked_multiplier if user_marked else 1.0
|
|
|
|
# 6. Question density (information seeking)
|
|
scores["question_density"] = self._calculate_question_density(user_messages)
|
|
|
|
# Calculate weighted final score
|
|
final_score = 0.0
|
|
for factor, weight in self.weights.items():
|
|
final_score += scores.get(factor, 0.0) * weight
|
|
|
|
# Normalize to 0-1 range
|
|
final_score = max(0.0, min(1.0, final_score))
|
|
|
|
self.logger.debug(
|
|
f"Importance score for {conversation.get('id')}: {final_score:.3f}"
|
|
)
|
|
return final_score
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to calculate importance score: {e}")
|
|
return 0.5 # Default to neutral
|
|
|
|
def _calculate_topic_diversity(self, messages: List[Dict[str, Any]]) -> float:
|
|
"""Calculate topic diversity score from messages."""
|
|
try:
|
|
# Simple topic-based diversity using keyword categories
|
|
topic_keywords = {
|
|
"technical": [
|
|
"code",
|
|
"programming",
|
|
"algorithm",
|
|
"function",
|
|
"bug",
|
|
"debug",
|
|
"api",
|
|
"database",
|
|
],
|
|
"personal": [
|
|
"feel",
|
|
"think",
|
|
"opinion",
|
|
"prefer",
|
|
"like",
|
|
"personal",
|
|
"life",
|
|
],
|
|
"work": [
|
|
"project",
|
|
"task",
|
|
"deadline",
|
|
"meeting",
|
|
"team",
|
|
"work",
|
|
"job",
|
|
],
|
|
"learning": [
|
|
"learn",
|
|
"study",
|
|
"understand",
|
|
"explain",
|
|
"tutorial",
|
|
"help",
|
|
],
|
|
"planning": ["plan", "schedule", "organize", "goal", "strategy"],
|
|
"creative": ["design", "create", "write", "art", "music", "story"],
|
|
}
|
|
|
|
topic_counts = defaultdict(int)
|
|
total_content = ""
|
|
|
|
for message in messages:
|
|
if message["role"] in ["user", "assistant"]:
|
|
content = message["content"].lower()
|
|
total_content += content + " "
|
|
|
|
# Count topic occurrences
|
|
for topic, keywords in topic_keywords.items():
|
|
for keyword in keywords:
|
|
if keyword in content:
|
|
topic_counts[topic] += 1
|
|
|
|
# Diversity = number of topics with significant presence
|
|
significant_topics = sum(1 for count in topic_counts.values() if count >= 2)
|
|
diversity_score = min(significant_topics / len(topic_keywords), 1.0)
|
|
|
|
return diversity_score
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to calculate topic diversity: {e}")
|
|
return 0.5
|
|
|
|
def _calculate_time_span_score(self, messages: List[Dict[str, Any]]) -> float:
|
|
"""Calculate time span score based on conversation duration."""
|
|
try:
|
|
timestamps = []
|
|
for message in messages:
|
|
if "timestamp" in message:
|
|
try:
|
|
ts = datetime.fromisoformat(message["timestamp"])
|
|
timestamps.append(ts)
|
|
except:
|
|
continue
|
|
|
|
if len(timestamps) < 2:
|
|
return 0.1 # Very short conversation
|
|
|
|
duration = max(timestamps) - min(timestamps)
|
|
duration_hours = duration.total_seconds() / 3600
|
|
|
|
# Score based on duration (24 hours = full score)
|
|
return min(duration_hours / 24, 1.0)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to calculate time span: {e}")
|
|
return 0.5
|
|
|
|
def _calculate_question_density(self, user_messages: List[Dict[str, Any]]) -> float:
|
|
"""Calculate question density from user messages."""
|
|
try:
|
|
if not user_messages:
|
|
return 0.0
|
|
|
|
question_count = 0
|
|
total_words = 0
|
|
|
|
for message in user_messages:
|
|
content = message["content"]
|
|
# Count questions
|
|
question_marks = content.count("?")
|
|
question_words = len(
|
|
re.findall(
|
|
r"\b(how|what|when|where|why|which|who|can|could|would|should|is|are|do|does)\b",
|
|
content,
|
|
re.IGNORECASE,
|
|
)
|
|
)
|
|
question_count += question_marks + question_words
|
|
|
|
# Count words
|
|
words = len(content.split())
|
|
total_words += words
|
|
|
|
if total_words == 0:
|
|
return 0.0
|
|
|
|
question_ratio = question_count / total_words
|
|
return min(question_ratio * 5, 1.0) # Normalize
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to calculate question density: {e}")
|
|
return 0.5
|
|
|
|
def should_retain_full(
|
|
self, conversation: Dict[str, Any], importance_score: Optional[float] = None
|
|
) -> bool:
|
|
"""
|
|
Determine if conversation should be retained in full form.
|
|
|
|
Args:
|
|
conversation: Conversation data
|
|
importance_score: Pre-calculated importance score (optional)
|
|
|
|
Returns:
|
|
True if conversation should be retained full
|
|
"""
|
|
if importance_score is None:
|
|
importance_score = self.calculate_importance_score(conversation)
|
|
|
|
# User explicitly marked important always retained
|
|
metadata = conversation.get("metadata", {})
|
|
if metadata.get("user_marked_important", False):
|
|
return True
|
|
|
|
# High importance score
|
|
if importance_score >= self.important_threshold:
|
|
return True
|
|
|
|
# Recent important conversations (within 30 days)
|
|
created_at = conversation.get("created_at")
|
|
if created_at:
|
|
try:
|
|
conv_date = datetime.fromisoformat(created_at)
|
|
if (datetime.now() - conv_date).days <= 30 and importance_score >= 0.5:
|
|
return True
|
|
except:
|
|
pass
|
|
|
|
return False
|
|
|
|
def should_retain_compressed(
|
|
self, conversation: Dict[str, Any], importance_score: Optional[float] = None
|
|
) -> Tuple[bool, str]:
|
|
"""
|
|
Determine if conversation should be compressed and to what level.
|
|
|
|
Args:
|
|
conversation: Conversation data
|
|
importance_score: Pre-calculated importance score (optional)
|
|
|
|
Returns:
|
|
Tuple of (should_compress, recommended_compression_level)
|
|
"""
|
|
if importance_score is None:
|
|
importance_score = self.calculate_importance_score(conversation)
|
|
|
|
# Check if should retain full
|
|
if self.should_retain_full(conversation, importance_score):
|
|
return False, "full"
|
|
|
|
# Determine compression level based on importance
|
|
if importance_score >= self.preserve_threshold:
|
|
# Important: lighter compression (key points)
|
|
return True, "key_points"
|
|
elif importance_score >= 0.2:
|
|
# Moderately important: summary compression
|
|
return True, "summary"
|
|
else:
|
|
# Low importance: metadata only
|
|
return True, "metadata"
|
|
|
|
def update_retention_policy(self, policy_settings: Dict[str, Any]) -> None:
|
|
"""
|
|
Update retention policy parameters.
|
|
|
|
Args:
|
|
policy_settings: Dictionary of policy parameter updates
|
|
"""
|
|
try:
|
|
if "important_threshold" in policy_settings:
|
|
self.important_threshold = float(policy_settings["important_threshold"])
|
|
if "preserve_threshold" in policy_settings:
|
|
self.preserve_threshold = float(policy_settings["preserve_threshold"])
|
|
if "user_marked_multiplier" in policy_settings:
|
|
self.user_marked_multiplier = float(
|
|
policy_settings["user_marked_multiplier"]
|
|
)
|
|
if "weights" in policy_settings:
|
|
self.weights.update(policy_settings["weights"])
|
|
|
|
self.logger.info(f"Updated retention policy: {policy_settings}")
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to update retention policy: {e}")
|
|
|
|
def get_retention_recommendations(
|
|
self, conversations: List[Dict[str, Any]]
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Get retention recommendations for multiple conversations.
|
|
|
|
Args:
|
|
conversations: List of conversations to analyze
|
|
|
|
Returns:
|
|
List of recommendations with scores and actions
|
|
"""
|
|
recommendations = []
|
|
|
|
for conversation in conversations:
|
|
try:
|
|
importance_score = self.calculate_importance_score(conversation)
|
|
should_compress, compression_level = self.should_retain_compressed(
|
|
conversation, importance_score
|
|
)
|
|
|
|
recommendation = {
|
|
"conversation_id": conversation.get("id"),
|
|
"title": conversation.get("title"),
|
|
"created_at": conversation.get("created_at"),
|
|
"importance_score": importance_score,
|
|
"should_compress": should_compress,
|
|
"recommended_level": compression_level,
|
|
"user_marked_important": conversation.get("metadata", {}).get(
|
|
"user_marked_important", False
|
|
),
|
|
"message_count": len(conversation.get("messages", [])),
|
|
"retention_reason": self._get_retention_reason(
|
|
importance_score, compression_level
|
|
),
|
|
}
|
|
|
|
recommendations.append(recommendation)
|
|
|
|
except Exception as e:
|
|
self.logger.error(
|
|
f"Failed to analyze conversation {conversation.get('id')}: {e}"
|
|
)
|
|
continue
|
|
|
|
# Sort by importance score (highest first)
|
|
recommendations.sort(key=lambda x: x["importance_score"], reverse=True)
|
|
return recommendations
|
|
|
|
def _get_retention_reason(
|
|
self, importance_score: float, compression_level: str
|
|
) -> str:
|
|
"""Get human-readable reason for retention decision."""
|
|
if compression_level == "full":
|
|
if importance_score >= self.important_threshold:
|
|
return "High importance - retained full"
|
|
else:
|
|
return "Recent conversation - retained full"
|
|
elif compression_level == "key_points":
|
|
return f"Moderate importance ({importance_score:.2f}) - key points retained"
|
|
elif compression_level == "summary":
|
|
return f"Standard importance ({importance_score:.2f}) - summary compression"
|
|
else:
|
|
return f"Low importance ({importance_score:.2f}) - metadata only"
|
|
|
|
def mark_conversation_important(
|
|
self, conversation_id: str, important: bool = True
|
|
) -> bool:
|
|
"""
|
|
Mark a conversation as user-important.
|
|
|
|
Args:
|
|
conversation_id: ID of conversation to mark
|
|
important: Whether to mark as important (True) or not important (False)
|
|
|
|
Returns:
|
|
True if marked successfully
|
|
"""
|
|
try:
|
|
conversation = self.db_manager.get_conversation(
|
|
conversation_id, include_messages=False
|
|
)
|
|
if not conversation:
|
|
self.logger.error(f"Conversation {conversation_id} not found")
|
|
return False
|
|
|
|
# Update metadata
|
|
metadata = conversation.get("metadata", {})
|
|
metadata["user_marked_important"] = important
|
|
metadata["marked_important_at"] = datetime.now().isoformat()
|
|
|
|
self.db_manager.update_conversation_metadata(conversation_id, metadata)
|
|
|
|
self.logger.info(
|
|
f"Marked conversation {conversation_id} as {'important' if important else 'not important'}"
|
|
)
|
|
return True
|
|
|
|
except Exception as e:
|
|
self.logger.error(
|
|
f"Failed to mark conversation {conversation_id} important: {e}"
|
|
)
|
|
return False
|
|
|
|
def get_important_conversations(self) -> List[Dict[str, Any]]:
|
|
"""
|
|
Get all user-marked important conversations.
|
|
|
|
Returns:
|
|
List of important conversations
|
|
"""
|
|
try:
|
|
recent_conversations = self.db_manager.get_recent_conversations(limit=1000)
|
|
|
|
important_conversations = []
|
|
for conversation in recent_conversations:
|
|
full_conversation = self.db_manager.get_conversation(
|
|
conversation["id"], include_messages=True
|
|
)
|
|
if full_conversation:
|
|
metadata = full_conversation.get("metadata", {})
|
|
if metadata.get("user_marked_important", False):
|
|
important_conversations.append(full_conversation)
|
|
|
|
return important_conversations
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to get important conversations: {e}")
|
|
return []
|
|
|
|
def get_retention_stats(self) -> Dict[str, Any]:
|
|
"""
|
|
Get retention policy statistics.
|
|
|
|
Returns:
|
|
Dictionary with retention statistics
|
|
"""
|
|
try:
|
|
recent_conversations = self.db_manager.get_recent_conversations(limit=500)
|
|
|
|
stats = {
|
|
"total_conversations": len(recent_conversations),
|
|
"important_marked": 0,
|
|
"importance_distribution": {"high": 0, "medium": 0, "low": 0},
|
|
"average_importance": 0.0,
|
|
"compression_recommendations": {
|
|
"full": 0,
|
|
"key_points": 0,
|
|
"summary": 0,
|
|
"metadata": 0,
|
|
},
|
|
}
|
|
|
|
importance_scores = []
|
|
|
|
for conv_data in recent_conversations:
|
|
conversation = self.db_manager.get_conversation(
|
|
conv_data["id"], include_messages=True
|
|
)
|
|
if not conversation:
|
|
continue
|
|
|
|
importance_score = self.calculate_importance_score(conversation)
|
|
importance_scores.append(importance_score)
|
|
|
|
# Check if user marked important
|
|
metadata = conversation.get("metadata", {})
|
|
if metadata.get("user_marked_important", False):
|
|
stats["important_marked"] += 1
|
|
|
|
# Categorize importance
|
|
if importance_score >= self.important_threshold:
|
|
stats["importance_distribution"]["high"] += 1
|
|
elif importance_score >= self.preserve_threshold:
|
|
stats["importance_distribution"]["medium"] += 1
|
|
else:
|
|
stats["importance_distribution"]["low"] += 1
|
|
|
|
# Compression recommendations
|
|
should_compress, level = self.should_retain_compressed(
|
|
conversation, importance_score
|
|
)
|
|
if level in stats["compression_recommendations"]:
|
|
stats["compression_recommendations"][level] += 1
|
|
else:
|
|
stats["compression_recommendations"]["full"] += 1
|
|
|
|
if importance_scores:
|
|
stats["average_importance"] = statistics.mean(importance_scores)
|
|
|
|
return stats
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to get retention stats: {e}")
|
|
return {}
|