Files
Mai/src/memory/backup/retention.py
Mai Development 8c58b1d070 feat(04-03): create JSON archival and smart retention systems
- Added ArchivalManager for JSON export/import with gzip compression
- Implemented organized directory structure by year/month
- Added batch archival operations and restore functionality
- Created RetentionPolicy with importance-based scoring
- Smart retention considers engagement, topics, user-marked importance
- MemoryManager integrates compression and archival automatically
- Added automatic compression triggering and archival scheduling
- Comprehensive archival statistics and retention recommendations
- Support for backup integration and restore verification
2026-01-27 23:56:49 -05:00

541 lines
19 KiB
Python

"""
Smart retention policies for conversation preservation.
Implements value-based retention scoring that keeps important
conversations longer while efficiently managing storage usage.
"""
import logging
import re
from datetime import datetime, timedelta
from typing import Dict, Any, List, Optional, Tuple
from collections import defaultdict
import statistics
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
from memory.storage.sqlite_manager import SQLiteManager
class RetentionPolicy:
"""
Smart retention policy engine.
Calculates conversation importance scores and determines
which conversations should be retained or compressed.
"""
def __init__(self, sqlite_manager: SQLiteManager):
"""
Initialize retention policy.
Args:
sqlite_manager: SQLite manager instance for data access
"""
self.db_manager = sqlite_manager
self.logger = logging.getLogger(__name__)
# Retention policy parameters
self.important_threshold = 0.7 # Above this = retain full
self.preserve_threshold = 0.4 # Above this = lighter compression
self.user_marked_multiplier = 1.5 # Boost for user-marked important
# Engagement scoring weights
self.weights = {
"message_count": 0.2, # More messages = higher engagement
"response_quality": 0.25, # Back-and-forth conversation
"topic_diversity": 0.15, # Multiple topics = important
"time_span": 0.1, # Longer duration = important
"user_marked": 0.2, # User explicitly marked important
"question_density": 0.1, # Questions = seeking information
}
def calculate_importance_score(self, conversation: Dict[str, Any]) -> float:
"""
Calculate importance score for a conversation.
Args:
conversation: Conversation data with messages and metadata
Returns:
Importance score between 0.0 and 1.0
"""
try:
messages = conversation.get("messages", [])
if not messages:
return 0.0
# Extract basic metrics
message_count = len(messages)
user_messages = [m for m in messages if m["role"] == "user"]
assistant_messages = [m for m in messages if m["role"] == "assistant"]
# Calculate engagement metrics
scores = {}
# 1. Message count score (normalized)
scores["message_count"] = min(
message_count / 20, 1.0
) # 20 messages = full score
# 2. Response quality (back-and-forth ratio)
if len(user_messages) > 0 and len(assistant_messages) > 0:
ratio = min(len(assistant_messages), len(user_messages)) / max(
len(assistant_messages), len(user_messages)
)
scores["response_quality"] = ratio # Close to 1.0 = good conversation
else:
scores["response_quality"] = 0.5
# 3. Topic diversity (variety in content)
scores["topic_diversity"] = self._calculate_topic_diversity(messages)
# 4. Time span (conversation duration)
scores["time_span"] = self._calculate_time_span_score(messages)
# 5. User marked important
metadata = conversation.get("metadata", {})
user_marked = metadata.get("user_marked_important", False)
scores["user_marked"] = self.user_marked_multiplier if user_marked else 1.0
# 6. Question density (information seeking)
scores["question_density"] = self._calculate_question_density(user_messages)
# Calculate weighted final score
final_score = 0.0
for factor, weight in self.weights.items():
final_score += scores.get(factor, 0.0) * weight
# Normalize to 0-1 range
final_score = max(0.0, min(1.0, final_score))
self.logger.debug(
f"Importance score for {conversation.get('id')}: {final_score:.3f}"
)
return final_score
except Exception as e:
self.logger.error(f"Failed to calculate importance score: {e}")
return 0.5 # Default to neutral
def _calculate_topic_diversity(self, messages: List[Dict[str, Any]]) -> float:
"""Calculate topic diversity score from messages."""
try:
# Simple topic-based diversity using keyword categories
topic_keywords = {
"technical": [
"code",
"programming",
"algorithm",
"function",
"bug",
"debug",
"api",
"database",
],
"personal": [
"feel",
"think",
"opinion",
"prefer",
"like",
"personal",
"life",
],
"work": [
"project",
"task",
"deadline",
"meeting",
"team",
"work",
"job",
],
"learning": [
"learn",
"study",
"understand",
"explain",
"tutorial",
"help",
],
"planning": ["plan", "schedule", "organize", "goal", "strategy"],
"creative": ["design", "create", "write", "art", "music", "story"],
}
topic_counts = defaultdict(int)
total_content = ""
for message in messages:
if message["role"] in ["user", "assistant"]:
content = message["content"].lower()
total_content += content + " "
# Count topic occurrences
for topic, keywords in topic_keywords.items():
for keyword in keywords:
if keyword in content:
topic_counts[topic] += 1
# Diversity = number of topics with significant presence
significant_topics = sum(1 for count in topic_counts.values() if count >= 2)
diversity_score = min(significant_topics / len(topic_keywords), 1.0)
return diversity_score
except Exception as e:
self.logger.error(f"Failed to calculate topic diversity: {e}")
return 0.5
def _calculate_time_span_score(self, messages: List[Dict[str, Any]]) -> float:
"""Calculate time span score based on conversation duration."""
try:
timestamps = []
for message in messages:
if "timestamp" in message:
try:
ts = datetime.fromisoformat(message["timestamp"])
timestamps.append(ts)
except:
continue
if len(timestamps) < 2:
return 0.1 # Very short conversation
duration = max(timestamps) - min(timestamps)
duration_hours = duration.total_seconds() / 3600
# Score based on duration (24 hours = full score)
return min(duration_hours / 24, 1.0)
except Exception as e:
self.logger.error(f"Failed to calculate time span: {e}")
return 0.5
def _calculate_question_density(self, user_messages: List[Dict[str, Any]]) -> float:
"""Calculate question density from user messages."""
try:
if not user_messages:
return 0.0
question_count = 0
total_words = 0
for message in user_messages:
content = message["content"]
# Count questions
question_marks = content.count("?")
question_words = len(
re.findall(
r"\b(how|what|when|where|why|which|who|can|could|would|should|is|are|do|does)\b",
content,
re.IGNORECASE,
)
)
question_count += question_marks + question_words
# Count words
words = len(content.split())
total_words += words
if total_words == 0:
return 0.0
question_ratio = question_count / total_words
return min(question_ratio * 5, 1.0) # Normalize
except Exception as e:
self.logger.error(f"Failed to calculate question density: {e}")
return 0.5
def should_retain_full(
self, conversation: Dict[str, Any], importance_score: Optional[float] = None
) -> bool:
"""
Determine if conversation should be retained in full form.
Args:
conversation: Conversation data
importance_score: Pre-calculated importance score (optional)
Returns:
True if conversation should be retained full
"""
if importance_score is None:
importance_score = self.calculate_importance_score(conversation)
# User explicitly marked important always retained
metadata = conversation.get("metadata", {})
if metadata.get("user_marked_important", False):
return True
# High importance score
if importance_score >= self.important_threshold:
return True
# Recent important conversations (within 30 days)
created_at = conversation.get("created_at")
if created_at:
try:
conv_date = datetime.fromisoformat(created_at)
if (datetime.now() - conv_date).days <= 30 and importance_score >= 0.5:
return True
except:
pass
return False
def should_retain_compressed(
self, conversation: Dict[str, Any], importance_score: Optional[float] = None
) -> Tuple[bool, str]:
"""
Determine if conversation should be compressed and to what level.
Args:
conversation: Conversation data
importance_score: Pre-calculated importance score (optional)
Returns:
Tuple of (should_compress, recommended_compression_level)
"""
if importance_score is None:
importance_score = self.calculate_importance_score(conversation)
# Check if should retain full
if self.should_retain_full(conversation, importance_score):
return False, "full"
# Determine compression level based on importance
if importance_score >= self.preserve_threshold:
# Important: lighter compression (key points)
return True, "key_points"
elif importance_score >= 0.2:
# Moderately important: summary compression
return True, "summary"
else:
# Low importance: metadata only
return True, "metadata"
def update_retention_policy(self, policy_settings: Dict[str, Any]) -> None:
"""
Update retention policy parameters.
Args:
policy_settings: Dictionary of policy parameter updates
"""
try:
if "important_threshold" in policy_settings:
self.important_threshold = float(policy_settings["important_threshold"])
if "preserve_threshold" in policy_settings:
self.preserve_threshold = float(policy_settings["preserve_threshold"])
if "user_marked_multiplier" in policy_settings:
self.user_marked_multiplier = float(
policy_settings["user_marked_multiplier"]
)
if "weights" in policy_settings:
self.weights.update(policy_settings["weights"])
self.logger.info(f"Updated retention policy: {policy_settings}")
except Exception as e:
self.logger.error(f"Failed to update retention policy: {e}")
def get_retention_recommendations(
self, conversations: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
"""
Get retention recommendations for multiple conversations.
Args:
conversations: List of conversations to analyze
Returns:
List of recommendations with scores and actions
"""
recommendations = []
for conversation in conversations:
try:
importance_score = self.calculate_importance_score(conversation)
should_compress, compression_level = self.should_retain_compressed(
conversation, importance_score
)
recommendation = {
"conversation_id": conversation.get("id"),
"title": conversation.get("title"),
"created_at": conversation.get("created_at"),
"importance_score": importance_score,
"should_compress": should_compress,
"recommended_level": compression_level,
"user_marked_important": conversation.get("metadata", {}).get(
"user_marked_important", False
),
"message_count": len(conversation.get("messages", [])),
"retention_reason": self._get_retention_reason(
importance_score, compression_level
),
}
recommendations.append(recommendation)
except Exception as e:
self.logger.error(
f"Failed to analyze conversation {conversation.get('id')}: {e}"
)
continue
# Sort by importance score (highest first)
recommendations.sort(key=lambda x: x["importance_score"], reverse=True)
return recommendations
def _get_retention_reason(
self, importance_score: float, compression_level: str
) -> str:
"""Get human-readable reason for retention decision."""
if compression_level == "full":
if importance_score >= self.important_threshold:
return "High importance - retained full"
else:
return "Recent conversation - retained full"
elif compression_level == "key_points":
return f"Moderate importance ({importance_score:.2f}) - key points retained"
elif compression_level == "summary":
return f"Standard importance ({importance_score:.2f}) - summary compression"
else:
return f"Low importance ({importance_score:.2f}) - metadata only"
def mark_conversation_important(
self, conversation_id: str, important: bool = True
) -> bool:
"""
Mark a conversation as user-important.
Args:
conversation_id: ID of conversation to mark
important: Whether to mark as important (True) or not important (False)
Returns:
True if marked successfully
"""
try:
conversation = self.db_manager.get_conversation(
conversation_id, include_messages=False
)
if not conversation:
self.logger.error(f"Conversation {conversation_id} not found")
return False
# Update metadata
metadata = conversation.get("metadata", {})
metadata["user_marked_important"] = important
metadata["marked_important_at"] = datetime.now().isoformat()
self.db_manager.update_conversation_metadata(conversation_id, metadata)
self.logger.info(
f"Marked conversation {conversation_id} as {'important' if important else 'not important'}"
)
return True
except Exception as e:
self.logger.error(
f"Failed to mark conversation {conversation_id} important: {e}"
)
return False
def get_important_conversations(self) -> List[Dict[str, Any]]:
"""
Get all user-marked important conversations.
Returns:
List of important conversations
"""
try:
recent_conversations = self.db_manager.get_recent_conversations(limit=1000)
important_conversations = []
for conversation in recent_conversations:
full_conversation = self.db_manager.get_conversation(
conversation["id"], include_messages=True
)
if full_conversation:
metadata = full_conversation.get("metadata", {})
if metadata.get("user_marked_important", False):
important_conversations.append(full_conversation)
return important_conversations
except Exception as e:
self.logger.error(f"Failed to get important conversations: {e}")
return []
def get_retention_stats(self) -> Dict[str, Any]:
"""
Get retention policy statistics.
Returns:
Dictionary with retention statistics
"""
try:
recent_conversations = self.db_manager.get_recent_conversations(limit=500)
stats = {
"total_conversations": len(recent_conversations),
"important_marked": 0,
"importance_distribution": {"high": 0, "medium": 0, "low": 0},
"average_importance": 0.0,
"compression_recommendations": {
"full": 0,
"key_points": 0,
"summary": 0,
"metadata": 0,
},
}
importance_scores = []
for conv_data in recent_conversations:
conversation = self.db_manager.get_conversation(
conv_data["id"], include_messages=True
)
if not conversation:
continue
importance_score = self.calculate_importance_score(conversation)
importance_scores.append(importance_score)
# Check if user marked important
metadata = conversation.get("metadata", {})
if metadata.get("user_marked_important", False):
stats["important_marked"] += 1
# Categorize importance
if importance_score >= self.important_threshold:
stats["importance_distribution"]["high"] += 1
elif importance_score >= self.preserve_threshold:
stats["importance_distribution"]["medium"] += 1
else:
stats["importance_distribution"]["low"] += 1
# Compression recommendations
should_compress, level = self.should_retain_compressed(
conversation, importance_score
)
if level in stats["compression_recommendations"]:
stats["compression_recommendations"][level] += 1
else:
stats["compression_recommendations"]["full"] += 1
if importance_scores:
stats["average_importance"] = statistics.mean(importance_scores)
return stats
except Exception as e:
self.logger.error(f"Failed to get retention stats: {e}")
return {}