feat(04-03): create JSON archival and smart retention systems
- Added ArchivalManager for JSON export/import with gzip compression - Implemented organized directory structure by year/month - Added batch archival operations and restore functionality - Created RetentionPolicy with importance-based scoring - Smart retention considers engagement, topics, user-marked importance - MemoryManager integrates compression and archival automatically - Added automatic compression triggering and archival scheduling - Comprehensive archival statistics and retention recommendations - Support for backup integration and restore verification
This commit is contained in:
540
src/memory/backup/retention.py
Normal file
540
src/memory/backup/retention.py
Normal file
@@ -0,0 +1,540 @@
|
||||
"""
|
||||
Smart retention policies for conversation preservation.
|
||||
|
||||
Implements value-based retention scoring that keeps important
|
||||
conversations longer while efficiently managing storage usage.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, Any, List, Optional, Tuple
|
||||
from collections import defaultdict
|
||||
import statistics
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
||||
from memory.storage.sqlite_manager import SQLiteManager
|
||||
|
||||
|
||||
class RetentionPolicy:
|
||||
"""
|
||||
Smart retention policy engine.
|
||||
|
||||
Calculates conversation importance scores and determines
|
||||
which conversations should be retained or compressed.
|
||||
"""
|
||||
|
||||
def __init__(self, sqlite_manager: SQLiteManager):
|
||||
"""
|
||||
Initialize retention policy.
|
||||
|
||||
Args:
|
||||
sqlite_manager: SQLite manager instance for data access
|
||||
"""
|
||||
self.db_manager = sqlite_manager
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
# Retention policy parameters
|
||||
self.important_threshold = 0.7 # Above this = retain full
|
||||
self.preserve_threshold = 0.4 # Above this = lighter compression
|
||||
self.user_marked_multiplier = 1.5 # Boost for user-marked important
|
||||
|
||||
# Engagement scoring weights
|
||||
self.weights = {
|
||||
"message_count": 0.2, # More messages = higher engagement
|
||||
"response_quality": 0.25, # Back-and-forth conversation
|
||||
"topic_diversity": 0.15, # Multiple topics = important
|
||||
"time_span": 0.1, # Longer duration = important
|
||||
"user_marked": 0.2, # User explicitly marked important
|
||||
"question_density": 0.1, # Questions = seeking information
|
||||
}
|
||||
|
||||
def calculate_importance_score(self, conversation: Dict[str, Any]) -> float:
|
||||
"""
|
||||
Calculate importance score for a conversation.
|
||||
|
||||
Args:
|
||||
conversation: Conversation data with messages and metadata
|
||||
|
||||
Returns:
|
||||
Importance score between 0.0 and 1.0
|
||||
"""
|
||||
try:
|
||||
messages = conversation.get("messages", [])
|
||||
if not messages:
|
||||
return 0.0
|
||||
|
||||
# Extract basic metrics
|
||||
message_count = len(messages)
|
||||
user_messages = [m for m in messages if m["role"] == "user"]
|
||||
assistant_messages = [m for m in messages if m["role"] == "assistant"]
|
||||
|
||||
# Calculate engagement metrics
|
||||
scores = {}
|
||||
|
||||
# 1. Message count score (normalized)
|
||||
scores["message_count"] = min(
|
||||
message_count / 20, 1.0
|
||||
) # 20 messages = full score
|
||||
|
||||
# 2. Response quality (back-and-forth ratio)
|
||||
if len(user_messages) > 0 and len(assistant_messages) > 0:
|
||||
ratio = min(len(assistant_messages), len(user_messages)) / max(
|
||||
len(assistant_messages), len(user_messages)
|
||||
)
|
||||
scores["response_quality"] = ratio # Close to 1.0 = good conversation
|
||||
else:
|
||||
scores["response_quality"] = 0.5
|
||||
|
||||
# 3. Topic diversity (variety in content)
|
||||
scores["topic_diversity"] = self._calculate_topic_diversity(messages)
|
||||
|
||||
# 4. Time span (conversation duration)
|
||||
scores["time_span"] = self._calculate_time_span_score(messages)
|
||||
|
||||
# 5. User marked important
|
||||
metadata = conversation.get("metadata", {})
|
||||
user_marked = metadata.get("user_marked_important", False)
|
||||
scores["user_marked"] = self.user_marked_multiplier if user_marked else 1.0
|
||||
|
||||
# 6. Question density (information seeking)
|
||||
scores["question_density"] = self._calculate_question_density(user_messages)
|
||||
|
||||
# Calculate weighted final score
|
||||
final_score = 0.0
|
||||
for factor, weight in self.weights.items():
|
||||
final_score += scores.get(factor, 0.0) * weight
|
||||
|
||||
# Normalize to 0-1 range
|
||||
final_score = max(0.0, min(1.0, final_score))
|
||||
|
||||
self.logger.debug(
|
||||
f"Importance score for {conversation.get('id')}: {final_score:.3f}"
|
||||
)
|
||||
return final_score
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to calculate importance score: {e}")
|
||||
return 0.5 # Default to neutral
|
||||
|
||||
def _calculate_topic_diversity(self, messages: List[Dict[str, Any]]) -> float:
|
||||
"""Calculate topic diversity score from messages."""
|
||||
try:
|
||||
# Simple topic-based diversity using keyword categories
|
||||
topic_keywords = {
|
||||
"technical": [
|
||||
"code",
|
||||
"programming",
|
||||
"algorithm",
|
||||
"function",
|
||||
"bug",
|
||||
"debug",
|
||||
"api",
|
||||
"database",
|
||||
],
|
||||
"personal": [
|
||||
"feel",
|
||||
"think",
|
||||
"opinion",
|
||||
"prefer",
|
||||
"like",
|
||||
"personal",
|
||||
"life",
|
||||
],
|
||||
"work": [
|
||||
"project",
|
||||
"task",
|
||||
"deadline",
|
||||
"meeting",
|
||||
"team",
|
||||
"work",
|
||||
"job",
|
||||
],
|
||||
"learning": [
|
||||
"learn",
|
||||
"study",
|
||||
"understand",
|
||||
"explain",
|
||||
"tutorial",
|
||||
"help",
|
||||
],
|
||||
"planning": ["plan", "schedule", "organize", "goal", "strategy"],
|
||||
"creative": ["design", "create", "write", "art", "music", "story"],
|
||||
}
|
||||
|
||||
topic_counts = defaultdict(int)
|
||||
total_content = ""
|
||||
|
||||
for message in messages:
|
||||
if message["role"] in ["user", "assistant"]:
|
||||
content = message["content"].lower()
|
||||
total_content += content + " "
|
||||
|
||||
# Count topic occurrences
|
||||
for topic, keywords in topic_keywords.items():
|
||||
for keyword in keywords:
|
||||
if keyword in content:
|
||||
topic_counts[topic] += 1
|
||||
|
||||
# Diversity = number of topics with significant presence
|
||||
significant_topics = sum(1 for count in topic_counts.values() if count >= 2)
|
||||
diversity_score = min(significant_topics / len(topic_keywords), 1.0)
|
||||
|
||||
return diversity_score
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to calculate topic diversity: {e}")
|
||||
return 0.5
|
||||
|
||||
def _calculate_time_span_score(self, messages: List[Dict[str, Any]]) -> float:
|
||||
"""Calculate time span score based on conversation duration."""
|
||||
try:
|
||||
timestamps = []
|
||||
for message in messages:
|
||||
if "timestamp" in message:
|
||||
try:
|
||||
ts = datetime.fromisoformat(message["timestamp"])
|
||||
timestamps.append(ts)
|
||||
except:
|
||||
continue
|
||||
|
||||
if len(timestamps) < 2:
|
||||
return 0.1 # Very short conversation
|
||||
|
||||
duration = max(timestamps) - min(timestamps)
|
||||
duration_hours = duration.total_seconds() / 3600
|
||||
|
||||
# Score based on duration (24 hours = full score)
|
||||
return min(duration_hours / 24, 1.0)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to calculate time span: {e}")
|
||||
return 0.5
|
||||
|
||||
def _calculate_question_density(self, user_messages: List[Dict[str, Any]]) -> float:
|
||||
"""Calculate question density from user messages."""
|
||||
try:
|
||||
if not user_messages:
|
||||
return 0.0
|
||||
|
||||
question_count = 0
|
||||
total_words = 0
|
||||
|
||||
for message in user_messages:
|
||||
content = message["content"]
|
||||
# Count questions
|
||||
question_marks = content.count("?")
|
||||
question_words = len(
|
||||
re.findall(
|
||||
r"\b(how|what|when|where|why|which|who|can|could|would|should|is|are|do|does)\b",
|
||||
content,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
)
|
||||
question_count += question_marks + question_words
|
||||
|
||||
# Count words
|
||||
words = len(content.split())
|
||||
total_words += words
|
||||
|
||||
if total_words == 0:
|
||||
return 0.0
|
||||
|
||||
question_ratio = question_count / total_words
|
||||
return min(question_ratio * 5, 1.0) # Normalize
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to calculate question density: {e}")
|
||||
return 0.5
|
||||
|
||||
def should_retain_full(
|
||||
self, conversation: Dict[str, Any], importance_score: Optional[float] = None
|
||||
) -> bool:
|
||||
"""
|
||||
Determine if conversation should be retained in full form.
|
||||
|
||||
Args:
|
||||
conversation: Conversation data
|
||||
importance_score: Pre-calculated importance score (optional)
|
||||
|
||||
Returns:
|
||||
True if conversation should be retained full
|
||||
"""
|
||||
if importance_score is None:
|
||||
importance_score = self.calculate_importance_score(conversation)
|
||||
|
||||
# User explicitly marked important always retained
|
||||
metadata = conversation.get("metadata", {})
|
||||
if metadata.get("user_marked_important", False):
|
||||
return True
|
||||
|
||||
# High importance score
|
||||
if importance_score >= self.important_threshold:
|
||||
return True
|
||||
|
||||
# Recent important conversations (within 30 days)
|
||||
created_at = conversation.get("created_at")
|
||||
if created_at:
|
||||
try:
|
||||
conv_date = datetime.fromisoformat(created_at)
|
||||
if (datetime.now() - conv_date).days <= 30 and importance_score >= 0.5:
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
|
||||
return False
|
||||
|
||||
def should_retain_compressed(
|
||||
self, conversation: Dict[str, Any], importance_score: Optional[float] = None
|
||||
) -> Tuple[bool, str]:
|
||||
"""
|
||||
Determine if conversation should be compressed and to what level.
|
||||
|
||||
Args:
|
||||
conversation: Conversation data
|
||||
importance_score: Pre-calculated importance score (optional)
|
||||
|
||||
Returns:
|
||||
Tuple of (should_compress, recommended_compression_level)
|
||||
"""
|
||||
if importance_score is None:
|
||||
importance_score = self.calculate_importance_score(conversation)
|
||||
|
||||
# Check if should retain full
|
||||
if self.should_retain_full(conversation, importance_score):
|
||||
return False, "full"
|
||||
|
||||
# Determine compression level based on importance
|
||||
if importance_score >= self.preserve_threshold:
|
||||
# Important: lighter compression (key points)
|
||||
return True, "key_points"
|
||||
elif importance_score >= 0.2:
|
||||
# Moderately important: summary compression
|
||||
return True, "summary"
|
||||
else:
|
||||
# Low importance: metadata only
|
||||
return True, "metadata"
|
||||
|
||||
def update_retention_policy(self, policy_settings: Dict[str, Any]) -> None:
|
||||
"""
|
||||
Update retention policy parameters.
|
||||
|
||||
Args:
|
||||
policy_settings: Dictionary of policy parameter updates
|
||||
"""
|
||||
try:
|
||||
if "important_threshold" in policy_settings:
|
||||
self.important_threshold = float(policy_settings["important_threshold"])
|
||||
if "preserve_threshold" in policy_settings:
|
||||
self.preserve_threshold = float(policy_settings["preserve_threshold"])
|
||||
if "user_marked_multiplier" in policy_settings:
|
||||
self.user_marked_multiplier = float(
|
||||
policy_settings["user_marked_multiplier"]
|
||||
)
|
||||
if "weights" in policy_settings:
|
||||
self.weights.update(policy_settings["weights"])
|
||||
|
||||
self.logger.info(f"Updated retention policy: {policy_settings}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to update retention policy: {e}")
|
||||
|
||||
def get_retention_recommendations(
|
||||
self, conversations: List[Dict[str, Any]]
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get retention recommendations for multiple conversations.
|
||||
|
||||
Args:
|
||||
conversations: List of conversations to analyze
|
||||
|
||||
Returns:
|
||||
List of recommendations with scores and actions
|
||||
"""
|
||||
recommendations = []
|
||||
|
||||
for conversation in conversations:
|
||||
try:
|
||||
importance_score = self.calculate_importance_score(conversation)
|
||||
should_compress, compression_level = self.should_retain_compressed(
|
||||
conversation, importance_score
|
||||
)
|
||||
|
||||
recommendation = {
|
||||
"conversation_id": conversation.get("id"),
|
||||
"title": conversation.get("title"),
|
||||
"created_at": conversation.get("created_at"),
|
||||
"importance_score": importance_score,
|
||||
"should_compress": should_compress,
|
||||
"recommended_level": compression_level,
|
||||
"user_marked_important": conversation.get("metadata", {}).get(
|
||||
"user_marked_important", False
|
||||
),
|
||||
"message_count": len(conversation.get("messages", [])),
|
||||
"retention_reason": self._get_retention_reason(
|
||||
importance_score, compression_level
|
||||
),
|
||||
}
|
||||
|
||||
recommendations.append(recommendation)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
f"Failed to analyze conversation {conversation.get('id')}: {e}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Sort by importance score (highest first)
|
||||
recommendations.sort(key=lambda x: x["importance_score"], reverse=True)
|
||||
return recommendations
|
||||
|
||||
def _get_retention_reason(
|
||||
self, importance_score: float, compression_level: str
|
||||
) -> str:
|
||||
"""Get human-readable reason for retention decision."""
|
||||
if compression_level == "full":
|
||||
if importance_score >= self.important_threshold:
|
||||
return "High importance - retained full"
|
||||
else:
|
||||
return "Recent conversation - retained full"
|
||||
elif compression_level == "key_points":
|
||||
return f"Moderate importance ({importance_score:.2f}) - key points retained"
|
||||
elif compression_level == "summary":
|
||||
return f"Standard importance ({importance_score:.2f}) - summary compression"
|
||||
else:
|
||||
return f"Low importance ({importance_score:.2f}) - metadata only"
|
||||
|
||||
def mark_conversation_important(
|
||||
self, conversation_id: str, important: bool = True
|
||||
) -> bool:
|
||||
"""
|
||||
Mark a conversation as user-important.
|
||||
|
||||
Args:
|
||||
conversation_id: ID of conversation to mark
|
||||
important: Whether to mark as important (True) or not important (False)
|
||||
|
||||
Returns:
|
||||
True if marked successfully
|
||||
"""
|
||||
try:
|
||||
conversation = self.db_manager.get_conversation(
|
||||
conversation_id, include_messages=False
|
||||
)
|
||||
if not conversation:
|
||||
self.logger.error(f"Conversation {conversation_id} not found")
|
||||
return False
|
||||
|
||||
# Update metadata
|
||||
metadata = conversation.get("metadata", {})
|
||||
metadata["user_marked_important"] = important
|
||||
metadata["marked_important_at"] = datetime.now().isoformat()
|
||||
|
||||
self.db_manager.update_conversation_metadata(conversation_id, metadata)
|
||||
|
||||
self.logger.info(
|
||||
f"Marked conversation {conversation_id} as {'important' if important else 'not important'}"
|
||||
)
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
f"Failed to mark conversation {conversation_id} important: {e}"
|
||||
)
|
||||
return False
|
||||
|
||||
def get_important_conversations(self) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get all user-marked important conversations.
|
||||
|
||||
Returns:
|
||||
List of important conversations
|
||||
"""
|
||||
try:
|
||||
recent_conversations = self.db_manager.get_recent_conversations(limit=1000)
|
||||
|
||||
important_conversations = []
|
||||
for conversation in recent_conversations:
|
||||
full_conversation = self.db_manager.get_conversation(
|
||||
conversation["id"], include_messages=True
|
||||
)
|
||||
if full_conversation:
|
||||
metadata = full_conversation.get("metadata", {})
|
||||
if metadata.get("user_marked_important", False):
|
||||
important_conversations.append(full_conversation)
|
||||
|
||||
return important_conversations
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to get important conversations: {e}")
|
||||
return []
|
||||
|
||||
def get_retention_stats(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get retention policy statistics.
|
||||
|
||||
Returns:
|
||||
Dictionary with retention statistics
|
||||
"""
|
||||
try:
|
||||
recent_conversations = self.db_manager.get_recent_conversations(limit=500)
|
||||
|
||||
stats = {
|
||||
"total_conversations": len(recent_conversations),
|
||||
"important_marked": 0,
|
||||
"importance_distribution": {"high": 0, "medium": 0, "low": 0},
|
||||
"average_importance": 0.0,
|
||||
"compression_recommendations": {
|
||||
"full": 0,
|
||||
"key_points": 0,
|
||||
"summary": 0,
|
||||
"metadata": 0,
|
||||
},
|
||||
}
|
||||
|
||||
importance_scores = []
|
||||
|
||||
for conv_data in recent_conversations:
|
||||
conversation = self.db_manager.get_conversation(
|
||||
conv_data["id"], include_messages=True
|
||||
)
|
||||
if not conversation:
|
||||
continue
|
||||
|
||||
importance_score = self.calculate_importance_score(conversation)
|
||||
importance_scores.append(importance_score)
|
||||
|
||||
# Check if user marked important
|
||||
metadata = conversation.get("metadata", {})
|
||||
if metadata.get("user_marked_important", False):
|
||||
stats["important_marked"] += 1
|
||||
|
||||
# Categorize importance
|
||||
if importance_score >= self.important_threshold:
|
||||
stats["importance_distribution"]["high"] += 1
|
||||
elif importance_score >= self.preserve_threshold:
|
||||
stats["importance_distribution"]["medium"] += 1
|
||||
else:
|
||||
stats["importance_distribution"]["low"] += 1
|
||||
|
||||
# Compression recommendations
|
||||
should_compress, level = self.should_retain_compressed(
|
||||
conversation, importance_score
|
||||
)
|
||||
if level in stats["compression_recommendations"]:
|
||||
stats["compression_recommendations"][level] += 1
|
||||
else:
|
||||
stats["compression_recommendations"]["full"] += 1
|
||||
|
||||
if importance_scores:
|
||||
stats["average_importance"] = statistics.mean(importance_scores)
|
||||
|
||||
return stats
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to get retention stats: {e}")
|
||||
return {}
|
||||
Reference in New Issue
Block a user