Mai/src/memory/backup/retention.py

"""
Smart retention policies for conversation preservation.

Implements value-based retention scoring that keeps important
conversations longer while efficiently managing storage usage.
"""

import logging
import re
from datetime import datetime, timedelta
from typing import Dict, Any, List, Optional, Tuple
from collections import defaultdict
import statistics

import sys
import os

sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))

from memory.storage.sqlite_manager import SQLiteManager


class RetentionPolicy:
    """
    Smart retention policy engine.

    Calculates conversation importance scores and determines
    which conversations should be retained or compressed.
    """

    def __init__(self, sqlite_manager: SQLiteManager):
        """
        Initialize retention policy.

        Args:
            sqlite_manager: SQLite manager instance for data access
        """
        self.db_manager = sqlite_manager
        self.logger = logging.getLogger(__name__)

        # Retention policy parameters
        self.important_threshold = 0.7  # Above this = retain full
        self.preserve_threshold = 0.4  # Above this = lighter compression
        self.user_marked_multiplier = 1.5  # Boost for user-marked important

        # Engagement scoring weights
        self.weights = {
            "message_count": 0.2,  # More messages = higher engagement
            "response_quality": 0.25,  # Back-and-forth conversation
            "topic_diversity": 0.15,  # Multiple topics = important
            "time_span": 0.1,  # Longer duration = important
            "user_marked": 0.2,  # User explicitly marked important
            "question_density": 0.1,  # Questions = seeking information
        }

    def calculate_importance_score(self, conversation: Dict[str, Any]) -> float:
        """
        Calculate importance score for a conversation.

        Args:
            conversation: Conversation data with messages and metadata

        Returns:
            Importance score between 0.0 and 1.0
        """
        try:
            messages = conversation.get("messages", [])
            if not messages:
                return 0.0

            # Extract basic metrics
            message_count = len(messages)
            user_messages = [m for m in messages if m["role"] == "user"]
            assistant_messages = [m for m in messages if m["role"] == "assistant"]

            # Calculate engagement metrics
            scores = {}

            # 1. Message count score (normalized)
            scores["message_count"] = min(
                message_count / 20, 1.0
            )  # 20 messages = full score

            # 2. Response quality (back-and-forth ratio)
            if len(user_messages) > 0 and len(assistant_messages) > 0:
                ratio = min(len(assistant_messages), len(user_messages)) / max(
                    len(assistant_messages), len(user_messages)
                )
                scores["response_quality"] = ratio  # Close to 1.0 = good conversation
            else:
                scores["response_quality"] = 0.5

            # 3. Topic diversity (variety in content)
            scores["topic_diversity"] = self._calculate_topic_diversity(messages)

            # 4. Time span (conversation duration)
            scores["time_span"] = self._calculate_time_span_score(messages)

            # 5. User marked important
            metadata = conversation.get("metadata", {})
            user_marked = metadata.get("user_marked_important", False)
            scores["user_marked"] = self.user_marked_multiplier if user_marked else 1.0

            # 6. Question density (information seeking)
            scores["question_density"] = self._calculate_question_density(user_messages)

            # Calculate weighted final score
            final_score = 0.0
            for factor, weight in self.weights.items():
                final_score += scores.get(factor, 0.0) * weight

            # Normalize to 0-1 range
            final_score = max(0.0, min(1.0, final_score))

            self.logger.debug(
                f"Importance score for {conversation.get('id')}: {final_score:.3f}"
            )
            return final_score

        except Exception as e:
            self.logger.error(f"Failed to calculate importance score: {e}")
            return 0.5  # Default to neutral

    def _calculate_topic_diversity(self, messages: List[Dict[str, Any]]) -> float:
        """Calculate topic diversity score from messages."""
        try:
            # Simple topic-based diversity using keyword categories
            topic_keywords = {
                "technical": [
                    "code",
                    "programming",
                    "algorithm",
                    "function",
                    "bug",
                    "debug",
                    "api",
                    "database",
                ],
                "personal": [
                    "feel",
                    "think",
                    "opinion",
                    "prefer",
                    "like",
                    "personal",
                    "life",
                ],
                "work": [
                    "project",
                    "task",
                    "deadline",
                    "meeting",
                    "team",
                    "work",
                    "job",
                ],
                "learning": [
                    "learn",
                    "study",
                    "understand",
                    "explain",
                    "tutorial",
                    "help",
                ],
                "planning": ["plan", "schedule", "organize", "goal", "strategy"],
                "creative": ["design", "create", "write", "art", "music", "story"],
            }

            topic_counts = defaultdict(int)
            total_content = ""

            for message in messages:
                if message["role"] in ["user", "assistant"]:
                    content = message["content"].lower()
                    total_content += content + " "

                    # Count topic occurrences
                    for topic, keywords in topic_keywords.items():
                        for keyword in keywords:
                            if keyword in content:
                                topic_counts[topic] += 1

            # Diversity = number of topics with significant presence
            significant_topics = sum(1 for count in topic_counts.values() if count >= 2)
            diversity_score = min(significant_topics / len(topic_keywords), 1.0)

            return diversity_score

        except Exception as e:
            self.logger.error(f"Failed to calculate topic diversity: {e}")
            return 0.5

    def _calculate_time_span_score(self, messages: List[Dict[str, Any]]) -> float:
        """Calculate time span score based on conversation duration."""
        try:
            timestamps = []
            for message in messages:
                if "timestamp" in message:
                    try:
                        ts = datetime.fromisoformat(message["timestamp"])
                        timestamps.append(ts)
                    except:
                        continue

            if len(timestamps) < 2:
                return 0.1  # Very short conversation

            duration = max(timestamps) - min(timestamps)
            duration_hours = duration.total_seconds() / 3600

            # Score based on duration (24 hours = full score)
            return min(duration_hours / 24, 1.0)

        except Exception as e:
            self.logger.error(f"Failed to calculate time span: {e}")
            return 0.5

    def _calculate_question_density(self, user_messages: List[Dict[str, Any]]) -> float:
        """Calculate question density from user messages."""
        try:
            if not user_messages:
                return 0.0

            question_count = 0
            total_words = 0

            for message in user_messages:
                content = message["content"]
                # Count questions
                question_marks = content.count("?")
                question_words = len(
                    re.findall(
                        r"\b(how|what|when|where|why|which|who|can|could|would|should|is|are|do|does)\b",
                        content,
                        re.IGNORECASE,
                    )
                )
                question_count += question_marks + question_words

                # Count words
                words = len(content.split())
                total_words += words

            if total_words == 0:
                return 0.0

            question_ratio = question_count / total_words
            return min(question_ratio * 5, 1.0)  # Normalize

        except Exception as e:
            self.logger.error(f"Failed to calculate question density: {e}")
            return 0.5

    def should_retain_full(
        self, conversation: Dict[str, Any], importance_score: Optional[float] = None
    ) -> bool:
        """
        Determine if conversation should be retained in full form.

        Args:
            conversation: Conversation data
            importance_score: Pre-calculated importance score (optional)

        Returns:
            True if conversation should be retained full
        """
        if importance_score is None:
            importance_score = self.calculate_importance_score(conversation)

        # User explicitly marked important always retained
        metadata = conversation.get("metadata", {})
        if metadata.get("user_marked_important", False):
            return True

        # High importance score
        if importance_score >= self.important_threshold:
            return True

        # Recent important conversations (within 30 days)
        created_at = conversation.get("created_at")
        if created_at:
            try:
                conv_date = datetime.fromisoformat(created_at)
                if (datetime.now() - conv_date).days <= 30 and importance_score >= 0.5:
                    return True
            except:
                pass

        return False

    def should_retain_compressed(
        self, conversation: Dict[str, Any], importance_score: Optional[float] = None
    ) -> Tuple[bool, str]:
        """
        Determine if conversation should be compressed and to what level.

        Args:
            conversation: Conversation data
            importance_score: Pre-calculated importance score (optional)

        Returns:
            Tuple of (should_compress, recommended_compression_level)
        """
        if importance_score is None:
            importance_score = self.calculate_importance_score(conversation)

        # Check if should retain full
        if self.should_retain_full(conversation, importance_score):
            return False, "full"

        # Determine compression level based on importance
        if importance_score >= self.preserve_threshold:
            # Important: lighter compression (key points)
            return True, "key_points"
        elif importance_score >= 0.2:
            # Moderately important: summary compression
            return True, "summary"
        else:
            # Low importance: metadata only
            return True, "metadata"

    def update_retention_policy(self, policy_settings: Dict[str, Any]) -> None:
        """
        Update retention policy parameters.

        Args:
            policy_settings: Dictionary of policy parameter updates
        """
        try:
            if "important_threshold" in policy_settings:
                self.important_threshold = float(policy_settings["important_threshold"])
            if "preserve_threshold" in policy_settings:
                self.preserve_threshold = float(policy_settings["preserve_threshold"])
            if "user_marked_multiplier" in policy_settings:
                self.user_marked_multiplier = float(
                    policy_settings["user_marked_multiplier"]
                )
            if "weights" in policy_settings:
                self.weights.update(policy_settings["weights"])

            self.logger.info(f"Updated retention policy: {policy_settings}")

        except Exception as e:
            self.logger.error(f"Failed to update retention policy: {e}")

    def get_retention_recommendations(
        self, conversations: List[Dict[str, Any]]
    ) -> List[Dict[str, Any]]:
        """
        Get retention recommendations for multiple conversations.

        Args:
            conversations: List of conversations to analyze

        Returns:
            List of recommendations with scores and actions
        """
        recommendations = []

        for conversation in conversations:
            try:
                importance_score = self.calculate_importance_score(conversation)
                should_compress, compression_level = self.should_retain_compressed(
                    conversation, importance_score
                )

                recommendation = {
                    "conversation_id": conversation.get("id"),
                    "title": conversation.get("title"),
                    "created_at": conversation.get("created_at"),
                    "importance_score": importance_score,
                    "should_compress": should_compress,
                    "recommended_level": compression_level,
                    "user_marked_important": conversation.get("metadata", {}).get(
                        "user_marked_important", False
                    ),
                    "message_count": len(conversation.get("messages", [])),
                    "retention_reason": self._get_retention_reason(
                        importance_score, compression_level
                    ),
                }

                recommendations.append(recommendation)

            except Exception as e:
                self.logger.error(
                    f"Failed to analyze conversation {conversation.get('id')}: {e}"
                )
                continue

        # Sort by importance score (highest first)
        recommendations.sort(key=lambda x: x["importance_score"], reverse=True)
        return recommendations

    def _get_retention_reason(
        self, importance_score: float, compression_level: str
    ) -> str:
        """Get human-readable reason for retention decision."""
        if compression_level == "full":
            if importance_score >= self.important_threshold:
                return "High importance - retained full"
            else:
                return "Recent conversation - retained full"
        elif compression_level == "key_points":
            return f"Moderate importance ({importance_score:.2f}) - key points retained"
        elif compression_level == "summary":
            return f"Standard importance ({importance_score:.2f}) - summary compression"
        else:
            return f"Low importance ({importance_score:.2f}) - metadata only"

    def mark_conversation_important(
        self, conversation_id: str, important: bool = True
    ) -> bool:
        """
        Mark a conversation as user-important.

        Args:
            conversation_id: ID of conversation to mark
            important: Whether to mark as important (True) or not important (False)

        Returns:
            True if marked successfully
        """
        try:
            conversation = self.db_manager.get_conversation(
                conversation_id, include_messages=False
            )
            if not conversation:
                self.logger.error(f"Conversation {conversation_id} not found")
                return False

            # Update metadata
            metadata = conversation.get("metadata", {})
            metadata["user_marked_important"] = important
            metadata["marked_important_at"] = datetime.now().isoformat()

            self.db_manager.update_conversation_metadata(conversation_id, metadata)

            self.logger.info(
                f"Marked conversation {conversation_id} as {'important' if important else 'not important'}"
            )
            return True

        except Exception as e:
            self.logger.error(
                f"Failed to mark conversation {conversation_id} important: {e}"
            )
            return False

    def get_important_conversations(self) -> List[Dict[str, Any]]:
        """
        Get all user-marked important conversations.

        Returns:
            List of important conversations
        """
        try:
            recent_conversations = self.db_manager.get_recent_conversations(limit=1000)

            important_conversations = []
            for conversation in recent_conversations:
                full_conversation = self.db_manager.get_conversation(
                    conversation["id"], include_messages=True
                )
                if full_conversation:
                    metadata = full_conversation.get("metadata", {})
                    if metadata.get("user_marked_important", False):
                        important_conversations.append(full_conversation)

            return important_conversations

        except Exception as e:
            self.logger.error(f"Failed to get important conversations: {e}")
            return []

    def get_retention_stats(self) -> Dict[str, Any]:
        """
        Get retention policy statistics.

        Returns:
            Dictionary with retention statistics
        """
        try:
            recent_conversations = self.db_manager.get_recent_conversations(limit=500)

            stats = {
                "total_conversations": len(recent_conversations),
                "important_marked": 0,
                "importance_distribution": {"high": 0, "medium": 0, "low": 0},
                "average_importance": 0.0,
                "compression_recommendations": {
                    "full": 0,
                    "key_points": 0,
                    "summary": 0,
                    "metadata": 0,
                },
            }

            importance_scores = []

            for conv_data in recent_conversations:
                conversation = self.db_manager.get_conversation(
                    conv_data["id"], include_messages=True
                )
                if not conversation:
                    continue

                importance_score = self.calculate_importance_score(conversation)
                importance_scores.append(importance_score)

                # Check if user marked important
                metadata = conversation.get("metadata", {})
                if metadata.get("user_marked_important", False):
                    stats["important_marked"] += 1

                # Categorize importance
                if importance_score >= self.important_threshold:
                    stats["importance_distribution"]["high"] += 1
                elif importance_score >= self.preserve_threshold:
                    stats["importance_distribution"]["medium"] += 1
                else:
                    stats["importance_distribution"]["low"] += 1

                # Compression recommendations
                should_compress, level = self.should_retain_compressed(
                    conversation, importance_score
                )
                if level in stats["compression_recommendations"]:
                    stats["compression_recommendations"][level] += 1
                else:
                    stats["compression_recommendations"]["full"] += 1

            if importance_scores:
                stats["average_importance"] = statistics.mean(importance_scores)

            return stats

        except Exception as e:
            self.logger.error(f"Failed to get retention stats: {e}")
            return {}