Mai/src/models/model_manager.py

"""Model manager for intelligent model selection and switching."""

import asyncio
import time
from typing import Dict, List, Optional, Any, Tuple
import logging
import yaml
from pathlib import Path

from .lmstudio_adapter import LMStudioAdapter
from .resource_monitor import ResourceMonitor
from .context_manager import ContextManager

# Fix circular imports by importing within functions
ProactiveScaler = None
ScalingDecision = None
HardwareTierDetector = None
ResourcePersonality = None
ResourceType = None


def _get_scaling_components():
    global ProactiveScaler, ScalingDecision
    if ProactiveScaler is None:
        from resource.scaling import ProactiveScaler, ScalingDecision
    return ProactiveScaler, ScalingDecision


def _get_tier_components():
    global HardwareTierDetector
    if HardwareTierDetector is None:
        from resource.tiers import HardwareTierDetector
    return HardwareTierDetector


def _get_personality_components():
    global ResourcePersonality, ResourceType
    if ResourcePersonality is None:
        from resource.personality import ResourcePersonality, ResourceType
    return ResourcePersonality, ResourceType


class ModelManager:
    """
    Intelligent model selection and switching system.

    Coordinates between LM Studio adapter, resource monitoring, and context
    management to provide optimal model selection and seamless switching.
    """

    def __init__(self, config_path: Optional[str] = None):
        """Initialize ModelManager with configuration.

        Args:
            config_path: Path to models configuration file
        """
        self.logger = logging.getLogger(__name__)

        # Load configuration
        self.config_path = (
            config_path
            or Path(__file__).parent.parent.parent / "config" / "models.yaml"
        )
        self.config = self._load_config()

        # Initialize subsystems
        self.lm_adapter = LMStudioAdapter()
        self.resource_monitor = ResourceMonitor()
        self.context_manager = ContextManager()

        # Get components safely
        tier_components = _get_tier_components()
        if tier_components is not None:
            HardwareTierDetector = tier_components
        else:
            # Fallback to direct import if lazy loading fails
            from resource.tiers import HardwareTierDetector

            self.tier_detector = HardwareTierDetector()

        # Initialize proactive scaler
        scaling_components = _get_scaling_components()
        if scaling_components is not None:
            ProactiveScaler, ScalingDecision = scaling_components
        else:
            # Fallback to direct import if lazy loading fails
            from resource.scaling import ProactiveScaler, ScalingDecision

        self._proactive_scaler = ProactiveScaler(
            resource_monitor=self.resource_monitor,
            tier_detector=self.tier_detector,
            upgrade_threshold=0.8,
            downgrade_threshold=0.9,
            stabilization_minutes=5,
            monitoring_interval=2.0,
            trend_window_minutes=10,
        )

        # Set callback for scaling decisions
        self._proactive_scaler.set_scaling_callback(
            self._handle_proactive_scaling_decision
        )

        # Start continuous monitoring
        self._proactive_scaler.start_continuous_monitoring()

        # Initialize personality system
        # Get personality components safely
        personality_components = _get_personality_components()
        if personality_components is not None:
            ResourcePersonality, ResourceType = personality_components
        else:
            # Fallback to direct import if lazy loading fails
            from resource.personality import ResourcePersonality, ResourceType

        self._personality = ResourcePersonality(sarcasm_level=0.7, gremlin_hunger=0.8)

        # Current model state
        self.current_model_key: Optional[str] = None
        self.current_model_instance: Optional[Any] = None
        self.available_models: List[Dict[str, Any]] = []
        self.model_configurations: Dict[str, Dict[str, Any]] = {}

        # Switching state
        self._switching_lock = asyncio.Lock()
        self._failure_count = {}
        self._last_switch_time = 0

        # Load initial configuration
        self._load_model_configurations()
        self._refresh_available_models()

        self.logger.info("ModelManager initialized with intelligent switching enabled")

    def _load_config(self) -> Dict[str, Any]:
        """Load models configuration from YAML file."""
        try:
            with open(self.config_path, "r") as f:
                return yaml.safe_load(f)
        except Exception as e:
            self.logger.error(f"Failed to load config from {self.config_path}: {e}")
            # Return minimal default config
            return {
                "models": [],
                "selection_rules": {
                    "resource_thresholds": {
                        "memory_available_gb": {"small": 2, "medium": 4, "large": 8}
                    },
                    "cpu_threshold_percent": 80,
                    "gpu_required_for_large": True,
                },
                "performance": {
                    "load_timeout_seconds": {"small": 30, "medium": 60, "large": 120},
                    "switching_triggers": {
                        "cpu_threshold": 85,
                        "memory_threshold": 85,
                        "response_time_threshold_ms": 5000,
                        "consecutive_failures": 3,
                    },
                },
            }

    def _load_model_configurations(self) -> None:
        """Load model configurations from config."""
        self.model_configurations = {}

        for model in self.config.get("models", []):
            self.model_configurations[model["key"]] = model

        self.logger.info(
            f"Loaded {len(self.model_configurations)} model configurations"
        )

    def _refresh_available_models(self) -> None:
        """Refresh list of available models from LM Studio."""
        try:
            model_list = self.lm_adapter.list_models()
            self.available_models = []

            for model_key, display_name, size_gb in model_list:
                if model_key in self.model_configurations:
                    model_info = self.model_configurations[model_key].copy()
                    model_info.update(
                        {
                            "display_name": display_name,
                            "estimated_size_gb": size_gb,
                            "available": True,
                        }
                    )
                    self.available_models.append(model_info)
                else:
                    # Create minimal config for unknown models
                    self.available_models.append(
                        {
                            "key": model_key,
                            "display_name": display_name,
                            "estimated_size_gb": size_gb,
                            "available": True,
                            "category": "unknown",
                        }
                    )

        except Exception as e:
            self.logger.error(f"Failed to refresh available models: {e}")
            self.available_models = []

    def select_best_model(
        self, conversation_context: Optional[Dict[str, Any]] = None
    ) -> Optional[str]:
        """Select the best model based on current resources and context.

        Args:
            conversation_context: Optional context about the current conversation

        Returns:
            Selected model key or None if no suitable model found
        """
        try:
            # Get current resources and scaling recommendations
            resources = self.resource_monitor.get_current_resources()
            scaling_status = self._proactive_scaler.get_scaling_status()

            # Apply proactive scaling recommendations
            if scaling_status.get("degradation_needed", False):
                # Prefer smaller models if degradation is needed
                self.logger.debug("Degradation needed, prioritizing smaller models")
            elif scaling_status.get("upgrade_available", False):
                # Consider larger models if upgrade is available
                self.logger.debug("Upgrade available, considering larger models")

            # Filter models that can fit current resources
            suitable_models = []

            for model in self.available_models:
                if not model.get("available", False):
                    continue

                # Check resource requirements
                required_memory = model.get("min_memory_gb", 2)
                required_vram = model.get("min_vram_gb", 1)

                available_memory = resources["available_memory_gb"]
                available_vram = resources.get("gpu_vram_gb", 0)

                # Check memory with safety margin
                if available_memory < required_memory * 1.5:
                    continue

                # Check VRAM if required for this model size
                if (
                    model.get("category") in ["large"]
                    and required_vram > available_vram
                ):
                    continue

                suitable_models.append(model)

            if not suitable_models:
                self.logger.warning("No models fit current resource constraints")
                return None

            # Sort by preference (large preferred if resources allow)
            selection_rules = self.config.get("selection_rules", {})

            # Apply preference scoring
            scored_models = []
            for model in suitable_models:
                score = 0.0

                # Category preference (large > medium > small)
                category = model.get("category", "unknown")
                if category == "large" and resources["available_memory_gb"] >= 8:
                    score += 100
                elif category == "medium" and resources["available_memory_gb"] >= 4:
                    score += 70
                elif category == "small":
                    score += 40

                # Preference rules from config
                preferred_when = model.get("preferred_when")
                if preferred_when:
                    if "memory" in preferred_when:
                        required_mem = int(
                            preferred_when.split("memory >= ")[1].split("GB")[0]
                        )
                        if resources["available_memory_gb"] >= required_mem:
                            score += 20

                # Factor in recent failures (penalize frequently failing models)
                failure_count = self._failure_count.get(model["key"], 0)
                score -= failure_count * 10

                # Factor in conversation complexity if provided
                if conversation_context:
                    task_type = conversation_context.get("task_type", "simple_chat")
                    model_capabilities = model.get("capabilities", [])

                    if task_type == "reasoning" and "reasoning" in model_capabilities:
                        score += 30
                    elif task_type == "analysis" and "analysis" in model_capabilities:
                        score += 30
                    elif (
                        task_type == "code_generation"
                        and "reasoning" in model_capabilities
                    ):
                        score += 20

                scored_models.append((score, model))

            # Sort by score and return best
            scored_models.sort(key=lambda x: x[0], reverse=True)

            if scored_models:
                best_model = scored_models[0][1]
                self.logger.info(
                    f"Selected model: {best_model['display_name']} (score: {scored_models[0][0]:.1f})"
                )
                return best_model["key"]

        except Exception as e:
            self.logger.error(f"Error in model selection: {e}")

        return None

    async def switch_model(self, target_model_key: str) -> bool:
        """Switch to a different model with proper resource cleanup.

        Args:
            target_model_key: Model key to switch to

        Returns:
            True if switch successful, False otherwise
        """
        async with self._switching_lock:
            try:
                if target_model_key == self.current_model_key:
                    self.logger.debug(f"Already using model {target_model_key}")
                    return True

                # Don't switch too frequently
                current_time = time.time()
                if current_time - self._last_switch_time < 30:  # 30 second cooldown
                    self.logger.warning(
                        "Model switch requested too frequently, ignoring"
                    )
                    return False

                self.logger.info(
                    f"Switching model: {self.current_model_key} -> {target_model_key}"
                )

                # Unload current model (silent - no user notification per CONTEXT.md)
                if self.current_model_instance and self.current_model_key:
                    try:
                        self.lm_adapter.unload_model(self.current_model_key)
                    except Exception as e:
                        self.logger.warning(f"Error unloading current model: {e}")

                # Load new model
                target_config = self.model_configurations.get(target_model_key)
                if not target_config:
                    target_config = {
                        "category": "unknown"
                    }  # Fallback for unknown models

                timeout = self.config.get("performance", {}).get(
                    "load_timeout_seconds", {}
                )
                timeout_seconds = timeout.get(
                    target_config.get("category", "medium"), 60
                )

                new_model = self.lm_adapter.load_model(
                    target_model_key, timeout_seconds
                )

                if new_model:
                    self.current_model_key = target_model_key
                    self.current_model_instance = new_model
                    self._last_switch_time = current_time

                    # Reset failure count for successful load
                    self._failure_count[target_model_key] = 0

                    self.logger.info(f"Successfully switched to {target_model_key}")
                    return True
                else:
                    # Increment failure count
                    self._failure_count[target_model_key] = (
                        self._failure_count.get(target_model_key, 0) + 1
                    )
                    self.logger.error(f"Failed to load model {target_model_key}")
                    return False

            except Exception as e:
                self.logger.error(f"Error during model switch: {e}")
                return False

    async def personality_aware_model_switch(
        self,
        target_model_key: str,
        switch_reason: str = "resource optimization",
        notify_user: bool = True,
    ) -> Tuple[bool, Optional[str]]:
        """Switch models with personality-driven communication.

        Args:
            target_model_key: Model to switch to
            switch_reason: Reason for the switch
            notify_user: Whether to notify user (only for downgrades)

        Returns:
            Tuple of (success, user_message_or_None)
        """
        try:
            # Get model categories for capability comparison
            old_config = self.model_configurations.get(self.current_model_key or "", {})
            new_config = self.model_configurations.get(target_model_key, {})

            old_capability = str(old_config.get("category", "unknown"))
            new_capability = str(new_config.get("category", "unknown"))

            # Determine if this is a downgrade
            is_downgrade = self._is_capability_downgrade(old_capability, new_capability)

            # Perform the actual switch
            success = await self.switch_model(target_model_key)

            if success and is_downgrade and notify_user:
                # Generate personality-driven degradation notice
                context = {
                    "old_capability": old_capability,
                    "new_capability": new_capability,
                    "reason": switch_reason,
                }

                message, technical_tip = self._personality.generate_resource_message(
                    ResourceType.DEGRADATION_NOTICE, context, include_technical_tip=True
                )

                # Combine message and optional tip
                if technical_tip:
                    full_message = f"{message}\n\n💡 *Technical tip*: {technical_tip}"
                else:
                    full_message = message

                self.logger.info(f"Personality degradation notice: {full_message}")
                return True, full_message

            elif success:
                # Silent upgrade - no notification per requirements
                self.logger.debug(f"Silent upgrade to {target_model_key} completed")
                return True, None

            else:
                # Failed switch - generate resource request message
                context = {
                    "resource": "model capability",
                    "current_usage": 95,  # High usage when switches fail
                    "threshold": 80,
                }

                message, technical_tip = self._personality.generate_resource_message(
                    ResourceType.RESOURCE_REQUEST, context, include_technical_tip=True
                )

                if technical_tip:
                    full_message = f"{message}\n\n💡 *Technical tip*: {technical_tip}"
                else:
                    full_message = message

                return False, full_message

        except Exception as e:
            self.logger.error(f"Error in personality_aware_model_switch: {e}")
            return False, "I'm... having trouble switching models right now..."

    def _is_capability_downgrade(
        self, old_capability: str, new_capability: str
    ) -> bool:
        """Check if switch represents a capability downgrade.

        Args:
            old_capability: Current model capability
            new_capability: Target model capability

        Returns:
            True if this is a downgrade
        """
        capability_order = {"large": 3, "medium": 2, "small": 1, "unknown": 0}

        old_level = capability_order.get(old_capability, 0)
        new_level = capability_order.get(new_capability, 0)

        return new_level < old_level

    async def generate_response(
        self,
        message: str,
        conversation_id: str = "default",
        conversation_context: Optional[Dict[str, Any]] = None,
    ) -> str:
        """Generate response with automatic model switching if needed.

        Args:
            message: User message to respond to
            conversation_id: Conversation ID for context
            conversation_context: Optional context for model selection

        Returns:
            Generated response text
        """
        try:
            # Pre-flight resource check
            can_proceed, reason = self._proactive_scaler.check_preflight_resources(
                "model_inference"
            )
            if not can_proceed:
                # Handle resource constraints gracefully
                degradation_target = (
                    self._proactive_scaler.initiate_graceful_degradation(
                        f"Pre-flight check failed: {reason}", immediate=True
                    )
                )
                if degradation_target:
                    # Switch to smaller model with personality notification
                    smaller_model_key = self._find_model_by_size(degradation_target)
                    if (
                        smaller_model_key
                        and smaller_model_key != self.current_model_key
                    ):
                        (
                            success,
                            personality_message,
                        ) = await self.personality_aware_model_switch(
                            smaller_model_key,
                            f"Pre-flight check failed: {reason}",
                            notify_user=True,
                        )

                        # If personality message generated, include it in response
                        if personality_message:
                            return f"{personality_message}\n\nI'll try to help anyway with what I have..."
                        else:
                            return "Switching to a lighter model due to resource constraints..."
                    else:
                        return "I'm experiencing resource constraints and cannot generate a response right now."

            # Ensure we have a model loaded
            if not self.current_model_instance:
                await self._ensure_model_loaded(conversation_context)

            if not self.current_model_instance:
                return "I'm sorry, I'm unable to load any models at the moment."

            # Get conversation context
            context_messages = self.context_manager.get_context_for_model(
                conversation_id
            )

            # Format messages for model (LM Studio uses OpenAI-like format)
            formatted_context = self._format_context_for_model(context_messages)

            # Attempt to generate response
            start_time = time.time()
            try:
                response = self.current_model_instance.respond(
                    f"{formatted_context}\n\nUser: {message}\n\nAssistant:",
                    max_tokens=1024,  # Reasonable default
                )

                response_time_ms = (time.time() - start_time) * 1000

                # Check if response is adequate
                if not response or len(response.strip()) < 10:
                    raise ValueError("Model returned empty or inadequate response")

                # Add messages to context
                from models.conversation import MessageRole

                self.context_manager.add_message(
                    conversation_id, MessageRole.USER, message
                )
                self.context_manager.add_message(
                    conversation_id, MessageRole.ASSISTANT, response
                )

                # Update performance metrics for proactive scaling
                self._proactive_scaler.update_performance_metrics(
                    operation_type="model_inference",
                    duration_ms=response_time_ms,
                    success=True,
                )

                # Check if we should consider switching (slow response or struggling)
                if await self._should_consider_switching(response_time_ms, response):
                    await self._proactive_model_switch(conversation_context)

                return response

            except Exception as e:
                response_time_ms = (time.time() - start_time) * 1000
                self.logger.warning(f"Model generation failed: {e}")

                # Update performance metrics for failure
                self._proactive_scaler.update_performance_metrics(
                    operation_type="model_inference",
                    duration_ms=response_time_ms,
                    success=False,
                )

                # Try switching to a different model
                if await self._handle_model_failure(conversation_context):
                    # Retry with new model
                    return await self.generate_response(
                        message, conversation_id, conversation_context
                    )

                # Generate personality message for repeated failures
                resources = self.resource_monitor.get_current_resources()
                context = {
                    "resource": "model stability",
                    "current_usage": resources.get("memory_percent", 90),
                    "threshold": 80,
                }

                personality_message, technical_tip = (
                    self._personality.generate_resource_message(
                        ResourceType.RESOURCE_REQUEST,
                        context,
                        include_technical_tip=True,
                    )
                )

                if technical_tip:
                    return f"{personality_message}\n\n💡 *Technical tip*: {technical_tip}\n\nPlease try again in a moment."
                else:
                    return f"{personality_message}\n\nPlease try again in a moment."

        except Exception as e:
            self.logger.error(f"Error in generate_response: {e}")
            return "An error occurred while processing your request."

    def get_current_model_status(self) -> Dict[str, Any]:
        """Get status of currently loaded model and resource usage.

        Returns:
            Dictionary with model status and resource information
        """
        status = {
            "current_model_key": self.current_model_key,
            "model_loaded": self.current_model_instance is not None,
            "resources": self.resource_monitor.get_current_resources(),
            "available_models": len(self.available_models),
            "recent_failures": dict(self._failure_count),
            "scaling": self._proactive_scaler.get_scaling_status()
            if hasattr(self, "_proactive_scaler")
            else {},
        }

        if (
            self.current_model_key
            and self.current_model_key in self.model_configurations
        ):
            config = self.model_configurations[self.current_model_key]
            status.update(
                {
                    "model_display_name": config.get(
                        "display_name", self.current_model_key
                    ),
                    "model_category": config.get("category", "unknown"),
                    "context_window": config.get("context_window", 4096),
                }
            )

        return status

    async def preload_model(self, model_key: str) -> bool:
        """Preload a model in background for faster switching.

        Args:
            model_key: Model to preload

        Returns:
            True if preload successful, False otherwise
        """
        try:
            if model_key not in self.model_configurations:
                self.logger.warning(f"Cannot preload unknown model: {model_key}")
                return False

            # Check if already loaded
            if model_key == self.current_model_key:
                return True

            self.logger.info(f"Preloading model: {model_key}")
            # For now, just attempt to load it
            # In a full implementation, this would use background loading
            model = self.lm_adapter.load_model(model_key, timeout=120)

            if model:
                self.logger.info(f"Successfully preloaded {model_key}")
                # Immediately unload to free resources
                self.lm_adapter.unload_model(model_key)
                return True
            else:
                self.logger.warning(f"Failed to preload {model_key}")
                return False

        except Exception as e:
            self.logger.error(f"Error preloading model {model_key}: {e}")
            return False

    async def _ensure_model_loaded(
        self, conversation_context: Optional[Dict[str, Any]] = None
    ) -> None:
        """Ensure we have a model loaded, selecting one if needed."""
        if not self.current_model_instance:
            # Get scaling recommendations for initial load
            scaling_status = self._proactive_scaler.get_scaling_status()

            # Select best model considering scaling constraints
            best_model = self.select_best_model(conversation_context)
            if best_model:
                # Set current model size in proactive scaler
                model_config = self.model_configurations.get(best_model, {})
                model_size = model_config.get("category", "unknown")
                self._proactive_scaler._current_model_size = model_size

                await self.switch_model(best_model)

    async def _should_consider_switching(
        self, response_time_ms: float, response: str
    ) -> bool:
        """Check if we should consider switching models based on performance.

        Args:
            response_time_ms: Response generation time in milliseconds
            response: Generated response content

        Returns:
            True if switching should be considered
        """
        triggers = self.config.get("performance", {}).get("switching_triggers", {})

        # Check response time threshold
        if response_time_ms > triggers.get("response_time_threshold_ms", 5000):
            return True

        # Check system resource thresholds
        resources = self.resource_monitor.get_current_resources()

        if resources["memory_percent"] > triggers.get("memory_threshold", 85):
            return True

        if resources["cpu_percent"] > triggers.get("cpu_threshold", 85):
            return True

        # Check for poor quality responses
        if len(response.strip()) < 20 or response.count("I don't know") > 0:
            return True

        return False

    async def _proactive_model_switch(
        self, conversation_context: Optional[Dict[str, Any]] = None
    ) -> None:
        """Perform proactive model switching without user notification (silent switching)."""
        try:
            best_model = self.select_best_model(conversation_context)
            if best_model and best_model != self.current_model_key:
                self.logger.info(
                    f"Proactively switching from {self.current_model_key} to {best_model}"
                )
                await self.switch_model(best_model)
        except Exception as e:
            self.logger.error(f"Error in proactive switch: {e}")

    async def _handle_model_failure(
        self, conversation_context: Optional[Dict[str, Any]] = None
    ) -> bool:
        """Handle model failure by trying fallback models.

        Args:
            conversation_context: Context for selecting fallback model

        Returns:
            True if fallback was successful, False otherwise
        """
        if not self.current_model_key:
            return False

        # Increment failure count
        self._failure_count[self.current_model_key] = (
            self._failure_count.get(self.current_model_key, 0) + 1
        )

        # Get fallback chain from config
        fallback_chains = self.config.get("selection_rules", {}).get(
            "fallback_chains", {}
        )

        # Find appropriate fallback
        fallback_model = None
        current_config = self.model_configurations.get(self.current_model_key, {})
        current_category = current_config.get("category")

        if current_category == "large":
            for large_to_medium in fallback_chains.get("large_to_medium", []):
                if self.current_model_key in large_to_medium:
                    fallback_model = large_to_medium[self.current_model_key]
                    break
        elif current_category == "medium":
            for medium_to_small in fallback_chains.get("medium_to_small", []):
                if self.current_model_key in medium_to_small:
                    fallback_model = medium_to_small[self.current_model_key]
                    break

        if fallback_model:
            self.logger.info(
                f"Attempting fallback: {self.current_model_key} -> {fallback_model}"
            )
            return await self.switch_model(fallback_model)

        # If no specific fallback, try any smaller model
        smaller_models = [
            model["key"]
            for model in self.available_models
            if model.get("category") in ["small", "medium"]
            and model["key"] != self.current_model_key
        ]

        if smaller_models:
            self.logger.info(f"Falling back to smaller model: {smaller_models[0]}")
            return await self.switch_model(smaller_models[0])

        return False

    def _format_context_for_model(self, messages: List[Any]) -> str:
        """Format context messages for LM Studio model."""
        if not messages:
            return ""

        formatted_parts = []
        for msg in messages:
            role_str = getattr(msg, "role", "user")
            content_str = getattr(msg, "content", str(msg))

            if role_str == "user":
                formatted_parts.append(f"User: {content_str}")
            elif role_str == "assistant":
                formatted_parts.append(f"Assistant: {content_str}")
            elif role_str == "system":
                formatted_parts.append(f"System: {content_str}")

        return "\n".join(formatted_parts)

    def _handle_proactive_scaling_decision(self, scaling_event) -> None:
        """Handle proactive scaling decision from ProactiveScaler.

        Args:
            scaling_event: ScalingEvent from ProactiveScaler
        """
        try:
            if scaling_event.decision == ScalingDecision.UPGRADE:
                # Proactive upgrade to larger model
                target_model_key = self._find_model_by_size(
                    scaling_event.new_model_size
                )
                if target_model_key and target_model_key != self.current_model_key:
                    self.logger.info(
                        f"Executing proactive upgrade to {target_model_key}"
                    )
                    # Schedule personality-aware upgrade (no notification)
                    asyncio.create_task(
                        self.personality_aware_model_switch(
                            target_model_key,
                            "proactive scaling detected available resources",
                            notify_user=False,
                        )
                    )

            elif scaling_event.decision == ScalingDecision.DOWNGRADE:
                # Immediate degradation to smaller model with personality notification
                target_model_key = self._find_model_by_size(
                    scaling_event.new_model_size
                )
                if target_model_key:
                    self.logger.warning(
                        f"Executing degradation to {target_model_key}: {scaling_event.reason}"
                    )
                    # Use personality-aware switching for degradation
                    asyncio.create_task(
                        self.personality_aware_model_switch(
                            target_model_key, scaling_event.reason, notify_user=True
                        )
                    )

        except Exception as e:
            self.logger.error(f"Error handling scaling decision: {e}")

    def _find_model_by_size(self, target_size: str) -> Optional[str]:
        """Find model key by size category.

        Args:
            target_size: Target model size ("small", "medium", "large")

        Returns:
            Model key or None if not found
        """
        try:
            # First, try to match by category in configurations
            for model_key, config in self.model_configurations.items():
                if config.get("category") == target_size:
                    # Check if model is available
                    for available_model in self.available_models:
                        if available_model["key"] == model_key and available_model.get(
                            "available", False
                        ):
                            return model_key

            # If no exact match, use preferred models from tier detector
            current_tier = self.tier_detector.detect_current_tier()
            preferred_models = self.tier_detector.get_preferred_models(current_tier)

            # Find model of target size in preferred list
            for preferred_model in preferred_models:
                if preferred_model in self.model_configurations:
                    config = self.model_configurations[preferred_model]
                    if config.get("category") == target_size:
                        return preferred_model

            return None

        except Exception as e:
            self.logger.error(f"Error finding model by size {target_size}: {e}")
            return None

    async def _execute_proactive_upgrade(self, target_model_key: str) -> None:
        """Execute proactive model upgrade with proper timing.

        Args:
            target_model_key: Model to upgrade to
        """
        try:
            # Only upgrade if not currently switching and enough time has passed
            if hasattr(self, "_upgrade_in_progress") and self._upgrade_in_progress:
                return

            self._upgrade_in_progress = True

            success = await self.switch_model(target_model_key)
            if success:
                self.logger.info(f"Proactive upgrade completed: {target_model_key}")
            else:
                self.logger.warning(f"Proactive upgrade failed: {target_model_key}")

        except Exception as e:
            self.logger.error(f"Error executing proactive upgrade: {e}")
        finally:
            self._upgrade_in_progress = False

    def shutdown(self) -> None:
        """Clean up resources and unload models."""
        try:
            # Stop proactive scaling monitoring
            if hasattr(self, "_proactive_scaler"):
                self._proactive_scaler.stop_continuous_monitoring()

            if self.current_model_instance and self.current_model_key:
                self.lm_adapter.unload_model(self.current_model_key)
                self.current_model_key = None
                self.current_model_instance = None

            self.logger.info("ModelManager shutdown complete")

        except Exception as e:
            self.logger.error(f"Error during shutdown: {e}")