"""Model manager for intelligent model selection and switching.""" import asyncio import time from typing import Dict, List, Optional, Any, Tuple import logging import yaml from pathlib import Path from .lmstudio_adapter import LMStudioAdapter from .resource_monitor import ResourceMonitor from .context_manager import ContextManager # Fix circular imports by importing within functions ProactiveScaler = None ScalingDecision = None HardwareTierDetector = None ResourcePersonality = None ResourceType = None def _get_scaling_components(): global ProactiveScaler, ScalingDecision if ProactiveScaler is None: from resource.scaling import ProactiveScaler, ScalingDecision return ProactiveScaler, ScalingDecision def _get_tier_components(): global HardwareTierDetector if HardwareTierDetector is None: from resource.tiers import HardwareTierDetector return HardwareTierDetector def _get_personality_components(): global ResourcePersonality, ResourceType if ResourcePersonality is None: from resource.personality import ResourcePersonality, ResourceType return ResourcePersonality, ResourceType class ModelManager: """ Intelligent model selection and switching system. Coordinates between LM Studio adapter, resource monitoring, and context management to provide optimal model selection and seamless switching. """ def __init__(self, config_path: Optional[str] = None): """Initialize ModelManager with configuration. Args: config_path: Path to models configuration file """ self.logger = logging.getLogger(__name__) # Load configuration self.config_path = ( config_path or Path(__file__).parent.parent.parent / "config" / "models.yaml" ) self.config = self._load_config() # Initialize subsystems self.lm_adapter = LMStudioAdapter() self.resource_monitor = ResourceMonitor() self.context_manager = ContextManager() # Get components safely tier_components = _get_tier_components() if tier_components is not None: HardwareTierDetector = tier_components else: # Fallback to direct import if lazy loading fails from resource.tiers import HardwareTierDetector self.tier_detector = HardwareTierDetector() # Initialize proactive scaler scaling_components = _get_scaling_components() if scaling_components is not None: ProactiveScaler, ScalingDecision = scaling_components else: # Fallback to direct import if lazy loading fails from resource.scaling import ProactiveScaler, ScalingDecision self._proactive_scaler = ProactiveScaler( resource_monitor=self.resource_monitor, tier_detector=self.tier_detector, upgrade_threshold=0.8, downgrade_threshold=0.9, stabilization_minutes=5, monitoring_interval=2.0, trend_window_minutes=10, ) # Set callback for scaling decisions self._proactive_scaler.set_scaling_callback( self._handle_proactive_scaling_decision ) # Start continuous monitoring self._proactive_scaler.start_continuous_monitoring() # Initialize personality system # Get personality components safely personality_components = _get_personality_components() if personality_components is not None: ResourcePersonality, ResourceType = personality_components else: # Fallback to direct import if lazy loading fails from resource.personality import ResourcePersonality, ResourceType self._personality = ResourcePersonality(sarcasm_level=0.7, gremlin_hunger=0.8) # Current model state self.current_model_key: Optional[str] = None self.current_model_instance: Optional[Any] = None self.available_models: List[Dict[str, Any]] = [] self.model_configurations: Dict[str, Dict[str, Any]] = {} # Switching state self._switching_lock = asyncio.Lock() self._failure_count = {} self._last_switch_time = 0 # Load initial configuration self._load_model_configurations() self._refresh_available_models() self.logger.info("ModelManager initialized with intelligent switching enabled") def _load_config(self) -> Dict[str, Any]: """Load models configuration from YAML file.""" try: with open(self.config_path, "r") as f: return yaml.safe_load(f) except Exception as e: self.logger.error(f"Failed to load config from {self.config_path}: {e}") # Return minimal default config return { "models": [], "selection_rules": { "resource_thresholds": { "memory_available_gb": {"small": 2, "medium": 4, "large": 8} }, "cpu_threshold_percent": 80, "gpu_required_for_large": True, }, "performance": { "load_timeout_seconds": {"small": 30, "medium": 60, "large": 120}, "switching_triggers": { "cpu_threshold": 85, "memory_threshold": 85, "response_time_threshold_ms": 5000, "consecutive_failures": 3, }, }, } def _load_model_configurations(self) -> None: """Load model configurations from config.""" self.model_configurations = {} for model in self.config.get("models", []): self.model_configurations[model["key"]] = model self.logger.info( f"Loaded {len(self.model_configurations)} model configurations" ) def _refresh_available_models(self) -> None: """Refresh list of available models from LM Studio.""" try: model_list = self.lm_adapter.list_models() self.available_models = [] for model_key, display_name, size_gb in model_list: if model_key in self.model_configurations: model_info = self.model_configurations[model_key].copy() model_info.update( { "display_name": display_name, "estimated_size_gb": size_gb, "available": True, } ) self.available_models.append(model_info) else: # Create minimal config for unknown models self.available_models.append( { "key": model_key, "display_name": display_name, "estimated_size_gb": size_gb, "available": True, "category": "unknown", } ) except Exception as e: self.logger.error(f"Failed to refresh available models: {e}") self.available_models = [] def select_best_model( self, conversation_context: Optional[Dict[str, Any]] = None ) -> Optional[str]: """Select the best model based on current resources and context. Args: conversation_context: Optional context about the current conversation Returns: Selected model key or None if no suitable model found """ try: # Get current resources and scaling recommendations resources = self.resource_monitor.get_current_resources() scaling_status = self._proactive_scaler.get_scaling_status() # Apply proactive scaling recommendations if scaling_status.get("degradation_needed", False): # Prefer smaller models if degradation is needed self.logger.debug("Degradation needed, prioritizing smaller models") elif scaling_status.get("upgrade_available", False): # Consider larger models if upgrade is available self.logger.debug("Upgrade available, considering larger models") # Filter models that can fit current resources suitable_models = [] for model in self.available_models: if not model.get("available", False): continue # Check resource requirements required_memory = model.get("min_memory_gb", 2) required_vram = model.get("min_vram_gb", 1) available_memory = resources["available_memory_gb"] available_vram = resources.get("gpu_vram_gb", 0) # Check memory with safety margin if available_memory < required_memory * 1.5: continue # Check VRAM if required for this model size if ( model.get("category") in ["large"] and required_vram > available_vram ): continue suitable_models.append(model) if not suitable_models: self.logger.warning("No models fit current resource constraints") return None # Sort by preference (large preferred if resources allow) selection_rules = self.config.get("selection_rules", {}) # Apply preference scoring scored_models = [] for model in suitable_models: score = 0.0 # Category preference (large > medium > small) category = model.get("category", "unknown") if category == "large" and resources["available_memory_gb"] >= 8: score += 100 elif category == "medium" and resources["available_memory_gb"] >= 4: score += 70 elif category == "small": score += 40 # Preference rules from config preferred_when = model.get("preferred_when") if preferred_when: if "memory" in preferred_when: required_mem = int( preferred_when.split("memory >= ")[1].split("GB")[0] ) if resources["available_memory_gb"] >= required_mem: score += 20 # Factor in recent failures (penalize frequently failing models) failure_count = self._failure_count.get(model["key"], 0) score -= failure_count * 10 # Factor in conversation complexity if provided if conversation_context: task_type = conversation_context.get("task_type", "simple_chat") model_capabilities = model.get("capabilities", []) if task_type == "reasoning" and "reasoning" in model_capabilities: score += 30 elif task_type == "analysis" and "analysis" in model_capabilities: score += 30 elif ( task_type == "code_generation" and "reasoning" in model_capabilities ): score += 20 scored_models.append((score, model)) # Sort by score and return best scored_models.sort(key=lambda x: x[0], reverse=True) if scored_models: best_model = scored_models[0][1] self.logger.info( f"Selected model: {best_model['display_name']} (score: {scored_models[0][0]:.1f})" ) return best_model["key"] except Exception as e: self.logger.error(f"Error in model selection: {e}") return None async def switch_model(self, target_model_key: str) -> bool: """Switch to a different model with proper resource cleanup. Args: target_model_key: Model key to switch to Returns: True if switch successful, False otherwise """ async with self._switching_lock: try: if target_model_key == self.current_model_key: self.logger.debug(f"Already using model {target_model_key}") return True # Don't switch too frequently current_time = time.time() if current_time - self._last_switch_time < 30: # 30 second cooldown self.logger.warning( "Model switch requested too frequently, ignoring" ) return False self.logger.info( f"Switching model: {self.current_model_key} -> {target_model_key}" ) # Unload current model (silent - no user notification per CONTEXT.md) if self.current_model_instance and self.current_model_key: try: self.lm_adapter.unload_model(self.current_model_key) except Exception as e: self.logger.warning(f"Error unloading current model: {e}") # Load new model target_config = self.model_configurations.get(target_model_key) if not target_config: target_config = { "category": "unknown" } # Fallback for unknown models timeout = self.config.get("performance", {}).get( "load_timeout_seconds", {} ) timeout_seconds = timeout.get( target_config.get("category", "medium"), 60 ) new_model = self.lm_adapter.load_model( target_model_key, timeout_seconds ) if new_model: self.current_model_key = target_model_key self.current_model_instance = new_model self._last_switch_time = current_time # Reset failure count for successful load self._failure_count[target_model_key] = 0 self.logger.info(f"Successfully switched to {target_model_key}") return True else: # Increment failure count self._failure_count[target_model_key] = ( self._failure_count.get(target_model_key, 0) + 1 ) self.logger.error(f"Failed to load model {target_model_key}") return False except Exception as e: self.logger.error(f"Error during model switch: {e}") return False async def personality_aware_model_switch( self, target_model_key: str, switch_reason: str = "resource optimization", notify_user: bool = True, ) -> Tuple[bool, Optional[str]]: """Switch models with personality-driven communication. Args: target_model_key: Model to switch to switch_reason: Reason for the switch notify_user: Whether to notify user (only for downgrades) Returns: Tuple of (success, user_message_or_None) """ try: # Get model categories for capability comparison old_config = self.model_configurations.get(self.current_model_key or "", {}) new_config = self.model_configurations.get(target_model_key, {}) old_capability = str(old_config.get("category", "unknown")) new_capability = str(new_config.get("category", "unknown")) # Determine if this is a downgrade is_downgrade = self._is_capability_downgrade(old_capability, new_capability) # Perform the actual switch success = await self.switch_model(target_model_key) if success and is_downgrade and notify_user: # Generate personality-driven degradation notice context = { "old_capability": old_capability, "new_capability": new_capability, "reason": switch_reason, } message, technical_tip = self._personality.generate_resource_message( ResourceType.DEGRADATION_NOTICE, context, include_technical_tip=True ) # Combine message and optional tip if technical_tip: full_message = f"{message}\n\n💡 *Technical tip*: {technical_tip}" else: full_message = message self.logger.info(f"Personality degradation notice: {full_message}") return True, full_message elif success: # Silent upgrade - no notification per requirements self.logger.debug(f"Silent upgrade to {target_model_key} completed") return True, None else: # Failed switch - generate resource request message context = { "resource": "model capability", "current_usage": 95, # High usage when switches fail "threshold": 80, } message, technical_tip = self._personality.generate_resource_message( ResourceType.RESOURCE_REQUEST, context, include_technical_tip=True ) if technical_tip: full_message = f"{message}\n\n💡 *Technical tip*: {technical_tip}" else: full_message = message return False, full_message except Exception as e: self.logger.error(f"Error in personality_aware_model_switch: {e}") return False, "I'm... having trouble switching models right now..." def _is_capability_downgrade( self, old_capability: str, new_capability: str ) -> bool: """Check if switch represents a capability downgrade. Args: old_capability: Current model capability new_capability: Target model capability Returns: True if this is a downgrade """ capability_order = {"large": 3, "medium": 2, "small": 1, "unknown": 0} old_level = capability_order.get(old_capability, 0) new_level = capability_order.get(new_capability, 0) return new_level < old_level async def generate_response( self, message: str, conversation_id: str = "default", conversation_context: Optional[Dict[str, Any]] = None, ) -> str: """Generate response with automatic model switching if needed. Args: message: User message to respond to conversation_id: Conversation ID for context conversation_context: Optional context for model selection Returns: Generated response text """ try: # Pre-flight resource check can_proceed, reason = self._proactive_scaler.check_preflight_resources( "model_inference" ) if not can_proceed: # Handle resource constraints gracefully degradation_target = ( self._proactive_scaler.initiate_graceful_degradation( f"Pre-flight check failed: {reason}", immediate=True ) ) if degradation_target: # Switch to smaller model with personality notification smaller_model_key = self._find_model_by_size(degradation_target) if ( smaller_model_key and smaller_model_key != self.current_model_key ): ( success, personality_message, ) = await self.personality_aware_model_switch( smaller_model_key, f"Pre-flight check failed: {reason}", notify_user=True, ) # If personality message generated, include it in response if personality_message: return f"{personality_message}\n\nI'll try to help anyway with what I have..." else: return "Switching to a lighter model due to resource constraints..." else: return "I'm experiencing resource constraints and cannot generate a response right now." # Ensure we have a model loaded if not self.current_model_instance: await self._ensure_model_loaded(conversation_context) if not self.current_model_instance: return "I'm sorry, I'm unable to load any models at the moment." # Get conversation context context_messages = self.context_manager.get_context_for_model( conversation_id ) # Format messages for model (LM Studio uses OpenAI-like format) formatted_context = self._format_context_for_model(context_messages) # Attempt to generate response start_time = time.time() try: response = self.current_model_instance.respond( f"{formatted_context}\n\nUser: {message}\n\nAssistant:", max_tokens=1024, # Reasonable default ) response_time_ms = (time.time() - start_time) * 1000 # Check if response is adequate if not response or len(response.strip()) < 10: raise ValueError("Model returned empty or inadequate response") # Add messages to context from models.conversation import MessageRole self.context_manager.add_message( conversation_id, MessageRole.USER, message ) self.context_manager.add_message( conversation_id, MessageRole.ASSISTANT, response ) # Update performance metrics for proactive scaling self._proactive_scaler.update_performance_metrics( operation_type="model_inference", duration_ms=response_time_ms, success=True, ) # Check if we should consider switching (slow response or struggling) if await self._should_consider_switching(response_time_ms, response): await self._proactive_model_switch(conversation_context) return response except Exception as e: response_time_ms = (time.time() - start_time) * 1000 self.logger.warning(f"Model generation failed: {e}") # Update performance metrics for failure self._proactive_scaler.update_performance_metrics( operation_type="model_inference", duration_ms=response_time_ms, success=False, ) # Try switching to a different model if await self._handle_model_failure(conversation_context): # Retry with new model return await self.generate_response( message, conversation_id, conversation_context ) # Generate personality message for repeated failures resources = self.resource_monitor.get_current_resources() context = { "resource": "model stability", "current_usage": resources.get("memory_percent", 90), "threshold": 80, } personality_message, technical_tip = ( self._personality.generate_resource_message( ResourceType.RESOURCE_REQUEST, context, include_technical_tip=True, ) ) if technical_tip: return f"{personality_message}\n\n💡 *Technical tip*: {technical_tip}\n\nPlease try again in a moment." else: return f"{personality_message}\n\nPlease try again in a moment." except Exception as e: self.logger.error(f"Error in generate_response: {e}") return "An error occurred while processing your request." def get_current_model_status(self) -> Dict[str, Any]: """Get status of currently loaded model and resource usage. Returns: Dictionary with model status and resource information """ status = { "current_model_key": self.current_model_key, "model_loaded": self.current_model_instance is not None, "resources": self.resource_monitor.get_current_resources(), "available_models": len(self.available_models), "recent_failures": dict(self._failure_count), "scaling": self._proactive_scaler.get_scaling_status() if hasattr(self, "_proactive_scaler") else {}, } if ( self.current_model_key and self.current_model_key in self.model_configurations ): config = self.model_configurations[self.current_model_key] status.update( { "model_display_name": config.get( "display_name", self.current_model_key ), "model_category": config.get("category", "unknown"), "context_window": config.get("context_window", 4096), } ) return status async def preload_model(self, model_key: str) -> bool: """Preload a model in background for faster switching. Args: model_key: Model to preload Returns: True if preload successful, False otherwise """ try: if model_key not in self.model_configurations: self.logger.warning(f"Cannot preload unknown model: {model_key}") return False # Check if already loaded if model_key == self.current_model_key: return True self.logger.info(f"Preloading model: {model_key}") # For now, just attempt to load it # In a full implementation, this would use background loading model = self.lm_adapter.load_model(model_key, timeout=120) if model: self.logger.info(f"Successfully preloaded {model_key}") # Immediately unload to free resources self.lm_adapter.unload_model(model_key) return True else: self.logger.warning(f"Failed to preload {model_key}") return False except Exception as e: self.logger.error(f"Error preloading model {model_key}: {e}") return False async def _ensure_model_loaded( self, conversation_context: Optional[Dict[str, Any]] = None ) -> None: """Ensure we have a model loaded, selecting one if needed.""" if not self.current_model_instance: # Get scaling recommendations for initial load scaling_status = self._proactive_scaler.get_scaling_status() # Select best model considering scaling constraints best_model = self.select_best_model(conversation_context) if best_model: # Set current model size in proactive scaler model_config = self.model_configurations.get(best_model, {}) model_size = model_config.get("category", "unknown") self._proactive_scaler._current_model_size = model_size await self.switch_model(best_model) async def _should_consider_switching( self, response_time_ms: float, response: str ) -> bool: """Check if we should consider switching models based on performance. Args: response_time_ms: Response generation time in milliseconds response: Generated response content Returns: True if switching should be considered """ triggers = self.config.get("performance", {}).get("switching_triggers", {}) # Check response time threshold if response_time_ms > triggers.get("response_time_threshold_ms", 5000): return True # Check system resource thresholds resources = self.resource_monitor.get_current_resources() if resources["memory_percent"] > triggers.get("memory_threshold", 85): return True if resources["cpu_percent"] > triggers.get("cpu_threshold", 85): return True # Check for poor quality responses if len(response.strip()) < 20 or response.count("I don't know") > 0: return True return False async def _proactive_model_switch( self, conversation_context: Optional[Dict[str, Any]] = None ) -> None: """Perform proactive model switching without user notification (silent switching).""" try: best_model = self.select_best_model(conversation_context) if best_model and best_model != self.current_model_key: self.logger.info( f"Proactively switching from {self.current_model_key} to {best_model}" ) await self.switch_model(best_model) except Exception as e: self.logger.error(f"Error in proactive switch: {e}") async def _handle_model_failure( self, conversation_context: Optional[Dict[str, Any]] = None ) -> bool: """Handle model failure by trying fallback models. Args: conversation_context: Context for selecting fallback model Returns: True if fallback was successful, False otherwise """ if not self.current_model_key: return False # Increment failure count self._failure_count[self.current_model_key] = ( self._failure_count.get(self.current_model_key, 0) + 1 ) # Get fallback chain from config fallback_chains = self.config.get("selection_rules", {}).get( "fallback_chains", {} ) # Find appropriate fallback fallback_model = None current_config = self.model_configurations.get(self.current_model_key, {}) current_category = current_config.get("category") if current_category == "large": for large_to_medium in fallback_chains.get("large_to_medium", []): if self.current_model_key in large_to_medium: fallback_model = large_to_medium[self.current_model_key] break elif current_category == "medium": for medium_to_small in fallback_chains.get("medium_to_small", []): if self.current_model_key in medium_to_small: fallback_model = medium_to_small[self.current_model_key] break if fallback_model: self.logger.info( f"Attempting fallback: {self.current_model_key} -> {fallback_model}" ) return await self.switch_model(fallback_model) # If no specific fallback, try any smaller model smaller_models = [ model["key"] for model in self.available_models if model.get("category") in ["small", "medium"] and model["key"] != self.current_model_key ] if smaller_models: self.logger.info(f"Falling back to smaller model: {smaller_models[0]}") return await self.switch_model(smaller_models[0]) return False def _format_context_for_model(self, messages: List[Any]) -> str: """Format context messages for LM Studio model.""" if not messages: return "" formatted_parts = [] for msg in messages: role_str = getattr(msg, "role", "user") content_str = getattr(msg, "content", str(msg)) if role_str == "user": formatted_parts.append(f"User: {content_str}") elif role_str == "assistant": formatted_parts.append(f"Assistant: {content_str}") elif role_str == "system": formatted_parts.append(f"System: {content_str}") return "\n".join(formatted_parts) def _handle_proactive_scaling_decision(self, scaling_event) -> None: """Handle proactive scaling decision from ProactiveScaler. Args: scaling_event: ScalingEvent from ProactiveScaler """ try: if scaling_event.decision == ScalingDecision.UPGRADE: # Proactive upgrade to larger model target_model_key = self._find_model_by_size( scaling_event.new_model_size ) if target_model_key and target_model_key != self.current_model_key: self.logger.info( f"Executing proactive upgrade to {target_model_key}" ) # Schedule personality-aware upgrade (no notification) asyncio.create_task( self.personality_aware_model_switch( target_model_key, "proactive scaling detected available resources", notify_user=False, ) ) elif scaling_event.decision == ScalingDecision.DOWNGRADE: # Immediate degradation to smaller model with personality notification target_model_key = self._find_model_by_size( scaling_event.new_model_size ) if target_model_key: self.logger.warning( f"Executing degradation to {target_model_key}: {scaling_event.reason}" ) # Use personality-aware switching for degradation asyncio.create_task( self.personality_aware_model_switch( target_model_key, scaling_event.reason, notify_user=True ) ) except Exception as e: self.logger.error(f"Error handling scaling decision: {e}") def _find_model_by_size(self, target_size: str) -> Optional[str]: """Find model key by size category. Args: target_size: Target model size ("small", "medium", "large") Returns: Model key or None if not found """ try: # First, try to match by category in configurations for model_key, config in self.model_configurations.items(): if config.get("category") == target_size: # Check if model is available for available_model in self.available_models: if available_model["key"] == model_key and available_model.get( "available", False ): return model_key # If no exact match, use preferred models from tier detector current_tier = self.tier_detector.detect_current_tier() preferred_models = self.tier_detector.get_preferred_models(current_tier) # Find model of target size in preferred list for preferred_model in preferred_models: if preferred_model in self.model_configurations: config = self.model_configurations[preferred_model] if config.get("category") == target_size: return preferred_model return None except Exception as e: self.logger.error(f"Error finding model by size {target_size}: {e}") return None async def _execute_proactive_upgrade(self, target_model_key: str) -> None: """Execute proactive model upgrade with proper timing. Args: target_model_key: Model to upgrade to """ try: # Only upgrade if not currently switching and enough time has passed if hasattr(self, "_upgrade_in_progress") and self._upgrade_in_progress: return self._upgrade_in_progress = True success = await self.switch_model(target_model_key) if success: self.logger.info(f"Proactive upgrade completed: {target_model_key}") else: self.logger.warning(f"Proactive upgrade failed: {target_model_key}") except Exception as e: self.logger.error(f"Error executing proactive upgrade: {e}") finally: self._upgrade_in_progress = False def shutdown(self) -> None: """Clean up resources and unload models.""" try: # Stop proactive scaling monitoring if hasattr(self, "_proactive_scaler"): self._proactive_scaler.stop_continuous_monitoring() if self.current_model_instance and self.current_model_key: self.lm_adapter.unload_model(self.current_model_key) self.current_model_key = None self.current_model_instance = None self.logger.info("ModelManager shutdown complete") except Exception as e: self.logger.error(f"Error during shutdown: {e}")