Verify PersonalityLearner instantiation works correctly after AdaptationRate import fix. Tests confirm no NameError occurs. Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
979 lines
38 KiB
Python
979 lines
38 KiB
Python
"""Model manager for intelligent model selection and switching."""
|
|
|
|
import asyncio
|
|
import time
|
|
from typing import Dict, List, Optional, Any, Tuple
|
|
import logging
|
|
import yaml
|
|
from pathlib import Path
|
|
|
|
from .lmstudio_adapter import LMStudioAdapter
|
|
from .resource_monitor import ResourceMonitor
|
|
from .context_manager import ContextManager
|
|
|
|
# Fix circular imports by importing within functions
|
|
ProactiveScaler = None
|
|
ScalingDecision = None
|
|
HardwareTierDetector = None
|
|
ResourcePersonality = None
|
|
ResourceType = None
|
|
|
|
|
|
def _get_scaling_components():
|
|
global ProactiveScaler, ScalingDecision
|
|
if ProactiveScaler is None:
|
|
from resource.scaling import ProactiveScaler, ScalingDecision
|
|
return ProactiveScaler, ScalingDecision
|
|
|
|
|
|
def _get_tier_components():
|
|
global HardwareTierDetector
|
|
if HardwareTierDetector is None:
|
|
from resource.tiers import HardwareTierDetector
|
|
return HardwareTierDetector
|
|
|
|
|
|
def _get_personality_components():
|
|
global ResourcePersonality, ResourceType
|
|
if ResourcePersonality is None:
|
|
from resource.personality import ResourcePersonality, ResourceType
|
|
return ResourcePersonality, ResourceType
|
|
|
|
|
|
class ModelManager:
|
|
"""
|
|
Intelligent model selection and switching system.
|
|
|
|
Coordinates between LM Studio adapter, resource monitoring, and context
|
|
management to provide optimal model selection and seamless switching.
|
|
"""
|
|
|
|
def __init__(self, config_path: Optional[str] = None):
|
|
"""Initialize ModelManager with configuration.
|
|
|
|
Args:
|
|
config_path: Path to models configuration file
|
|
"""
|
|
self.logger = logging.getLogger(__name__)
|
|
|
|
# Load configuration
|
|
self.config_path = (
|
|
config_path
|
|
or Path(__file__).parent.parent.parent / "config" / "models.yaml"
|
|
)
|
|
self.config = self._load_config()
|
|
|
|
# Initialize subsystems
|
|
self.lm_adapter = LMStudioAdapter()
|
|
self.resource_monitor = ResourceMonitor()
|
|
self.context_manager = ContextManager()
|
|
|
|
# Get components safely
|
|
tier_components = _get_tier_components()
|
|
if tier_components is not None:
|
|
HardwareTierDetector = tier_components
|
|
else:
|
|
# Fallback to direct import if lazy loading fails
|
|
from resource.tiers import HardwareTierDetector
|
|
|
|
self.tier_detector = HardwareTierDetector()
|
|
|
|
# Initialize proactive scaler
|
|
scaling_components = _get_scaling_components()
|
|
if scaling_components is not None:
|
|
ProactiveScaler, ScalingDecision = scaling_components
|
|
else:
|
|
# Fallback to direct import if lazy loading fails
|
|
from resource.scaling import ProactiveScaler, ScalingDecision
|
|
|
|
self._proactive_scaler = ProactiveScaler(
|
|
resource_monitor=self.resource_monitor,
|
|
tier_detector=self.tier_detector,
|
|
upgrade_threshold=0.8,
|
|
downgrade_threshold=0.9,
|
|
stabilization_minutes=5,
|
|
monitoring_interval=2.0,
|
|
trend_window_minutes=10,
|
|
)
|
|
|
|
# Set callback for scaling decisions
|
|
self._proactive_scaler.set_scaling_callback(
|
|
self._handle_proactive_scaling_decision
|
|
)
|
|
|
|
# Start continuous monitoring
|
|
self._proactive_scaler.start_continuous_monitoring()
|
|
|
|
# Initialize personality system
|
|
# Get personality components safely
|
|
personality_components = _get_personality_components()
|
|
if personality_components is not None:
|
|
ResourcePersonality, ResourceType = personality_components
|
|
else:
|
|
# Fallback to direct import if lazy loading fails
|
|
from resource.personality import ResourcePersonality, ResourceType
|
|
|
|
self._personality = ResourcePersonality(sarcasm_level=0.7, gremlin_hunger=0.8)
|
|
|
|
# Current model state
|
|
self.current_model_key: Optional[str] = None
|
|
self.current_model_instance: Optional[Any] = None
|
|
self.available_models: List[Dict[str, Any]] = []
|
|
self.model_configurations: Dict[str, Dict[str, Any]] = {}
|
|
|
|
# Switching state
|
|
self._switching_lock = asyncio.Lock()
|
|
self._failure_count = {}
|
|
self._last_switch_time = 0
|
|
|
|
# Load initial configuration
|
|
self._load_model_configurations()
|
|
self._refresh_available_models()
|
|
|
|
self.logger.info("ModelManager initialized with intelligent switching enabled")
|
|
|
|
def _load_config(self) -> Dict[str, Any]:
|
|
"""Load models configuration from YAML file."""
|
|
try:
|
|
with open(self.config_path, "r") as f:
|
|
return yaml.safe_load(f)
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to load config from {self.config_path}: {e}")
|
|
# Return minimal default config
|
|
return {
|
|
"models": [],
|
|
"selection_rules": {
|
|
"resource_thresholds": {
|
|
"memory_available_gb": {"small": 2, "medium": 4, "large": 8}
|
|
},
|
|
"cpu_threshold_percent": 80,
|
|
"gpu_required_for_large": True,
|
|
},
|
|
"performance": {
|
|
"load_timeout_seconds": {"small": 30, "medium": 60, "large": 120},
|
|
"switching_triggers": {
|
|
"cpu_threshold": 85,
|
|
"memory_threshold": 85,
|
|
"response_time_threshold_ms": 5000,
|
|
"consecutive_failures": 3,
|
|
},
|
|
},
|
|
}
|
|
|
|
def _load_model_configurations(self) -> None:
|
|
"""Load model configurations from config."""
|
|
self.model_configurations = {}
|
|
|
|
for model in self.config.get("models", []):
|
|
self.model_configurations[model["key"]] = model
|
|
|
|
self.logger.info(
|
|
f"Loaded {len(self.model_configurations)} model configurations"
|
|
)
|
|
|
|
def _refresh_available_models(self) -> None:
|
|
"""Refresh list of available models from LM Studio."""
|
|
try:
|
|
model_list = self.lm_adapter.list_models()
|
|
self.available_models = []
|
|
|
|
for model_key, display_name, size_gb in model_list:
|
|
if model_key in self.model_configurations:
|
|
model_info = self.model_configurations[model_key].copy()
|
|
model_info.update(
|
|
{
|
|
"display_name": display_name,
|
|
"estimated_size_gb": size_gb,
|
|
"available": True,
|
|
}
|
|
)
|
|
self.available_models.append(model_info)
|
|
else:
|
|
# Create minimal config for unknown models
|
|
self.available_models.append(
|
|
{
|
|
"key": model_key,
|
|
"display_name": display_name,
|
|
"estimated_size_gb": size_gb,
|
|
"available": True,
|
|
"category": "unknown",
|
|
}
|
|
)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to refresh available models: {e}")
|
|
self.available_models = []
|
|
|
|
def select_best_model(
|
|
self, conversation_context: Optional[Dict[str, Any]] = None
|
|
) -> Optional[str]:
|
|
"""Select the best model based on current resources and context.
|
|
|
|
Args:
|
|
conversation_context: Optional context about the current conversation
|
|
|
|
Returns:
|
|
Selected model key or None if no suitable model found
|
|
"""
|
|
try:
|
|
# Get current resources and scaling recommendations
|
|
resources = self.resource_monitor.get_current_resources()
|
|
scaling_status = self._proactive_scaler.get_scaling_status()
|
|
|
|
# Apply proactive scaling recommendations
|
|
if scaling_status.get("degradation_needed", False):
|
|
# Prefer smaller models if degradation is needed
|
|
self.logger.debug("Degradation needed, prioritizing smaller models")
|
|
elif scaling_status.get("upgrade_available", False):
|
|
# Consider larger models if upgrade is available
|
|
self.logger.debug("Upgrade available, considering larger models")
|
|
|
|
# Filter models that can fit current resources
|
|
suitable_models = []
|
|
|
|
for model in self.available_models:
|
|
if not model.get("available", False):
|
|
continue
|
|
|
|
# Check resource requirements
|
|
required_memory = model.get("min_memory_gb", 2)
|
|
required_vram = model.get("min_vram_gb", 1)
|
|
|
|
available_memory = resources["available_memory_gb"]
|
|
available_vram = resources.get("gpu_vram_gb", 0)
|
|
|
|
# Check memory with safety margin
|
|
if available_memory < required_memory * 1.5:
|
|
continue
|
|
|
|
# Check VRAM if required for this model size
|
|
if (
|
|
model.get("category") in ["large"]
|
|
and required_vram > available_vram
|
|
):
|
|
continue
|
|
|
|
suitable_models.append(model)
|
|
|
|
if not suitable_models:
|
|
self.logger.warning("No models fit current resource constraints")
|
|
return None
|
|
|
|
# Sort by preference (large preferred if resources allow)
|
|
selection_rules = self.config.get("selection_rules", {})
|
|
|
|
# Apply preference scoring
|
|
scored_models = []
|
|
for model in suitable_models:
|
|
score = 0.0
|
|
|
|
# Category preference (large > medium > small)
|
|
category = model.get("category", "unknown")
|
|
if category == "large" and resources["available_memory_gb"] >= 8:
|
|
score += 100
|
|
elif category == "medium" and resources["available_memory_gb"] >= 4:
|
|
score += 70
|
|
elif category == "small":
|
|
score += 40
|
|
|
|
# Preference rules from config
|
|
preferred_when = model.get("preferred_when")
|
|
if preferred_when:
|
|
if "memory" in preferred_when:
|
|
required_mem = int(
|
|
preferred_when.split("memory >= ")[1].split("GB")[0]
|
|
)
|
|
if resources["available_memory_gb"] >= required_mem:
|
|
score += 20
|
|
|
|
# Factor in recent failures (penalize frequently failing models)
|
|
failure_count = self._failure_count.get(model["key"], 0)
|
|
score -= failure_count * 10
|
|
|
|
# Factor in conversation complexity if provided
|
|
if conversation_context:
|
|
task_type = conversation_context.get("task_type", "simple_chat")
|
|
model_capabilities = model.get("capabilities", [])
|
|
|
|
if task_type == "reasoning" and "reasoning" in model_capabilities:
|
|
score += 30
|
|
elif task_type == "analysis" and "analysis" in model_capabilities:
|
|
score += 30
|
|
elif (
|
|
task_type == "code_generation"
|
|
and "reasoning" in model_capabilities
|
|
):
|
|
score += 20
|
|
|
|
scored_models.append((score, model))
|
|
|
|
# Sort by score and return best
|
|
scored_models.sort(key=lambda x: x[0], reverse=True)
|
|
|
|
if scored_models:
|
|
best_model = scored_models[0][1]
|
|
self.logger.info(
|
|
f"Selected model: {best_model['display_name']} (score: {scored_models[0][0]:.1f})"
|
|
)
|
|
return best_model["key"]
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error in model selection: {e}")
|
|
|
|
return None
|
|
|
|
async def switch_model(self, target_model_key: str) -> bool:
|
|
"""Switch to a different model with proper resource cleanup.
|
|
|
|
Args:
|
|
target_model_key: Model key to switch to
|
|
|
|
Returns:
|
|
True if switch successful, False otherwise
|
|
"""
|
|
async with self._switching_lock:
|
|
try:
|
|
if target_model_key == self.current_model_key:
|
|
self.logger.debug(f"Already using model {target_model_key}")
|
|
return True
|
|
|
|
# Don't switch too frequently
|
|
current_time = time.time()
|
|
if current_time - self._last_switch_time < 30: # 30 second cooldown
|
|
self.logger.warning(
|
|
"Model switch requested too frequently, ignoring"
|
|
)
|
|
return False
|
|
|
|
self.logger.info(
|
|
f"Switching model: {self.current_model_key} -> {target_model_key}"
|
|
)
|
|
|
|
# Unload current model (silent - no user notification per CONTEXT.md)
|
|
if self.current_model_instance and self.current_model_key:
|
|
try:
|
|
self.lm_adapter.unload_model(self.current_model_key)
|
|
except Exception as e:
|
|
self.logger.warning(f"Error unloading current model: {e}")
|
|
|
|
# Load new model
|
|
target_config = self.model_configurations.get(target_model_key)
|
|
if not target_config:
|
|
target_config = {
|
|
"category": "unknown"
|
|
} # Fallback for unknown models
|
|
|
|
timeout = self.config.get("performance", {}).get(
|
|
"load_timeout_seconds", {}
|
|
)
|
|
timeout_seconds = timeout.get(
|
|
target_config.get("category", "medium"), 60
|
|
)
|
|
|
|
new_model = self.lm_adapter.load_model(
|
|
target_model_key, timeout_seconds
|
|
)
|
|
|
|
if new_model:
|
|
self.current_model_key = target_model_key
|
|
self.current_model_instance = new_model
|
|
self._last_switch_time = current_time
|
|
|
|
# Reset failure count for successful load
|
|
self._failure_count[target_model_key] = 0
|
|
|
|
self.logger.info(f"Successfully switched to {target_model_key}")
|
|
return True
|
|
else:
|
|
# Increment failure count
|
|
self._failure_count[target_model_key] = (
|
|
self._failure_count.get(target_model_key, 0) + 1
|
|
)
|
|
self.logger.error(f"Failed to load model {target_model_key}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error during model switch: {e}")
|
|
return False
|
|
|
|
async def personality_aware_model_switch(
|
|
self,
|
|
target_model_key: str,
|
|
switch_reason: str = "resource optimization",
|
|
notify_user: bool = True,
|
|
) -> Tuple[bool, Optional[str]]:
|
|
"""Switch models with personality-driven communication.
|
|
|
|
Args:
|
|
target_model_key: Model to switch to
|
|
switch_reason: Reason for the switch
|
|
notify_user: Whether to notify user (only for downgrades)
|
|
|
|
Returns:
|
|
Tuple of (success, user_message_or_None)
|
|
"""
|
|
try:
|
|
# Get model categories for capability comparison
|
|
old_config = self.model_configurations.get(self.current_model_key or "", {})
|
|
new_config = self.model_configurations.get(target_model_key, {})
|
|
|
|
old_capability = str(old_config.get("category", "unknown"))
|
|
new_capability = str(new_config.get("category", "unknown"))
|
|
|
|
# Determine if this is a downgrade
|
|
is_downgrade = self._is_capability_downgrade(old_capability, new_capability)
|
|
|
|
# Perform the actual switch
|
|
success = await self.switch_model(target_model_key)
|
|
|
|
if success and is_downgrade and notify_user:
|
|
# Generate personality-driven degradation notice
|
|
context = {
|
|
"old_capability": old_capability,
|
|
"new_capability": new_capability,
|
|
"reason": switch_reason,
|
|
}
|
|
|
|
message, technical_tip = self._personality.generate_resource_message(
|
|
ResourceType.DEGRADATION_NOTICE, context, include_technical_tip=True
|
|
)
|
|
|
|
# Combine message and optional tip
|
|
if technical_tip:
|
|
full_message = f"{message}\n\n💡 *Technical tip*: {technical_tip}"
|
|
else:
|
|
full_message = message
|
|
|
|
self.logger.info(f"Personality degradation notice: {full_message}")
|
|
return True, full_message
|
|
|
|
elif success:
|
|
# Silent upgrade - no notification per requirements
|
|
self.logger.debug(f"Silent upgrade to {target_model_key} completed")
|
|
return True, None
|
|
|
|
else:
|
|
# Failed switch - generate resource request message
|
|
context = {
|
|
"resource": "model capability",
|
|
"current_usage": 95, # High usage when switches fail
|
|
"threshold": 80,
|
|
}
|
|
|
|
message, technical_tip = self._personality.generate_resource_message(
|
|
ResourceType.RESOURCE_REQUEST, context, include_technical_tip=True
|
|
)
|
|
|
|
if technical_tip:
|
|
full_message = f"{message}\n\n💡 *Technical tip*: {technical_tip}"
|
|
else:
|
|
full_message = message
|
|
|
|
return False, full_message
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error in personality_aware_model_switch: {e}")
|
|
return False, "I'm... having trouble switching models right now..."
|
|
|
|
def _is_capability_downgrade(
|
|
self, old_capability: str, new_capability: str
|
|
) -> bool:
|
|
"""Check if switch represents a capability downgrade.
|
|
|
|
Args:
|
|
old_capability: Current model capability
|
|
new_capability: Target model capability
|
|
|
|
Returns:
|
|
True if this is a downgrade
|
|
"""
|
|
capability_order = {"large": 3, "medium": 2, "small": 1, "unknown": 0}
|
|
|
|
old_level = capability_order.get(old_capability, 0)
|
|
new_level = capability_order.get(new_capability, 0)
|
|
|
|
return new_level < old_level
|
|
|
|
async def generate_response(
|
|
self,
|
|
message: str,
|
|
conversation_id: str = "default",
|
|
conversation_context: Optional[Dict[str, Any]] = None,
|
|
) -> str:
|
|
"""Generate response with automatic model switching if needed.
|
|
|
|
Args:
|
|
message: User message to respond to
|
|
conversation_id: Conversation ID for context
|
|
conversation_context: Optional context for model selection
|
|
|
|
Returns:
|
|
Generated response text
|
|
"""
|
|
try:
|
|
# Pre-flight resource check
|
|
can_proceed, reason = self._proactive_scaler.check_preflight_resources(
|
|
"model_inference"
|
|
)
|
|
if not can_proceed:
|
|
# Handle resource constraints gracefully
|
|
degradation_target = (
|
|
self._proactive_scaler.initiate_graceful_degradation(
|
|
f"Pre-flight check failed: {reason}", immediate=True
|
|
)
|
|
)
|
|
if degradation_target:
|
|
# Switch to smaller model with personality notification
|
|
smaller_model_key = self._find_model_by_size(degradation_target)
|
|
if (
|
|
smaller_model_key
|
|
and smaller_model_key != self.current_model_key
|
|
):
|
|
(
|
|
success,
|
|
personality_message,
|
|
) = await self.personality_aware_model_switch(
|
|
smaller_model_key,
|
|
f"Pre-flight check failed: {reason}",
|
|
notify_user=True,
|
|
)
|
|
|
|
# If personality message generated, include it in response
|
|
if personality_message:
|
|
return f"{personality_message}\n\nI'll try to help anyway with what I have..."
|
|
else:
|
|
return "Switching to a lighter model due to resource constraints..."
|
|
else:
|
|
return "I'm experiencing resource constraints and cannot generate a response right now."
|
|
|
|
# Ensure we have a model loaded
|
|
if not self.current_model_instance:
|
|
await self._ensure_model_loaded(conversation_context)
|
|
|
|
if not self.current_model_instance:
|
|
return "I'm sorry, I'm unable to load any models at the moment."
|
|
|
|
# Get conversation context
|
|
context_messages = self.context_manager.get_context_for_model(
|
|
conversation_id
|
|
)
|
|
|
|
# Format messages for model (LM Studio uses OpenAI-like format)
|
|
formatted_context = self._format_context_for_model(context_messages)
|
|
|
|
# Attempt to generate response
|
|
start_time = time.time()
|
|
try:
|
|
response = self.current_model_instance.respond(
|
|
f"{formatted_context}\n\nUser: {message}\n\nAssistant:",
|
|
max_tokens=1024, # Reasonable default
|
|
)
|
|
|
|
response_time_ms = (time.time() - start_time) * 1000
|
|
|
|
# Check if response is adequate
|
|
if not response or len(response.strip()) < 10:
|
|
raise ValueError("Model returned empty or inadequate response")
|
|
|
|
# Add messages to context
|
|
from models.conversation import MessageRole
|
|
|
|
self.context_manager.add_message(
|
|
conversation_id, MessageRole.USER, message
|
|
)
|
|
self.context_manager.add_message(
|
|
conversation_id, MessageRole.ASSISTANT, response
|
|
)
|
|
|
|
# Update performance metrics for proactive scaling
|
|
self._proactive_scaler.update_performance_metrics(
|
|
operation_type="model_inference",
|
|
duration_ms=response_time_ms,
|
|
success=True,
|
|
)
|
|
|
|
# Check if we should consider switching (slow response or struggling)
|
|
if await self._should_consider_switching(response_time_ms, response):
|
|
await self._proactive_model_switch(conversation_context)
|
|
|
|
return response
|
|
|
|
except Exception as e:
|
|
response_time_ms = (time.time() - start_time) * 1000
|
|
self.logger.warning(f"Model generation failed: {e}")
|
|
|
|
# Update performance metrics for failure
|
|
self._proactive_scaler.update_performance_metrics(
|
|
operation_type="model_inference",
|
|
duration_ms=response_time_ms,
|
|
success=False,
|
|
)
|
|
|
|
# Try switching to a different model
|
|
if await self._handle_model_failure(conversation_context):
|
|
# Retry with new model
|
|
return await self.generate_response(
|
|
message, conversation_id, conversation_context
|
|
)
|
|
|
|
# Generate personality message for repeated failures
|
|
resources = self.resource_monitor.get_current_resources()
|
|
context = {
|
|
"resource": "model stability",
|
|
"current_usage": resources.get("memory_percent", 90),
|
|
"threshold": 80,
|
|
}
|
|
|
|
personality_message, technical_tip = (
|
|
self._personality.generate_resource_message(
|
|
ResourceType.RESOURCE_REQUEST,
|
|
context,
|
|
include_technical_tip=True,
|
|
)
|
|
)
|
|
|
|
if technical_tip:
|
|
return f"{personality_message}\n\n💡 *Technical tip*: {technical_tip}\n\nPlease try again in a moment."
|
|
else:
|
|
return f"{personality_message}\n\nPlease try again in a moment."
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error in generate_response: {e}")
|
|
return "An error occurred while processing your request."
|
|
|
|
def get_current_model_status(self) -> Dict[str, Any]:
|
|
"""Get status of currently loaded model and resource usage.
|
|
|
|
Returns:
|
|
Dictionary with model status and resource information
|
|
"""
|
|
status = {
|
|
"current_model_key": self.current_model_key,
|
|
"model_loaded": self.current_model_instance is not None,
|
|
"resources": self.resource_monitor.get_current_resources(),
|
|
"available_models": len(self.available_models),
|
|
"recent_failures": dict(self._failure_count),
|
|
"scaling": self._proactive_scaler.get_scaling_status()
|
|
if hasattr(self, "_proactive_scaler")
|
|
else {},
|
|
}
|
|
|
|
if (
|
|
self.current_model_key
|
|
and self.current_model_key in self.model_configurations
|
|
):
|
|
config = self.model_configurations[self.current_model_key]
|
|
status.update(
|
|
{
|
|
"model_display_name": config.get(
|
|
"display_name", self.current_model_key
|
|
),
|
|
"model_category": config.get("category", "unknown"),
|
|
"context_window": config.get("context_window", 4096),
|
|
}
|
|
)
|
|
|
|
return status
|
|
|
|
async def preload_model(self, model_key: str) -> bool:
|
|
"""Preload a model in background for faster switching.
|
|
|
|
Args:
|
|
model_key: Model to preload
|
|
|
|
Returns:
|
|
True if preload successful, False otherwise
|
|
"""
|
|
try:
|
|
if model_key not in self.model_configurations:
|
|
self.logger.warning(f"Cannot preload unknown model: {model_key}")
|
|
return False
|
|
|
|
# Check if already loaded
|
|
if model_key == self.current_model_key:
|
|
return True
|
|
|
|
self.logger.info(f"Preloading model: {model_key}")
|
|
# For now, just attempt to load it
|
|
# In a full implementation, this would use background loading
|
|
model = self.lm_adapter.load_model(model_key, timeout=120)
|
|
|
|
if model:
|
|
self.logger.info(f"Successfully preloaded {model_key}")
|
|
# Immediately unload to free resources
|
|
self.lm_adapter.unload_model(model_key)
|
|
return True
|
|
else:
|
|
self.logger.warning(f"Failed to preload {model_key}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error preloading model {model_key}: {e}")
|
|
return False
|
|
|
|
async def _ensure_model_loaded(
|
|
self, conversation_context: Optional[Dict[str, Any]] = None
|
|
) -> None:
|
|
"""Ensure we have a model loaded, selecting one if needed."""
|
|
if not self.current_model_instance:
|
|
# Get scaling recommendations for initial load
|
|
scaling_status = self._proactive_scaler.get_scaling_status()
|
|
|
|
# Select best model considering scaling constraints
|
|
best_model = self.select_best_model(conversation_context)
|
|
if best_model:
|
|
# Set current model size in proactive scaler
|
|
model_config = self.model_configurations.get(best_model, {})
|
|
model_size = model_config.get("category", "unknown")
|
|
self._proactive_scaler._current_model_size = model_size
|
|
|
|
await self.switch_model(best_model)
|
|
|
|
async def _should_consider_switching(
|
|
self, response_time_ms: float, response: str
|
|
) -> bool:
|
|
"""Check if we should consider switching models based on performance.
|
|
|
|
Args:
|
|
response_time_ms: Response generation time in milliseconds
|
|
response: Generated response content
|
|
|
|
Returns:
|
|
True if switching should be considered
|
|
"""
|
|
triggers = self.config.get("performance", {}).get("switching_triggers", {})
|
|
|
|
# Check response time threshold
|
|
if response_time_ms > triggers.get("response_time_threshold_ms", 5000):
|
|
return True
|
|
|
|
# Check system resource thresholds
|
|
resources = self.resource_monitor.get_current_resources()
|
|
|
|
if resources["memory_percent"] > triggers.get("memory_threshold", 85):
|
|
return True
|
|
|
|
if resources["cpu_percent"] > triggers.get("cpu_threshold", 85):
|
|
return True
|
|
|
|
# Check for poor quality responses
|
|
if len(response.strip()) < 20 or response.count("I don't know") > 0:
|
|
return True
|
|
|
|
return False
|
|
|
|
async def _proactive_model_switch(
|
|
self, conversation_context: Optional[Dict[str, Any]] = None
|
|
) -> None:
|
|
"""Perform proactive model switching without user notification (silent switching)."""
|
|
try:
|
|
best_model = self.select_best_model(conversation_context)
|
|
if best_model and best_model != self.current_model_key:
|
|
self.logger.info(
|
|
f"Proactively switching from {self.current_model_key} to {best_model}"
|
|
)
|
|
await self.switch_model(best_model)
|
|
except Exception as e:
|
|
self.logger.error(f"Error in proactive switch: {e}")
|
|
|
|
async def _handle_model_failure(
|
|
self, conversation_context: Optional[Dict[str, Any]] = None
|
|
) -> bool:
|
|
"""Handle model failure by trying fallback models.
|
|
|
|
Args:
|
|
conversation_context: Context for selecting fallback model
|
|
|
|
Returns:
|
|
True if fallback was successful, False otherwise
|
|
"""
|
|
if not self.current_model_key:
|
|
return False
|
|
|
|
# Increment failure count
|
|
self._failure_count[self.current_model_key] = (
|
|
self._failure_count.get(self.current_model_key, 0) + 1
|
|
)
|
|
|
|
# Get fallback chain from config
|
|
fallback_chains = self.config.get("selection_rules", {}).get(
|
|
"fallback_chains", {}
|
|
)
|
|
|
|
# Find appropriate fallback
|
|
fallback_model = None
|
|
current_config = self.model_configurations.get(self.current_model_key, {})
|
|
current_category = current_config.get("category")
|
|
|
|
if current_category == "large":
|
|
for large_to_medium in fallback_chains.get("large_to_medium", []):
|
|
if self.current_model_key in large_to_medium:
|
|
fallback_model = large_to_medium[self.current_model_key]
|
|
break
|
|
elif current_category == "medium":
|
|
for medium_to_small in fallback_chains.get("medium_to_small", []):
|
|
if self.current_model_key in medium_to_small:
|
|
fallback_model = medium_to_small[self.current_model_key]
|
|
break
|
|
|
|
if fallback_model:
|
|
self.logger.info(
|
|
f"Attempting fallback: {self.current_model_key} -> {fallback_model}"
|
|
)
|
|
return await self.switch_model(fallback_model)
|
|
|
|
# If no specific fallback, try any smaller model
|
|
smaller_models = [
|
|
model["key"]
|
|
for model in self.available_models
|
|
if model.get("category") in ["small", "medium"]
|
|
and model["key"] != self.current_model_key
|
|
]
|
|
|
|
if smaller_models:
|
|
self.logger.info(f"Falling back to smaller model: {smaller_models[0]}")
|
|
return await self.switch_model(smaller_models[0])
|
|
|
|
return False
|
|
|
|
def _format_context_for_model(self, messages: List[Any]) -> str:
|
|
"""Format context messages for LM Studio model."""
|
|
if not messages:
|
|
return ""
|
|
|
|
formatted_parts = []
|
|
for msg in messages:
|
|
role_str = getattr(msg, "role", "user")
|
|
content_str = getattr(msg, "content", str(msg))
|
|
|
|
if role_str == "user":
|
|
formatted_parts.append(f"User: {content_str}")
|
|
elif role_str == "assistant":
|
|
formatted_parts.append(f"Assistant: {content_str}")
|
|
elif role_str == "system":
|
|
formatted_parts.append(f"System: {content_str}")
|
|
|
|
return "\n".join(formatted_parts)
|
|
|
|
def _handle_proactive_scaling_decision(self, scaling_event) -> None:
|
|
"""Handle proactive scaling decision from ProactiveScaler.
|
|
|
|
Args:
|
|
scaling_event: ScalingEvent from ProactiveScaler
|
|
"""
|
|
try:
|
|
if scaling_event.decision == ScalingDecision.UPGRADE:
|
|
# Proactive upgrade to larger model
|
|
target_model_key = self._find_model_by_size(
|
|
scaling_event.new_model_size
|
|
)
|
|
if target_model_key and target_model_key != self.current_model_key:
|
|
self.logger.info(
|
|
f"Executing proactive upgrade to {target_model_key}"
|
|
)
|
|
# Schedule personality-aware upgrade (no notification)
|
|
asyncio.create_task(
|
|
self.personality_aware_model_switch(
|
|
target_model_key,
|
|
"proactive scaling detected available resources",
|
|
notify_user=False,
|
|
)
|
|
)
|
|
|
|
elif scaling_event.decision == ScalingDecision.DOWNGRADE:
|
|
# Immediate degradation to smaller model with personality notification
|
|
target_model_key = self._find_model_by_size(
|
|
scaling_event.new_model_size
|
|
)
|
|
if target_model_key:
|
|
self.logger.warning(
|
|
f"Executing degradation to {target_model_key}: {scaling_event.reason}"
|
|
)
|
|
# Use personality-aware switching for degradation
|
|
asyncio.create_task(
|
|
self.personality_aware_model_switch(
|
|
target_model_key, scaling_event.reason, notify_user=True
|
|
)
|
|
)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error handling scaling decision: {e}")
|
|
|
|
def _find_model_by_size(self, target_size: str) -> Optional[str]:
|
|
"""Find model key by size category.
|
|
|
|
Args:
|
|
target_size: Target model size ("small", "medium", "large")
|
|
|
|
Returns:
|
|
Model key or None if not found
|
|
"""
|
|
try:
|
|
# First, try to match by category in configurations
|
|
for model_key, config in self.model_configurations.items():
|
|
if config.get("category") == target_size:
|
|
# Check if model is available
|
|
for available_model in self.available_models:
|
|
if available_model["key"] == model_key and available_model.get(
|
|
"available", False
|
|
):
|
|
return model_key
|
|
|
|
# If no exact match, use preferred models from tier detector
|
|
current_tier = self.tier_detector.detect_current_tier()
|
|
preferred_models = self.tier_detector.get_preferred_models(current_tier)
|
|
|
|
# Find model of target size in preferred list
|
|
for preferred_model in preferred_models:
|
|
if preferred_model in self.model_configurations:
|
|
config = self.model_configurations[preferred_model]
|
|
if config.get("category") == target_size:
|
|
return preferred_model
|
|
|
|
return None
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error finding model by size {target_size}: {e}")
|
|
return None
|
|
|
|
async def _execute_proactive_upgrade(self, target_model_key: str) -> None:
|
|
"""Execute proactive model upgrade with proper timing.
|
|
|
|
Args:
|
|
target_model_key: Model to upgrade to
|
|
"""
|
|
try:
|
|
# Only upgrade if not currently switching and enough time has passed
|
|
if hasattr(self, "_upgrade_in_progress") and self._upgrade_in_progress:
|
|
return
|
|
|
|
self._upgrade_in_progress = True
|
|
|
|
success = await self.switch_model(target_model_key)
|
|
if success:
|
|
self.logger.info(f"Proactive upgrade completed: {target_model_key}")
|
|
else:
|
|
self.logger.warning(f"Proactive upgrade failed: {target_model_key}")
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error executing proactive upgrade: {e}")
|
|
finally:
|
|
self._upgrade_in_progress = False
|
|
|
|
def shutdown(self) -> None:
|
|
"""Clean up resources and unload models."""
|
|
try:
|
|
# Stop proactive scaling monitoring
|
|
if hasattr(self, "_proactive_scaler"):
|
|
self._proactive_scaler.stop_continuous_monitoring()
|
|
|
|
if self.current_model_instance and self.current_model_key:
|
|
self.lm_adapter.unload_model(self.current_model_key)
|
|
self.current_model_key = None
|
|
self.current_model_instance = None
|
|
|
|
self.logger.info("ModelManager shutdown complete")
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error during shutdown: {e}")
|