feat(01-01): create model configuration system
Some checks failed
Discord Webhook / git (push) Has been cancelled
Some checks failed
Discord Webhook / git (push) Has been cancelled
- Created comprehensive model definitions in config/models.yaml - Defined model categories: small, medium, large - Specified resource requirements for each model - Added context window sizes and capability lists - Configured fallback chains for graceful degradation - Included selection rules and switching triggers - Added context management compression settings
This commit is contained in:
131
config/models.yaml
Normal file
131
config/models.yaml
Normal file
@@ -0,0 +1,131 @@
|
|||||||
|
# Model configuration for Mai
|
||||||
|
# Defines available models, resource requirements, and switching behavior
|
||||||
|
|
||||||
|
models:
|
||||||
|
# Small models - for resource-constrained environments
|
||||||
|
- key: "microsoft/DialoGPT-medium"
|
||||||
|
display_name: "DialoGPT Medium"
|
||||||
|
category: "small"
|
||||||
|
min_memory_gb: 2
|
||||||
|
min_vram_gb: 1
|
||||||
|
context_window: 1024
|
||||||
|
capabilities: ["chat"]
|
||||||
|
fallback_for: ["large", "medium"]
|
||||||
|
|
||||||
|
- key: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
||||||
|
display_name: "TinyLlama 1.1B Chat"
|
||||||
|
category: "small"
|
||||||
|
min_memory_gb: 2
|
||||||
|
min_vram_gb: 1
|
||||||
|
context_window: 2048
|
||||||
|
capabilities: ["chat"]
|
||||||
|
fallback_for: ["large", "medium"]
|
||||||
|
|
||||||
|
# Medium models - balance of capability and efficiency
|
||||||
|
- key: "qwen/qwen3-4b-2507"
|
||||||
|
display_name: "Qwen3 4B"
|
||||||
|
category: "medium"
|
||||||
|
min_memory_gb: 4
|
||||||
|
min_vram_gb: 2
|
||||||
|
context_window: 8192
|
||||||
|
capabilities: ["chat", "reasoning"]
|
||||||
|
fallback_for: ["large"]
|
||||||
|
preferred_when: "memory >= 4GB and CPU < 80%"
|
||||||
|
|
||||||
|
- key: "microsoft/DialoGPT-large"
|
||||||
|
display_name: "DialoGPT Large"
|
||||||
|
category: "medium"
|
||||||
|
min_memory_gb: 6
|
||||||
|
min_vram_gb: 3
|
||||||
|
context_window: 2048
|
||||||
|
capabilities: ["chat"]
|
||||||
|
fallback_for: ["large"]
|
||||||
|
|
||||||
|
# Large models - maximum capability, require resources
|
||||||
|
- key: "qwen/qwen2.5-7b-instruct"
|
||||||
|
display_name: "Qwen2.5 7B Instruct"
|
||||||
|
category: "large"
|
||||||
|
min_memory_gb: 8
|
||||||
|
min_vram_gb: 4
|
||||||
|
context_window: 32768
|
||||||
|
capabilities: ["chat", "reasoning", "analysis"]
|
||||||
|
preferred_when: "memory >= 8GB and GPU available"
|
||||||
|
|
||||||
|
- key: "meta-llama/Llama-2-13b-chat-hf"
|
||||||
|
display_name: "Llama2 13B Chat"
|
||||||
|
category: "large"
|
||||||
|
min_memory_gb: 10
|
||||||
|
min_vram_gb: 6
|
||||||
|
context_window: 4096
|
||||||
|
capabilities: ["chat", "reasoning", "analysis"]
|
||||||
|
preferred_when: "memory >= 10GB and GPU available"
|
||||||
|
|
||||||
|
# Model selection rules
|
||||||
|
selection_rules:
|
||||||
|
# Resource-based selection criteria
|
||||||
|
resource_thresholds:
|
||||||
|
memory_available_gb:
|
||||||
|
small: 2
|
||||||
|
medium: 4
|
||||||
|
large: 8
|
||||||
|
cpu_threshold_percent: 80
|
||||||
|
gpu_required_for_large: true
|
||||||
|
|
||||||
|
# Context window requirements per task type
|
||||||
|
task_requirements:
|
||||||
|
simple_chat: 2048
|
||||||
|
reasoning: 8192
|
||||||
|
analysis: 16384
|
||||||
|
code_generation: 4096
|
||||||
|
|
||||||
|
# Fallback chains when resources are constrained
|
||||||
|
fallback_chains:
|
||||||
|
large_to_medium:
|
||||||
|
- "qwen/qwen2.5-7b-instruct": "qwen/qwen3-4b-2507"
|
||||||
|
- "meta-llama/Llama-2-13b-chat-hf": "microsoft/DialoGPT-large"
|
||||||
|
medium_to_small:
|
||||||
|
- "qwen/qwen3-4b-2507": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
||||||
|
- "microsoft/DialoGPT-large": "microsoft/DialoGPT-medium"
|
||||||
|
large_to_small:
|
||||||
|
- "qwen/qwen2.5-7b-instruct": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
||||||
|
- "meta-llama/Llama-2-13b-chat-hf": "microsoft/DialoGPT-medium"
|
||||||
|
|
||||||
|
# Context management settings
|
||||||
|
context_management:
|
||||||
|
# When to trigger context compression (percentage of context window)
|
||||||
|
compression_threshold: 70
|
||||||
|
|
||||||
|
# Minimum context to preserve
|
||||||
|
min_context_tokens: 512
|
||||||
|
|
||||||
|
# Hybrid compression strategy
|
||||||
|
compression_strategy:
|
||||||
|
# Summarize messages older than this ratio
|
||||||
|
summarize_older_than: 0.5
|
||||||
|
# Keep some messages from middle intact
|
||||||
|
keep_middle_percentage: 0.3
|
||||||
|
# Always preserve most recent messages
|
||||||
|
keep_recent_percentage: 0.2
|
||||||
|
# Priority during compression
|
||||||
|
always_preserve: ["user_instructions", "explicit_requests"]
|
||||||
|
|
||||||
|
# Performance settings
|
||||||
|
performance:
|
||||||
|
# Model loading timeouts
|
||||||
|
load_timeout_seconds:
|
||||||
|
small: 30
|
||||||
|
medium: 60
|
||||||
|
large: 120
|
||||||
|
|
||||||
|
# Resource monitoring frequency
|
||||||
|
monitoring_interval_seconds: 5
|
||||||
|
|
||||||
|
# Trend analysis window
|
||||||
|
trend_window_minutes: 5
|
||||||
|
|
||||||
|
# When to consider model switching
|
||||||
|
switching_triggers:
|
||||||
|
cpu_threshold: 85
|
||||||
|
memory_threshold: 85
|
||||||
|
response_time_threshold_ms: 5000
|
||||||
|
consecutive_failures: 3
|
||||||
Reference in New Issue
Block a user