feat(01-01): create model configuration system
Some checks failed
Discord Webhook / git (push) Has been cancelled

- Created comprehensive model definitions in config/models.yaml
- Defined model categories: small, medium, large
- Specified resource requirements for each model
- Added context window sizes and capability lists
- Configured fallback chains for graceful degradation
- Included selection rules and switching triggers
- Added context management compression settings
This commit is contained in:
Mai Development
2026-01-27 12:00:30 -05:00
parent e6f072a6c7
commit 446b9baca6

131
config/models.yaml Normal file
View File

@@ -0,0 +1,131 @@
# Model configuration for Mai
# Defines available models, resource requirements, and switching behavior
models:
# Small models - for resource-constrained environments
- key: "microsoft/DialoGPT-medium"
display_name: "DialoGPT Medium"
category: "small"
min_memory_gb: 2
min_vram_gb: 1
context_window: 1024
capabilities: ["chat"]
fallback_for: ["large", "medium"]
- key: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
display_name: "TinyLlama 1.1B Chat"
category: "small"
min_memory_gb: 2
min_vram_gb: 1
context_window: 2048
capabilities: ["chat"]
fallback_for: ["large", "medium"]
# Medium models - balance of capability and efficiency
- key: "qwen/qwen3-4b-2507"
display_name: "Qwen3 4B"
category: "medium"
min_memory_gb: 4
min_vram_gb: 2
context_window: 8192
capabilities: ["chat", "reasoning"]
fallback_for: ["large"]
preferred_when: "memory >= 4GB and CPU < 80%"
- key: "microsoft/DialoGPT-large"
display_name: "DialoGPT Large"
category: "medium"
min_memory_gb: 6
min_vram_gb: 3
context_window: 2048
capabilities: ["chat"]
fallback_for: ["large"]
# Large models - maximum capability, require resources
- key: "qwen/qwen2.5-7b-instruct"
display_name: "Qwen2.5 7B Instruct"
category: "large"
min_memory_gb: 8
min_vram_gb: 4
context_window: 32768
capabilities: ["chat", "reasoning", "analysis"]
preferred_when: "memory >= 8GB and GPU available"
- key: "meta-llama/Llama-2-13b-chat-hf"
display_name: "Llama2 13B Chat"
category: "large"
min_memory_gb: 10
min_vram_gb: 6
context_window: 4096
capabilities: ["chat", "reasoning", "analysis"]
preferred_when: "memory >= 10GB and GPU available"
# Model selection rules
selection_rules:
# Resource-based selection criteria
resource_thresholds:
memory_available_gb:
small: 2
medium: 4
large: 8
cpu_threshold_percent: 80
gpu_required_for_large: true
# Context window requirements per task type
task_requirements:
simple_chat: 2048
reasoning: 8192
analysis: 16384
code_generation: 4096
# Fallback chains when resources are constrained
fallback_chains:
large_to_medium:
- "qwen/qwen2.5-7b-instruct": "qwen/qwen3-4b-2507"
- "meta-llama/Llama-2-13b-chat-hf": "microsoft/DialoGPT-large"
medium_to_small:
- "qwen/qwen3-4b-2507": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
- "microsoft/DialoGPT-large": "microsoft/DialoGPT-medium"
large_to_small:
- "qwen/qwen2.5-7b-instruct": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
- "meta-llama/Llama-2-13b-chat-hf": "microsoft/DialoGPT-medium"
# Context management settings
context_management:
# When to trigger context compression (percentage of context window)
compression_threshold: 70
# Minimum context to preserve
min_context_tokens: 512
# Hybrid compression strategy
compression_strategy:
# Summarize messages older than this ratio
summarize_older_than: 0.5
# Keep some messages from middle intact
keep_middle_percentage: 0.3
# Always preserve most recent messages
keep_recent_percentage: 0.2
# Priority during compression
always_preserve: ["user_instructions", "explicit_requests"]
# Performance settings
performance:
# Model loading timeouts
load_timeout_seconds:
small: 30
medium: 60
large: 120
# Resource monitoring frequency
monitoring_interval_seconds: 5
# Trend analysis window
trend_window_minutes: 5
# When to consider model switching
switching_triggers:
cpu_threshold: 85
memory_threshold: 85
response_time_threshold_ms: 5000
consecutive_failures: 3