Comprehensive guide to downloading, organizing, and managing AI models in Inferno.
Inferno supports multiple model formats and sources:
# Download specific model
inferno models download llama-2-7b-chat
# Download from specific repo
inferno models download --source huggingface TheBloke/Llama-2-7B-Chat-GGUF
# Download specific file
inferno models download --file llama-2-7b-chat.Q4_K_M.gguf TheBloke/Llama-2-7B-Chat-GGUF
# List available models
inferno models search llama
# Use local GGUF file
inferno run --model-path /path/to/model.gguf --prompt "Hello"
# Copy to models directory
cp /path/to/model.gguf ~/.local/share/inferno/models/
# Then use by name
inferno run --model my-model
# Download with progress bar
inferno models download llama-2-7b-chat --progress
# Resume interrupted download
inferno models download llama-2-7b-chat --resume
# Verify checksum after download
inferno models download llama-2-7b-chat --verify
# Download to specific directory
inferno models download llama-2-7b-chat --output /data/models/
# List all models
inferno models list
# List with details
inferno models list --detailed
# List specific format
inferno models list --format gguf
# Show model sizes
inferno models list --show-size
Example output:
Available Models:
llama-2-7b-chat GGUF 4.1GB 2024-01-15
llama-2-13b-chat GGUF 8.5GB 2024-01-15
mistral-7b-instruct-v0.2 GGUF 4.4GB 2024-02-01
codellama-7b GGUF 4.2GB 2024-01-20
# Show detailed model info
inferno models info llama-2-7b-chat
# Show model configuration
inferno models info llama-2-7b-chat --config
# Show model layers
inferno models info llama-2-7b-chat --layers
# Show model parameters
inferno models info llama-2-7b-chat --params
Example output:
Model: llama-2-7b-chat
Format: GGUF
Size: 4.1GB
Parameters: 7B
Quantization: Q4_K_M
Context Length: 4096
Architecture: Llama
Layers: 32
Vocabulary Size: 32000
Default structure:
~/.local/share/inferno/
├── models/
│ ├── llama-2-7b-chat.gguf
│ ├── mistral-7b-instruct.gguf
│ └── codellama-7b.gguf
├── cache/
│ └── model_cache/
└── config/
└── models.json
# Set custom models directory
export INFERNO_MODELS_DIR=/data/models
# Or in configuration
[models]
models_dir = "/data/models"
# Use with command
inferno serve --models-dir /data/models
# Create subdirectories
mkdir -p ~/.local/share/inferno/models/{chat,code,instruct}
# Organize models
mv llama-2-7b-chat.gguf ~/.local/share/inferno/models/chat/
mv codellama-7b.gguf ~/.local/share/inferno/models/code/
mv mistral-7b-instruct.gguf ~/.local/share/inferno/models/instruct/
# Use with full path
inferno run --model-path models/chat/llama-2-7b-chat.gguf
# Create alias for long model names
inferno models alias llama-2-7b-chat.Q4_K_M.gguf llama2
# Use alias
inferno run --model llama2 --prompt "Hello"
# List aliases
inferno models alias list
# Remove alias
inferno models alias remove llama2
# Preload model into memory
inferno models preload llama-2-7b-chat
# Preload multiple models
inferno models preload llama-2-7b-chat mistral-7b-instruct
# Check preloaded models
inferno models preload --list
# Unload model
inferno models unload llama-2-7b-chat
# In config.toml
[models]
preload_models = [
"llama-2-7b-chat",
"mistral-7b-instruct"
]
Quantization reduces model size and memory usage:
# Download 4-bit quantized (recommended)
inferno models download llama-2-7b-chat-q4
# Download specific quantization
inferno models download --quantization Q5_K_M llama-2-7b-chat
# List available quantizations
inferno models search llama-2-7b --show-quants
# Convert existing model to GGUF
inferno models convert --input model.safetensors --output model.gguf
# Quantize model
inferno models quantize --input model.gguf --output model-q4.gguf --quant Q4_K_M
# Verify model file
inferno models verify llama-2-7b-chat
# Verify checksum
inferno models verify llama-2-7b-chat --checksum
# Test model loading
inferno models test llama-2-7b-chat
# Remove single model
inferno models remove llama-2-7b-chat
# Remove with confirmation
inferno models remove llama-2-7b-chat --confirm
# Remove multiple models
inferno models remove llama-2-7b-chat mistral-7b-instruct
# Remove all unused models
inferno models clean
# Dry run (show what would be removed)
inferno models clean --dry-run
Inferno caches model data for faster loading:
# View cache size
inferno cache info
# Clear cache
inferno cache clear
# Clear specific model cache
inferno cache clear --model llama-2-7b-chat
# Set cache size limit
inferno cache set-limit 50GB
[models]
cache_enabled = true
cache_dir = "/data/cache"
cache_size_limit = "50GB"
Create a model registry for team use:
// models-registry.json
{
"models": [
{
"name": "llama2-production",
"source": "TheBloke/Llama-2-7B-Chat-GGUF",
"file": "llama-2-7b-chat.Q4_K_M.gguf",
"quantization": "Q4_K_M",
"recommended_for": ["chat", "general"]
},
{
"name": "codellama-production",
"source": "TheBloke/CodeLlama-7B-GGUF",
"file": "codellama-7b.Q4_K_M.gguf",
"quantization": "Q4_K_M",
"recommended_for": ["code", "programming"]
}
]
}
Load from registry:
inferno models import --registry models-registry.json
# Download multiple models
inferno models batch-download models-list.txt
# Remove multiple models
inferno models batch-remove models-to-remove.txt
# Verify multiple models
inferno models batch-verify
Chat/Conversation:
# 7B - Fast, good quality
inferno models download llama-2-7b-chat
# 13B - Better quality, slower
inferno models download llama-2-13b-chat
# Alternative
inferno models download mistral-7b-instruct-v0.2
Code Generation:
# Code-specific model
inferno models download codellama-7b
# Instruct variant
inferno models download codellama-7b-instruct
Instruction Following:
# Mistral (excellent instruction following)
inferno models download mistral-7b-instruct-v0.2
# Llama 2 instruct
inferno models download llama-2-7b-instruct
Multilingual:
# Good multilingual support
inferno models download llama-2-7b-chat
# Specialized multilingual
inferno models download bloom-7b
Limited RAM (8-16GB):
--mmap)Moderate RAM (16-32GB):
High RAM (32GB+):
Development/Testing:
Production:
Research/Quality:
#!/bin/bash
# update-models.sh
# Update all models
for model in $(inferno models list --names-only); do
echo "Updating $model..."
inferno models download $model --update
done
# Clean old versions
inferno models clean --keep-latest
#!/bin/bash
# check-models.sh
# Verify all models
inferno models batch-verify
# Check disk space
df -h ~/.local/share/inferno/models/
# Alert if space low
SPACE=$(df -h ~/.local/share/inferno/models/ | awk 'NR==2 {print $5}' | tr -d '%')
if [ $SPACE -gt 90 ]; then
echo "Warning: Low disk space"
inferno models clean --dry-run
fi