A comprehensive guide for deploying Inferno AI in production environments.
This guide covers enterprise-grade deployment strategies, security hardening, monitoring, and high availability configurations for Inferno AI.
Create a docker-compose.yml for production:
version: '3.8'
services:
inferno:
image: ringo380/inferno:latest
container_name: inferno-prod
restart: unless-stopped
ports:
- "8080:8080"
volumes:
- ./inferno-data/models:/data/models
- ./inferno-data/cache:/data/cache
- ./inferno-data/logs:/var/log/inferno
- ./config/config.toml:/etc/inferno/config.toml:ro
environment:
- INFERNO_HOST=0.0.0.0
- INFERNO_PORT=8080
- INFERNO_LOG_LEVEL=info
- INFERNO_GPU_BACKEND=cuda
deploy:
resources:
limits:
cpus: '8'
memory: 32G
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
logging:
driver: "json-file"
options:
max-size: "100m"
max-file: "10"
# Reverse proxy (nginx)
nginx:
image: nginx:alpine
container_name: inferno-nginx
restart: unless-stopped
ports:
- "80:80"
- "443:443"
volumes:
- ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
- ./nginx/ssl:/etc/nginx/ssl:ro
depends_on:
- inferno
# Monitoring (Prometheus)
prometheus:
image: prom/prometheus:latest
container_name: inferno-prometheus
restart: unless-stopped
ports:
- "9090:9090"
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus-data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
# Visualization (Grafana)
grafana:
image: grafana/grafana:latest
container_name: inferno-grafana
restart: unless-stopped
ports:
- "3000:3000"
volumes:
- grafana-data:/var/lib/grafana
- ./grafana/dashboards:/etc/grafana/dashboards:ro
environment:
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD}
volumes:
prometheus-data:
grafana-data:
# Start all services
docker-compose up -d
# View logs
docker-compose logs -f inferno
# Check status
docker-compose ps
# Restart services
docker-compose restart
# Stop all services
docker-compose down
Create inferno-deployment.yaml:
apiVersion: v1
kind: Namespace
metadata:
name: inferno
---
apiVersion: v1
kind: ConfigMap
metadata:
name: inferno-config
namespace: inferno
data:
config.toml: |
[server]
host = "0.0.0.0"
port = 8080
[models]
models_dir = "/data/models"
cache_dir = "/data/cache"
[gpu]
backend = "cuda"
gpu_layers = -1
[logging]
level = "info"
file = "/var/log/inferno/inferno.log"
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: inferno-models-pvc
namespace: inferno
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 500Gi
storageClassName: fast-ssd
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: inferno
namespace: inferno
spec:
replicas: 3
selector:
matchLabels:
app: inferno
template:
metadata:
labels:
app: inferno
spec:
containers:
- name: inferno
image: ringo380/inferno:latest
ports:
- containerPort: 8080
name: http
resources:
requests:
memory: "16Gi"
cpu: "4"
nvidia.com/gpu: 1
limits:
memory: "32Gi"
cpu: "8"
nvidia.com/gpu: 1
volumeMounts:
- name: config
mountPath: /etc/inferno
readOnly: true
- name: models
mountPath: /data/models
- name: cache
mountPath: /data/cache
env:
- name: INFERNO_HOST
value: "0.0.0.0"
- name: INFERNO_PORT
value: "8080"
livenessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /ready
port: 8080
initialDelaySeconds: 15
periodSeconds: 5
volumes:
- name: config
configMap:
name: inferno-config
- name: models
persistentVolumeClaim:
claimName: inferno-models-pvc
- name: cache
emptyDir: {}
---
apiVersion: v1
kind: Service
metadata:
name: inferno-service
namespace: inferno
spec:
selector:
app: inferno
ports:
- protocol: TCP
port: 80
targetPort: 8080
type: LoadBalancer
---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: inferno-hpa
namespace: inferno
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: inferno
minReplicas: 2
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
Deploy to Kubernetes:
# Apply manifests
kubectl apply -f inferno-deployment.yaml
# Check deployment status
kubectl get pods -n inferno
# View logs
kubectl logs -f -n inferno -l app=inferno
# Get service endpoint
kubectl get svc -n inferno
For non-containerized deployments on Linux:
Create /etc/systemd/system/inferno.service:
[Unit]
Description=Inferno AI Inference Server
After=network.target
Wants=network-online.target
[Service]
Type=simple
User=inferno
Group=inferno
WorkingDirectory=/opt/inferno
# Environment variables
Environment="INFERNO_CONFIG=/etc/inferno/config.toml"
Environment="INFERNO_MODELS_DIR=/var/lib/inferno/models"
# Start command
ExecStart=/usr/local/bin/inferno serve --config /etc/inferno/config.toml
# Restart policy
Restart=always
RestartSec=10s
# Security hardening
NoNewPrivileges=true
PrivateTmp=true
ProtectSystem=strict
ProtectHome=true
ReadWritePaths=/var/lib/inferno /var/log/inferno
# Resource limits
LimitNOFILE=65536
LimitNPROC=4096
[Install]
WantedBy=multi-user.target
Enable and start the service:
# Create user
sudo useradd -r -s /bin/false inferno
# Create directories
sudo mkdir -p /var/lib/inferno/models
sudo mkdir -p /var/log/inferno
sudo chown -R inferno:inferno /var/lib/inferno /var/log/inferno
# Reload systemd
sudo systemctl daemon-reload
# Enable service
sudo systemctl enable inferno
# Start service
sudo systemctl start inferno
# Check status
sudo systemctl status inferno
# View logs
sudo journalctl -u inferno -f
Configure nginx as a reverse proxy with SSL:
Create /etc/nginx/sites-available/inferno:
upstream inferno_backend {
least_conn;
server 127.0.0.1:8080 max_fails=3 fail_timeout=30s;
# Add more backends for load balancing
# server 127.0.0.1:8081 max_fails=3 fail_timeout=30s;
# server 127.0.0.1:8082 max_fails=3 fail_timeout=30s;
}
# Rate limiting
limit_req_zone $binary_remote_addr zone=inferno_limit:10m rate=10r/s;
server {
listen 80;
listen [::]:80;
server_name api.infernoai.example.com;
# Redirect HTTP to HTTPS
return 301 https://$server_name$request_uri;
}
server {
listen 443 ssl http2;
listen [::]:443 ssl http2;
server_name api.infernoai.example.com;
# SSL configuration
ssl_certificate /etc/nginx/ssl/cert.pem;
ssl_certificate_key /etc/nginx/ssl/key.pem;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers HIGH:!aNULL:!MD5;
ssl_prefer_server_ciphers on;
# Security headers
add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always;
add_header X-Frame-Options "SAMEORIGIN" always;
add_header X-Content-Type-Options "nosniff" always;
# Logging
access_log /var/log/nginx/inferno-access.log;
error_log /var/log/nginx/inferno-error.log;
# Client max body size (for large requests)
client_max_body_size 100M;
# Timeouts
proxy_connect_timeout 300s;
proxy_send_timeout 300s;
proxy_read_timeout 300s;
location / {
# Rate limiting
limit_req zone=inferno_limit burst=20 nodelay;
# Proxy to Inferno backend
proxy_pass http://inferno_backend;
proxy_http_version 1.1;
# Headers
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# WebSocket support (for streaming)
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
# Buffering
proxy_buffering off;
proxy_request_buffering off;
}
# Health check endpoint
location /health {
access_log off;
proxy_pass http://inferno_backend/health;
}
}
Enable the configuration:
# Create symbolic link
sudo ln -s /etc/nginx/sites-available/inferno /etc/nginx/sites-enabled/
# Test configuration
sudo nginx -t
# Reload nginx
sudo systemctl reload nginx
Create prometheus.yml:
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'inferno'
static_configs:
- targets: ['localhost:8080']
metrics_path: '/metrics'
Import the Inferno Grafana dashboard:
Enable API key authentication:
[security]
auth_enabled = true
api_keys = ["${INFERNO_API_KEY_1}", "${INFERNO_API_KEY_2}"]
[security]
rate_limit_enabled = true
rate_limit_requests = 100
rate_limit_window = 60
Always use HTTPS in production:
# Generate SSL certificate with Let's Encrypt
sudo certbot --nginx -d api.infernoai.example.com
Enable comprehensive logging:
[logging]
level = "info"
file = "/var/log/inferno/inferno.log"
audit_log_enabled = true
Deploy multiple Inferno instances behind a load balancer:
upstream inferno_cluster {
least_conn;
server 10.0.1.10:8080 weight=3 max_fails=2 fail_timeout=30s;
server 10.0.1.11:8080 weight=3 max_fails=2 fail_timeout=30s;
server 10.0.1.12:8080 weight=2 max_fails=2 fail_timeout=30s;
}
Implement health check endpoints:
# Health check script
#!/bin/bash
response=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8080/health)
if [ $response -eq 200 ]; then
exit 0
else
exit 1
fi
Implement backup strategies:
#!/bin/bash
# Backup models and configuration
tar -czf /backup/inferno-$(date +%Y%m%d).tar.gz \
/var/lib/inferno/models \
/etc/inferno/config.toml
[gpu]
backend = "cuda"
gpu_layers = -1 # Offload all layers
device_id = 0
[performance]
batch_size = 512 # Adjust based on GPU memory
Implement model caching:
[models]
cache_enabled = true
cache_dir = "/var/cache/inferno"
Configure connection limits:
[server]
max_connections = 1000
keep_alive_timeout = 60