7.1 容器化部署 (Docker/Kubernetes)
🎯 学习目标:掌握MCP服务器的容器化部署和Kubernetes编排
⏱️ 预计时间:60分钟
📊 难度等级:⭐⭐⭐⭐
🐳 Docker最佳实践
📦 多阶段构建优化
MCP服务器的容器化需要考虑安全性、性能和大小优化。以下是一个完整的多阶段构建示例:
dockerfile
# Dockerfile.production
# 构建阶段
FROM python:3.11-slim as builder
# 安装构建依赖
RUN apt-get update && apt-get install -y \
build-essential \
pkg-config \
libffi-dev \
libssl-dev \
&& rm -rf /var/lib/apt/lists/*
# 设置工作目录
WORKDIR /build
# 复制依赖文件
COPY requirements.txt .
COPY requirements-dev.txt .
# 创建虚拟环境并安装依赖
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# 升级pip并安装依赖
RUN pip install --upgrade pip && \
pip install --no-cache-dir -r requirements.txt
# 复制源码
COPY src/ ./src/
COPY tests/ ./tests/
COPY setup.py .
COPY pyproject.toml .
# 构建应用
RUN pip install --no-cache-dir -e .
# 运行测试
RUN python -m pytest tests/ -v
# 生产阶段
FROM python:3.11-slim as production
# 创建非root用户
RUN groupadd -r mcp && useradd -r -g mcp mcp
# 安装运行时依赖
RUN apt-get update && apt-get install -y \
libmagic1 \
ffmpeg \
libsndfile1 \
curl \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
# 复制虚拟环境
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# 设置工作目录
WORKDIR /app
# 复制应用代码
COPY --from=builder /build/src ./src
COPY --chown=mcp:mcp config/ ./config/
COPY --chown=mcp:mcp scripts/ ./scripts/
# 创建必要的目录
RUN mkdir -p /app/logs /app/data /app/models && \
chown -R mcp:mcp /app
# 健康检查
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/health || exit 1
# 切换到非root用户
USER mcp
# 暴露端口
EXPOSE 8000
# 设置环境变量
ENV PYTHONPATH=/app/src
ENV PYTHONUNBUFFERED=1
ENV MCP_LOG_LEVEL=INFO
# 启动命令
CMD ["python", "-m", "mcp_server.main"]
🔒 安全配置最佳实践
dockerfile
# Dockerfile.secure
FROM python:3.11-slim
# 安全标签
LABEL \
org.opencontainers.image.title="MCP Server" \
org.opencontainers.image.description="Production MCP Server" \
org.opencontainers.image.version="1.0.0" \
org.opencontainers.image.vendor="Your Company" \
security.scan="enabled"
# 更新系统并安装安全更新
RUN apt-get update && apt-get upgrade -y && \
apt-get install -y --no-install-recommends \
ca-certificates \
curl \
&& rm -rf /var/lib/apt/lists/*
# 创建专用用户和组
RUN groupadd -r -g 1001 mcp && \
useradd -r -u 1001 -g mcp -d /app -s /sbin/nologin -c "MCP Server" mcp
# 设置安全的文件权限
WORKDIR /app
COPY --chown=mcp:mcp . .
# 移除不必要的包管理工具
RUN apt-get purge -y --auto-remove \
apt \
dpkg \
&& rm -rf /var/cache/apt/archives
# 设置只读根文件系统的必要目录
RUN mkdir -p /tmp /var/tmp && \
chmod 1777 /tmp /var/tmp
# 切换到非特权用户
USER mcp
# 运行时安全配置
EXPOSE 8000
CMD ["python", "-m", "mcp_server.main"]
# 安全扫描命令(在CI中运行)
# docker run --rm -v /var/run/docker.sock:/var/run/docker.sock \
# -v $(pwd):/app anchore/grype:latest /app
📊 Docker Compose生产配置
yaml
# docker-compose.prod.yml
version: '3.8'
services:
mcp-server:
build:
context: .
dockerfile: Dockerfile.production
target: production
image: mcp-server:latest
container_name: mcp-server-prod
restart: unless-stopped
# 资源限制
deploy:
resources:
limits:
cpus: '2.0'
memory: 4G
reservations:
cpus: '1.0'
memory: 2G
# 环境变量
environment:
- NODE_ENV=production
- MCP_LOG_LEVEL=INFO
- MCP_WORKERS=4
- DATABASE_URL=postgresql://user:password@postgres:5432/mcpdb
- REDIS_URL=redis://redis:6379
- SECRET_KEY_FILE=/run/secrets/secret_key
# 密钥管理
secrets:
- secret_key
- database_password
# 端口映射
ports:
- "8000:8000"
# 卷挂载
volumes:
- mcp_logs:/app/logs
- mcp_data:/app/data
- mcp_models:/app/models:ro
# 网络配置
networks:
- mcp_network
# 健康检查
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
# 依赖服务
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_started
# 日志配置
logging:
driver: "json-file"
options:
max-size: "100m"
max-file: "3"
# PostgreSQL数据库
postgres:
image: postgres:15-alpine
container_name: mcp-postgres-prod
restart: unless-stopped
environment:
- POSTGRES_DB=mcpdb
- POSTGRES_USER=mcpuser
- POSTGRES_PASSWORD_FILE=/run/secrets/database_password
secrets:
- database_password
volumes:
- postgres_data:/var/lib/postgresql/data
- ./init-scripts:/docker-entrypoint-initdb.d:ro
networks:
- mcp_network
healthcheck:
test: ["CMD-SHELL", "pg_isready -U mcpuser -d mcpdb"]
interval: 10s
timeout: 5s
retries: 5
# Redis缓存
redis:
image: redis:7-alpine
container_name: mcp-redis-prod
restart: unless-stopped
command: redis-server --appendonly yes --requirepass ${REDIS_PASSWORD}
volumes:
- redis_data:/data
networks:
- mcp_network
# Nginx反向代理
nginx:
image: nginx:alpine
container_name: mcp-nginx-prod
restart: unless-stopped
ports:
- "80:80"
- "443:443"
volumes:
- ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
- ./nginx/ssl:/etc/nginx/ssl:ro
- nginx_logs:/var/log/nginx
networks:
- mcp_network
depends_on:
- mcp-server
# 监控和日志
prometheus:
image: prom/prometheus:latest
container_name: mcp-prometheus
restart: unless-stopped
ports:
- "9090:9090"
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus_data:/prometheus
networks:
- mcp_network
grafana:
image: grafana/grafana:latest
container_name: mcp-grafana
restart: unless-stopped
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD}
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/dashboards:/etc/grafana/provisioning/dashboards:ro
- ./grafana/datasources:/etc/grafana/provisioning/datasources:ro
networks:
- mcp_network
# 网络配置
networks:
mcp_network:
driver: bridge
ipam:
config:
- subnet: 172.20.0.0/16
# 卷配置
volumes:
postgres_data:
driver: local
redis_data:
driver: local
mcp_logs:
driver: local
mcp_data:
driver: local
mcp_models:
driver: local
nginx_logs:
driver: local
prometheus_data:
driver: local
grafana_data:
driver: local
# 密钥配置
secrets:
secret_key:
file: ./secrets/secret_key.txt
database_password:
file: ./secrets/database_password.txt
☸️ Kubernetes部署
📋 基础部署配置
yaml
# k8s/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: mcp-system
labels:
name: mcp-system
app.kubernetes.io/name: mcp-server
app.kubernetes.io/instance: production
yaml
# k8s/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: mcp-server-config
namespace: mcp-system
data:
app.properties: |
log_level=INFO
workers=4
max_connections=1000
timeout=30
nginx.conf: |
upstream mcp_backend {
least_conn;
server mcp-server-service:8000 max_fails=3 fail_timeout=30s;
}
server {
listen 80;
server_name _;
location /health {
access_log off;
return 200 "healthy\\n";
}
location / {
proxy_pass http://mcp_backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
}
yaml
# k8s/secret.yaml
apiVersion: v1
kind: Secret
metadata:
name: mcp-server-secrets
namespace: mcp-system
type: Opaque
stringData:
database-url: "postgresql://user:password@postgres:5432/mcpdb"
redis-url: "redis://redis:6379"
secret-key: "your-super-secret-key-here"
openai-api-key: "your-openai-api-key"
🚀 应用部署
yaml
# k8s/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: mcp-server
namespace: mcp-system
labels:
app: mcp-server
version: v1.0.0
spec:
replicas: 3
selector:
matchLabels:
app: mcp-server
template:
metadata:
labels:
app: mcp-server
version: v1.0.0
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8000"
prometheus.io/path: "/metrics"
spec:
# 服务账户
serviceAccountName: mcp-server-sa
# 安全上下文
securityContext:
runAsNonRoot: true
runAsUser: 1001
runAsGroup: 1001
fsGroup: 1001
# 初始化容器
initContainers:
- name: migration
image: mcp-server:latest
command: ['python', '-m', 'mcp_server.migrate']
env:
- name: DATABASE_URL
valueFrom:
secretKeyRef:
name: mcp-server-secrets
key: database-url
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "512Mi"
cpu: "200m"
# 主容器
containers:
- name: mcp-server
image: mcp-server:latest
ports:
- containerPort: 8000
name: http
protocol: TCP
# 环境变量
env:
- name: NODE_ENV
value: "production"
- name: MCP_LOG_LEVEL
value: "INFO"
- name: DATABASE_URL
valueFrom:
secretKeyRef:
name: mcp-server-secrets
key: database-url
- name: REDIS_URL
valueFrom:
secretKeyRef:
name: mcp-server-secrets
key: redis-url
- name: SECRET_KEY
valueFrom:
secretKeyRef:
name: mcp-server-secrets
key: secret-key
# 配置文件挂载
volumeMounts:
- name: config
mountPath: /app/config
readOnly: true
- name: logs
mountPath: /app/logs
- name: models
mountPath: /app/models
readOnly: true
# 资源限制
resources:
requests:
memory: "1Gi"
cpu: "500m"
limits:
memory: "2Gi"
cpu: "1000m"
# 健康检查
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
httpGet:
path: /ready
port: 8000
initialDelaySeconds: 10
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 2
# 启动探针
startupProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 10
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 30
# 安全上下文
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
# 卷配置
volumes:
- name: config
configMap:
name: mcp-server-config
- name: logs
emptyDir: {}
- name: models
persistentVolumeClaim:
claimName: mcp-models-pvc
# 节点选择
nodeSelector:
kubernetes.io/arch: amd64
node-type: application
# 容忍性配置
tolerations:
- key: "application"
operator: "Equal"
value: "mcp"
effect: "NoSchedule"
# 亲和性配置
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchLabels:
app: mcp-server
topologyKey: kubernetes.io/hostname
# 更新策略
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
maxSurge: 1
🌐 服务配置
yaml
# k8s/service.yaml
apiVersion: v1
kind: Service
metadata:
name: mcp-server-service
namespace: mcp-system
labels:
app: mcp-server
spec:
type: ClusterIP
ports:
- port: 8000
targetPort: 8000
protocol: TCP
name: http
selector:
app: mcp-server
---
# Ingress配置
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: mcp-server-ingress
namespace: mcp-system
annotations:
kubernetes.io/ingress.class: "nginx"
nginx.ingress.kubernetes.io/rewrite-target: /
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/use-regex: "true"
cert-manager.io/cluster-issuer: "letsencrypt-prod"
nginx.ingress.kubernetes.io/rate-limit: "100"
spec:
tls:
- hosts:
- mcp.yourdomain.com
secretName: mcp-tls-secret
rules:
- host: mcp.yourdomain.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: mcp-server-service
port:
number: 8000
📊 自动扩缩容配置
yaml
# k8s/hpa.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: mcp-server-hpa
namespace: mcp-system
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: mcp-server
minReplicas: 3
maxReplicas: 20
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
- type: Pods
pods:
metric:
name: http_requests_per_second
target:
type: AverageValue
averageValue: "100"
behavior:
scaleDown:
stabilizationWindowSeconds: 300
policies:
- type: Percent
value: 10
periodSeconds: 60
scaleUp:
stabilizationWindowSeconds: 60
policies:
- type: Percent
value: 50
periodSeconds: 60
- type: Pods
value: 2
periodSeconds: 60
selectPolicy: Max
---
# VPA配置(垂直扩缩容)
apiVersion: autoscaling.k8s.io/v1
kind: VerticalPodAutoscaler
metadata:
name: mcp-server-vpa
namespace: mcp-system
spec:
targetRef:
apiVersion: apps/v1
kind: Deployment
name: mcp-server
updatePolicy:
updateMode: "Auto"
resourcePolicy:
containerPolicies:
- containerName: mcp-server
maxAllowed:
cpu: 2
memory: 4Gi
minAllowed:
cpu: 100m
memory: 128Mi
💾 存储配置
yaml
# k8s/storage.yaml
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: mcp-ssd
provisioner: kubernetes.io/aws-ebs
parameters:
type: gp3
iops: "3000"
throughput: "125"
encrypted: "true"
allowVolumeExpansion: true
reclaimPolicy: Retain
volumeBindingMode: WaitForFirstConsumer
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: mcp-models-pvc
namespace: mcp-system
spec:
accessModes:
- ReadOnlyMany
storageClassName: mcp-ssd
resources:
requests:
storage: 50Gi
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: mcp-data-pvc
namespace: mcp-system
spec:
accessModes:
- ReadWriteMany
storageClassName: mcp-ssd
resources:
requests:
storage: 100Gi
🔄 部署自动化脚本
🚀 部署脚本
bash
#!/bin/bash
# deploy.sh
set -euo pipefail
# 配置变量
NAMESPACE="mcp-system"
IMAGE_TAG="${1:-latest}"
ENVIRONMENT="${2:-production}"
echo "🚀 开始部署MCP服务器..."
echo "📦 镜像标签: $IMAGE_TAG"
echo "🌍 环境: $ENVIRONMENT"
# 检查依赖
check_dependencies() {
echo "🔍 检查依赖..."
command -v kubectl >/dev/null 2>&1 || {
echo "❌ kubectl 未安装" >&2;
exit 1;
}
command -v docker >/dev/null 2>&1 || {
echo "❌ docker 未安装" >&2;
exit 1;
}
# 检查集群连接
kubectl cluster-info >/dev/null 2>&1 || {
echo "❌ 无法连接到Kubernetes集群" >&2;
exit 1;
}
echo "✅ 依赖检查通过"
}
# 构建镜像
build_image() {
echo "🔨 构建Docker镜像..."
docker build \
-f Dockerfile.production \
-t "mcp-server:$IMAGE_TAG" \
--build-arg BUILD_DATE="$(date -u +'%Y-%m-%dT%H:%M:%SZ')" \
--build-arg VCS_REF="$(git rev-parse --short HEAD)" \
.
echo "✅ 镜像构建完成"
}
# 推送镜像
push_image() {
echo "📤 推送镜像到仓库..."
# 标记镜像
docker tag "mcp-server:$IMAGE_TAG" "your-registry/mcp-server:$IMAGE_TAG"
# 推送镜像
docker push "your-registry/mcp-server:$IMAGE_TAG"
echo "✅ 镜像推送完成"
}
# 创建命名空间
create_namespace() {
echo "📁 创建命名空间..."
kubectl apply -f k8s/namespace.yaml
echo "✅ 命名空间创建完成"
}
# 部署配置
deploy_configs() {
echo "⚙️ 部署配置..."
# 应用ConfigMap
kubectl apply -f k8s/configmap.yaml
# 应用Secret(如果存在)
if [[ -f "k8s/secret.yaml" ]]; then
kubectl apply -f k8s/secret.yaml
fi
# 应用存储配置
kubectl apply -f k8s/storage.yaml
echo "✅ 配置部署完成"
}
# 部署应用
deploy_application() {
echo "🚢 部署应用..."
# 更新镜像标签
sed -i.bak "s|image: mcp-server:.*|image: your-registry/mcp-server:$IMAGE_TAG|g" k8s/deployment.yaml
# 应用部署配置
kubectl apply -f k8s/deployment.yaml
kubectl apply -f k8s/service.yaml
# 如果是生产环境,应用Ingress
if [[ "$ENVIRONMENT" == "production" ]]; then
kubectl apply -f k8s/ingress.yaml
fi
echo "✅ 应用部署完成"
}
# 配置自动扩缩容
deploy_autoscaling() {
echo "📈 配置自动扩缩容..."
kubectl apply -f k8s/hpa.yaml
echo "✅ 自动扩缩容配置完成"
}
# 等待部署完成
wait_for_deployment() {
echo "⏳ 等待部署完成..."
kubectl rollout status deployment/mcp-server -n $NAMESPACE --timeout=600s
echo "✅ 部署完成"
}
# 验证部署
verify_deployment() {
echo "🔍 验证部署..."
# 检查Pod状态
echo "📋 Pod状态:"
kubectl get pods -n $NAMESPACE -l app=mcp-server
# 检查Service状态
echo "🌐 Service状态:"
kubectl get svc -n $NAMESPACE
# 检查健康状态
echo "💓 健康检查:"
kubectl exec -n $NAMESPACE deployment/mcp-server -- curl -f http://localhost:8000/health
echo "✅ 部署验证完成"
}
# 主函数
main() {
check_dependencies
if [[ "${BUILD_IMAGE:-true}" == "true" ]]; then
build_image
push_image
fi
create_namespace
deploy_configs
deploy_application
deploy_autoscaling
wait_for_deployment
verify_deployment
echo "🎉 MCP服务器部署成功!"
echo "🌍 访问地址: https://mcp.yourdomain.com"
}
# 执行主函数
main "$@"
🔄 回滚脚本
bash
#!/bin/bash
# rollback.sh
set -euo pipefail
NAMESPACE="mcp-system"
REVISION="${1:-}"
echo "🔄 开始回滚MCP服务器..."
# 显示部署历史
show_history() {
echo "📚 部署历史:"
kubectl rollout history deployment/mcp-server -n $NAMESPACE
}
# 执行回滚
perform_rollback() {
if [[ -n "$REVISION" ]]; then
echo "🔄 回滚到版本 $REVISION..."
kubectl rollout undo deployment/mcp-server -n $NAMESPACE --to-revision=$REVISION
else
echo "🔄 回滚到上一个版本..."
kubectl rollout undo deployment/mcp-server -n $NAMESPACE
fi
}
# 等待回滚完成
wait_for_rollback() {
echo "⏳ 等待回滚完成..."
kubectl rollout status deployment/mcp-server -n $NAMESPACE --timeout=300s
}
# 验证回滚
verify_rollback() {
echo "🔍 验证回滚..."
kubectl get pods -n $NAMESPACE -l app=mcp-server
kubectl exec -n $NAMESPACE deployment/mcp-server -- curl -f http://localhost:8000/health
}
# 主函数
main() {
show_history
read -p "确定要执行回滚吗?(y/N): " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
perform_rollback
wait_for_rollback
verify_rollback
echo "✅ 回滚完成"
else
echo "❌ 回滚已取消"
exit 0
fi
}
main "$@"
🔍 故障排查指南
🐛 常见问题诊断
bash
# troubleshoot.sh
#!/bin/bash
NAMESPACE="${1:-mcp-system}"
echo "🔍 MCP服务器故障排查..."
# Pod状态检查
check_pods() {
echo "📋 检查Pod状态..."
kubectl get pods -n $NAMESPACE -l app=mcp-server
# 获取失败的Pod
FAILED_PODS=$(kubectl get pods -n $NAMESPACE -l app=mcp-server --field-selector=status.phase!=Running --no-headers | awk '{print $1}')
if [[ -n "$FAILED_PODS" ]]; then
echo "❌ 发现异常Pod:"
for pod in $FAILED_PODS; do
echo " - $pod"
kubectl describe pod $pod -n $NAMESPACE
echo "日志:"
kubectl logs $pod -n $NAMESPACE --tail=50
done
else
echo "✅ 所有Pod运行正常"
fi
}
# 服务检查
check_services() {
echo "🌐 检查Service状态..."
kubectl get svc -n $NAMESPACE
# 检查端点
kubectl get endpoints -n $NAMESPACE
}
# 配置检查
check_configs() {
echo "⚙️ 检查配置..."
kubectl get configmap -n $NAMESPACE
kubectl get secret -n $NAMESPACE
}
# 资源使用情况
check_resources() {
echo "📊 检查资源使用..."
kubectl top pods -n $NAMESPACE
kubectl top nodes
}
# 事件检查
check_events() {
echo "📰 检查集群事件..."
kubectl get events -n $NAMESPACE --sort-by='.metadata.creationTimestamp'
}
# 主函数
main() {
check_pods
check_services
check_configs
check_resources
check_events
}
main
🎯 本节小结
通过本节学习,你已经掌握了:
✅ Docker最佳实践:多阶段构建、安全配置、镜像优化
✅ Kubernetes部署:完整的K8s资源配置和编排
✅ 自动扩缩容:HPA和VPA配置与调优
✅ 存储管理:PV/PVC配置和存储类选择
✅ 部署自动化:完整的部署和回滚脚本
✅ 故障排查:系统化的问题诊断方法
🤔 思考题
- 资源规划:如何根据业务需求合理规划Pod的资源请求和限制?
- 高可用设计:如何设计跨可用区的高可用部署架构?
- 安全加固:除了现有配置,还有哪些Kubernetes安全最佳实践?
🔗 相关资源
恭喜! 🎉 你已经掌握了MCP服务器的容器化部署。下一节我们将学习如何构建完整的CI/CD流水线。