curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
apt-get update
apt-get install -y nvidia-container-toolkit
vi /etc/docker/daemon.json
{
"default-runtime": "nvidia",
"runtimes": {
"nvidia": {
"path": "nvidia-container-runtime",
"runtimeArgs": []
}
},
"exec-opts": ["native.cgroupdriver=systemd"],
"log-driver": "json-file",
"log-opts": {
"max-size": "1000m"
},
"max-concurrent-downloads": 500,
"max-concurrent-uploads": 500,
"live-restore": true
}
systemctl daemon-reload && systemctl
docker pull registry.cn-beijing.aliyuncs.com/duanshuaixing-dockerhub/infra:nemo-infra-24.01.01.framework
```yaml
---
apiVersion: v1
kind: Secret
metadata:
name: kubernetes-etcd-sla
data:
server.crt:
server.key:
---
kind: ConfigMap
apiVersion: v1
metadata:
name: etcd-sla-configmap
data:
crontab-list: |
* * * * * bash -vx /tmp/sla.sh >/opt/sla.log 2>&1
sla.sh: |
<<!
**********************************************************
* Author : duanshuaixing
* Email : duanshuaixing@gmail.com
* Last modified : 2021-10-30 20:48
* Filename : sla.sh
* Description : 检查etcd集群sla
* *******************************************************
!
#!/bin/bash
ETCD_CLUSTER_NAME=kubernetes-etcd
ETCD_MEMBER=https://10.10.12.139:2379,https://10.10.12.141:2379,https://10.10.12.142:2379
ETCD_CERT=/opt/etcd-key/server.crt
ETCD_CERT_KEY=/opt/etcd-key/server.key
Check_time=`date +%Y/%m/%d-%T`
mkdir -p /opt/sla
>/opt/sla/metrics
send_message_dingtalk(){
HTTP_PROXY_ENABLED=false
HTTP_PROXY=httpproxy:3128
DINGTALK_TOKEN=xxxxxxxx
if [ "$HTTP_PROXY_ENABLED" = true ];then
curl --proxy $HTTP_PROXY "$DINGTALK_TOKEN" -H 'Content-Type: application/json' -d "{\"actionCard\": {\"text\": \"$context\",},\"msgtype\": \"actionCard\"}"
else
curl "$DINGTALK_TOKEN" -H 'Content-Type: application/json' -d "{\"actionCard\": {\"text\": \"$context\",},\"msgtype\": \"actionCard\"}"
fi
}
send_message_feishu(){
HTTP_PROXY_ENABLED=false
HTTP_PROXY=httpproxy:3128
#FEISHU_TOKEN=
if [ "$HTTP_PROXY_ENABLED" = true ];then
curl --proxy $HTTP_PROXY $FEISHU_TOKEN -X POST -H "Content-Type: application/json" -d "{\"msg_type\":\"text\", \"content\":{\"text\":\"$context\"}}"
else
curl $FEISHU_TOKEN -X POST -H "Content-Type: application/json" -d "{\"msg_type\":\"text\", \"content\":{\"text\":\"$context\"}}"
fi
}
check_sla(){
timeout -k 5s 30s /usr/local/bin/etcdctl --cert $ETCD_CERT --key $ETCD_CERT_KEY --endpoints $ETCD_MEMBER --insecure-skip-tls-verify member list --write-out=table
timeout -k 5s 30s /usr/local/bin/etcdctl --cert $ETCD_CERT --key $ETCD_CERT_KEY --endpoints $ETCD_MEMBER --insecure-skip-tls-verify endpoint status --write-out=table
timeout -k 5s 30s /usr/local/bin/etcdctl --cert $ETCD_CERT --key $ETCD_CERT_KEY --endpoints $ETCD_MEMBER --insecure-skip-tls-verify endpoint health --write-out=table
#timeout -k 5s 30s etcdctl --cert $ETCD_CERT --key $ETCD_CERT_KEY --endpoints $ETCD_MEMBER --insecure-skip-tls-verify put /infrae2e/e2e "infra e2e test"
timeout -k 5s 30s /usr/local/bin/etcdctl --cert $ETCD_CERT --key $ETCD_CERT_KEY --endpoints $ETCD_MEMBER --insecure-skip-tls-verify get /infrae2e/e2e
if [ $? -eq 0 ]; then
echo "SLA_etcd_avail{cluster=\"$ETCD_CLUSTER_NAME\"} 1" >> /opt/sla/metrics
else
echo "SLA_etcd_avail{cluster=\"$ETCD_CLUSTER_NAME\"} 0" >> /opt/sla/metrics
context="SLA SLA掉落。集群名称$ETCD_CLUSTER_NAME 请及时处理!"
#send_message_dingtalk
#send_message_feishu
fi
}
check_sla
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/managed-by: Helm
team: infra
name: etcd-sla
spec:
endpoints:
- interval: 90s
port: etcd-sla
path: /metrics
jobLabel: etcd-sla
selector:
matchLabels:
app: etcd-sla
---
apiVersion: v1
kind: Service
metadata:
labels:
app: etcd-sla
cluster: etcd-sla
name: etcd-sla
namespace: sre
spec:
ports:
- port: 80
protocol: TCP
targetPort: 80
name: etcd-sla
selector:
app: etcd-sla
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: etcd-sla
cluster: etcd-sla
name: etcd-sla
namespace: sre
spec:
replicas: 3
selector:
matchLabels:
app: etcd-sla
cluster: etcd-sla
template:
metadata:
labels:
app: etcd-sla
cluster: etcd-sla
spec:
tolerations:
- key: "node-role.kubernetes.io/control-plane"
operator: "Exists"
effect: "NoSchedule"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/control-plane
operator: Exists
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: cluster
operator: In
values:
- etcd-sla
topologyKey: "kubernetes.io/hostname"
containers:
- image: duanshuaixing02/databases-backup:v002
name: metrics
imagePullPolicy: IfNotPresent
command: ["python"]
args:
- "-c"
- |
import os
os.chdir('/opt/sla/')
import SimpleHTTPServer
import SocketServer
Handler = SimpleHTTPServer.SimpleHTTPRequestHandler
httpd = SocketServer.TCPServer(('', 80), Handler)
httpd.serve_forever()
ports:
- containerPort: 80
volumeMounts:
- name: sla-log
mountPath: /opt/sla
- image: duanshuaixing02/databases-backup:v002
name: server
imagePullPolicy: IfNotPresent
volumeMounts:
- name: etcd-certs
mountPath: "/opt/etcd-key"
readOnly: true
- name: sla-script-volume
mountPath: /tmp/sla.sh
subPath: sla.sh
- name: crontab-list-volume
mountPath: /etc/cron.d/crontab-list
subPath: crontab-list
- name: sla-log
mountPath: /opt/sla
volumes:
- name: etcd-certs
secret:
secretName: kubernetes-etcd-sla
- name: sla-script-volume
configMap:
name: etcd-sla-configmap
items:
- key: sla.sh
path: sla.sh
- name: crontab-list-volume
configMap:
name: etcd-sla-configmap
items:
- key: crontab-list
path: crontab-list
- name: sla-log
emptyDir:
sizeLimit: 5Mi
```