nvidia

curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg   && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list |     sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' |     sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
apt-get update
apt-get install -y nvidia-container-toolkit

vi /etc/docker/daemon.json
{
  "default-runtime": "nvidia",
  "runtimes": {
      "nvidia": {
          "path": "nvidia-container-runtime",
          "runtimeArgs": []
      }
  },
  "exec-opts": ["native.cgroupdriver=systemd"],
  "log-driver": "json-file",
  "log-opts": {
    "max-size": "1000m"
  },
  "max-concurrent-downloads": 500,
  "max-concurrent-uploads": 500,
  "live-restore": true
}

systemctl daemon-reload && systemctl 

 

 

 

docker pull registry.cn-beijing.aliyuncs.com/duanshuaixing-dockerhub/infra:nemo-infra-24.01.01.framework

 

 

 

```yaml

---

apiVersion: v1

kind: Secret

metadata:

  name: kubernetes-etcd-sla

data:

  server.crt:

  server.key: 

---

kind: ConfigMap

apiVersion: v1

metadata:

  name: etcd-sla-configmap

data:

  crontab-list: |

    * * * * * bash -vx /tmp/sla.sh >/opt/sla.log 2>&1

  sla.sh: |

    <<!

     **********************************************************

     * Author : duanshuaixing

     * Email : duanshuaixing@gmail.com

     * Last modified : 2021-10-30 20:48

     * Filename : sla.sh

     * Description : 检查etcd集群sla

     * *******************************************************

    !

    #!/bin/bash

    ETCD_CLUSTER_NAME=kubernetes-etcd

    ETCD_MEMBER=https://10.10.12.139:2379,https://10.10.12.141:2379,https://10.10.12.142:2379

    ETCD_CERT=/opt/etcd-key/server.crt

    ETCD_CERT_KEY=/opt/etcd-key/server.key

    Check_time=`date +%Y/%m/%d-%T`

 

    mkdir -p /opt/sla

    >/opt/sla/metrics

 

    send_message_dingtalk(){

         HTTP_PROXY_ENABLED=false

         HTTP_PROXY=httpproxy:3128

         DINGTALK_TOKEN=xxxxxxxx

         if [ "$HTTP_PROXY_ENABLED" = true ];then

             curl --proxy $HTTP_PROXY "$DINGTALK_TOKEN" -H 'Content-Type: application/json' -d "{\"actionCard\": {\"text\": \"$context\",},\"msgtype\": \"actionCard\"}"

         else

             curl "$DINGTALK_TOKEN" -H 'Content-Type: application/json' -d "{\"actionCard\": {\"text\": \"$context\",},\"msgtype\": \"actionCard\"}"

        fi

    }

    send_message_feishu(){

        HTTP_PROXY_ENABLED=false

        HTTP_PROXY=httpproxy:3128

        #FEISHU_TOKEN=

 

       if [ "$HTTP_PROXY_ENABLED" = true ];then

           curl --proxy $HTTP_PROXY $FEISHU_TOKEN -X POST -H "Content-Type: application/json" -d "{\"msg_type\":\"text\", \"content\":{\"text\":\"$context\"}}"

       else

           curl $FEISHU_TOKEN -X POST -H "Content-Type: application/json" -d "{\"msg_type\":\"text\", \"content\":{\"text\":\"$context\"}}"

       fi

    }

 

    check_sla(){

        timeout -k 5s 30s /usr/local/bin/etcdctl --cert $ETCD_CERT --key $ETCD_CERT_KEY --endpoints $ETCD_MEMBER --insecure-skip-tls-verify member list --write-out=table

        timeout -k 5s 30s /usr/local/bin/etcdctl --cert $ETCD_CERT --key $ETCD_CERT_KEY --endpoints $ETCD_MEMBER --insecure-skip-tls-verify endpoint status --write-out=table

        timeout -k 5s 30s /usr/local/bin/etcdctl --cert $ETCD_CERT --key $ETCD_CERT_KEY --endpoints $ETCD_MEMBER --insecure-skip-tls-verify endpoint health --write-out=table

        #timeout -k 5s 30s etcdctl --cert $ETCD_CERT --key $ETCD_CERT_KEY --endpoints $ETCD_MEMBER --insecure-skip-tls-verify put /infrae2e/e2e "infra e2e test"

        timeout -k 5s 30s /usr/local/bin/etcdctl --cert $ETCD_CERT --key $ETCD_CERT_KEY --endpoints $ETCD_MEMBER --insecure-skip-tls-verify get /infrae2e/e2e

        if [ $? -eq 0 ]; then

            echo "SLA_etcd_avail{cluster=\"$ETCD_CLUSTER_NAME\"} 1" >> /opt/sla/metrics

        else

            echo "SLA_etcd_avail{cluster=\"$ETCD_CLUSTER_NAME\"} 0" >> /opt/sla/metrics

            context="SLA SLA掉落。集群名称$ETCD_CLUSTER_NAME 请及时处理!"

            #send_message_dingtalk

            #send_message_feishu

        fi

 

    }

 

    check_sla

---

apiVersion: monitoring.coreos.com/v1

kind: ServiceMonitor

metadata:

  labels:

    app.kubernetes.io/managed-by: Helm

    team: infra

  name: etcd-sla

spec:

  endpoints:

  - interval: 90s

    port: etcd-sla

    path: /metrics

  jobLabel: etcd-sla

  selector:

    matchLabels:

      app: etcd-sla

---

apiVersion: v1

kind: Service

metadata:

  labels:

    app: etcd-sla

    cluster: etcd-sla

  name: etcd-sla

  namespace: sre

spec:

  ports:

  - port: 80

    protocol: TCP

    targetPort: 80

    name: etcd-sla

  selector:

    app: etcd-sla

---

apiVersion: apps/v1

kind: Deployment

metadata:

  labels:

    app: etcd-sla

    cluster: etcd-sla

  name: etcd-sla

  namespace: sre

spec:

  replicas: 3

  selector:

    matchLabels:

      app: etcd-sla

      cluster: etcd-sla

  template:

    metadata:

      labels:

        app: etcd-sla

        cluster: etcd-sla

    spec:

      tolerations:

        - key: "node-role.kubernetes.io/control-plane"

          operator: "Exists"

          effect: "NoSchedule"

      affinity:

        nodeAffinity:

          requiredDuringSchedulingIgnoredDuringExecution:

            nodeSelectorTerms:

            - matchExpressions:

              - key: node-role.kubernetes.io/control-plane

                operator: Exists

        podAntiAffinity:

          requiredDuringSchedulingIgnoredDuringExecution:

          - labelSelector:

              matchExpressions:

              - key: cluster

                operator: In

                values:

                - etcd-sla

            topologyKey: "kubernetes.io/hostname"

      containers:

      - image: duanshuaixing02/databases-backup:v002

        name: metrics

        imagePullPolicy: IfNotPresent

        command: ["python"]

        args:

          - "-c"

          - |

            import os

            os.chdir('/opt/sla/')

            import SimpleHTTPServer

            import SocketServer

            Handler = SimpleHTTPServer.SimpleHTTPRequestHandler

            httpd = SocketServer.TCPServer(('', 80), Handler)

            httpd.serve_forever()

        ports:

        - containerPort: 80

        volumeMounts:

        - name: sla-log

          mountPath: /opt/sla

      - image: duanshuaixing02/databases-backup:v002

        name: server

        imagePullPolicy: IfNotPresent

        volumeMounts:

        - name: etcd-certs

          mountPath: "/opt/etcd-key"

          readOnly: true

        - name: sla-script-volume

          mountPath: /tmp/sla.sh

          subPath: sla.sh

        - name: crontab-list-volume

          mountPath: /etc/cron.d/crontab-list

          subPath: crontab-list

        - name: sla-log

          mountPath: /opt/sla

      volumes:

      - name: etcd-certs

        secret:

          secretName: kubernetes-etcd-sla

      - name: sla-script-volume

        configMap:

          name: etcd-sla-configmap

          items:

          - key: sla.sh

            path: sla.sh

      - name: crontab-list-volume

        configMap:

          name: etcd-sla-configmap

          items:

          - key: crontab-list

            path: crontab-list

      - name: sla-log

        emptyDir:

          sizeLimit: 5Mi

```

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值