实验室电脑原来的1080训练coco数据集时烧坏了,搞了2周终于放弃治疗,返厂维修。同时换了一台ti继续跑,为了随时监控1080ti的温度,我用Python写了一个实时监控GPU温度的脚本,分享如下。
# !/usr/bin/python
# -*- coding: utf-8 -*-
import time
import os, shutil
import smtplib
from email.mime.text import MIMEText
import datetime
pause = 100
mailto_list=['zcy0016@163.com']
mail_host="smtp.163.com"
mail_user="GPU_Monitor" #发送警报的邮箱
mail_pass="自己注册一个吧老哥们,简单的一比吊糟,记得开启STMP服务,设置STMP密码" #不是登录密码,是STMP密码
mail_postfix="163.com"
def send_email(to_list,sub,content):
me="GPU Auto Monitor"+"<"+mail_user+"@"+mail_postfix+">"
msg = MIMEText(content,_subtype='plain')
msg['Subject'] = sub
msg['From'] = me
msg['To'] = ";".join(to_list) #将收件人列表以‘;’分隔
try:
server = smtplib.SMTP()
server.connect(mail_host) #连接服务器
server.login(mail_user,mail_pass) #登录操作
server.sendmail(me, to_list, msg.as_string())
server.close()
return True
except Exception:
print("send error!!!")
return False
def get_gpu_tem():
shell_str = "tem_line=`nvidia-smi | grep %` && tem1=`echo $tem_line | cut -d C -f 1` " \
"&& tem2=`echo $tem1 | cut -d % -f 2` && echo $tem2"
result = os.popen(shell_str)
result_str = result.read()
tem_str = result_str.split("\n")[0]
result.close()
return float(tem_str)
while(True):
try:
tem_num = get_gpu_tem()
if tem_num>20:
nowTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
warning_str = nowTime+" Current temperature is " + str(tem_num) + "!!!"
print(warning_str)
send_email(mailto_list, "GPU Warning!!!", warning_str)
print("send over")
finally:
time.sleep(pause)