#!/usr/bin/python3
# encoding: utf-8
#filename: process-check-self-healing.py
#author: gaohaixiang
#writetime:202206141535
import re
import time
import subprocess
"""
# 使用注意事项:
涉及多项检测及自愈功能,按实际需求更改脚本
连续多次监测到cpu,内存大于某个值,重启
或者检测到服务不是active,重启服务,根据实际情况进行定制
假如一份服务有多个进程,则将多个进程的使用率加起来,计算该服务的总使用率做为服务的使用率
此脚本适用一个服务多个进程,一个服务一个进程
不适合几个服务几个进程,否则会将多个服务全部杀死重启
脚本中进程与服务区别:
进程需要使用命令进行杀死及启动
服务可以使用 systemctl 命令停止及启动,如 systemctl restart httpd
top取值说明:
top -d 2 -n 3
取值三次,每次间隔2秒,mem和cpu取的值除以3,获取3次得平均值
"""
# top命令执行
def TOP_monitor(topCMD):
# topCMD = "top -n 1"
topCMDresult = subprocess.getoutput(topCMD)
# print(topCMDresult)
return topCMDresult
# 从top命令结果获取内存使用率
def TOP_MEM_monitor(topCMDresult,PRO_NAME):
#print(topCMDresult)
# PRO_NAME = "top"
PRO_MEM_imformations = 0.0
top_lines = topCMDresult.split("\n")
#print(top_lines)
for top_line in top_lines:
#print(top_line)
if re.findall(PRO_NAME,top_line) and top_line.strip().endswith(PRO_NAME):
PRO_information = top_line.split()
# print("进程PID,用户,优先级,优先级,虚拟内存,物理内存,共享内存,进程状态,cpu,mem,占用cpu时间,进程名")
# print(PRO_information)
PRO_MEM_imformations = PRO_MEM_imformations + float(PRO_information[9])
# print(PRO_MEM_imformation)
#print(PRO_MEM_imformations)
return PRO_MEM_imformations
# 从top命令结果获取CPU使用率
def TOP_CPU_monitor(topCMDresult,PRO_NAME):
# print(topCMDresult)
#PRO_NAME = "top"
PRO_CPU_imformations = 0.0
top_lines = topCMDresult.split("\n")
# print(top_lines)
for top_line in top_lines:
# print(top_line)
if re.findall(PRO_NAME, top_line) and top_line.strip().endswith(PRO_NAME):
PRO_information = top_line.split()
# print("进程PID,用户,优先级,优先级,虚拟内存,物理内存,共享内存,进程状态,cpu,mem,占用cpu时间,进程名")
# print(PRO_information)
PRO_CPU_imformation = float(PRO_information[8])
PRO_CPU_imformations = PRO_CPU_imformations + PRO_CPU_imformation
# print(PRO_CPU_imformation)
#print(PRO_CPU_imformations)
return PRO_CPU_imformations
# 当前进程数量获取
def PRO_NUM_check(PRO_NAME):
PRO_NUM_CMD = "ps -ef |grep %s |grep -v grep | wc -l" % PRO_NAME
PRO_NUM = subprocess.getoutput(PRO_NUM_CMD)
return int(PRO_NUM)
# 当前服务状态获取
def PRO_system_check(PRO_NAME):
PRO_system_check_cmd = "systemctl status %s |grep 'Active:'|awk '{print $2}'" % PRO_NAME
checkCMDoutput = subprocess.getoutput(PRO_system_check_cmd)
return checkCMDoutput
# 使用命令启动进程
def PRO_CMD_start(PRO_CMD_start_cmd):
startCMDoutput,startCMDstatus = subprocess.getstatusoutput(PRO_CMD_start_cmd)
return startCMDoutput,startCMDstatus
# 重启服务
def PRO_system_restart(PRO_NAME):
PRO_system_restart_cmd = "systemctl restart %s " % PRO_NAME
restartCMDoutput,restartCMDstatus = subprocess.getstatusoutput(PRO_system_restart_cmd)
return restartCMDoutput,restartCMDstatus
# 杀死进程
def PRO_CMD_kill(PRO_NAME):
PRO_CMD_kill_cmd = "ps -ef |grep %s|grep -v grep|awk '{print $2}'|xargs kill -9" % PRO_NAME
CMDoutput,CMDstatus = subprocess.getstatusoutput(PRO_CMD_kill_cmd)
return CMDoutput,CMDstatus
# 系统总CPU数量获取
def GET_CPU_NUM_total():
GET_CPU_NUM_total_cmd = "cat /proc/cpuinfo |grep processor|wc -l"
cpu_num_output = subprocess.getoutput(GET_CPU_NUM_total_cmd)
return cpu_num_output
# 系统总内存获取
def GET_MEM_NUM_total():
GET_MEM_NUM_taotal_cmd = "cat /proc/meminfo |grep 'MemTotal:'|awk '{print $2}'"
mem_num_output = subprocess.getoutput(GET_MEM_NUM_taotal_cmd)
return mem_num_output
if __name__ == '__main__':
starttime = time.time()
print ("Process is running...")
# top命令
topCMD = "top -d 2 -n 3" # 取3次值,每次取值时间间隔2秒
# 进程名称
PRO_NAME = "httpd"
# 进程数量
pro_num = 1
# 进程启动命令
PRO_CMD_start_cmd = "systemctl start httpd" # 其他的启动命令
# 限制进程CPU使用率,占用系统的总量
Limit_cpu_used_total = 0.9
# 限制进程内存使用率,占用系统的总量
Limit_mem_used_total = 0.9
'''# service 服务检测,当服务状态不是active时候,进行服务
# 无限重启,知道服务状态达到active'''
PRO_system_check_outputs = ""
while PRO_system_check_outputs != 'active':
PRO_system_restart(PRO_NAME)
PRO_system_check_outputs = PRO_system_restart(PRO_NAME)
'''# 进程数量检测,当进程数量达不到该有的数量时候,
# 进程进行无限重启,直到进程数量达到标准'''
pro_nums = 0
while pro_nums < pro_num:
PRO_CMD_start(PRO_CMD_start_cmd)
pro_nums = PRO_NUM_check(PRO_NAME)
"""
'''# 进程占用cpu总量的90%及以上,重启进程'''
# 执行top命令
topCMDresult = TOP_monitor(topCMD)
# 获取cpu使用百分比
PRO_CPU_imformations = TOP_CPU_monitor(topCMDresult,PRO_NAME)/3
# 获取cpu的总量
cpu_num_output = int(GET_CPU_NUM_total())
if PRO_CPU_imformations/(cpu_num_output * 100) > 0.9:
PRO_CMD_kill(PRO_NAME)
pro_nums = 0
while pro_nums < pro_num:
PRO_CMD_start(PRO_CMD_start_cmd)
pro_nums = PRO_NUM_check(PRO_NAME)
"""
'''# 进程占用cpu总量的90%及以上,重启服务'''
# 执行top命令
topCMDresult = TOP_monitor(topCMD)
# 获取cpu使用百分比
PRO_CPU_imformations = TOP_CPU_monitor(topCMDresult,PRO_NAME)/3
# 获取cpu的总量
cpu_num_output = int(GET_CPU_NUM_total())
if PRO_CPU_imformations/(cpu_num_output * 100) > 0.9:
PRO_system_restart(PRO_NAME)
PRO_system_check_outputs = ""
while PRO_system_check_outputs != 'active':
PRO_system_restart(PRO_NAME)
PRO_system_check_outputs = PRO_system_restart(PRO_NAME)
"""
'''# 进程占用内存总量的90%及以上,重启进程'''
# 执行top命令
topCMDresult = TOP_monitor(topCMD)
# 获取cpu使用百分比
PRO_MEM_imformations = TOP_MEM_monitor(topCMDresult,PRO_NAME)/3
# 获取cpu的总量
mem_num_output = GET_MEM_NUM_total()
if PRO_MEM_imformations > 0.9:
PRO_CMD_kill(PRO_NAME)
pro_nums = 0
while pro_nums < pro_num:
PRO_CMD_start(PRO_CMD_start_cmd)
pro_nums = PRO_NUM_check(PRO_NAME)
"""
'''# 进程占用内存总量的90%及以上,重启服务'''
# 执行top命令
topCMDresult = TOP_monitor(topCMD)
# 获取cpu使用百分比
PRO_MEM_imformations = TOP_MEM_monitor(topCMDresult,PRO_NAME)/3
# 获取cpu的总量
mem_num_output = GET_MEM_NUM_total()
if PRO_MEM_imformations > 0.9:
PRO_system_restart(PRO_NAME)
PRO_system_check_outputs = ""
while PRO_system_check_outputs != 'active':
PRO_system_restart(PRO_NAME)
PRO_system_check_outputs = PRO_system_restart(PRO_NAME)
endtime = time.time()
print (endtime-starttime)
进程检测及资源限制自愈
于 2022-06-14 15:40:10 首次发布