今天测试刚好发现一个关于网卡软中断不均衡的问题,之前处理过类似情况,一直未有总结,今天稍作总结下,并写了个自动化绑定脚本。关于什么是软中断,网卡硬件中断队列,RFS,RPS 这些概,这里不做具体解释。感兴趣可自行google
问题现象:
1. 网卡软中断不平衡,集中在一个CPU核心上(mpstat 查看%soft集中,通常是cpu0)
2. 网卡的硬件中断队列不够, < CPU 核心数,无法一对一绑定,导致部分CPU核心%soft 较少,CPU使用不均衡
解决办法:
情况1: 绑定网卡中断,通常是和CPU 一对一绑定
情况2: 开启RFS,RPS
最终效果:
mpstat -P ALL 2
cpu 线程利用率
附上tuneNetSoft.py 脚本内容,主要是用于绑定软中断和开启RPS,RFS
#!/usr/bin/env python
#python version < 3
#net irq_smp_affinity && rps,rfs setting
#version 1.0
#author: pylt
import re
from os import system,popen
from os import walk as walkdir
from optparse import OptionParser
RPS_CPUS_VALUE = 'ffffffff'
RPS_FLOW_VALUE = '4096'
RPS_RFS_DEFAULT = '0'
interrupts_file = '/proc/interrupts'
rps_cpus_list = []
rps_flow_list = []
#ENTRY_VALUE=32768
def get_device():
return re.findall(r'([a-z]+\d+)\s+Link.*',popen('ifconfig').read())
def get_rfs_rps_file(net_device):
rps_path = '/sys/class/net/' + net_device + '/queues/'
for s in walkdir(rps_path):
if len(s[2]) == 2:
rps_cpus_list.append('/'.join([s[0],s[2][0]]))
rps_flow_list.append('/'.join([s[0],s[2][1]]))
def file_hander(TARGET,VALUE='0'):
try:
f_hander = open(TARGET,'w')
f_hander.write(VALUE)
finally:
f_hander.close()
def set_rfs_rps(net_device):
get_rfs_rps_file(net_device)
def set_rps_cpus_value(PATH):
file_hander(PATH,RPS_CPUS_VALUE)
def set_rps_flow_value(PATH):
file_hander(PATH,RPS_FLOW_VALUE)
map(set_rps_cpus_value,rps_cpus_list)
map(set_rps_flow_value,rps_flow_list)
def unset_rfs_rps(net_device):
get_rfs_rps_file(net_device)
def unset_rps_cpus_value(PATH):
file_hander(PATH,RPS_RFS_DEFAULT)
def unset_rps_flow_value(PATH):
file_hander(PATH,RPS_RFS_DEFAULT)
map(unset_rps_cpus_value,rps_cpus_list)
map(unset_rps_flow_value,rps_flow_list)
def set_irq_balance():
stop_irq_balance = 'service irqbalance stop'
system(stop_irq_balance)
interrupts_ct = open(interrupts_file)
cores_nr = len(interrupts_ct.readline().split()) # 获取CPU核心数
irq_bit = 0
while True:
inter_line = interrupts_ct.readline()
if inter_line == "":
break
js = inter_line.split()
if len(js[-1]) > 5:
if re.match(r'eth.-',js[-1][:5]):
irq_nr = js[0][:-1]
TARGET = '/proc/irq/%s/smp_affinity' %(irq_nr)
VALUE = str(re.sub('0x','',hex(1 << irq_bit))) #1 << irq_bit 相对于2的N次方 ,hex() 二进制转十六进制
file_hander(TARGET,VALUE)
irq_bit += 1
if irq_bit == cores_nr:
irq_bit = 0
def unset_irq_balance():
start_irq_balance = 'service irqbalance start'
system(start_irq_balance)
def usage():
usage = '''=================================================
Description: irq_balance_set && rfs_rps_set tools
Usage:
<script> -i : set irq smp_affinity
-I : unset irq smp_affinity
-r : set rfs && rps
-R : unset rfs && rps
'''
print usage
if __name__ == '__main__':
parser = OptionParser()
parser.add_option("-i", action="store_true",
dest="irq_true",
default=False)
parser.add_option("-I", action="store_true",
dest="irq_false",
default=False)
parser.add_option("-r", action="store_true",
dest="rps_true",
default=False)
parser.add_option("-R", action="store_true",
dest="rps_false",
default=False)
(options, args) = parser.parse_args()
if options.irq_true == True:
set_irq_balance()
print "irq_balance_set successfully"
elif options.irq_false == True:
unset_irq_balance()
print "unset irq balance successfully"
elif options.rps_true == True:
device_list = get_device()
map(set_rfs_rps,device_list)
print "rfs&&rps configured successfully"
elif options.rps_false == True:
device_list = get_device()
map(unset_rfs_rps,device_list)
print "unconfigured rfs&&rps successfully"
else:
usage()
这篇博客总结了处理网卡软中断不均衡问题的经验,包括问题的现象(软中断集中在单个CPU核心,硬件中断队列不足)以及解决办法(中断一对一绑定,启用RFS和RPS)。提供了一个自动化绑定脚本来优化CPU使用平衡。
1521

被折叠的 条评论
为什么被折叠?



