续集:
一. 背景
接着上一回,多线程threading本身没有限制自己的线程数量,但是当线程数量过多时,就会报错,导致数据收集不全,所以要选择设置threading开启的线程数量
二. 解决
修改脚本,开启线程数量限制,同时开启300个线程运行收集版本数据
vim get_version.py
import paramiko
import requests
import datetime
import redis
import json
import os
import subprocess
from threading import Timer
import logging
import threading
import socket
logger = logging.getLogger('ktzlogger')
class GetVersion(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def par_ver(self, host0, app_name):
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(hostname=host0, port=2222, username='dc')
stdin, stdout, stderr = client.exec_command("awk '/jfrog/' /data/scripts/deploy_%s.sh | tail -1 | awk '{print $4}' | awk -F/ '{print $4}'" % (app_name))
out = stdout.read().decode('utf-8')
err = stderr.read().decode('utf-8')
if out == '':
out = '0'
client.close()
out = out.strip()
return out
def chaoshi(self, args, timeout):
p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
timer = Timer(timeout, lambda process: process.kill(), [p])
try:
timer.start()
stdout, stderr = p.communicate()
return_code = p.returncode
if stdout != b'':
return True
else:
return False
finally:
timer.cancel()
def worker(self,host0,app_name,objs,i,center,hosts_str,env,threadmax):
addr = socket.gethostbyname(socket.getfqdn(socket.gethostname()))
if addr == '192.168.89.133':
ver = 'v1'
else:
result = self.chaoshi(['telnet', host0, '53742'], 2)
if result == False:
os.system('echo %s >> /tmp/hosts_questions.txt' % (host0))
return
ver = self.par_ver(host0, app_name)
now = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M')
objs.append({'v_id':i, 'v_gps':center, 'v_proj':app_name, 'v_tag':ver, 'v_hosts':hosts_str, 'v_time':now, 'v_env':env})
threadmax.release()
def main(self):
os.system('rm -f /tmp/hosts_questions.txt')
all_keys = requests.get("http://172.16.3.100:10082/assets/inventory/--list/None/")
all_objs = all_keys.json()
i = 1
objs = []
gps = []
threads = []
env_list=[]
threadmax = threading.BoundedSemaphore(300)
for item in all_objs:
threadmax.acquire()
if item == 'all' or item == '_meta':
continue
if 'ktz_data_apps' in item:
env = item.split('_ktz_data_apps_')[0]
center = 'ktz_data_apps'
app_name = item.split('_ktz_data_apps_')[-1]
elif 'ktz_m' in item:
env = item.split('_ktz_m_')[0]
center = 'ktz_m'
app_name = item.split('_ktz_m_')[-1]
else:
env = item.split('_')[0]
center = item.split('_')[1]
app_name = item.split('_')[-1]
if env not in env_list:
env_list.append(env)
if center not in gps:
gps.append(center)
hosts = all_objs[item]['hosts']
str = ''
for host in hosts:
str += host + ','
hosts_str = str
host0 = hosts[0]
t = threading.Thread(target=self.worker, args=(host0,app_name,objs,i,center,hosts_str,env,threadmax))
t.start()
threads.append(t)
i += 1
for t in threads:
t.join()
print(objs)
print(env_list)
red = redis.Redis(host='localhost', port=6379, db=1)
all={'gps': gps , 'deploys': objs}
objs_json = json.dumps(all)
red.set('versions', objs_json)
if __name__ == '__main__':
gv = GetVersion()
gv.main()
测试运行大概3分钟15秒左右运行结束,而且数据不会丢失
三. 总结
多线程虽然好用,但是有时候要注意线程数量,如果数量太多,会导致报错,进而丢失数据,最终重启celery和supervisord即可
celery-threading优化续集
多线程数据收集优化
最新推荐文章于 2025-09-18 18:50:06 发布
本文介绍了一种在大规模数据收集任务中优化多线程使用的方法。通过限制threading模块的线程数量,避免了因线程过多而导致的数据丢失问题。文章详细展示了如何设置线程池大小,并提供了一个Python脚本实例,该脚本能够并行收集版本信息,同时确保数据完整性和系统稳定性。
部署运行你感兴趣的模型镜像
您可能感兴趣的与本文相关的镜像
Python3.10
Conda
Python
Python 是一种高级、解释型、通用的编程语言,以其简洁易读的语法而闻名,适用于广泛的应用,包括Web开发、数据分析、人工智能和自动化脚本
383

被折叠的 条评论
为什么被折叠?



