背景:
海量数据任务的并发处理,IO较为频繁,所以采用多线程的方式进行处理
方案:
一个主线程进行任务的分发,另外再独立创建一定数量的并发工作线程
代码:
# -*- coding: utf-8 -*-
__author__ = 'jasonliu'
'''
由于是IO密集型,采用用线程的方式进行处理
注意本版本代码仅仅适用于python3
'''
import hashlib
import json
import time
import datetime
import threading
import requests
import codecs
import queue#和python2不同
from threading import Thread
import re
lyricer_pat = r'[作]{0,1}词[::]{1}(.+)'
comp_pat = r'[作]{0,1}曲[::]{1}(.+)'
sign_pat = r'\[mywriting:([^\]]+)\]'
#加锁
mu = threading.Lock()
sign_mu = threading.Lock()
MS = 20#启动的线程数
wq = queue.Queue(maxsize=20)
composer_lyrics_file = codecs.open("composer_lyrics.txt", 'w', 'utf-8')#记录作曲作词人信息
sign_file = codecs.open("sign.txt", 'w', 'utf-8')
net_class = "publish"
def loop_worker():
while True:
if True:
line = wq.get()
wq.task_done()
if not line:
break
linedata = line.split("\t")
krcid = linedata[0]
scid = linedata[1]
if net_class == "publish":
url = "http://you_ip1/id="
else:
url = "http://you_ip2/id="
url = url + str(krcid)
cnx = requests.get(url)
ret = cnx.text
lyricer_man = "NULL"
lyricer_group = re.search(lyricer_pat, ret)
if lyricer_group:
lyricer_man = lyricer_group.group(1)#作词人信息
lyricer_man = lyricer_man.strip()
com_man = "NULL"
com_group = re.search(comp_pat, ret)#作曲人信息
if com_group:
com_man = com_group.group(1)
com_man = com_man.strip()
sign_group = re.search(sign_pat, ret)
if sign_group:
sign_content = sign_group.group(1)
sign_content = sign_content.strip()
tmplog1 = str(krcid) + "\t" + str(sign_content) + "\n"
if sign_mu.acquire(True):
sign_file.write(tmplog1)
sign_file.flush()
sign_mu.release()
if lyricer_group and com_group:
tmplog = str(krcid) + "\t" + str(scid) + "\t" + str(lyricer_man) + "\t" + str(com_man) + "\n"
if mu.acquire(True):
composer_lyrics_file.write(tmplog)
composer_lyrics_file.flush()
mu.release()
def start_workers():
for i in range(MS):
t = Thread(target=loop_worker)
t.daemon = True
t.start()
def stop_workers():
for i in range(MS):
wq.put(None)
time.sleep(5)#等待其他线程一起退出来
def run():
print(datetime.datetime.now(), '===>begin')
start_workers()
with open('all_krcid_scid.txt', 'r') as fi:
num = 0
for line in fi:
num = num + 1
line = line.strip('\n')
print(line)
wq.put(line)#存储的是hash
stop_workers()
print(datetime.datetime.now(), '===>end')
if '__main__' == __name__:
time1 = time.time()
run()
time2 = time.time()
print('cost=',(time2-time1))
注意,在执行的时候会有可能出现request操作的时候,连接失败,所以需要对这些情况做处理,比如用try
语句,加强代码的稳健性。
在程序执行之后,如果是在windows的话,我们可以通过设置:
从而看到进程下的线程数。当然也可以采用tasklist命令查看。