def generate_simhash_obj(pname, text_list, que):
simhash_obj_list = []
for idx, text in text_list:
***
print(f'process-{pname} done. len:{len(simhash_obj_list)}, first value:{simhash_obj_list[0]}')
que.put(dill.dumps(simhash_obj_list))
print('put into queue.')
if __name__ == '__main__':
m_k, m_process_num = 3, min((multiprocessing.cpu_count(), 8))
m_split_text_list = split_data_by_pnum(m_text_list, m_process_num)
print('split data done.')
t3 = time.time()
q = multiprocessing.Queue()
jobs = []
for i in range(m_process_num):
p = multiprocessing.Process(target=generate_simhash_obj, args=(i, m_split_text_list[i], q))
jobs.append(p)
p.start()
print('init poll done.')
for p in jobs:
p.join()
print('join pool done.')
simhash_objs_result = []
for i in range(m_process_num):
simhash_objs_result.extend(dill.loads(q.get())
print('get result from queue done.')
使用Queue收集各子进程的处理结果,之后在主进程内合并。
结果,从日志打印可以看出,各子进程正常处理完成,且debug看到队列中是正常put进数据的。但是就是在join这一步卡住。
查资料看到这个:multiprocess.Queue.get() needs very long time in python - Stack Overflow
意思是说,Queue里面的数据必须取出后,才会关闭子进程。
所以需要在join之前,使用queue.get()取出队列中的全部数据,才可以保证子进程全部结束,到达join。
但是,我的解决方法是,切换为multiprocessing.Manager().list()
def generate_simhash_obj(pname, text_list, que):
simhash_obj_list = []
for idx, text in text_list:
***
print(f'process-{pname} done. len:{len(simhash_obj_list)}, first value:{simhash_obj_list[0]}')
obj_list.extend([dill.dumps(so) for so in simhash_obj_list])
if __name__ == '__main__':
m_k, m_process_num = 3, min((multiprocessing.cpu_count(), 8))
m_split_text_list = split_data_by_pnum(m_text_list, m_process_num)
print('split data done.')
t3 = time.time()
m = multiprocessing.Manager()
simhash_objs_result = m.list()
jobs = []
for i in range(m_process_num):
p = multiprocessing.Process(target=generate_simhash_obj, args=(i, m_split_text_list[i], simhash_objs_result))
jobs.append(p)
p.start()
print('init poll done.')
for p in jobs:
p.join()
print('join pool done.')
simhash_objs_result = []
for i in range(m_process_num):
simhash_objs_result.extend(dill.loads(q.get())
print('get result from queue done.')
这样就可以按照原来的流程,正常到主进程的join。
此外,还有一个小点,就是在多进程过程中报的序列化错误问题,是因为默认的pickle不支持自定义类的序列化,可按照上面第二份代码示例,使用dill首先序列化,再加入即可。