觉得python的线程池库用着不要顺手,就按自己的意思改了改。
去掉了poll方法,添加了线程池开始、返回结果队列等方法,结果队列的数据结构也改成了字典.
请求传参的方式也改了,原有的传参方式简直迷人眼。
以下,代码
# -*- coding:utf-8 -*-
import threading
try:
import queue
except ImportError:
import Queue as queue
class ParameterTypeError(TypeError):
pass
class ThreadPoolRequestException(Exception):
pass
ParameterListLength = 3
class ThreadPoolRequest:
def __init__(self, _callable, params, _callback):
# if not isinstance(_callback, function) or not isinstance(_callable, function):
# raise ParameterTypeError('some params must be function')
self._id = id(self)
self.exception = False
self._callable = _callable
self.params = params or []
self._callback = _callback
class ThreadPool:
def __init__(self, threadSum, pollTime=2):
self.requestsQueue = queue.Queue()
self.pollTime = pollTime
self.resultsMap = {}
self.threadList = []
self.scrappedThreadList = []
self.__init(threadSum)
def __init(self, num):
# create <num> threads, and add to threadList
for _ in range(num):
self.threadList.append(Thread(self.requestsQueue, self.resultsMap, self.pollTime))
def capacity(self):
# get ThreadPool capacity
return len(self.threadList)
def start(self):
# activate the ThreadPool
for i in self.threadList:
i.start()
def set_daemon(self, flag):
# set daemon of all threads of ThreadPool
for i in self.threadList:
i.setDaemon(flag)
def wait(self):
# block until all requests are completed
self.requestsQueue.join()
def reduce_threads(self, num):
# reduce the capacity of ThreadPool by close threads.
# Note: the threads could not stop at once, if they were executing a request
for i in range(min(num, len(self.threadList))):
j = self.threadList.pop()
j.delete()
self.scrappedThreadList.append(j)
def wait_reduce_threads(self):
# must use it after reduce_threads()
# block until all scrapped threads stop
for i in self.scrappedThreadList:
i.join()
def increase_threads(self, num):
# increase the capacity of ThreadPool
self.__init(num)
def get_result_set(self, waitAllResuls=True):
if waitAllResuls:
self.requestsQueue.join()
return self.resultsMap
def make_request(self, target):
if not isinstance(target, ThreadPoolRequest):
raise ParameterTypeError('param must be ThreadPoolRequest')
if target.exception:
raise ThreadPoolRequestException('the request are processed and caused a exception')
self.requestsQueue.put(target)
class Thread(threading.Thread):
def __init__(self, requestsQueue, resultsMap, pollTime=5):
threading.Thread.__init__(self)
self.setDaemon(True)
self.requestsQueue = requestsQueue
self.resultsMap = resultsMap
self.scrapped = threading.Event()
self.pollTime = pollTime
# self.scrapped = False
def run(self):
while True:
if self.scrapped.isSet():
break
try:
target = self.requestsQueue.get(True, self.pollTime)
except queue.Empty:
continue
if self.scrapped.isSet():
self.requestsQueue.put(target)
break
try:
result = target._callable(*target.params)
except Exception:
target.exception = True
self.requestsQueue.task_done()
continue
if target._callback is not None:
try:
if result is None: target._callback() else: target._callback(result)except Exception: target.exception = True pass self.requestsQueue.task_done() self.resultsMap[target._id] = result def delete(self): self.scrapped.set()
# 使用样例
for i in range(1, int(sum)+1): taskList.append(threadpool.ThreadPoolRequest(start_reptile, [BEGIN_URL + '&page=' + str(i)], None)) for i in taskList: thread_pool_collect.make_request(i) thread_pool_collect.start()