多线程,适用于io,密集型。
多进程cpu密集型的。
join()方法可以使子线程运行结束之前不运行主线程
import time
import threading
def get_list_html():
print('开始获取列表页')
time.sleep(2)#模拟网络请求
print('已经获取到列表页')
def get_detail_html():
print('开始获取详情页')
time.sleep(2)#模拟网络请求
print('已经获取到详情页')
start_time = time.time()
t1 =threading.Thread(target=get_list_html)
t2=threading.Thread(target=get_detail_html)
t1.start()
t2.start()
t1.join()
t2.join()
print(time.time()-start_time)
守护线程:主线程不需要等待子线程运行结束,只要主线程结束,子线程就结束。
守护线程要在线程启动之前设置。
t1 = threading.Thread(target=get_list_html)
t2 = threading.Thread(target=get_detail_html)
t1.setDaemon(True)
t2.setDaemon(True)
t1.start()
t2.start()
print(time.time()-start_time)
run()方法适用于复杂的逻辑。
import time
import threading
def get_list_html():
print('开始获取列表页')
time.sleep(2)#模拟网络请求
print('已经获取到列表页')
def get_detail_html():
print('开始获取详情页')
time.sleep(2)#模拟网络请求
print('已经获取到详情页')
class GetListHtml(threading.Thread):
def run(self):
print('开始获取列表页')
time.sleep(2) # 模拟网络请求
print('已经获取到列表页')
class GetListHtml(threading.Thread):
def run(self):
print('开始获取详情页')
time.sleep(2) # 模拟网络请求
print('已经获取到详情页')
start_time = time.time()
t3 = GetListHtml()
t4 = GetListHtml()
t3.start()
t4.start()
t3.join()
t4.join()
print(time.time()-start_time)
传参:在多线程中实用类方法的传参。url为参数
class GetListHtml(threading.Thread):
def __init__(self,url):
super(GetListHtml, self).__init__()
self.url =url
def run(self):
print('开始获取列表页')
time.sleep(2) # 模拟网络请求
print('已经获取到列表页')
class GetDetailHtml(threading.Thread):
def __init__(self,url):
super(GetDetailHtml, self).__init__()
self.url = url
def run(self):
print('开始获取详情页')
time.sleep(2) # 模拟网络请求
print('已经获取到详情页')
多进程的通信:
通过全局变量来通信:
import time
import threading
detail_url_list = []
class GetListHtml(threading.Thread):
def __init__(self,url_list):
super(GetListHtml, self).__init__()
self.url_list =url_list
def run(self):
print('开始获取列表页')
for i in range(30):
time.sleep(1)
url = f'www.baidu.com/{i}'
self.url_list.append(url)
print('已经获取到列表页')
class GetDetailHtml(threading.Thread):
def __init__(self, url_list):
super(GetDetailHtml, self).__init__()
self.url_list = url_list
def run(self):
print('开始获取详情页')
while True:
if self.url_list:
url = self.url_list.pop()
print(f'已经获取到详情页-{url}')
time.sleep(2)
if __name__ == '__main__':
t1 = GetListHtml(detail_url_list)
t2 = GetDetailHtml(detail_url_list)
t3 =GetDetailHtml(detail_url_list)
t1.start()
t2.start()
t3.start()#t3是为了让详情页获取更快
python中使用全局变量的多线程不安全,
安全的队列:
import time
import threading
from queue import Queue,Empty
list_status = False
class GetListHtml(threading.Thread):
def __init__(self,url_queue: Queue):
super(GetListHtml, self).__init__()
self.url_queue =url_queue
def run(self):
print('开始获取列表页')
try:
for i in range(10):
time.sleep(0.5)
url = f'www.baidu.com/{i}'
self.url_queue.put(url)
print('已经获取到列表页')
finally:
global list_status
list_status = True
class GetDetailHtml(threading.Thread):
def __init__(self, url_queue):
super(GetDetailHtml, self).__init__()
self.url_queue= url_queue
def run(self):
print('开始获取详情页')
while True:
try:
url = self.url_queue.get(timeout=1)
print(f'已经获取到详情页-{url}')
time.sleep(1)
except Empty:
global list_status
print(list_status)
if list_status:
print ('详情页已抓完')
break
if __name__ == '__main__':
url_queue = Queue(100)
t1 = GetListHtml(url_queue)
t2 = GetDetailHtml(url_queue)
t3 =GetDetailHtml(url_queue)
t1.start()
t2.start()
t3.start()
join:阻塞队列,在主线程阻塞。
join里有一个计数器,在put
的时候加一,在done的时候减一,当计数器是0的话会释放阻塞。
if __name__ == '__main__':
url_queue = Queue(100)
t1 = GetListHtml(url_queue)
t2 = GetDetailHtml(url_queue)
t3 =GetDetailHtml(url_queue)
t1.start()
t2.start()
t3.start()
t1.join()
t2.join()
t3.join()
print(11111)
url_queue.join()
在代码前和后添加,上锁可以使线程安全。
lock.acquire()
lock.release()
lock.acquire()
i += 1
lock.release()
t1 = threading.Threead(target = ,args=(lock, ))
Rlock:可重入锁