先补充一个小知识,GIL全局解释器锁:
全局解释器锁(Global Interpreter Lock)是计算机程序设计语言解释器用于同步线程的工具,使得在同一进程内任何时刻仅有一个线程在执行。常见例子有CPython(JPython不使用GIL)与Ruby MRI。
简言之:
就是把一个CPU的利用率大幅度提高,并不会切换CPU。
好了,来看正题。
首先我们先用之前的办法同步下载:
import re
import requests
from lxml import etree
from urllib import request
import os
def parse_page(url):
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36"}
response=requests.get(url,headers=headers)
text=response.text
html=etree.HTML(text)#解析器
imgs=html.xpath('//div[@class="page-content text-center"]//img[@class!="gif"]')#xpath语法
#下载图片时模拟浏览器进行访问###########################################################
opener = request.build_opener()
opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36')]
request.install_opener(opener)
for img in imgs:
img_url=img.get('data-original')
alt=img.get('alt')
alt=re.sub(r'[??.。,!!::*]',"",alt)
suffix=os.path.splitext(img_url)[1]
filename=alt+suffix
request.urlretrieve(img_url,r'C:\python38\new project\biaoqing/'+filename)
print(filename+'下载成功')
def main():
for x in range(1,2):
url='http://www.doutula.com/photo/list/?page=%d'%x
parse_page(url)
if __name__ == '__main__':
main()
发现下载的可慢了,然后来看看多线程吧!!!速度马上就提上来了!
import re
import requests
from lxml import etree
from urllib import request
import os
from queue import Queue
import threading
class Procuder(threading.Thread):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36"}
def __init__(self,page_queue,img_queue,*args,**kwargs):#重写init
super(Procuder,self).__init__(*args,**kwargs)
self.page_queue=page_queue
self.img_queue=img_queue
def run(self):
while True:
if self.page_queue.empty():
break
url=self.page_queue.get()
self.parse_page(url)
def parse_page(self,url):
response = requests.get(url, headers=self.headers)
text = response.text
html = etree.HTML(text) # 解析器
imgs = html.xpath('//div[@class="page-content text-center"]//img[@class!="gif"]') # xpath语法
# 下载图片时模拟浏览器进行访问###########################################################
opener = request.build_opener()
opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36')]
request.install_opener(opener)
for img in imgs:
img_url = img.get('data-original')
alt = img.get('alt')
alt = re.sub(r'[??.。,!!::*]', "", alt)
suffix = os.path.splitext(img_url)[1]
filename = alt + suffix
self.img_queue.put((img_url,filename))
class Consumer(threading.Thread):
def __init__(self,page_queue,img_queue,*args,**kwargs):
super(Consumer,self).__init__(*args,**kwargs)
self.page_queue=page_queue
self.img_queue=img_queue
def run(self):
while True:
if self.page_queue.empty() and self.img_queue.empty():
break
img_url,filename=self.img_queue.get()
request.urlretrieve(img_url, r'C:\python38\new project\biaoqing/' + filename)
print(filename+'下载完成')
def main():
page_queue=Queue(50)
img_queue=Queue(1000)
for x in range(1, 10):
url = 'http://www.doutula.com/photo/list/?page=%d'%x
page_queue.put(url)
for x in range(5):
t = Procuder(page_queue,img_queue)
t.start()
for x in range(5):
t = Consumer(page_queue,img_queue)
t.start()
if __name__ == '__main__':
main()
来看看我保存的海量表情包:

再也不怕斗不过图了!!
想不想像我一样每天提升一点点!快来关注互粉和我一起学爬虫吧!记录我的小白爬虫之路。
此处再次鸣谢B站up主神奇的老黄的爬虫学习视频。
链接在此:
https://www.bilibili.com/video/av44518113?p=73
本文介绍了一种利用多线程技术加速网络爬虫的方法,通过对比单线程与多线程下载效率,展示了如何使用Python实现多线程爬虫,有效提升了数据抓取速度。

被折叠的 条评论
为什么被折叠?



