import requests,re,os,threading
from lxml import etree
from queue import Queue
from urllib import request
class Producer(threading.Thread):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
def __init__(self,page_queue,img_queue,*args,**kwargs):
# 引用队列
self.page_queue = page_queue
self.img_queue = img_queue
super(Producer,self).__init__(*args,**kwargs)
def parse_page(self,url):
# 获取所有img标签
response = requests.get(url,headers = Producer.headers)
response.encoding = response.apparent_encoding
text = response.text
html = etree.HTML(text)
imgs = html.xpath("//div[@class = 'page-content text-center']//img[@class != 'gif']")
for img in imgs:
#获取图片url
img_url = img.get('data-original')
#设置图片保保存的名字
alt = img.get('alt')
alt = re.sub(r'[./,!*?。,!]','',alt)
suffix = os.path.splitext(img_url)[1]
filename = alt + suffix
#把图片的url传进队列
self.img_queue.put((img_url,filename))
def run(self):
while True:
if self.page_queue.empty():
break
#把页面的url传出队列
url = self.page_queue.get()
self.parse_page(url)
class Consummer(threading.Thread):
def __init__(self, page_queue, img_queue, *args, **kwargs):
# 引用队列
self.page_queue = page_queue
self.img_queue = img_queue
super(Consummer, self).__init__(*args, **kwargs)
def run(self):
while True:
if self.page_queue.empty() and self.img_queue.empty():
break
#把图片的url传出队列
img_url,filename = self.img_queue.get()
#下载图片
request.urlretrieve(img_url,'images/' + filename)
print(filename + '下载完成!')
def main():
#设置队列
page_queue = Queue(100)
img_queue = Queue(1000)
for i in range(1,101):
url = 'https://www.doutula.com/photo/list/?page=%d' % i
#把页面的url传进队列
page_queue.put(url)
for i in range(10):
t = Producer(page_queue,img_queue)
t.start()
for i in range(20):
t = Consummer(page_queue,img_queue)
t.start()
if __name__ == '__main__':
main()