# -*- coding: utf-8 -*-
"""
Created on Thu Dec 19 18:59:58 2019
@author: m
"""
import urllib,os
import requests
from lxml import etree
from multiprocessing import Pool
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"
}
def parse_page(url):
response = requests.get(url,headers)
html = response.text
html = etree.HTML(html)
lists = html.xpath('//div[@id="bqb"]/div[@class="ui segment imghover"]/div[@class="tagbqppdiv"]')
srcs = []
titles=[]
for li in lists:
src = li.xpath('./a/img/@data-original')[0]
title = li.xpath('./a/@title')[0]
srcs.append(src)
titles.append(title)
return srcs,titles
def download(srcs,titles):
file_path='D:/book/imgews'
r=0
try:
for src in srcs:
print(src)
file_suffix = os.path.splitext(src)[1]
file_name = titles[r][0:3]#截取前三个做文件名字
file_name = file_name.replace('?','').replace(':','').replace('*','').replace('"','')
filename = '{}{}{}{}'.format(file_path,os.sep,file_name,file_suffix)
print(filename)
urllib.request.urlretrieve(src,filename=filename)
r+=1
except IOError as e:
print("IOError")
# except Exception as e:
# print("Exception")
if __name__ == "__main__":
a= []
p = Pool()
for i in range(0,100):
url = "https://www.fabiaoqing.com/biaoqing/lists/page/{}.html".format(i)
print("正在下载:..........................................................",url)
a.append(p.apply_async(parse_page,(url,)))
p.close()
p.join()
p = Pool()
for i in a:
print(i.get()[1],i.get()[0])#是一个数组里面有两个列表
p.apply_async(download,(i.get()[0],i.get()[1]))