"""
爬取彼岸图网
思路:
1.按用户输入的爬取对象编号构建首页url
2.爬取url,获取页数信息(re)
3.线程池爬取所有页面,获取每个页面上所有图片块的url(xpath)
4.多任务异步协程爬取图片块信息,下载图片
5.所有图片下载完毕后再次询问用户是否继续其他爬取
----------------------------------------------------------------
"""
#需要导入的模块
import requests
import re
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
#提前准备的全局变量
#图片下载路径
path_picture = "C:\\Users\\kangheng\\Desktop\\MyPicture\\"
#修改UA,添加代理IP
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.0", 'Connection': 'close'}
proxies = {"http": "http://120.83.108.45:9999"}
dict = {
1: ["4K风景","fengjing"],
2: ["4K美女","meinv"],
3: ["4K游戏","youxi"],
4: ["4K动漫","dongman"],
5: ["4K影视","yingshi"],
6: ["4K明星","mingxing"],
7: ["4K汽车","qiche"],
8: ["4K动物","dongwu"],
9: ["4K人物","renwu"],
10: ["4K美食","meishi"],
11: ["4K宗教","zongjiao"],
12: ["4K背景","beijing"]
}
#根据用户输入获取url
def Get_home_url():
for num in range(len(dict)):
print("{0} {1}".format(list(dict.keys())[num], list(dict.values())[num][0]))
print("------------------------")
n = int(input("请输入要爬取的对象对应编号:"))
name = dict.get(n)[1]
#print("要爬取的url为:", f"https://pic.netbian.com/4k{name}/")
return name
#获取响应
def Get_response(url):
try:
response = requests.get(url, headers = headers, proxies = proxies, verify=False, timeout = 5)
return response
except Exception as error:
print(error)
return None
#获取页面数
def Get_page(response, name):
response.encoding = "gbk"
pat = f'<a href="/4k{name}/index_(.*?).html"'
rst = re.compile(pat).findall(response.text)
p = []
for i in range(len(rst)):
p.append(int(rst[i]))
page = max(p)
print(f"总页面数为{page}")
return page
#下载图片
def Download_picture(url):
response = Get_response(url)
response.encoding = "gbk"
tree = etree.HTML(response.text)
img = tree.xpath('//*[@id="img"]/img/@src')[0]
title = tree.xpath('//*[@id="img"]/img/@alt')[0]
url_picture = f"https://pic.netbian.com{img}"
response_picture = Get_response(url_picture)
with open(path_picture+title+".jpg", "wb+") as fp:
fp.write(response_picture.content)
#获取页面上每个图片对应部分的链接
def Get_all_part(url):
response = Get_response(url)
response.encoding = "gbk"
tree = etree.HTML(response.text)
lis = tree.xpath('//*[@id="main"]/div[3]/ul/li')
for li in lis:
part_url = li.xpath('./a/@href')[0]
Download_picture(f"https://pic.netbian.com{part_url}")
print("一页下载完成!")
#线程池请求所有页面
def Get_all_response(url, page):
with ThreadPoolExecutor(30) as t:
for i in range(1,page + 1):
t.submit(Get_all_part, url + f"index_{i}.html")
def main():
name = Get_home_url()
url = f"https://pic.netbian.com/4k{name}/"
response = Get_response(url)
page = Get_page(response, name)
Get_all_response(url, page)
if __name__ == "__main__" :
while True:
main()
choice = input("爬取全部完成!\n是否继续爬取?(请输入yes或no)")
if choice == "yes":
pass
else:
print("程序结束!")
break
希望路过的小伙伴可以点个赞,满足一下我小小的虚荣心,同时欢迎大佬提出改进意见或提供新思路,在这里道谢了!!!