"""
爬取画师通网站,本来想利用xpath,但数据不在源文件中,失败
通过浏览器检查功能找到了搜索结果页面的对应文件
利用线程池提高效率
思路:
1.提示用户输入关键词
2.构建第一个url,发出请求,获取响应,得到页面数
3.构建所有的url,创建队列,进行多进程爬取
4.解析响应数据,下载其中所有图片
"""
from urllib import request
import requests
from UA_IP_Pool import *
import os
import time
from queue import Queue
import threading
from concurrent.futures import ThreadPoolExecutor
#新建图片存储文件夹
def Make_file(keyword):
file_path = "C:\\Users\\kangheng\\Desktop\\MyPicture\\" + request.unquote(keyword)
try:
os.mkdir(file_path)
except:
pass
finally:
return file_path
#请求搜索结果页面,获取响应,得到pageCount数据
def Get_pageCount(url):
response = requests.get(url, headers = Get_UA(), proxies = Get_http_IP(), verify=False, timeout = 5)
response.encoding = "utf-8"
#print(response.text)
page_Count = response.json()["data"]["pageCount"]
print("总页数为{}".format(page_Count))
return page_Count
#获取图片信息,下载图片
def Get_information(url, mypicture_path):
response = requests.get(url, headers = Get_UA(), proxies = Get_http_IP(), verify=False, timeout = 5)
response.encoding = "utf-8"
# 获取锁,用于线程同步
threadLock = threading.Lock()
threadLock.acquire()
title_pat = '"title":"(.*?)",'
picture_part_pat = '"path":"(.*?)",'
title_rst = re.compile(title_pat).findall(response.text)
picture_part_rst = re.compile(picture_part_pat).findall(response.text)
for i in range(len(title_rst)):
#规范文件命名
for j in range(len(title_rst[i])):
if title_rst[i][j] == "/" or title_rst[i][j] == "\\" or title_rst[i][j] == ":" or title_rst[i][j] == "*" or title_rst[i][j] == "<" or title_rst[i][j] == ">" or title_rst[i][j] == "|" or title_rst[i][j] == "?" or title_rst[i][j] == '"':
title_rst[i] = title_rst[i][:j]
break
else:
pass
res = requests.get("https://img2.huashi6.com/"+picture_part_rst[i], headers = Get_UA(), proxies = Get_http_IP(), verify=False, timeout = 5)
with open(mypicture_path+"\\"+title_rst[i]+".jpg", "wb+") as fp:
fp.write(res.content)
print("图片下载完成!")
# 释放锁,开启下一个线程
threadLock.release()
def main():
#用户输入
keyword = request.quote(input("请输入你想搜索的图片关键词:"))
#获取文件夹位置信息
mypicture_path = Make_file(keyword)
#获取程序开始爬取的时间戳
start_time = time.time()
#通过第一个url获取页面数
first_url = f"https://rt.huashi6.com/front/works/search?index=1&title={keyword}"
page_Count = Get_pageCount(first_url)
#创建url队列
url_queue = Queue()
for index in range(1, page_Count+1):#将构建的每个url都送入队列
url = f"https://rt.huashi6.com/front/works/search?index={index}&title={keyword}"
url_queue.put(url)
#print(url)
#进程池获取每页
with ThreadPoolExecutor(10) as t: # 创建一个最大容纳数量为10的线程池
for i in range(1, url_queue.qsize()+1):
t.submit(lambda p: Get_information(*p), [url_queue.get(), mypicture_path]) # 通过submit提交执行的函数到线程池中
#获取程序结束爬取的时间戳
finish_time = time.time()
print("对应图片全部爬取完成!(共耗时{}秒)".format(finish_time-start_time))
if __name__ == "__main__":
while True:
main()
input_next = input("是否继续爬取?(请输入yes 或 no)")
if input_next == "no":
print("程序终止!!")
break
希望路过的小伙伴可以点个赞,满足一下我小小的虚荣心,同时欢迎大佬提出改进意见或提供新思路,在这里道谢了!!!