'''
爬取网页图片
'''
import urllib.parse
import requests
import os
import time
from concurrent.futures import ThreadPoolExecutor
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
}
INPUT = input("要查询的图片")
if not os.path.exists(f"./图片/{INPUT}"):
os.mkdir(f"./图片/{INPUT}")
print("已创建文件夹")
PAGES = int(input("要爬取的页数(1页30张)"))
INPUT_str = urllib.parse.quote(INPUT)
start_time = time.time()
urls = []
for i in range(1,PAGES+1):
url = f"https://image.baidu.com/search/acjson?tn=resultjson_com&logid=5179920884740494226&ipn=rj&ct=201326592&is=&fp=result&queryWord={INPUT_str}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=©right=&word={INPUT_str}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&nojc=&pn={ i*30}&rn=30&gsm=1e&1635054081427= "
urls.append(url)
def Request(url):
global picture_num
response = requests.get(url=url,headers = headers)
datas = response.json()['data']
for data in datas:
try:
url_every = data['thumbURL']
url_type = data['type']
response = requests.get(url=url_every,headers = headers).content
with open(f"./图片/{INPUT}/{int(time.time()*1000000)}.{url_type}","wb") as w:
w.write(response)
except:
pass
with ThreadPoolExecutor(max_workers=len(urls)) as e:
for url in urls:
e.submit(Request,url)
end_time = time.time()
print("用时:",round(end_time-start_time,2))