介绍
因为是动态加载,故用模拟浏览器获取,edge浏览器
代码
# 爬取图库的图片
import os
import time
import requests
from lxml import etree
from selenium import webdriver
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
}
bro = webdriver.Edge('./msedgedriver.exe')
def parse_label():
"""
解析标签
:return:
"""
# 通过解析标签,锁定所有装化妆品信息的盒子
div_list = tree.xpath('//*[@id="imgid"]/div/ul/li/a/@href')
index = 0
for i in div_list:
href = "https://image.baidu.com/" + i
response = requests.get(url=href, headers=headers)
# 手动设定编码格式
# response.encoding = "gbk"
pic_text = response.text
pic_tree = etree.HTML(pic_text)
title = str(index) + ".jpg"
index += 1
# 通用的处理中文乱码的解决方法
title = title.encode("iso-8859-1").decode("gbk")
pic_href = pic_tree.xpath('//*[@id="srcPic"]/div/img/@src')
print(index)
if len(pic_href) > 0:
pic = requests.get(url=pic_href[0], headers=headers).content
with open("./pic/" + title, "wb") as fp:
fp.write(pic)
if __name__ == "__main__":
keyword = "雪景"
page = 100
url = 'https://image.baidu.com/search/index?tn=baiduimage&word='+ keyword
bro.maximize_window()
bro.get(url)
time.sleep(2)
for i in range(1, page ):
bro.execute_script("document.documentElement.scrollTop = document.documentElement.scrollHeight * " + str(i))
time.sleep(0.5)
# 获取请求的网页资源
page_text = bro.page_source
# 将网页资源转化成HTML,方便后续标签定位
tree = etree.HTML(page_text)
# 解析标签
parse_label()
bro.quit()