已加载的图片
未加载的图片
通过观察发现img
下的data-original
属性是固定不变的,可以通过抓取这个数据,来得到图片的信息
import requests
from lxml import etree
from fake_useragent import UserAgent
headers = {
"User-Agent":UserAgent().chrome
}
def get_page():
page = int(input("要爬取几页数据:"))
urls = ["https://sc.chinaz.com/tupian/meinvtupian.html"]
for i in range(2,page+1):
if page == 1:
url = urls[0]
else:
url = f"https://sc.chinaz.com/tupian/meinvtupian_{i}.html"
urls.append(url)
# print(urls)
j = 1
for url in urls:
print(f"-----第{j}页开始爬取-----")
get_img(url)
print(f"-----第{j}页爬取完成-----")
j += 1
print(f"爬取共{j-1}页{(j-1)*40}张图片")
def get_img(url):
resp = requests.get(url,headers)
resp.encoding = "utf-8"
e = etree.HTML(resp.content)
imgs_src = e.xpath("//div[@class='container']/div[2]/div/img/@data-original")
for src in imgs_src:
img_src = "https:" + src
pic_name = src.split("imgs")[-1].split("/")[-1]
response = requests.get(img_src)
with open(f"imgs/{pic_name}","wb") as f:
f.write(response.content)
if __name__ == "__main__":
get_page()