import requests
import re
import os
# 糗事百科热图主页
url = 'https://www.qiushibaike.com/imgrank/page/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
piclist = [] # 用来存放每个图片的链接
html = '' # 存放所有页面的代码
def get_html(url): # 获取指定页数的源代码
pages = 3 # 想要获取图片的页数
html_txt = ''
for page in range(pages):
new_url = url + str(page + 1)
response = requests.get(new_url, headers=headers)
response.encoding = response.apparent_encoding
html_txt = html_txt + response.text
return html_txt
# 根据正则表达式匹配图片路径 并存储在piclist中
def find_pic(html, piclist):
ex = '<div class="thumb">.*?src="(.*?)" alt.*?</div>'
res = re.findall(ex, html, re.S) # re.S使匹配过程中不会因换行导致匹配失败
return res
# 存储图片数据到本地文件夹
def store_pic(src_list):
mkdir(picpath)
for src in src_list: # 遍历图片链接列表 逐一保存图片到本地
src = 'http:' + src
img_data = requests.get(url=src, headers=headers).content # 根据路径get到图片的二进制数据赋值给img_data
path = './pic/' + src.split('/')[-1]
# 将文件保存到本地
with open(path, 'wb')as fp:
fp.write(img_data)
print(src[-20:], '下载成功')
def mkdir(path):
folder = os.path.exists(path)
if not folder:
os.makedirs(path)
print
"--- new folder... ---"
print
"--- OK ---"
else:
print
"--- There is this folder! ---"
picpath = './pic'
html = get_html(url)
src_list = find_pic(html, piclist)
store_pic(src_list)
爬取图片 糗事百科python爬虫
最新推荐文章于 2021-09-08 23:47:14 发布