从爬取一张图开始,到爬取某系列图片,再到你想爬取多少页就爬取多少页,这几天真是挫折多多。代码看的出来写的比较糟糕,有很多能够优化处理的地方,等能力提高了,对python理解更深时再回来修改,继续加油!
以下是完整的爬取代码:
import requests
import re
def getHtmlText(url,header={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}): # 获取网页
try:
html = requests.get(url,headers=header)
html.raise_for_status()
html.encoding = html.apparent_encoding
return html
except:
return print("产生异常")
def getetaddress(nums): # 得到每个主题的地址,并返回一个地址列表addlist
pagelist = []
for i in range(nums):
i += 1
url1 = "https://www.mzitu.com/page/"+ str(i)
html1 = getHtmlText(url1)
pagelist.append (re.findall(r'<li><a href="(.*?)"', html1.text)) # 每一页的主题地址组成为一列表,这些列表然后再组成一个列表
# re.findall()返回list,r"()" list的元素为这里面括号匹配到的内容,没有括号就全返回,注意特殊字符记得加'\'
addlist = []
for i in range(len(pagelist)):
for j in range(len(pagelist[i])):
addlist.append(pagelist[i][j])
return addlist # 处理成没有复合的列表
def gettoalpage(urladd): # 传入某个主题地址“urladd”,并得到该系列下所有地址,通过列表ealist返回
html2 = getHtmlText(urladd)
p = re.findall(r'…</span><a href=\'.*?\'><span>([\d]*?)</span></a>',html2.text) # 得到某主题的页数,便于后面形成图片地址
p = int(p[0])
# 得到每页地址
ad = re.findall(r'<img src="(https://i.meizitu.net/.*?)\.jpg" alt=".*?" width="[\d]*?" height="[\d]*?" /></a></p>',html2.text)
ad = ad[0][:-2]
ealist = []
for i in range(p):
i += 1
url2 = ad + str(i).zfill(2) +".jpg"
ealist.append(url2)
return ealist
def downpics(ealist): # 传入系列地址,进行单个系列下所有图片的保存
for line in ealist:
try:
html3 = getHtmlText(line,header={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',"Referer" : "https://www.mzitu.com/"}) # header中加入Referer,搞定防盗链
h = line[22:-4].split('/')
with open("F://meizitu//" + h[0] + h[1] + h[2] + ".jpg","wb") as f: # 保存文件时不能有“/”,我的F盘下先创建了一个‘meizitu’文件夹
f.write(html3.content)
print("正在保存%s..."%line)
except:
return "异常" # 某个图片报错,不影响后续爬取
def main():
nums = int(input("请输入你想下载的页数:"))
print("好的\n\n别着急,马上开始...")
addlist = getetaddress(nums)
for urls in addlist:
ealist = gettoalpage(urls)
downpics(ealist)
print("恭喜顺利完成爬取")
main()
注意:需要在F盘下先创建一个叫meizitu的文件夹。
这个完整的爬取代码就在这里结束了。
继续附上爬取某一系列的图片时所写代码:
import requests
import re
def getHtmlText(url):
try:
html = requests.get(url,headers = header)
html.raise_for_status()
html.encoding = html.apparent_encoding
return html
except:
return print("产生异常")
def savepics(html,pic):
with open("F://meizitu//" + str(pic) +".jpg","wb") as f:
f.write(html.content)
print("正在保存第%d张..."%pic)
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 SafarMozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',"Referer" : "http://www.mzitu.com/",'Host' : 'i.meizitu.net'}
for pic in range(40):
pic += 1
url = "https://i.meizitu.net/2018/11/26d"+ str(pic).zfill(2) +".jpg"
html = getHtmlText(url)
savepics(html,pic)
print("爬取完成")