python3 requests+bs4爬取某网页MM图片
原理:
将所要抓取的首页分标题及地址保存到字典,遍历字典,对每一个标题下的所有分页进行抓取
import requests
from bs4 import BeautifulSoup
import urllib
import os
import re
#获取网页Html页面
def getHTMLText(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
#为图片创建文件夹并保存
def get_img(path,url):
path = "D://e//" + path
isExists=os.path.exists(path)
if not isExists:
os.makedirs(path)
i = 3
while True:
n = url.find("/",(len(url)-i))
if n<0:
i = i + 1
else:
break
img = requests.get(url)
f = open(path+"//"+url[n+1:],'ab')
f.write(img.content)
f.close()
print("文件保存成功")
def get_in_html(url) :
start_url=url
html=getHTMLText(start_url)
soup = BeautifulSoup(html,"html.parser")
pageInfo = soup.find(class_="itempage")
ss =str(pageInfo.contents[0])
name = soup.h1.text
page2 =int(ss[4:-7])
tag_soup = soup.find(class_="picsbox picsboxcenter")
imgUrl = tag_soup.img["src"]
print("第1页")
get_img(name,imgUrl)
i = 2
while i<=int(page2):
temp_url = start_url[:-5] + "_" + str(i) + ".html"
html=getHTMLText(temp_url)
soup = BeautifulSoup(html,"html.parser")
tag_soup = soup.find(class_="picsbox picsboxcenter")
imgUrl = tag_soup.img["src"]
print("第" + str(i) + "页")
get_img(name,imgUrl)
i = i+1
def main():
pa = 1
info = {}
while pa<=20:
start_url="https://******xiaohua/list_6_%d.html"%pa#7160
html=getHTMLText(start_url)
soup = BeautifulSoup(html,"html.parser")
for k in soup.select("body > div > div.center > div > div.news_bom > div.news_bom-left > ul > ul > li > a"):
info[(start_url[0:(start_url.find(".com")+4)]) + k['href']] = k['title']
pa = pa + 1
print(len (info))
for k in info.keys():
print("开始网页:"+ k)
get_in_html(k)
main()