一个非常简单的爬虫小案例,爬取到图片之后放到当前目录的pictures文件夹
import requests
import json
import re
from bs4 import BeautifulSoup
headers={ #设置一个请求头,防止把识别成爬虫
'Host': 'www.doutula.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate',
'Referer': 'https://www.doutula.com/photo/list/?page=2',
'Connection': 'close',
'Cookie': 'XSRF-TOKEN=eyJpdiI6IkNlbXM5ZnBQbWJ2VlBkcUZPcm52M0E9PSIsInZhbHVlIjoibkd6NHhBeFdCMmRYT2FjNndrWUJDTUgyVnQrSTNhM0h3cnAyRnVHS2xSbDJqdVJIZVphalJybVduMCs1WXBncSIsIm1hYyI6IjkzMGUxZTc1N2JjNTE2Y2FlNjE5MWVjNjJiYjhkMWRjOWYyNjUzMzc5MzE1NGVhZDExYzM0MzliZTE2ODNhMWMifQ%3D%3D; doutula_session=eyJpdiI6IktGbU94cWYybEtpUE5RbDNXOGsyZVE9PSIsInZhbHVlIjoib2k4OXhQZEU1SnJOZGpZc0pNVWNEV1FHdUMyRGMrVFVXQ1hjUDdqNnRGalFSY01VYlQ2VHRQMlwvUXI3aWxRcnciLCJtYWMiOiIxNTA4YzdkY2ViZDI5ZDIyMjFkZWZkMjlhZmE0MGZlODY2OTZiMTZhMzU5Y2RjOGQ4ZGM4NjMxZTg3NTY5N2YxIn0%3D; BAIDU_SSP_lcr=https://www.baidu.com/link?url=FY7QRNvGuTh2Ucy_-PyLctkXGQTiyG1LTCOV2g0_TXbF0vch-vSJJHZgroRdyDWt&wd=&eqid=efd82af400005022000000065f6d92fe; Hm_lvt_2fc12699c699441729d4b335ce117f40=1601016583; Hm_lpvt_2fc12699c699441729d4b335ce117f40=1601016807; _agep=1601016584; _agfp=9c49ff87bc62714594565f0d0ded4814; _agtk=263d685c60048ca1eb91bd372d276c4d',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0',
}
def getPicture_url(url):
res=requests.get(url,headers=headers)
soup = BeautifulSoup(res.text,"lxml") #解析html文件
for item in soup.find_all('img'): #遍历img标签
if item.get("alt")!=None:
download(item.get("alt"),item.get("data-backup"))
print("已完成单个页面下载")
# picture_urls['name']=item.get("alt")
# picture_urls['picture_url']=item.get("data-backup")
# print(picture_urls)
# return picture_urls
def download(name,url):
r=requests.get(url)
f=open('pictures/'+name+'.png','wb')
f.write(r.content)
f.close
# for page in range(1, 3):
# new_url = url='https://www.doutula.com/photo/list/?page=%s'%(page)
# getPicture_url(new_url)
def A():
page=int(input("请输入你想下载的页面"))
print("...............................................")
url='https://www.doutula.com/photo/list/?page=%s'%(page)
getPicture_url(url)
def B():
m,n=int(input("请输入你想下载的页面").split(','))
print("...............................................")
for page in range(m+1, n+2):
url='https://www.doutula.com/photo/list/?page=%s'%(page)
getPicture_url(url)
def dosomeselect():
select=input("请选择你需要的功能:A.下载指定页面 eg :3\n B.下载m~n个页面 eg: 2,3\n")
if select=='A':
A()
elif(select=='B'):
B()
else:
print("请正确输入")
dosomeselect()