首先创建的文件夹,先定义一个创建文件夹的函数:
def makdir(path):
try:
isExists = os.path.exists(path)
#判断是否存在文件夹
if not isExists:
#不存在创建
os.makedirs(path)
print(path+'文件夹创建成功!')
else:
print(path+'文件夹已存在!')
except Exception as e:
print(e)
爬取图片并保存图片函数
def pic_download(paths,url):
# 请求头池
headers = {
'user-agent':"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
}
web = requests.get(url, headers=headers).content.decode()
# print(web.encoding)
data = etree.HTML(web)
lists = data.xpath('//div[@class="text_left text_lefts"]/div[@id="container"]/div')
for lis in lists:
new_url = 'https:' + lis.xpath('./p/a/@href')[0]
name = lis.xpath('./p/a/@alt')[0]
#创建相应文件夹
makdir(paths+'/' + name)
new_web = requests.get(new_url, headers=headers).content.decode()
new_data = etree.HTML(new_web)
finall_url = 'https:' + new_data.xpath('//div[@class="imga"]/a/@href')[0]
#以二进制的方式写入图片
try:
r = requests.get(finall_url, headers=headers).content
with open(paths+'/' + name+'/'+name+'.jpg','wb') as f:
f.write(r)
print('%s 下载成功' % (name))
time.sleep(0.5)
except Exception:
print('%s下载失败' % (name))
time.sleep(0.5)
调用函数
if __name__ == '__main__':
path = 'D:/爬虫下载'
#爬取第5页到第7页的数据
for i in range(5,8):
print('==========开始第{}页的爬取=========='.format(i))
url = 'https://sc.chinaz.com/tupian/renwutupian_{}.html'.format(i)
pic_download(path+'/' + str(i),url)
print('----------完成第{}页的爬取----------'.format(i))
time.sleep(1)
完整代码:
from lxml import etree
import time
import requests
import os
#创建文件夹
def makdir(path):
try:
isExists = os.path.exists(path)
#判断是否存在文件夹
if not isExists:
#不存在创建
os.makedirs(path)
print(path+'文件夹创建成功!')
else:
print(path+'文件夹已存在!')
except Exception as e:
print(e)
#获取图片
def pic_download(paths,url):
# 请求头池
headers = {
'user-agent':"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
}
web = requests.get(url, headers=headers).content.decode()
# print(web.encoding)
data = etree.HTML(web)
lists = data.xpath('//div[@class="text_left text_lefts"]/div[@id="container"]/div')
for lis in lists:
new_url = 'https:' + lis.xpath('./p/a/@href')[0]
name = lis.xpath('./p/a/@alt')[0]
#创建相应文件夹
makdir(paths+'/' + name)
new_web = requests.get(new_url, headers=headers).content.decode()
new_data = etree.HTML(new_web)
finall_url = 'https:' + new_data.xpath('//div[@class="imga"]/a/@href')[0]
#以二进制的方式写入图片
try:
r = requests.get(finall_url, headers=headers).content
with open(paths+'/' + name+'/'+name+'.jpg','wb') as f:
f.write(r)
print('%s 下载成功' % (name))
time.sleep(0.5)
except Exception:
print('%s下载失败' % (name))
time.sleep(0.5)
# #print(name)
if __name__ == '__main__':
path = 'D:/爬虫下载'
#爬取第5页到第7页的数据
for i in range(5,6):
print('==========开始第{}页的爬取=========='.format(i))
url = 'https://sc.chinaz.com/tupian/renwutupian_{}.html'.format(i)
pic_download(path+'/' + str(i),url)
print('----------完成第{}页的爬取----------'.format(i))
time.sleep(1)