写了一个简单的图片下载器,可以连续多页,直到没有【下一页】。
特别注意的是header里一定要添加上Cookie,要不然抓取的是假数据。获取Cookie,可以用浏览器开发者工具来辅助。
而且添加了时间延迟,以免被屏蔽。
完整代码如下:
# -*- coding: utf-8 -*-
'''
一个简单的图片下载脚本
下载豆瓣小说图书封面
'''
import requests
import time
import random
import urllib
from lxml import etree
file_dir = r'd:/tmp/img/' # 图片保存路径
text_html = r'd:/tmp/img/img_html.txt' # 保存HTML
class Download():
def __init__(self):
self.url = r'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4' # 豆瓣小说列表,第一页
def request_method(self):
# 模拟浏览器,Cookie一定要有,不然爬出来是假的数据
headers = {
'Cache-Control':'no-cache',
'Connection': 'Keep-Alive',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Accept-Enconding':'gzip, deflate, sdch',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Host':'book.douban.com',
'Pragma':'no-cache',
'Upgrade-Insecure-Requests':'1',
'Cookie':'xxxx' # 添加自己的cookie,利用浏览器的开发者工具(F12)来查到。
}
reqs = requests.get(self.url,headers=headers,timeout=5)
return reqs
def get_html(self):
# 获取页面源码
req_handle = self.request_method()
html = req_handle.text
return html
def save_html(self,ip_html):
# 保存页面源码
with open(text_html,'w',encoding='utf-8') as f:
f.write(ip_html)
f.close()
def save_img(self,img_url,img_name):
# 保存图片
img_info = requests.get(img_url,timeout=10)
with open(file_dir+'/'+str(img_name)+'.jpg','wb') as f:
f.write(img_info.content)
#f.close()
def parse_html(self,img_html):
# 解析页面,保存图片,并找到下一页,然后继续查找
res_html = etree.HTML(img_html)
lis = res_html.xpath('//ul[@class="subject-list"]/li')
for i in range(len(lis)):
img_url = lis[i].xpath('div[1]/a/img/@src')[0].replace(' ','').replace('\n','')
img_name = lis[i].xpath('div[2]/h2/a/text()')[0].replace(' ','').replace('\n','')
self.save_img(img_url,img_name) # 保存照片
next_url = res_html.xpath('//div[@class="paginator"]/span[last()]/a/@href') # 下一页
if isinstance(next_url,list) and next_url != []: # 判断是否取到数据
next_url = next_url[0]
next_uri = next_url[next_url.find('?'):]
pos = self.url.find('?')
if(pos == -1):
next_url = self.url+next_uri
else:
next_url = self.url[0:pos]+next_uri
self.url = next_url
self.crawler()
def crawler(self):
print(self.url)
html = self.get_html()
#self.save_html(html)
self.parse_html(html)
time.sleep(random.randint(15,25))
def download_img():
img = Download()
img.crawler()
if __name__ == '__main__':
download_img()