一、代码如下
import re import requests class Spi_doutu(object): head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3573.0 Safari/537.36'} def qingqiu(self,page): res = requests.get( 'http://www.doutula.com/article/list/?page={}'.format(page),headers=self.head) html = res.content.decode('UTF-8') return html def guolv_taotuwangzhi(self,html): patt = re.compile('<a href="(http://www.doutula.com/article/detail/[0-9]{7})"') items = patt.findall(html) return items def guolv_photo(self,items): tupian = [] for i in items: res = requests.get(url=i,headers=self.head) html = res.content.decode('UTF-8') patt = re.compile('<img src="(https://ws[0-9].sinaimg.cn/large/\w+.(jpg|gif))"') items = patt.findall(html) tupian.extend(items) # print(tupian) return tupian def save(self,tupian): for i,j in enumerate(tupian): res = requests.get(url=j[0],headers=self.head) items = res.content if 'jpg' in j[0]: with open(r'C:\Users\wangyl\daima\doutu\{}.gif'.format(i),'wb') as f: f.write(items) else: with open(r'C:\Users\wangyl\daima\doutu\{}.jpg'.format(i),'wb') as f: f.write(items) spider = Spi_doutu() html = spider.qingqiu(2) url_list = spider.guolv_taotuwangzhi(html) tp = spider.guolv_photo(url_list) spider.save(tp)