使用python爬取“斗图“

一、代码如下 

import re
import requests

class Spi_doutu(object):
    head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3573.0 Safari/537.36'}

    def qingqiu(self,page):
        res = requests.get( 'http://www.doutula.com/article/list/?page={}'.format(page),headers=self.head)
        html = res.content.decode('UTF-8')
        return html

    def guolv_taotuwangzhi(self,html):
        patt = re.compile('<a href="(http://www.doutula.com/article/detail/[0-9]{7})"')
        items = patt.findall(html)
        return items

    def guolv_photo(self,items):
        tupian = []
        for i in items:
            res = requests.get(url=i,headers=self.head)
            html = res.content.decode('UTF-8')
            patt = re.compile('<img src="(https://ws[0-9].sinaimg.cn/large/\w+.(jpg|gif))"')
            items = patt.findall(html)
            tupian.extend(items)
        # print(tupian)
        return tupian

    def save(self,tupian):
        for i,j in enumerate(tupian):
            res = requests.get(url=j[0],headers=self.head)
            items = res.content
            if 'jpg' in j[0]:
                with open(r'C:\Users\wangyl\daima\doutu\{}.gif'.format(i),'wb') as f:
                    f.write(items)
            else:
                with open(r'C:\Users\wangyl\daima\doutu\{}.jpg'.format(i),'wb') as f:
                    f.write(items)

spider = Spi_doutu()
html = spider.qingqiu(2)
url_list = spider.guolv_taotuwangzhi(html)
tp = spider.guolv_photo(url_list)
spider.save(tp)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值