scrapy 初级

scrapy 补充方法

路径拼接

      responses.urljoin(不完整的链接)

多字典时使用

     管道中判断是否为某一个字典

           if isinstance(item,字典名)

                  保存

           return item

判断网站ip类型,从而挂ip类型

        if request.url.startswitch(''http://"):

                request.meta["proxy"]='http://+ip'

        elif ...........('https://'):

                ......='https://+ip'

pipeline 管道

      class 管道名(object):

  1.             def __init__(self):

                        self.文件=open("文件名.后缀",'写入方式',encoding="utf_8")

  1.             def open_spider(self,spider):

                         self.文件=open("文件名.后缀",'写入方式',encoding="utf_8")

     2.             def process_item(self,spider):

                                 #  管道中判断是否为某一个字典

                                   if isinstance(item,字典名)

                                          保存

                                   return item

                           if isinstance(item, 字典名):           

                                      data = dict(item)         

                                      self.file.write(json.dumps(data, ensure_ascii=False) + ',\n')         

                                   #ensure_ascii=False:这是因为json.dumps 序列化时对中文默认使用的ascii编码.想输出真正的中文需要指定ensure_ascii=False

                                     

    3.              def close_spider(self,spider):

                            self.文件.close()#关闭文件

    3.              def __del__(self):

                           self.文件.close()#关闭文件

setting文件

代理及user_agent

DOWNLOADER_MIDDLEWARES = {
    # 'demo_58.middlewares.Demo58DownloaderMiddleware': 543,
    #随机ua
    'demo_58.middlewares.UserAgentDownloadMiddleware': 543,  
     #随机ip
    'demo_58.middlewares.RandomProxy': 542,
}

请求头

DEFAULT_REQUEST_HEADERS = {
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en',
    'referer': 'https://cs.58.com/chuzu/?PGTID=0d100000-0019-e694-7c7e-9295d99e15c1&ClickID=2'
}

middlewares文件

随机ip

class RandomProxy:
    ip_list = [
        '124.116.116.13:4228',
        '122.194.194.139:4212',
        '36.42.248.45:4215',
        '1.83.250.183:4228',
        '49.85.43.175:4223',
        '121.205.229.70:4231',

    ]

    # 方法名是scrapy规定的方法 (协商机制)

    def process_request(self, request, spider):
        proxy = random.choice(self.ip_list)

        # 修改请求的元数据字典
        # 如果是将IP以列表随机形式构造 需要加上https://,否则报错
        request.meta['proxy'] = 'https://' + proxy

        # 如果是将IP以字典形式构造
        print('IP:', request.meta)

随机user_agent

class UserAgentDownloadMiddleware:
    user_agent = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko)         Chrome/14.0.835.163 Safari/535.1',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko)         Version/5.1 Safari/534.50',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:77.0) Gecko/20190101 Firefox/77.0',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like         Gecko) Chrome/70.0.3538.77 Safari/537.36',
        'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16.2'
    ]

    # 方法名是scrapy规定的方法 (协商机制)
    # 每个交给下载器的request对象都会经过该方法,并期望返回response
    def process_request(self, request, spider):
        # 获取随机请求头
        u_a = random.choice(self.user_agent)
        # 设置请求头
        request.headers['User-Agent'] = u_a

item字典

# 租房
class Demo58Item_zufang(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()  # 租房标题
    price = scrapy.Field()  # 租房价格

# 二手房
class Demo58Item_ershou(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()  # 二手标题
    price = scrapy.Field()  # 二手价格

spider文件

import scrapy
导入字典
from demo_58.items import Demo58Item_zufang,Demo58Item_ershou

class SpiderSpider(scrapy.Spider):
    name = 'spider'

        #限制域名
    allowed_domains = ['58.com']

        #限制网址
    start_urls = ['http://58.com/']

    def parse(self, response):
        # 解析链接
        links = response.xpath('//div[@class="board"]//span[@class="contentAdTilRt"]/a/@href').extract()
        


        for link in links:
            # response.urljoin() 自动拼接链接
            href = response.urljoin(link)

                #指定爬取的子网页
            if '58.com/chuzu/' in href:
                yield scrapy.Request(url=href,callback=self.get_zufang_data)
            if '58.com/ershoufang/' in href:
                yield scrapy.Request(url=href, callback=self.get_ershoufang_data)

    '''解析信息'''
    def get_zufang_data(self,response):
        title  = response.xpath('//div[@class="des"]/h2/a/text()').extract()  # 标题
        preice  = response.xpath('//div[@class="list-li-right"]//div[@class="money"]/b/text()').extract()          # 价格
         

        for titles,preices in zip(title,preice):
            # 实例化租房的item类
            zufang_item = Demo58Item_zufang()


            zufang_item['title'] =titles
            zufang_item['price'] =preices
            # 返回给引擎
            yield zufang_item

        # 构造翻页
        for i in range(2,12):
            print('当前正在下载租房的第{}页'.format(i))
            z_next_url = 'https://cs.58.com/chuzu/pn{}/'.format(i)
            # 将翻页Url打包成请求对象给引擎

                #callback=交给谁去执行
            yield scrapy.Request(url=z_next_url,callback=self.get_zufang_data)

    '''解析信息'''
    def get_ershoufang_data(self,response):
        title = response.xpath('//div[@class="property-content-detail"]/div[@class="property-content-title"]/h3/text()').extract()  # 标题
        preice = response.xpath('//p[@class="property-price-total"]/span[@class="property-price-total-num"]/text()').extract()  # 价格
       

         for titles, preices in zip(title, preice):
            # 实例化二手房的Item类
            ershou_item = Demo58Item_ershou()
            ershou_item['title'] = titles.strip()
            ershou_item['price'] = preices
            # 返回给引擎
            yield ershou_item

        # 开始二手房翻页
        for i in range(2,12):
            print('当前正在下载二手房的第{}页'.format(i))
            next_url = 'https://cs.58.com/ershoufang/p{}/'.format(i)
            # 将翻页Url打包成请求对象给引擎
            yield scrapy.Request(url=next_url, callback=self.get_ershoufang_data)

if __name__ == '__main__':

        #启动文件
    from scrapy import cmdline
    cmdline.execute(['scrapy', 'crawl', 'spider'])

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值