Python之Scrapy程序运用

本文介绍了如何使用Python的Scrapy框架搭建马蜂窝爬虫项目,涉及安装Scrapy、pywin32,创建项目以及代码编写。在爬虫中,还运用了代理和MongoDB进行数据存储。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >



python2.7



1、安装scrapy

pip install scrapy

2、安装pywin32  下载安装 一路next

3、创建scrapy项目

scrapy startproject mafengwo

4、编写代码  代码下载 点击打开链接

5、代码说明:

本项目为Python爬取马蜂窝项目,使用scrapy项目作为框架,使用到代理,mongodb辅助存取爬取

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for mafengwo111 project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'mafengwo111'

SPIDER_MODULES = ['mafengwo111.spiders']
NEWSPIDER_MODULE = 'mafengwo111.spiders'


DOWNLOADER_MIDDLEWARES = {
    "mafengwo111.middlewares.UserAgentMiddleware": 401,
    #"mafengwo111.middlewares.CookiesMiddleware": 402,
}

ITEM_PIPELINES = {
    'mafengwo111.pipelines.MongoDBPipleline': 10,
}

DOWNLOAD_DELAY =10 # 间隔时间
# CONCURRENT_ITEMS = 1000
# CONCURRENT_REQUESTS = 100
# REDIRECT_ENABLED = False
# CONCURRENT_REQUESTS_PER_DOMAIN = 100
# CONCURRENT_REQUESTS_PER_IP = 0
# CONCURRENT_REQUESTS_PER_SPIDER=100
# DNSCACHE_ENABLED = True
# LOG_LEVEL = 'INFO'    # 日志级别
# CONCURRENT_REQUESTS = 70

核心代码:

class MafengwoQaSpider(scrapy.Spider):
    name = 'mafengwo-qa'

    def parse(self, response):
        html_text = json.loads(response.body)['payload']['list_html']
       
        #print(json.dumps(html_text))
        for href in Selector(text=html_text).xpath(
                #'//li/div[@class="wen"]//div[@class="title"]/a[@href]/@href').extract():
                '//li//div[@class="title"]/a[@href]/@href').extract():              
            url = urljoin(response.url, href)
            print("------"+url)
            yield Request(url=url, callback=self.parse_question)

    def start_requests(self):
        for start_idx in xrange(0, 500, 20):
            yield Request(url='http://www.mafengwo.cn/qa/ajax_pager.php?action=question_index&start=%d' % start_idx)
        
    
    def parse_question(self, response):
        # 抓取相关问题
        for related_href in response.selector.xpath(
                '//div[@class="q-relate"]/ul[@class="bd"]/li/a[@href]/@href').extract():
            url = urljoin(response.url, related_href)
            yield Request(url=url, callback=self.parse_question)

        q_item = self.retrive_question(response)
        yield q_item

        # 抓取回答
        qid = q_item['qid']
        page = 0
        page_size = 50
        url = 'http://www.mafengwo.cn/qa/ajax_pager.php?qid=%d&action=question_detail&start=%d' \
              % (qid, page * page_size)
        #print("answerUrl="+url)
        yield Request(url=url, callback=self.parse_answer_list, meta={'qid': qid, 'page': page, 'page_size': page_size})

    def retrive_question(self, response):
        """
        分析response,得到问题
        """
        
        #tmp = response.selector.xpath('//div[@class="q-detail"]/div[@class="person"]/div[@class="avatar"]/a[@href]')
        tmp = response.selector.xpath('//div[@class="q-detail"]//div[@class="pub-bar fr"]//a[@href]')        
        try:
            user_href = tmp[0].xpath('./@href').extract()[0]
        except IndexError:
            self.logger.warning('Invalid response: %s' % response.url)
            #self.logger.warning(response.body)
            raise
        m = re.search(r'/wenda/u/(\d+)', user_href)
        author_id = int(m.group(1))
        tmp = tmp[0].xpath('./img/@src').extract()[0]
        author_avatar = re.sub(r'\.head\.w\d+\.', '.', tmp)
        if author_avatar.endswith('pp48.gif'):
            author_avatar = None
        author_name = response.selector.xpath(
                '//div[@class="q-content"]//div[@class="pub-bar fr"]/a[@class="name"]/text()').extract()[0]

        title = response.selector.xpath('//div[@class="q-content"]/div[@class="q-title"]/h1/text()').extract()[0]

        raw_contents = \
            response.selector.xpath('//div[@class="q-content"]//div[@class="q-desc"]').extract()[0]
        contents = html2text(raw_contents)

        topic = response.selector.xpath(
                '//div[@class="q-content"]/div[@class="q-title"]//a[@class="location"]/text()').extract()[0]
        #print("tmp="+tmp)
        #topic=re.search('<i></i>*',tmp).group(0);
        #print("topic="+topic)
        #view_cnt = int(re.search(ur'(\d+)\s*浏览', tmp).group(1))
        temp=response.selector.xpath(
                '//div[@class="q-detail"]//div[@class="fr"]//span[@class="atten-num"]/text()').extract()[0]
        print("temp="+temp)
        view_cnt = news=int(re.sub('浏览'.decode("utf8"),'',temp))
        print("view_cnt="+str(view_cnt))
        timestamp = response.selector.xpath(
                '//div[@class="q-content"]//div[@class="pub-bar fr"]//span[@class="time"]//span/text()').extract()[0]
        #timestamp = parse_time(time_str)
        #print('timestamp='+timestamp)
        #tmp = response.selector.xpath(
        #       '//div[@class="q-content"]/div[@class="user-bar"]/span[@class="fr"]/a[@href]/text()').extract()
        #if tmp and tmp[0].strip():
        #    topic = tmp[0].strip()
        #else:
        #    topic = None

        tags = response.selector.xpath(
                '//div[@class="q-content"]//a[@class="a-tag"]/text()').extract()
        #tags = [tmp.strip() for tmp in raw_tags if tmp.strip()]

        match = re.search(r'detail-(\d+)\.html', response.url)
        qid = int(match.group(1))

        item = QAItem()
        item['source'] = 'mafengwo'
        item['type'] = 'question'
        item['qid'] = qid
        item['title'] = title
        item['author_nickname'] = author_name
        item['author_id'] = author_id
        if author_avatar:
            item['author_avatar'] = author_avatar
            item['file_urls'] = [author_avatar]
        item['timestamp'] = timestamp
        if topic:
            item['topic'] = topic
        item['contents'] = contents
        item['tags'] = tags
        item['view_cnt'] = view_cnt

        return item

    def parse_answer_list(self, response):
        meta = response.meta
        qid = meta['qid']
        page = meta['page']
        page_size = meta['page_size']
        print('-------'+response.url)
        sel = Selector(text=json.loads(response.body)['payload']['list_html'])
        answer_nodes = sel.xpath('//li[contains(@class, "answer-item")]')
        if not answer_nodes:
            return

        # 查找下一页
        if len(answer_nodes) == page_size:
            next_page = page + 1
            url = 'http://www.mafengwo.cn/qa/ajax_pager.php?qid=%d&action=question_detail&start=%d' \
                  % (qid, next_page * page_size)
            yield Request(url=url, callback=self.parse_answer_list,
                          meta={'qid': qid, 'page': next_page, 'page_size': page_size})

        for answer_node in sel.xpath('//li[contains(@class, "answer-item") and @data-aid]'):
            aid = int(answer_node.xpath('./@data-aid').extract()[0])

            author_node = answer_node.xpath('//a[@class="_j_filter_click avatar"]')[0]
            temp = author_node.xpath('./@href').extract()[0]
            author_id=re.search(r'(\d+)', temp).group()
            
            tmp = author_node.xpath('//a/img/@src').extract()[0]
            author_avatar = re.sub(r'\.head\.w\d+\.', '.', tmp)
            author_name = answer_node.xpath('//a[@class="name"]/text()').extract()[0]
            if author_avatar.endswith('pp48.gif'):
                author_avatar = None

            content_node = answer_node.xpath('//div[contains(@class,"_j_answer_html")]').extract()[0]

            timestamp = answer_node.xpath('//div[@class="a-operate _js_operate clearfix"]//div[@class="pub-time"]//span/text()').extract()[0]
            #timestamp = parse_time(time_str)

            #accepted = bool(answer_node.xpath('.//div[contains(@class,"answer-best")]'))

            #raw_contents = content_node.xpath('.//dl/dd[@class="_j_answer_html"]').extract()[0]
            contents = html2text(content_node)

            try:
                vote_cnt = int(answer_node.xpath('//a[@class="btn-ding _js_zan"]//span/text()').extract()[0])
            except (IndexError, ValueError):
                self.logger.debug(u'Invalid vote count: %s' % answer_node.extract()[0])
                vote_cnt = 0

            item = QAItem()
            item['type'] = 'answer'
            item['source'] = 'mafengwo'
            item['qid'] = qid
            item['aid'] = aid
            item['author_nickname'] = author_name
            item['author_id'] = author_id
            if author_avatar:
                item['author_avatar'] = author_avatar
                item['file_urls'] = [author_avatar]
            item['timestamp'] = timestamp
            item['contents'] = contents
            item['vote_cnt'] = vote_cnt
            #item['accepted'] = accepted

            yield item


评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值