python2.7
1、安装scrapy
pip install scrapy
2、安装pywin32 下载安装 一路next
3、创建scrapy项目
scrapy startproject mafengwo
4、编写代码 代码下载 点击打开链接
5、代码说明:
本项目为Python爬取马蜂窝项目,使用scrapy项目作为框架,使用到代理,mongodb辅助存取爬取
settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for mafengwo111 project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'mafengwo111'
SPIDER_MODULES = ['mafengwo111.spiders']
NEWSPIDER_MODULE = 'mafengwo111.spiders'
DOWNLOADER_MIDDLEWARES = {
"mafengwo111.middlewares.UserAgentMiddleware": 401,
#"mafengwo111.middlewares.CookiesMiddleware": 402,
}
ITEM_PIPELINES = {
'mafengwo111.pipelines.MongoDBPipleline': 10,
}
DOWNLOAD_DELAY =10 # 间隔时间
# CONCURRENT_ITEMS = 1000
# CONCURRENT_REQUESTS = 100
# REDIRECT_ENABLED = False
# CONCURRENT_REQUESTS_PER_DOMAIN = 100
# CONCURRENT_REQUESTS_PER_IP = 0
# CONCURRENT_REQUESTS_PER_SPIDER=100
# DNSCACHE_ENABLED = True
# LOG_LEVEL = 'INFO' # 日志级别
# CONCURRENT_REQUESTS = 70
核心代码:
class MafengwoQaSpider(scrapy.Spider):
name = 'mafengwo-qa'
def parse(self, response):
html_text = json.loads(response.body)['payload']['list_html']
#print(json.dumps(html_text))
for href in Selector(text=html_text).xpath(
#'//li/div[@class="wen"]//div[@class="title"]/a[@href]/@href').extract():
'//li//div[@class="title"]/a[@href]/@href').extract():
url = urljoin(response.url, href)
print("------"+url)
yield Request(url=url, callback=self.parse_question)
def start_requests(self):
for start_idx in xrange(0, 500, 20):
yield Request(url='http://www.mafengwo.cn/qa/ajax_pager.php?action=question_index&start=%d' % start_idx)
def parse_question(self, response):
# 抓取相关问题
for related_href in response.selector.xpath(
'//div[@class="q-relate"]/ul[@class="bd"]/li/a[@href]/@href').extract():
url = urljoin(response.url, related_href)
yield Request(url=url, callback=self.parse_question)
q_item = self.retrive_question(response)
yield q_item
# 抓取回答
qid = q_item['qid']
page = 0
page_size = 50
url = 'http://www.mafengwo.cn/qa/ajax_pager.php?qid=%d&action=question_detail&start=%d' \
% (qid, page * page_size)
#print("answerUrl="+url)
yield Request(url=url, callback=self.parse_answer_list, meta={'qid': qid, 'page': page, 'page_size': page_size})
def retrive_question(self, response):
"""
分析response,得到问题
"""
#tmp = response.selector.xpath('//div[@class="q-detail"]/div[@class="person"]/div[@class="avatar"]/a[@href]')
tmp = response.selector.xpath('//div[@class="q-detail"]//div[@class="pub-bar fr"]//a[@href]')
try:
user_href = tmp[0].xpath('./@href').extract()[0]
except IndexError:
self.logger.warning('Invalid response: %s' % response.url)
#self.logger.warning(response.body)
raise
m = re.search(r'/wenda/u/(\d+)', user_href)
author_id = int(m.group(1))
tmp = tmp[0].xpath('./img/@src').extract()[0]
author_avatar = re.sub(r'\.head\.w\d+\.', '.', tmp)
if author_avatar.endswith('pp48.gif'):
author_avatar = None
author_name = response.selector.xpath(
'//div[@class="q-content"]//div[@class="pub-bar fr"]/a[@class="name"]/text()').extract()[0]
title = response.selector.xpath('//div[@class="q-content"]/div[@class="q-title"]/h1/text()').extract()[0]
raw_contents = \
response.selector.xpath('//div[@class="q-content"]//div[@class="q-desc"]').extract()[0]
contents = html2text(raw_contents)
topic = response.selector.xpath(
'//div[@class="q-content"]/div[@class="q-title"]//a[@class="location"]/text()').extract()[0]
#print("tmp="+tmp)
#topic=re.search('<i></i>*',tmp).group(0);
#print("topic="+topic)
#view_cnt = int(re.search(ur'(\d+)\s*浏览', tmp).group(1))
temp=response.selector.xpath(
'//div[@class="q-detail"]//div[@class="fr"]//span[@class="atten-num"]/text()').extract()[0]
print("temp="+temp)
view_cnt = news=int(re.sub('浏览'.decode("utf8"),'',temp))
print("view_cnt="+str(view_cnt))
timestamp = response.selector.xpath(
'//div[@class="q-content"]//div[@class="pub-bar fr"]//span[@class="time"]//span/text()').extract()[0]
#timestamp = parse_time(time_str)
#print('timestamp='+timestamp)
#tmp = response.selector.xpath(
# '//div[@class="q-content"]/div[@class="user-bar"]/span[@class="fr"]/a[@href]/text()').extract()
#if tmp and tmp[0].strip():
# topic = tmp[0].strip()
#else:
# topic = None
tags = response.selector.xpath(
'//div[@class="q-content"]//a[@class="a-tag"]/text()').extract()
#tags = [tmp.strip() for tmp in raw_tags if tmp.strip()]
match = re.search(r'detail-(\d+)\.html', response.url)
qid = int(match.group(1))
item = QAItem()
item['source'] = 'mafengwo'
item['type'] = 'question'
item['qid'] = qid
item['title'] = title
item['author_nickname'] = author_name
item['author_id'] = author_id
if author_avatar:
item['author_avatar'] = author_avatar
item['file_urls'] = [author_avatar]
item['timestamp'] = timestamp
if topic:
item['topic'] = topic
item['contents'] = contents
item['tags'] = tags
item['view_cnt'] = view_cnt
return item
def parse_answer_list(self, response):
meta = response.meta
qid = meta['qid']
page = meta['page']
page_size = meta['page_size']
print('-------'+response.url)
sel = Selector(text=json.loads(response.body)['payload']['list_html'])
answer_nodes = sel.xpath('//li[contains(@class, "answer-item")]')
if not answer_nodes:
return
# 查找下一页
if len(answer_nodes) == page_size:
next_page = page + 1
url = 'http://www.mafengwo.cn/qa/ajax_pager.php?qid=%d&action=question_detail&start=%d' \
% (qid, next_page * page_size)
yield Request(url=url, callback=self.parse_answer_list,
meta={'qid': qid, 'page': next_page, 'page_size': page_size})
for answer_node in sel.xpath('//li[contains(@class, "answer-item") and @data-aid]'):
aid = int(answer_node.xpath('./@data-aid').extract()[0])
author_node = answer_node.xpath('//a[@class="_j_filter_click avatar"]')[0]
temp = author_node.xpath('./@href').extract()[0]
author_id=re.search(r'(\d+)', temp).group()
tmp = author_node.xpath('//a/img/@src').extract()[0]
author_avatar = re.sub(r'\.head\.w\d+\.', '.', tmp)
author_name = answer_node.xpath('//a[@class="name"]/text()').extract()[0]
if author_avatar.endswith('pp48.gif'):
author_avatar = None
content_node = answer_node.xpath('//div[contains(@class,"_j_answer_html")]').extract()[0]
timestamp = answer_node.xpath('//div[@class="a-operate _js_operate clearfix"]//div[@class="pub-time"]//span/text()').extract()[0]
#timestamp = parse_time(time_str)
#accepted = bool(answer_node.xpath('.//div[contains(@class,"answer-best")]'))
#raw_contents = content_node.xpath('.//dl/dd[@class="_j_answer_html"]').extract()[0]
contents = html2text(content_node)
try:
vote_cnt = int(answer_node.xpath('//a[@class="btn-ding _js_zan"]//span/text()').extract()[0])
except (IndexError, ValueError):
self.logger.debug(u'Invalid vote count: %s' % answer_node.extract()[0])
vote_cnt = 0
item = QAItem()
item['type'] = 'answer'
item['source'] = 'mafengwo'
item['qid'] = qid
item['aid'] = aid
item['author_nickname'] = author_name
item['author_id'] = author_id
if author_avatar:
item['author_avatar'] = author_avatar
item['file_urls'] = [author_avatar]
item['timestamp'] = timestamp
item['contents'] = contents
item['vote_cnt'] = vote_cnt
#item['accepted'] = accepted
yield item