1.系统环境的配置
sudo apt install python3-pip python3-dev build-essential
sudo python3 -m pip3 install --upgrade pip
sudo pip3 install virtualenvwrapper
mkdir /var/www/EnvRoot
export WORKON_HOME =/var/www/EnvRoot
export VIRTUALENVWRAPPER_PYTHON =/usr/bin/python3
source /usr/local/bin/virtualenvwrapper.sh
source ~/.zshrc
mkvirtualenv scrapy
pip install scrapy
scrapy startproject douban
2.爬虫文件的编写
- 编辑爬虫文件以及数据库连接存储文件,将爬取到的数据存到数据库中,后期可以利用这些数据进行一些数据分析
cd douban
vim spiders/douban.py
import scrapy
import re
from bs4 import BeautifulSoup
from douban.items import DoubanItem
class DbSpider(scrapy.Spider):
name ='douban'
allowed_domains = ["douban.com"]
start_urls =["https://www.douban.com/doulist/43430373"]
def parse(self,response):
item = DoubanItem()
response.encding='utf-8'
soup = BeautifulSoup(response.text,'html.parser')
books= soup.select('.doulist-item')
selector = scrapy.Selector(response)
for book in books:
if len(book.select('.title a'))>0:
title =book.select('.title a')[0].text
rate =book.select('.rating span')[1].text
score =book.select('.rating span')[2].text.lstrip('(').strip('人评价)')
author =book.select('.abstract')[0].text
title=title.replace(' ','').replace('\n','')
author =author.replace('\n\r','').replace(' ','')
aa=re.split('[\n]+',author)
urlb =book.select('.title a')[0]['href']
item['title'] = title
item['rate'] = rate
item['author'] = aa[1][3:]
item['score'] =score
item['press'] =aa[2][4:]
item['pretime'] = aa[3][4:]
yield scrapy.http.Request(urlb,callback=self.parse_book,meta=item)
nextPage = selector.xpath('//span[@class="next"]/link/@href').extract()
if nextPage:
next =nextPage[0]
yield scrapy.http.Request(next,callback=self.parse)
def parse_book(self,response):
item=response.meta
ISBN=response.xpath(u'//span[.//text()[normalize-space(.)="ISBN:"]]/following::text()[1]').extract()[0]
price=response.xpath(u'//span[.//text()[normalize-space(.)="定价:"]]/following::text()[1]').extract()[0]
number=response.xpath(u'//span[.//text()[normalize-space(.)="页数:"]]/following::text()[1]').extract()[0]
ISBN=ISBN.replace(' ','')
price =price.replace(' ','')
number = number.replace(' ','')
item['ISBN']=ISBN
item['price']=price
item['number']=number
yield item
import scrapy
class DoubanItem(scrapy.Item):
ISBN =scrapy.Field()
title =scrapy.Field()
rate = scrapy.Field()
author = scrapy.Field()
score = scrapy.Field()
press =scrapy.Field()
pretime = scrapy.Field()
prise =scrapy.Field()
number=scrapy.Field()
from scrapy import signals
class DoubanSpiderMiddleware(object):
@classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
return None
def process_spider_output(self, response, result, spider):
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
pass
def process_start_requests(self, start_requests, spider):
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class DoubanDownloaderMiddleware(object):
@classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
return None
def process_response(self, request, response, spider):
return response
def process_exception(self, request, exception, spider):
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
import pymysql
class DoubanPipeline(object):
def __init__(self):
self.connect = pymysql.connect(
host='localhost',
db ='douban',
user = 'root',
password = 'root',
port = 3306,
charset ='utf8',
use_unicode = True)
self.cursor = self.connect.cursor();
def process_item(self, item, spider):
try:
self.cursor.execute(
"""select * from dbook where ISBN=%s""",item['ISBN'])
repetition =self.cursor.fetchone()
if repetition:
pass
else:
self.cursor.execute( """insert into dbook(ISBN,title,rate,author,score,press,pretime,price,number) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)""",
(item['ISBN'],item['title'],item['rate'],item['author'], item['score'],item['press'],item['pretime'],item['price'],item['number']))
self.connect.commit()
except Exception as error:
print(error)
return item
BOT_NAME = 'douban'
SPIDER_MODULES = ['douban.spiders']
NEWSPIDER_MODULE = 'douban.spiders'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'
ROBOTSTXT_OBEY = True
DOWNLOAD_DELAY = 3
COOKIES_ENABLED = False
ITEM_PIPELINES = {
'douban.pipelines.DoubanPipeline': 300,
}
main
from scrapy import cmdline
cmdline.execute("scrapy crawl douban".split())
python3 main.py