1.系统环境的配置
sudo apt install python3-pip python3-dev build-essential
sudo python3 -m pip3 install --upgrade pip
sudo pip3 install virtualenvwrapper
mkdir /var/www/EnvRoot
export WORKON_HOME =/var/www/EnvRoot
export VIRTUALENVWRAPPER_PYTHON =/usr/bin/python3
source /usr/local/bin/virtualenvwrapper.sh
source ~/.zshrc
mkvirtualenv scrapy
pip install scrapy
scrapy startproject douban
2.爬虫文件的编写
- 编辑爬虫文件以及数据库连接存储文件,将爬取到的数据存到数据库中,后期可以利用这些数据进行一些数据分析
cd douban
vim spiders/douban.py
import scrapy
import re
from bs4 import BeautifulSoup
from douban.items import DoubanItem
class DbSpider(scrapy.Spider):
name ='douban'
allowed_domains = ["douban.com"]
start_urls =["https://www.douban.com/doulist/43430373"]
def parse(self,response):
item = DoubanItem()
response.encding='utf-8'
soup = BeautifulSoup(response.text,'html.parser')
books= soup.select('.doulist-item')
selector = scrapy.Selector(response)
for book in books:
if len(book.select('.title a'))>0:
title =book.select('.title a')[0].text
rate =book.select('.rating span')[1].text
score =book.select('.rating span')[2].text.lstrip('(').strip('人评价)')
author =book.select('.abstract')[0]