安装
pip3 install scrapy
生成爬虫项目
默认在当前目录生成爬虫
scrapy startproject 爬虫项目名
生成爬虫文件
scrapy genspider 爬虫名称 域名
运行
在爬虫根目录下运行
scrapy crawl 爬虫文件名称
修改settings.py文件
用户代理
USER_AGENT = ‘’
是否遵循爬虫协议
ROBOTSTXT_OBEY = False
打开管道
ITEM_PIPELINES = {
‘doubanspider.pipelines.DoubanspiderPipeline’: 300,
}
#############实例###############
1. 创建doubanspider项目
crapy startproject doubanspider
2. 创建爬虫文件
cd doubanspider/spiders
scrapy genspider doubanspider https://movie.douban.com/
3. 修改settings.py
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'doubanspider.pipelines.DoubanspiderPipeline': 300,
}
4. 编写doubanspider.py
import scrapy
from ..items import DoubanspiderItem
class DoubanspiderSpider(scrapy.Spider):
name = 'doubanspider'
allowed_domains = ['https://movie.douban.com/']
start_urls = ['https://movie.douban.com/chart']
def parse(self, response):
movies = response.xpath('.//tr[@class="item"]')
all_movie = []
for i in movies:
movie_name = i.xpath('td/div/a/span/text()')
score = i.xpath('td/div/div/span[2]/text()')
num_of_comment = i.xpath('td/div/div/span[3]/text()')
di = DoubanspiderItem()
print(movie_name.extract()[0])
print(score.extract()[0])
print(num_of_comment.extract()[0])
di['movie_name'] = movie_name.extract()[0]
di['score'] = score.extract()[0]
di['num_of_comment'] = num_of_comment.extract()[0]
all_movie.append(di)
return all_movie
5. 编写items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
# 实体类
class DoubanspiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
movie_name = scrapy.Field()
score = scrapy.Field()
num_of_comment = scrapy.Field()
6.编写pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import json
# 持久化
class DoubanspiderPipeline:
def __init__(self):
self.file = open('douban.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
c = json.dumps(dict(item), ensure_ascii=False)
self.file.write(c)
return item
def close_spider(self, spider):
print("关闭")
self.file.close()
7. 运行
scrapy crawl doubanspider