new_movie.py 定义爬虫
import scrapy
import re
import urllib
from ..items import NewMovieItem
class new_movie(scrapy.Spider):
name='new_movie'
start_urls=['http://www.87movie.com/tag/喜剧']
allowed_domains=['www.87movie.com']
def parse_info(self,response):
movie_info=response.meta['movie_info']
movie_info['name']=response.xpath('//div[@class="white-div"]//div[@class="col-md-8"]/h3/text()').extract()
movie_info['pic']=response.xpath('//div[@class="white-div"]//img/@src').extract()
movie_info['content']=response.xpath('//div[@class="white-div"]//div[@class="col-md-8"]/text()').extract()
movie_info['download']=response.xpath('//div[@class="white-div"]//ul[@class="list-unstyled"]/li/a/@href').extract()
return movie_info #return标志着程序终止
def parse_page(self,response):
movie_list=response.xpath('//ul[@class="list-unstyled mlist"]/li/div/div[@class="col-md-10"]/h4/a/@href').extract()
basic_url="http://www.87movie.com"
for i in movie_list:
movie_info=NewMovieItem()#所以要把NewMovieItem导入进来
yield scrapy.Request(basic_url+i,meta={'movie_info':movie_info},callback=self.parse_info)
def parse(self,response): #parse函数是scrapy第一个函数
tmp_str=response.xpath('//ul[@class="pagination"]/li[last()]/a/@href').extract()
num_max=int(tmp_str[0].split('/')[-1].split('?')[0])
for i in range(1,num_max+1):
basic_url="http://www.87movie.com/tag/喜剧/{}?o=date".format(str(i))
print(basic_url)
yield scrapy.Request(basic_url,callback=self.parse_page) #程序不可以在此终止,所以用yield而不用return
Item.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#item.py
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NewMovieItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name=scrapy.Field()
pic=scrapy.Field()
content=scrapy.Field()
download=scrapy.Field()
pipline.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json#将数据存入json
import codecs
class NewMoviePipeline(object):
def __init__(self):
self.file=codecs.open('movie_info.json','w',encoding='utf-8')
def process_item(self, item, spider):
item['content']=item['content'][-2]#将抓取的数据进行处理加工
item['pic']='www.87movie.com'+item['pic'][0]
line=json.dumps(dict(item),ensure_ascii=False)+"\n"
self.file.write(line)
return item
def spider_closed(self,spider):
self.file.close()
setting.py
BOT_NAME = 'new_movie'
SPIDER_MODULES = ['new_movie.spiders']
NEWSPIDER_MODULE = 'new_movie.spiders'
DOWNLOAD_DELAY = 1
ITEM_PIPELINES = {
'new_movie.pipelines.NewMoviePipeline': 300,
}