scrapy爬取电影网站

本文介绍了一个使用Python Scrapy框架编写的爬虫实例,该爬虫用于从特定网站抓取喜剧电影的相关信息,包括电影名称、图片链接、简介及下载链接等。通过解析网页结构,爬虫能够遍历多个页面并获取所有目标数据。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

new_movie.py 定义爬虫


import scrapy
import re
import urllib
from ..items import NewMovieItem 

class new_movie(scrapy.Spider):
	name='new_movie'
	start_urls=['http://www.87movie.com/tag/喜剧']
	allowed_domains=['www.87movie.com']

	def parse_info(self,response):
		movie_info=response.meta['movie_info']
		movie_info['name']=response.xpath('//div[@class="white-div"]//div[@class="col-md-8"]/h3/text()').extract()
		movie_info['pic']=response.xpath('//div[@class="white-div"]//img/@src').extract()
		movie_info['content']=response.xpath('//div[@class="white-div"]//div[@class="col-md-8"]/text()').extract()
		movie_info['download']=response.xpath('//div[@class="white-div"]//ul[@class="list-unstyled"]/li/a/@href').extract()
		return movie_info #return标志着程序终止

	def parse_page(self,response):
		movie_list=response.xpath('//ul[@class="list-unstyled mlist"]/li/div/div[@class="col-md-10"]/h4/a/@href').extract()
		basic_url="http://www.87movie.com"
		for i in movie_list:
			movie_info=NewMovieItem()#所以要把NewMovieItem导入进来
			yield scrapy.Request(basic_url+i,meta={'movie_info':movie_info},callback=self.parse_info)

	def parse(self,response): #parse函数是scrapy第一个函数
		tmp_str=response.xpath('//ul[@class="pagination"]/li[last()]/a/@href').extract()
		num_max=int(tmp_str[0].split('/')[-1].split('?')[0])
		for i in range(1,num_max+1):
			basic_url="http://www.87movie.com/tag/喜剧/{}?o=date".format(str(i))
			print(basic_url)
			yield scrapy.Request(basic_url,callback=self.parse_page) #程序不可以在此终止,所以用yield而不用return


Item.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#item.py
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class NewMovieItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
	name=scrapy.Field()
	pic=scrapy.Field()
	content=scrapy.Field()
	download=scrapy.Field()

pipline.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json#将数据存入json
import codecs

class NewMoviePipeline(object):
	def __init__(self):
		self.file=codecs.open('movie_info.json','w',encoding='utf-8')

	def process_item(self, item, spider):
		item['content']=item['content'][-2]#将抓取的数据进行处理加工
		item['pic']='www.87movie.com'+item['pic'][0]
		line=json.dumps(dict(item),ensure_ascii=False)+"\n"
		self.file.write(line)
		return item

	def spider_closed(self,spider):
		self.file.close()



setting.py

BOT_NAME = 'new_movie'

SPIDER_MODULES = ['new_movie.spiders']
NEWSPIDER_MODULE = 'new_movie.spiders'

DOWNLOAD_DELAY = 1

ITEM_PIPELINES = {
    'new_movie.pipelines.NewMoviePipeline': 300,
}




评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值