#usr/bin/python #-*-coding:utf-8-*
settings.py MONGODB_HOST = "127.0.0.1" MONGODB_PORT = 27017 MONGODB_NAME = "Douban" MONGODB_SHEET_NAME = "douban_movies" DOWNLOADER_MIDDLEWARES = { "douban.middlewares.RandomUserAgent":100, "douban.middlewares.RandomProxy":200, } ITEM_PIPELINES = { "douban.pipelines.DoubanPipelines":300 } USER_AGENTS = [ "1", "2", "3" ] PROXIES = [ # 需要认证的 {"ip_port":"0.0.0.1:8888","user_password":"username:password"}, # 不需要认证的 {"ip_port":"0.0.0.1:8888","user_password":""}, {"ip_port":"0.0.0.1:8888","user_password":""} ] ----------------------------------------------------------------------------------------- items.py import scrapy class DoubanItem(scrapy.Item): title = scrapy.Field() bd = scrapy.Field() star = scrapy.Field() quote = scrapy.Field() ------------------------------------------------------------------------------------------ douban.py import scrapy from douban.items import DoubanItem class DoubanSpider(scrapy.Spider): name = "douban" allowed_domains = ["movie.douban.com"] base_url = "https://movie.douban.com/top250?start=" offset = 0 start_urls = [base_url + str(offset)] def parse(self,response): item = DoubanItem() movies = response.xpath("//div[@class='info']") for each in movies: title = each.xpath(".//span[@class='title'][1]/text()").extract()[0] bd = each.xpath(".//div[@class='bd']/p/text()").extract()[0] star = each.xpath(".//div[@class='star']/span[@class='rating_num']/text()").extract()[0] quote = each.xpath(".//div[@class='quote']/span/text()").extract() item["title"] = title item["bd"] = bd item["star"] = star if len(quote) != 0: item["quote"] = quote[0] yield item if self.offset < 255: self.offset += 25 yield scrapy.Requests(url=self.base_url+str(self.offset),callback=self.parse) ------------------------------------------------------------------------------------------ pipelines.py import pymongo from scrapy.conf import settings # for douban.settings import * class DoubanPipeline(object): def __init__(self): self.host = settings["MONGODB_HOST"] self.port = settings["MONGODB_PORT"] self.db_name = settings["MONGODB_DB_NAME"] self.sheet_name = settings["MONGODB_SHEET_NAME"] self.client = pymongo.MongoClient(host=host,port=port) self.mydb = self.client[self.db_name] self.sheet = self.mydb["self.sheet_name"] def process_item(self,item,spider): data = dict(item) self.sheet.insert(data) return item ------------------------------------------------------------------------------------------- middlewares.py import random import base64 from douban.settings import USER_AGENTS from douban.settings import PROXIES # 添加随机 User-Agent class RandomUserAgent(object): def process_request(self,requests,spider): # 从 settings 中导入保存 User-Agent 的变量,并随机选择一个 user_agent = random.choice(USER_AGENTS) # 添加到请求中 requests.headers.setdefault("User-Agent",user_agent) # 添加随机代理 class RandomProxy(object): def process_request(self,request,spider): # 从 settings 中导入保存代理的变量,并随机选择一个 proxy = random.choice(PROXIES) # 若代理不需要认证,直接添加 if proxy["user_password"] is None: request.meta["proxy"] = "http://" + proxy["ip_port"] # 若代理需要认证,进行认证 else: base64_user_password = base64.b64encode(proxy["user_password"]) request.headers["Proxy-Authorization"] = "basic " + base64_user_password request.meta["proxy"] = "http://" + proxy["ip_port"] --------------------------------------------------------------------------------------------
scrapy middleware 下载中间件 添加随机 User-Agent 和 代理
最新推荐文章于 2019-12-24 16:21:20 发布