Scrapy+phantomjs爬取动态网页数据

本文介绍如何在Scrapy框架中集成PhantomJS以实现动态网页的爬取。包括PhantomJS的安装配置、Scrapy设置及自定义中间件编写等关键步骤。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

安装phantomjs

安装包下载地址: http://phantomjs.org/ ,包括 Windows ,Mac OS,Linux版本,自行选择对应 版本下载解压即可( 为方便使用,可自已为phantomjs设置环境变量 ),其中带有一个example文件夹,里面有很多已经写好的代码供使用。本文假设phantomjs已经安装好并已设置了环境变量。

Scrapy 中在setting 文件设置

#phantomjs的文件路径,这里我复制到spiders文件中
JS_BIN="spiders\\phantomjs.exe"

LOGIN_TYPE="myCrawl"

ROBOTSTXT_OBEY = True
#反爬机制
ROBOTSTXT_OBEY = False
#设置取消Cookes
COOKIES_ENABLED = False
#设置用户代理值,随便浏览一个网页,按F12 -> Network -> F5,随便点击一项,你都能看到有 User-agent 这一项,将这里面的内容拷贝就可以。
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'

DOWNLOAD_DELAY = 3
CONCURRENT_REQUESTS=100

#取消默认的useragent,使用新的useragent  
DOWNLOADER_MIDDLEWARES = {  
    'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,#关闭默认下载器  
    'JavaScriptMiddleware.JavaScriptMiddleware':543 #键为中间件类的路径,值为中间件的顺序  
} 

编写中间件

神马是中间件?

# -*- coding: utf-8 -*-
from selenium import webdriver
from scrapy.conf import settings
# from scrapy.http.response import Response
from scrapy.http import HtmlResponse
import time
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher 
from telnetlib import DO

class JavaScriptMiddleware(object):
    def __init__(self):
        if settings['LOGIN_TYPE'] == 'MyCrawl':
            '''
            self.simulation = weibo_login(settings['USERNAME'], settings['PWD'], 
            settings['COOKIE_FILE'])
            cookie_file = settings['COOKIE_FILE']
            cookie_jar = cookielib.LWPCookieJar(cookie_file)
            cookie_jar.load(ignore_discard=True, ignore_expires=True)
            self.driver = webdriver.PhantomJS(executable_path=settings['JS_BIN'])
            for c in cookie_jar:
                self.driver.add_cookie({'name': c.name, 'value': c.value, 'path': '/', 'domain': c.domain})
            '''
            #  simulate user login process
            self.driver = webdriver.PhantomJS(executable_path=settings['JS_BIN'])
#             登录
#             self.driver.get('http://login.sina.com.cn/')
#             uid = self.driver.find_element_by_id('username')
#             upw = self.driver.find_element_by_id('password')
#             loginBtn = self.driver.find_element_by_class_name('smb_btn')
#             time.sleep(1)
#             uid.send_keys(settings['USERNAME'])
#             upw.send_keys(settings['PWD'])
#             loginBtn.click()
#             time.sleep(1)
        elif settings['LOGIN_TYPE'] == 'other':
            print('add login code')
            pass
        else:
            self.driver = webdriver.PhantomJS(executable_path=settings['JS_BIN'])
        dispatcher.connect(self.spider_closed, signals.spider_closed)

    def process_request(self, request, spider):
        self.driver.get(request.url)
        print("页面渲染中····开始自动下拉页面")
        indexPage = 1000
        while indexPage<self.driver.execute_script("return document.body.offsetHeight"):
            self.driver.execute_script("scroll(0,"+str(indexPage)+")")
            indexPage = indexPage +1000
            print(indexPage)
            time.sleep(1)

        rendered_body = self.driver.page_source
        #编码处理
        if r'charset="GBK"' in rendered_body or r'charset=gbk' in rendered_body:
            coding = 'gbk'
        else:
            coding = 'utf-8'
        return HtmlResponse(request.url, body=rendered_body, encoding='utf-8')
    #关闭浏览器
    def spider_closed(self, spider, reason):
        print ('close driver......')
        self.driver.close()

最后放虫咬网站

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from fileinput import filename
from pip._vendor.requests.packages.urllib3 import response
from win32ui import GetType
import re
from builtins import str

class DmozSpider(scrapy.Spider):
    name = "crawl007"                          
    redis_key = 'blog.youkuaiyun.com'
    start_urls = ["http://blog.youkuaiyun.com/u010085423/article/details/54943875"]

    def parse(self, response):
       #//*[@id="article_details"]/div[1]/h1/span/a
       content = response.xpath("//[@id='article_details']/div[1]/h1/span/a/text()").extract()
       if content:
           print(content[0])    
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值