安装phantomjs
安装包下载地址: http://phantomjs.org/ ,包括 Windows ,Mac OS,Linux版本,自行选择对应 版本下载解压即可( 为方便使用,可自已为phantomjs设置环境变量 ),其中带有一个example文件夹,里面有很多已经写好的代码供使用。本文假设phantomjs已经安装好并已设置了环境变量。
Scrapy 中在setting 文件设置
#phantomjs的文件路径,这里我复制到spiders文件中
JS_BIN="spiders\\phantomjs.exe"
LOGIN_TYPE="myCrawl"
ROBOTSTXT_OBEY = True
#反爬机制
ROBOTSTXT_OBEY = False
#设置取消Cookes
COOKIES_ENABLED = False
#设置用户代理值,随便浏览一个网页,按F12 -> Network -> F5,随便点击一项,你都能看到有 User-agent 这一项,将这里面的内容拷贝就可以。
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
DOWNLOAD_DELAY = 3
CONCURRENT_REQUESTS=100
#取消默认的useragent,使用新的useragent
DOWNLOADER_MIDDLEWARES = {
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,#关闭默认下载器
'JavaScriptMiddleware.JavaScriptMiddleware':543 #键为中间件类的路径,值为中间件的顺序
}
编写中间件
# -*- coding: utf-8 -*-
from selenium import webdriver
from scrapy.conf import settings
# from scrapy.http.response import Response
from scrapy.http import HtmlResponse
import time
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
from telnetlib import DO
class JavaScriptMiddleware(object):
def __init__(self):
if settings['LOGIN_TYPE'] == 'MyCrawl':
'''
self.simulation = weibo_login(settings['USERNAME'], settings['PWD'],
settings['COOKIE_FILE'])
cookie_file = settings['COOKIE_FILE']
cookie_jar = cookielib.LWPCookieJar(cookie_file)
cookie_jar.load(ignore_discard=True, ignore_expires=True)
self.driver = webdriver.PhantomJS(executable_path=settings['JS_BIN'])
for c in cookie_jar:
self.driver.add_cookie({'name': c.name, 'value': c.value, 'path': '/', 'domain': c.domain})
'''
# simulate user login process
self.driver = webdriver.PhantomJS(executable_path=settings['JS_BIN'])
# 登录
# self.driver.get('http://login.sina.com.cn/')
# uid = self.driver.find_element_by_id('username')
# upw = self.driver.find_element_by_id('password')
# loginBtn = self.driver.find_element_by_class_name('smb_btn')
# time.sleep(1)
# uid.send_keys(settings['USERNAME'])
# upw.send_keys(settings['PWD'])
# loginBtn.click()
# time.sleep(1)
elif settings['LOGIN_TYPE'] == 'other':
print('add login code')
pass
else:
self.driver = webdriver.PhantomJS(executable_path=settings['JS_BIN'])
dispatcher.connect(self.spider_closed, signals.spider_closed)
def process_request(self, request, spider):
self.driver.get(request.url)
print("页面渲染中····开始自动下拉页面")
indexPage = 1000
while indexPage<self.driver.execute_script("return document.body.offsetHeight"):
self.driver.execute_script("scroll(0,"+str(indexPage)+")")
indexPage = indexPage +1000
print(indexPage)
time.sleep(1)
rendered_body = self.driver.page_source
#编码处理
if r'charset="GBK"' in rendered_body or r'charset=gbk' in rendered_body:
coding = 'gbk'
else:
coding = 'utf-8'
return HtmlResponse(request.url, body=rendered_body, encoding='utf-8')
#关闭浏览器
def spider_closed(self, spider, reason):
print ('close driver......')
self.driver.close()
最后放虫咬网站
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from fileinput import filename
from pip._vendor.requests.packages.urllib3 import response
from win32ui import GetType
import re
from builtins import str
class DmozSpider(scrapy.Spider):
name = "crawl007"
redis_key = 'blog.youkuaiyun.com'
start_urls = ["http://blog.youkuaiyun.com/u010085423/article/details/54943875"]
def parse(self, response):
#//*[@id="article_details"]/div[1]/h1/span/a
content = response.xpath("//[@id='article_details']/div[1]/h1/span/a/text()").extract()
if content:
print(content[0])