import asyncio
import time
import random
from pyppeteer import launch # 控制模拟浏览器用
from pyppeteer.dialog import Dialog
from retrying import retry # 设置重试次数用的
js1 = '''() =>{Object.defineProperties(navigator,{webdriver:{get: () => undefined} })}'''
js2 = '''() => {alert ( window.navigator.webdriver )}'''
async def main(url): # 定义main协程函数,
#插件路径
chrome_extension = r'C:\Users\Administrator.USER-20190313RI\AppData\Local\Chromium\User Data\Default\Extensions\dbclpoekepcmadpkeaelmhiheolhjflj\0.2.7_0'
args = ['--no-sandbox',
'--disable-gpu',
'--log-level=3', #日志等级
'--disable-infobars', # 关闭提示
'--window-size={},{}'.format(1080,950),
# "--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36",
'--load-extension={}'.format(chrome_extension), #加载扩展插件
'--disable-extensions-except={}'.format(chrome_extension),
]
if proxy:
ip=re.findall('(\d+\.\d+\.\d+\.\d+:\d+)',proxy)
if len(ip)>0:
args.append("--proxy-server=http://{}".format(proxy))
exepath = r'D:\Program_Files\chrome-win32\chrome.exe' #chromium浏览器路径
# dumpio:True 浏览器就不会卡住了
#浏览器启动参数
params={'executablePath': exepath,
"userDataDir": r"F:\temporary",
'headless': False,
'args':args,
'dumpio':True}
browser = await launch(params) # 启动pyppeteer 属于内存中实现交互的模拟器
page = await browser.newPage() # 启动个新的浏览器页面
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36')
#设置页面超时
page.setDefaultNavigationTimeout(1000*60) #60s
#启用js
await page.setJavaScriptEnabled(True)
# 启用拦截器
# await page.setRequestInterception(True)
await page.evaluate(js1)
await page.evaluate(js2)
await page.setViewport({'width':1300,'height':868}) #设置界面
page.on('dialog', lambda dialog: asyncio.ensure_future(handle_dialog(page,dialog)))
await page.goto(url) # 访问登录页面
await asyncio.sleep(5)
await browser.close()
async def handle_dialog(page,dialog: Dialog):
print(dialog.message)#打印出弹框的信息
print(dialog.type)#打印出弹框的类型,是alert、confirm、prompt哪种
# print(dialog.defaultValue())#打印出默认的值只有prompt弹框才有
await page.waitFor(2000)#特意加两秒等可以看到弹框出现后取消
await dialog.dismiss()
# await dialog.accept(‘000’) #可以给弹窗设置默认值
if __name__ == '__main__':
url = 'https://www.baidu.com'
loop = asyncio.get_event_loop()
m = main(url)
loop.run_until_complete(m)
from scrapy import signals
import pyppeteer
import asyncio
import os
import time
import json
import tkinter
from scrapy.http import HtmlResponse
from pyppeteer.dialog import Dialog
from Aliexpress.ConfigDB import RedisDB,RedisPool
import logging
pyppeteer_level = logging.WARNING
logging.getLogger('pyppeteer').setLevel(pyppeteer_level)
logging.getLogger('websockets.protocol').setLevel(pyppeteer_level)
pyppeteer_logger = logging.getLogger('pyppeteer')
pyppeteer_logger.setLevel(logging.WARNING)
# redisconn=RedisDB(db=0)
redisconn=RedisPool(db=0)
pyppeteer.DEBUG = False
def _patch_pyppeteer():
from typing import Any
from pyppeteer import connection, launcher
import websockets.client
class PatchedConnection(connection.Connection): # type: ignore
def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
# the _ws argument is not yet connected, can simply be replaced with another
# with better defaults.
self._ws = websockets.client.connect(
self._url,
loop=self._loop,
# the following parameters are all passed to WebSocketCommonProtocol
# which markes all three as Optional, but connect() doesn't, hence the liberal
# use of type: ignore on these lines.
# fixed upstream but not yet released, see aaugustin/websockets#93ad88
max_size=None, # type: ignore
ping_interval=None, # type: ignore
ping_timeout=None, # type: ignore
)
connection.Connection = PatchedConnection
# also imported as a global in pyppeteer.launcher
launcher.Connection = PatchedConnection
class DownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
def __init__(self):
# print("Init downloaderMiddleware use pypputeer.")
# os.environ['PYPPETEER_CHROMIUM_REVISION'] = '588429'
# pyppeteer.DEBUG = False
# print(os.environ.get('PYPPETEER_CHROMIUM_REVISION'))
loop = asyncio.get_event_loop()
task = asyncio.ensure_future(self.getbrowser())
loop.run_until_complete(task)
# self.browser = task.result()
# print(self.browser)
# print(self.page)
# self.page = await browser.newPage()
async def getbrowser(self):
self.ua=redisconn.sget("user-agent")
redisconn.rpush('user-agent',self.ua)
proxies=redisconn.sget("daxiangIP")
Proxies=str(proxies,encoding='utf-8')
self.browser = await pyppeteer.launch({'headless': False, 'timeout':0, # 'args':['--no-sandbox']})
'args': [
'--window-size={1600},{700}',
'--disable-extensions',
'--hide-scrollbars',
'--disable-bundled-ppapi-flash',
'--mute-audio',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-gpu',
'--disable-infobars',
# '--proxy-server={}'.format(Proxies)
],
'dumpio': True
})
# self.browser = await pyppeteer.connect({'browserWSEndpoint': 'ws://172.20.1.233:3007'})
self.page = await self.browser.newPage()
await self.page.setViewport(viewport={'width': 1600, 'height': 800})
await self.page.setUserAgent(str(self.ua,encoding='utf-8'))
# await self.page.setUserAgent("Chrome (AppleWebKit/537.1; Chrome50.0; Windows NT 6.3) AppleWebKit/537.36 (KHTML like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393")
await self.page.setJavaScriptEnabled(enabled=True)
await self.page.evaluate(
'''() =>{ Object.defineProperties(navigator,{ webdriver:{ get: () => false } }) }''')
await self.page.evaluate('''() =>{ window.navigator.chrome = { runtime: {}, }; }''')
await self.page.evaluate(
'''() =>{ Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); }''')
await self.page.evaluate(
'''() =>{ Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5,6], }); }''')
await self.page.waitFor(10000)
return self.page
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
loop = asyncio.get_event_loop()
task = asyncio.ensure_future(self.usePypuppeteer(request))
loop.run_until_complete(task)
# return task.result()
return HtmlResponse(url=request.url, body=task.result(), encoding="utf-8", request=request)
async def intercept_request(self,req):
if req.resourceType in ['image', 'media', 'eventsource', 'websocket']:
await req.abort()
else:
await req.continue_()
async def intercept_response(self,res):
resourceType = res.request.resourceType
# if resourceType in ['xhr', 'fetch']:
# resp = await res.text()
# print(resp)
async def usePypuppeteer(self, request):
# redisconn=RedisDB(db=0)
# cooki=redisconn.get("tbcookie")
# redisconn.lpush('tbcookie',cooki)
# cookieStr=str(cooki,encoding='utf-8')
# for cook in json.loads(cookieStr):
# await self.page.setCookie(cook)
await self.page.goto(request.url, {'timeout': 0})
self.page.on('dialog', lambda dialog: asyncio.ensure_future(self.handle_dialog(page,dialog)))
await asyncio.sleep(10)
# try:
# # self.dialog_close= self.page.querySelector('body > div.next-overlay-wrapper.opened > div.next-overlay-inner.next-dialog-container > div')
# self.dialog_close=self.page.querySelector('.next-dialog next-closeable ui-newuser-layer-dialog > a')
# # print(self.dialog_close.content)
# # self.dialog_close.click()
# except Exception as e:
# print("error>>%s"%e)
try:
self.ck= await self.page.querySelector('#product-detail ul > li:nth-child(3) > div > span')
await self.ck.click()
if R'/store' in request.url or R"SearchText=" in request.url:
await self.page.setRequestInterception(True)
self.page.on('request', self.intercept_request)
self.page.on('response', self.intercept_response)
await self.page.goto(request.url,{'timeout':0})
await asyncio.sleep(5)
await self.page.evaluate('window.scrollBy(0, document.body.scrollHeight)')
# await asyncio.sleep(8)
else:
for i in range(0,6):
await self.page.evaluate('window.scrollBy(0, {})'.format(800*i))
await asyncio.sleep(5)
# await self.page.evaluate('window.scrollBy(0, document.body.scrollHeight)')
content = await self.page.content()
except Exception as e:
print("error >>",e)
content="error"
return content
async def handle_dialog(self,page,dialog: Dialog):
print(dialog.message)#打印出弹框的信息
print(dialog.type)#打印出弹框的类型,是alert、confirm、prompt哪种
# print(dialog.defaultValue())#打印出默认的值只有prompt弹框才有
await page.waitFor(2000)#特意加两秒等可以看到弹框出现后取消
await dialog.dismiss()
# await dialog.accept(‘000’) #可以给弹窗设置默认值
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
_patch_pyppeteer()