python - pychrome 页面抓取测试
Max.Bai
2019.06
记录一下去年做页面抓取的脚本记录。
使用Chrome-headless抓取页面内容,使用python 的pychrome包。
要先开启浏览器
然后通过pychrome调用chrome dev protocol
#! python3
# _*_ coding:utf-8 _*_
__author__ = 'Max.Bai'
__date__ = '2018.06'
import pychrome
import threadpool
import threading
ticket_lock = threading.Lock()
p_lock = threading.Lock()
import time
# start chrome first
# "C:\Program Files (x86)\Google\Chrome\Application\chrome.exe" --headless --remote-debugging-port=9222 --disable-gpu --remote-debugging-address=0.0.0.0
# google-chrome --no-sandbox --headless --remote-debugging-port=9222 --user-data-dir=/home/tools/chrome/temp/ --remote-debugging-address=0.0.0.0 --disable-gpu
class BrowserManager(object):
_tab_pool = {}
_browsers = {}
@staticmethod
def add_browser(host, port, tab_count=5):
browser_key = "{}:{}".format(host.lower(), port)
if browser_key in BrowserManager._browsers:
br = BrowserManager._browsers[browser_key]
else:
browser_url = "http://{}:{}".format(host.lower(), port)
br = pychrome.Browser(url=browser_url)
BrowserManager._browsers[browser_key] = br
tabs = br.list_tab(5)
if tab_count > len(tabs):
for i in range(0, (tab_count - len(tabs))):
br.new_tab()
br.list_tab(5)
BrowserManager._tab_pool.update(br._tabs)
def get_tab(self):
global ticket_lock
tab = None
ticket_lock.acquire()
while tab is None:
for key in BrowserManager._tab_pool:
if BrowserManager._tab_pool[key].status == pychrome.Tab.status_initial and BrowserManager._tab_pool[key].used != True:
tab = BrowserManager._tab_pool[key]
tab.used = True
break
ticket_lock.release()
return tab
def release_tab(self, tab):
global p_lock
p_lock.acquire()
for browser_key in self._browsers:
if tab._websocket_url.find(browser_key) >= 0:
# self._browsers[browser_key].new_tab()
# self._browsers[browser_key].close_tab(tab)
self._browsers[browser_key].list_tab()
BrowserManager._tab_pool[tab.id] = self._browsers[browser_key]._tabs[tab.id]
# BrowserManager._tab_pool.pop(tab.id)
p_lock.release()
def close_all_tab(self):
for br in BrowserManager._browsers:
tabs = BrowserManager._browsers[br].list_tab()
for tab in tabs:
BrowserManager._browsers[br].close_tab(tab)
def callback_test(**kw):
# print('call back')
st = time.time()
tab = kw['tab']
count = 0
timeout = 3
while True:
count += 1
if time.time() - st > timeout:
break
try:
root_dom = tab.DOM.getDocument(depth=1)
result = tab.DOM.getOuterHTML(nodeId=root_dom.get('root')['nodeId'])
tab.outerHTML = result.get('outerHTML')
# print(str(result)[:100])
# print('html cost', time.time() - st)
break
except Exception as e:
print('failed {} times '.format(count))
continue
tab.stop()
def get_html_content(bm, url):
tab = bm.get_tab()
# print(tab._websocket_url)
tab.debug = True
tab.set_listener('Network.dataReceived', callback_test)
tab.outerHTML = ''
st = time.time()
tab.start()
tab.Network.enable()
tab.CSS.disable()
tab.Page.navigate(url=url)
tab.wait(timeout=3)
print('one url cost:', time.time()-st)
# print(tab.outerHTML)
bm.release_tab(tab)
def main():
bm = BrowserManager()
tab_count = 2
bm.add_browser('127.0.0.1', 9222, tab_count=tab_count)
bm.add_browser('127.0.0.1', 9223, tab_count=tab_count)
bm.add_browser('127.0.0.1', 9224, tab_count=tab_count)
bm.add_browser('127.0.0.1', 9225, tab_count=tab_count)
bm.add_browser('127.0.0.1', 9226, tab_count=tab_count)
# bm.add_browser('200.200.200.235', 9222, tab_count=tab_count)
# bm.add_browser('200.200.200.235', 9223, tab_count=tab_count)
urls = [
"http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/abababababababababababab.html",
"http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/ababababababababab.html",
"http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/abababababababababababababababababababababababababababababab.html",
"http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/abababababab.html",
"http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/WIFIabababababababababababababababababab.html",
"http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/abababababab.html",
"http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/abababababababababababab/index.html",
"http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/abababababababababababab/userList.html",
"http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/abababababababababababab/label.html",
"http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/ababababababababababababababab/adv.html",
"http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/ababababababababababababababab/media.html",
"http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/abababababababababababababababababab/orderconfig.html",
"http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/abababababababababababababababababab/orderconfig.html",
"http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/abababababababababababababababababab/voiceoperationconfig.html",
"http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/abababababababababababab/picture.html",
"http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/abababababababababababababababababab/help.html",
"http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/abababababababababababababababababab/producttrace.html",
"http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/abababababababababababab/albums.html",
"http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/abababababababababababab/service.html",
"http://1.2.3.4/svn/repositories/server/wiki/project/pages/point/ababababababababababababappabababababab.html",
"http://1.2.3.4/svn/repositories/server/wiki/project/pages/point/abababababababababababababababababab.html",
"http://1.2.3.4/svn/repositories/server/wiki/project/pages/weather/abababababab.html",
"http://1.2.3.4/svn/repositories/server/wiki/project/pages/weather/abababababab.html",
]
thread_count = 10
print('Thead count', thread_count)
pool = threadpool.ThreadPool(thread_count)
pars = [([bm, url], None) for url in urls]
requests = threadpool.makeRequests(get_html_content, pars)
[pool.putRequest(req) for req in requests]
pool.wait()
# bm.close_all_tab()
if __name__ == '__main__':
st = time.time()
main()
print('Total Cost:', time.time() - st)
本文介绍了使用Python的PyChrome库配合Chrome无头模式(headless)进行网页内容抓取的测试过程。作者详细记录了如何启动浏览器并利用Chrome Dev Protocol进行操作。
1582

被折叠的 条评论
为什么被折叠?



