python - pychrome 页面抓取测试

本文介绍了使用Python的PyChrome库配合Chrome无头模式(headless)进行网页内容抓取的测试过程。作者详细记录了如何启动浏览器并利用Chrome Dev Protocol进行操作。

python - pychrome 页面抓取测试

Max.Bai

2019.06

 

记录一下去年做页面抓取的脚本记录。

使用Chrome-headless抓取页面内容,使用python 的pychrome包。

要先开启浏览器

然后通过pychrome调用chrome dev protocol

#! python3
# _*_ coding:utf-8 _*_


__author__ = 'Max.Bai'
__date__ = '2018.06'

import pychrome
import threadpool
import threading
ticket_lock = threading.Lock()
p_lock = threading.Lock()
import time

# start chrome first
# "C:\Program Files (x86)\Google\Chrome\Application\chrome.exe" --headless --remote-debugging-port=9222 --disable-gpu --remote-debugging-address=0.0.0.0
# google-chrome --no-sandbox --headless --remote-debugging-port=9222 --user-data-dir=/home/tools/chrome/temp/ --remote-debugging-address=0.0.0.0 --disable-gpu

class BrowserManager(object):
    _tab_pool = {}
    _browsers = {}
    
    @staticmethod
    def add_browser(host, port, tab_count=5):
        browser_key = "{}:{}".format(host.lower(), port)
        if browser_key in BrowserManager._browsers:
            br = BrowserManager._browsers[browser_key]
        else:
            browser_url = "http://{}:{}".format(host.lower(), port)
            br = pychrome.Browser(url=browser_url)
            BrowserManager._browsers[browser_key] = br

        tabs = br.list_tab(5)
        if tab_count > len(tabs):
            for i in range(0, (tab_count - len(tabs))):
                br.new_tab()
        br.list_tab(5)
        BrowserManager._tab_pool.update(br._tabs)

    def get_tab(self):
        global ticket_lock
        tab = None
        ticket_lock.acquire()
        while tab is None:
            for key in BrowserManager._tab_pool:
                if BrowserManager._tab_pool[key].status == pychrome.Tab.status_initial and BrowserManager._tab_pool[key].used != True:
                    tab = BrowserManager._tab_pool[key]
                    tab.used = True
                    break
        ticket_lock.release()
        return tab

    def release_tab(self, tab):
        global p_lock
        p_lock.acquire()
        for browser_key in self._browsers:
            if tab._websocket_url.find(browser_key) >= 0:
                # self._browsers[browser_key].new_tab()
                # self._browsers[browser_key].close_tab(tab)
                self._browsers[browser_key].list_tab()
                BrowserManager._tab_pool[tab.id] = self._browsers[browser_key]._tabs[tab.id]
                # BrowserManager._tab_pool.pop(tab.id)
        p_lock.release()

    def close_all_tab(self):
        for br in BrowserManager._browsers:
            tabs = BrowserManager._browsers[br].list_tab()
            for tab in tabs:
                BrowserManager._browsers[br].close_tab(tab)


def callback_test(**kw):
    # print('call back')
    st = time.time()
    tab = kw['tab']
    count = 0
    timeout = 3
    while True:
        count += 1
        if time.time() - st > timeout:
            break
        
        try:
            root_dom = tab.DOM.getDocument(depth=1)
            result = tab.DOM.getOuterHTML(nodeId=root_dom.get('root')['nodeId'])
            tab.outerHTML = result.get('outerHTML')
            # print(str(result)[:100])
            # print('html cost', time.time() - st)
            break
        except Exception as e:
            print('failed {} times '.format(count))
            continue
    tab.stop()
    

def get_html_content(bm, url):
    tab = bm.get_tab()
    # print(tab._websocket_url)
    tab.debug = True
    tab.set_listener('Network.dataReceived', callback_test)
    tab.outerHTML = ''
    st = time.time()
    tab.start()
    tab.Network.enable()
    tab.CSS.disable()
    tab.Page.navigate(url=url)
    tab.wait(timeout=3)
    print('one url cost:', time.time()-st)
    # print(tab.outerHTML)
    bm.release_tab(tab)

def main():
    bm = BrowserManager()
    tab_count = 2
    bm.add_browser('127.0.0.1', 9222, tab_count=tab_count)
    bm.add_browser('127.0.0.1', 9223, tab_count=tab_count)
    bm.add_browser('127.0.0.1', 9224, tab_count=tab_count)
    bm.add_browser('127.0.0.1', 9225, tab_count=tab_count)
    bm.add_browser('127.0.0.1', 9226, tab_count=tab_count)
    # bm.add_browser('200.200.200.235', 9222, tab_count=tab_count)
    # bm.add_browser('200.200.200.235', 9223, tab_count=tab_count)

    urls = [
        "http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/abababababababababababab.html",
        "http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/ababababababababab.html",
        "http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/abababababababababababababababababababababababababababababab.html",
        "http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/abababababab.html",
        "http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/WIFIabababababababababababababababababab.html",
        "http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/abababababab.html",
        "http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/abababababababababababab/index.html",
        "http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/abababababababababababab/userList.html",
        "http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/abababababababababababab/label.html",
        "http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/ababababababababababababababab/adv.html",
        "http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/ababababababababababababababab/media.html",
        "http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/abababababababababababababababababab/orderconfig.html",
        "http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/abababababababababababababababababab/orderconfig.html",
        "http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/abababababababababababababababababab/voiceoperationconfig.html",
        "http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/abababababababababababab/picture.html",
        "http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/abababababababababababababababababab/help.html",
        "http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/abababababababababababababababababab/producttrace.html",
        "http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/abababababababababababab/albums.html",
        "http://1.2.3.4/svn/repositories/server/wiki/project/pages/cms/manage/WEB/abababababababababababab/service.html",
        "http://1.2.3.4/svn/repositories/server/wiki/project/pages/point/ababababababababababababappabababababab.html",
        "http://1.2.3.4/svn/repositories/server/wiki/project/pages/point/abababababababababababababababababab.html",
        "http://1.2.3.4/svn/repositories/server/wiki/project/pages/weather/abababababab.html",
        "http://1.2.3.4/svn/repositories/server/wiki/project/pages/weather/abababababab.html",
    ]

    thread_count = 10
    print('Thead count', thread_count)
    pool = threadpool.ThreadPool(thread_count)

    pars = [([bm, url], None) for url in urls]
    requests = threadpool.makeRequests(get_html_content, pars) 
    [pool.putRequest(req) for req in requests] 
    pool.wait()
    # bm.close_all_tab()
    


if __name__ == '__main__':
    st = time.time()
    main()
    print('Total Cost:', time.time() - st)

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值