web scrapy with headless browser engine

Headless Chrome

chrome --headless --remote-debugging-port=9222 --disable-gpu

chrome --headless --disable-gpu --remote-debugging-port=9222 --window-size=1280x1696

const CDP = require("chrome-remote-interface");

async function scrape(client) {
    const { DOM } = client;

    // Get document root element nodeId     const rootElement = await DOM.getDocument();
    const { root: { nodeId } } = rootElement;

    // Use seletor to get the links     const { nodeIds: linkIDs } = await DOM.querySelectorAll({
        selector: ".content a.title",
        nodeId,
    });

    // Get each element attributes     const attributes = await Promise.all(linkIDs.map((ID) =>
        DOM.getAttributes({ nodeId: ID })
    ));

    // Atrributes are returned in single array and item pairs     // [..., "href", "www.example.com"]     const links = attributes
        .map(x => x.attributes)
        .filter(x => x.includes("href"))
        .map((attrs) => {
            const index = attrs.indexOf("href");
            return attrs[index + 1];
        });

    // Use set to get unique items only     const uniqueLinks = new Set([...links]);

    return uniqueLinks;
}

async function onClientHandler(client) {
    // Extract domains     const { Network, Page, Runtime, Overlay } = client;

    Page.loadEventFired(() => {
        console.log("Load event fired");
        scrape(client)
            .then((links) => {
                console.log(links);
                client.close();
            });
    });

    try {
        await Promise.all([ Network.enable(), Page.enable() ]);
        await Page.navigate({ url: "https://reddit.com/r/programming" });
    } catch (err) {
        console.error(err);
        client.close();
    }
};

CDP(onClientHandler)
.on("error", (err) => {
    console.error(err);
});

Puppeteer

 

Headless PhantomJS

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

options = Options()
options.add_argument('-headless')
driver = webdriver.Firefox(firefox_options=options)
driver = webdriver.Firefox(firefox_options=options)

try:
    driver.get('http://www.baidu.com')
    title_tag = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'title')))
    print(title_tag.text)
finally:
    driver.quit()

Headless Firefox

/path/to/firefox -headless -screenshot https://developer.mozilla.com
var webdriver = require('selenium-webdriver'),
    By = webdriver.By,
    until = webdriver.until;
var firefox = require('selenium-webdriver/firefox');

 

# chrome headless
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('disable-gpu')
# options.add_argument('remote-debugging-port=9222')
driver = webdriver.Chrome(chrome_options=options)

driver.get("http://www.python.org")
if "Python" in driver.title:
    print("first aseert success")
else:
    print("ERROR: first assert failed")

elem = driver.find_element_by_name("q")
elem.send_keys("pycon")
elem.send_keys(Keys.RETURN)

if "No results found." not in driver.page_source:
    print("second aseert success")
else:
    print("ERROR: second assert failed")

driver.back()
driver.close()

 

转载于:https://my.oschina.net/shannanzi/blog/1594150

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值