canna-cloud【十六】Selenium+browsermob渲染网页爬虫

最新推荐文章于 2024-07-20 02:19:27 发布

eulers

最新推荐文章于 2024-07-20 02:19:27 发布

阅读量344

点赞数

分类专栏： canna-cloud JAVA工具文章标签： Selenium

本文链接：https://blog.youkuaiyun.com/jiangxuexuanshuang/article/details/103905997

版权

canna-cloud 同时被 2 个专栏收录

23 篇文章

订阅专栏

JAVA工具

18 篇文章

订阅专栏

1、maven依赖引入：

<dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-java</artifactId>
            <version>3.141.59</version>
        </dependency>
        <dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-server</artifactId>
            <version>3.141.59</version>
        </dependency>
        <dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-api</artifactId>
            <version>3.141.59</version>
        </dependency>
        <dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-firefox-driver</artifactId>
            <version>3.141.59</version>
        </dependency>
        <dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-chrome-driver</artifactId>
            <version>3.141.59</version>
        </dependency>
        <dependency>
            <groupId>net.lightbody.bmp</groupId>
            <artifactId>browsermob-core</artifactId>
            <version>2.1.5</version>
<!--            <scope>test</scope>-->
        </dependency>

2、Selenium工具

import lombok.Getter;
import lombok.Setter;
import lombok.extern.log4j.Log4j2;
import net.lightbody.bmp.BrowserMobProxy;
import net.lightbody.bmp.BrowserMobProxyServer;
import net.lightbody.bmp.client.ClientUtil;
import net.lightbody.bmp.core.har.Har;
import net.lightbody.bmp.core.har.HarEntry;
import net.lightbody.bmp.core.har.HarNameValuePair;
import net.lightbody.bmp.core.har.HarResponse;
import net.lightbody.bmp.proxy.CaptureType;
import org.apache.commons.lang3.StringUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.Proxy;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.logging.LogType;
import org.openqa.selenium.logging.LoggingPreferences;
import org.openqa.selenium.remote.CapabilityType;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;

@Log4j2
public class SeleniumService {

    @Getter
    private WebDriver driver;

    @Setter
    private String driverPath;

    @Setter
    private boolean useProxy;

    @Setter
    private boolean useLog;

    @Setter
    private boolean headless;

    private BrowserMobProxy proxy;

    private String crawlerHost;

    @Setter
    private long pageLoadTime;

    @Getter
    private List<String> hrefList = new ArrayList<>();

    @Getter
    private List<String> downloadUrlList = new ArrayList<>();

    public void init() {
        if (StringUtils.isNotBlank(driverPath)) {
            System.setProperty("webdriver.chrome.driver", driverPath);
        }

        ChromeOptions chromeOptions = new ChromeOptions();
        if (headless) {
            chromeOptions.addArguments("--headless");
        }

        if (this.useLog) {
            LoggingPreferences logPrefs = new LoggingPreferences();
            logPrefs.enable(LogType.BROWSER, Level.ALL);
            chromeOptions.setCapability(CapabilityType.LOGGING_PREFS, logPrefs);
        }

        if (this.useProxy) {
            this.proxy = new BrowserMobProxyServer();
            proxy.start(0);
            proxy.enableHarCaptureTypes(CaptureType.REQUEST_CONTENT, CaptureType.RESPONSE_CONTENT);

            Proxy seleniumProxy = ClientUtil.createSeleniumProxy(proxy);

            chromeOptions.setCapability(CapabilityType.PROXY, seleniumProxy);
        }

        this.driver = new ChromeDriver(chromeOptions);
    }

    private String getHost(String url) {
        String host = url;
        if (host.startsWith("http://")) {
            host = host.substring("http://".length());
        }

        if (host.startsWith("https://")) {
            host = host.substring("https://".length());
        }

        if (host.indexOf('/') != CommonConstants.NOT_FOUND_INDEX) {
            host = host.substring(0, host.indexOf('/'));
        }

        return host;
    }

    public void newHar(String url) {
        if (useProxy) {
            proxy.newHar(crawlerHost);
        }
    }

    public void endHar() {
        if (useProxy) {
            proxy.endHar();
        }
    }

    public void get(String url) {
        crawlerHost = this.getHost(url);

        this.newHar(url);

        driver.get(url);

        if (pageLoadTime > 0) {
            this.waitLoad(pageLoadTime, TimeUnit.SECONDS);
        }

        this.initPageInfo();
    }

    private void initPageInfo() {
        downloadUrlList.add(driver.getCurrentUrl());
        List<WebElement> aList = driver.findElements(By.tagName("a"));
        for (WebElement element: aList) {
            String href = element.getAttribute("href");
            String requestHost = this.getHost(href);
            if (hrefList.indexOf(href) == CommonConstants.NOT_FOUND_INDEX && requestHost.equals(crawlerHost)) {
                hrefList.add(href);
            }
        }

        if (useProxy) {
            Har har = proxy.getHar();
            List<HarEntry> harEntries = har.getLog().getEntries();
            for (HarEntry entry : harEntries) {
                String srcUrl = entry.getRequest().getUrl();
                String requestHost = this.getHost(srcUrl);

                if (requestHost.equals(crawlerHost)) {
                    boolean isHtml = contentTypeHtml(entry.getResponse());
                    if (isHtml) {
                        hrefList.add(srcUrl);
                    }
                }

                downloadUrlList.add(srcUrl);
            }
        }
    }

    public void close() {
        this.proxy.stop();
        this.driver.quit();
    }

    private boolean contentTypeHtml(HarResponse harResponse) {
        List<HarNameValuePair>  harNameValuePairList = harResponse.getHeaders();
        Map<String, String> headerMap = new HashMap<>();
        for (HarNameValuePair harNameValuePair: harNameValuePairList) {
            headerMap.put(harNameValuePair.getName().toLowerCase(), harNameValuePair.getValue());
        }
        String value = headerMap.get("content-type");
        return StringUtils.isNotBlank(value) && value.contains("text/html");
    }

    private void waitLoad(long amount, TimeUnit timeUnit) {
        try {
            Thread.sleep(timeUnit.toMillis(amount));
        } catch (InterruptedException e) {
            log.error("", e);
        }
    }
}

3、爬虫工具类

import lombok.Setter;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.io.FileUtils;

import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLDecoder;
import java.nio.charset.Charset;
import java.util.List;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.function.Function;

/**
 * @author euler
 */
@Log4j2
public class CrawlerService {
    private static final BlockingQueue<String> HISTORY_HREF_QUEUE = new LinkedBlockingQueue<>();

    private static final BlockingQueue<String> HREF_QUEUE = new LinkedBlockingQueue<>();

    private static final BlockingQueue<String> HISTORY_DOWLOAD_QUEUE = new LinkedBlockingQueue<>();

    private static final BlockingQueue<String> DOWLOAD_QUEUE = new LinkedBlockingQueue<>();

    @Setter
    private boolean recursion;

    @Setter
    private String pathPrefix;

    @Setter
    private String driverPath;

    @Setter
    private int crawlerAmount;

    @Setter
    private int downloadAmount;

    private Function<String, String> convertURLToFileName;

    public void init() {
        if (convertURLToFileName == null) {
            convertURLToFileName = (src) -> {
                try {
                    return urlToFileName(src);
                } catch (UnsupportedEncodingException e) {
                    log.error("", e);
                }

                return null;
            };
        }

        // 递归爬虫线程
        if (recursion) {
            for (int i = 0; i < crawlerAmount; i++) {
                startCrawler();
            }
        }

        for (int i = 0; i < downloadAmount; i++) {
            startDownload();
        }
    }

    private void startCrawler() {
        ThreadConstants.execute(new Runnable() {
            @Override
            public void run() {
                while (true) {
                    try {
                        String href = HREF_QUEUE.take();
                        crawler(href);
                    } catch (Exception e) {
                        log.error("", e);
                    }
                }
            }
        });
    }

    private void startDownload() {
        ThreadConstants.execute(new Runnable() {
            @Override
            public void run() {
                while (true) {
                    try {
                        String url = DOWLOAD_QUEUE.take();

                        download(url);
                    } catch (Exception e) {
                        log.error("", e);
                    }
                }
            }
        });
    }

    public void startCrawler(String url) throws InterruptedException {
        HREF_QUEUE.put(url);
    }

    private void download(String url) throws IOException {
        String fileName = convertURLToFileName.apply(url);
        File write = new File(pathPrefix + fileName);
        copyURLToFile(url, write);
    }

    private String urlToFileName(String source) throws UnsupportedEncodingException {
        String urlCurr = URLDecoder.decode(source, Charset.defaultCharset().name());
        if (urlCurr.endsWith("/")) {
            urlCurr += "index.html";
        }

        if (urlCurr.contains("?")) {
            urlCurr = urlCurr.substring(0, urlCurr.indexOf("?"));
        }

        return urlCurr.substring("http://".length());
    }

    private void copyURLToFile(String source, File destination) {
        try {
            URL url = new URL(source);
            FileUtils.copyURLToFile(url, destination);
        } catch (Exception e) {
            log.error("", e);
        }
    }

    private void crawler(String url) throws InterruptedException, IOException {
        SeleniumService seleniumService = new SeleniumService();
        seleniumService.setDriverPath(driverPath);
        seleniumService.setUseProxy(true);
        seleniumService.setPageLoadTime(10L);
        seleniumService.init();

        seleniumService.get(url);

        List<String> downloadUrlList = seleniumService.getDownloadUrlList();
        List<String> hrefList = seleniumService.getHrefList();

        for (String href: hrefList) {
            if (HISTORY_HREF_QUEUE.contains(href)) {
                continue;
            }
            HISTORY_HREF_QUEUE.put(href);
            HREF_QUEUE.put(href);
        }
        for (String download: downloadUrlList) {
            if (HISTORY_DOWLOAD_QUEUE.contains(download)) {
                continue;
            }
            HISTORY_DOWLOAD_QUEUE.put(download);
            DOWLOAD_QUEUE.put(download);
        }

        seleniumService.close();
    }

}

说明：ThreadConstants.execute可以使用默认创建线程池代替，此处为自定义线程池。

4、使用方式

import java.io.IOException;

/**
 * @author euler
 */
public final class SeleniumUtils {
    private static final int CRAWLER_AMOUNT = 6;

    private static final int DOWNLOAD_AMOUNT = 5;

    private SeleniumUtils() {
        // do nothing
    }

    public static void main(String[] args) throws InterruptedException, IOException {
        String url = "http://www.baidu.com";
        String parent = "G:/crawler/";
        String driverPath = "chromedriver.exe";

        CrawlerService crawlerService = new CrawlerService();
        crawlerService.setRecursion(true);
        crawlerService.setPathPrefix(parent);
        crawlerService.setDriverPath(driverPath);
        crawlerService.setCrawlerAmount(CRAWLER_AMOUNT);
        crawlerService.setDownloadAmount(DOWNLOAD_AMOUNT);
        crawlerService.init();

        crawlerService.startCrawler(url);
    }
}