1、maven依赖引入:
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.141.59</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-server</artifactId>
<version>3.141.59</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-api</artifactId>
<version>3.141.59</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-firefox-driver</artifactId>
<version>3.141.59</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-chrome-driver</artifactId>
<version>3.141.59</version>
</dependency>
<dependency>
<groupId>net.lightbody.bmp</groupId>
<artifactId>browsermob-core</artifactId>
<version>2.1.5</version>
<!-- <scope>test</scope>-->
</dependency>
2、Selenium工具
import lombok.Getter;
import lombok.Setter;
import lombok.extern.log4j.Log4j2;
import net.lightbody.bmp.BrowserMobProxy;
import net.lightbody.bmp.BrowserMobProxyServer;
import net.lightbody.bmp.client.ClientUtil;
import net.lightbody.bmp.core.har.Har;
import net.lightbody.bmp.core.har.HarEntry;
import net.lightbody.bmp.core.har.HarNameValuePair;
import net.lightbody.bmp.core.har.HarResponse;
import net.lightbody.bmp.proxy.CaptureType;
import org.apache.commons.lang3.StringUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.Proxy;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.logging.LogType;
import org.openqa.selenium.logging.LoggingPreferences;
import org.openqa.selenium.remote.CapabilityType;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
@Log4j2
public class SeleniumService {
@Getter
private WebDriver driver;
@Setter
private String driverPath;
@Setter
private boolean useProxy;
@Setter
private boolean useLog;
@Setter
private boolean headless;
private BrowserMobProxy proxy;
private String crawlerHost;
@Setter
private long pageLoadTime;
@Getter
private List<String> hrefList = new ArrayList<>();
@Getter
private List<String> downloadUrlList = new ArrayList<>();
public void init() {
if (StringUtils.isNotBlank(driverPath)) {
System.setProperty("webdriver.chrome.driver", driverPath);
}
ChromeOptions chromeOptions = new ChromeOptions();
if (headless) {
chromeOptions.addArguments("--headless");
}
if (this.useLog) {
LoggingPreferences logPrefs = new LoggingPreferences();
logPrefs.enable(LogType.BROWSER, Level.ALL);
chromeOptions.setCapability(CapabilityType.LOGGING_PREFS, logPrefs);
}
if (this.useProxy) {
this.proxy = new BrowserMobProxyServer();
proxy.start(0);
proxy.enableHarCaptureTypes(CaptureType.REQUEST_CONTENT, CaptureType.RESPONSE_CONTENT);
Proxy seleniumProxy = ClientUtil.createSeleniumProxy(proxy);
chromeOptions.setCapability(CapabilityType.PROXY, seleniumProxy);
}
this.driver = new ChromeDriver(chromeOptions);
}
private String getHost(String url) {
String host = url;
if (host.startsWith("http://")) {
host = host.substring("http://".length());
}
if (host.startsWith("https://")) {
host = host.substring("https://".length());
}
if (host.indexOf('/') != CommonConstants.NOT_FOUND_INDEX) {
host = host.substring(0, host.indexOf('/'));
}
return host;
}
public void newHar(String url) {
if (useProxy) {
proxy.newHar(crawlerHost);
}
}
public void endHar() {
if (useProxy) {
proxy.endHar();
}
}
public void get(String url) {
crawlerHost = this.getHost(url);
this.newHar(url);
driver.get(url);
if (pageLoadTime > 0) {
this.waitLoad(pageLoadTime, TimeUnit.SECONDS);
}
this.initPageInfo();
}
private void initPageInfo() {
downloadUrlList.add(driver.getCurrentUrl());
List<WebElement> aList = driver.findElements(By.tagName("a"));
for (WebElement element: aList) {
String href = element.getAttribute("href");
String requestHost = this.getHost(href);
if (hrefList.indexOf(href) == CommonConstants.NOT_FOUND_INDEX && requestHost.equals(crawlerHost)) {
hrefList.add(href);
}
}
if (useProxy) {
Har har = proxy.getHar();
List<HarEntry> harEntries = har.getLog().getEntries();
for (HarEntry entry : harEntries) {
String srcUrl = entry.getRequest().getUrl();
String requestHost = this.getHost(srcUrl);
if (requestHost.equals(crawlerHost)) {
boolean isHtml = contentTypeHtml(entry.getResponse());
if (isHtml) {
hrefList.add(srcUrl);
}
}
downloadUrlList.add(srcUrl);
}
}
}
public void close() {
this.proxy.stop();
this.driver.quit();
}
private boolean contentTypeHtml(HarResponse harResponse) {
List<HarNameValuePair> harNameValuePairList = harResponse.getHeaders();
Map<String, String> headerMap = new HashMap<>();
for (HarNameValuePair harNameValuePair: harNameValuePairList) {
headerMap.put(harNameValuePair.getName().toLowerCase(), harNameValuePair.getValue());
}
String value = headerMap.get("content-type");
return StringUtils.isNotBlank(value) && value.contains("text/html");
}
private void waitLoad(long amount, TimeUnit timeUnit) {
try {
Thread.sleep(timeUnit.toMillis(amount));
} catch (InterruptedException e) {
log.error("", e);
}
}
}
3、爬虫工具类
import lombok.Setter;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.io.FileUtils;
import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLDecoder;
import java.nio.charset.Charset;
import java.util.List;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.function.Function;
/**
* @author euler
*/
@Log4j2
public class CrawlerService {
private static final BlockingQueue<String> HISTORY_HREF_QUEUE = new LinkedBlockingQueue<>();
private static final BlockingQueue<String> HREF_QUEUE = new LinkedBlockingQueue<>();
private static final BlockingQueue<String> HISTORY_DOWLOAD_QUEUE = new LinkedBlockingQueue<>();
private static final BlockingQueue<String> DOWLOAD_QUEUE = new LinkedBlockingQueue<>();
@Setter
private boolean recursion;
@Setter
private String pathPrefix;
@Setter
private String driverPath;
@Setter
private int crawlerAmount;
@Setter
private int downloadAmount;
private Function<String, String> convertURLToFileName;
public void init() {
if (convertURLToFileName == null) {
convertURLToFileName = (src) -> {
try {
return urlToFileName(src);
} catch (UnsupportedEncodingException e) {
log.error("", e);
}
return null;
};
}
// 递归爬虫线程
if (recursion) {
for (int i = 0; i < crawlerAmount; i++) {
startCrawler();
}
}
for (int i = 0; i < downloadAmount; i++) {
startDownload();
}
}
private void startCrawler() {
ThreadConstants.execute(new Runnable() {
@Override
public void run() {
while (true) {
try {
String href = HREF_QUEUE.take();
crawler(href);
} catch (Exception e) {
log.error("", e);
}
}
}
});
}
private void startDownload() {
ThreadConstants.execute(new Runnable() {
@Override
public void run() {
while (true) {
try {
String url = DOWLOAD_QUEUE.take();
download(url);
} catch (Exception e) {
log.error("", e);
}
}
}
});
}
public void startCrawler(String url) throws InterruptedException {
HREF_QUEUE.put(url);
}
private void download(String url) throws IOException {
String fileName = convertURLToFileName.apply(url);
File write = new File(pathPrefix + fileName);
copyURLToFile(url, write);
}
private String urlToFileName(String source) throws UnsupportedEncodingException {
String urlCurr = URLDecoder.decode(source, Charset.defaultCharset().name());
if (urlCurr.endsWith("/")) {
urlCurr += "index.html";
}
if (urlCurr.contains("?")) {
urlCurr = urlCurr.substring(0, urlCurr.indexOf("?"));
}
return urlCurr.substring("http://".length());
}
private void copyURLToFile(String source, File destination) {
try {
URL url = new URL(source);
FileUtils.copyURLToFile(url, destination);
} catch (Exception e) {
log.error("", e);
}
}
private void crawler(String url) throws InterruptedException, IOException {
SeleniumService seleniumService = new SeleniumService();
seleniumService.setDriverPath(driverPath);
seleniumService.setUseProxy(true);
seleniumService.setPageLoadTime(10L);
seleniumService.init();
seleniumService.get(url);
List<String> downloadUrlList = seleniumService.getDownloadUrlList();
List<String> hrefList = seleniumService.getHrefList();
for (String href: hrefList) {
if (HISTORY_HREF_QUEUE.contains(href)) {
continue;
}
HISTORY_HREF_QUEUE.put(href);
HREF_QUEUE.put(href);
}
for (String download: downloadUrlList) {
if (HISTORY_DOWLOAD_QUEUE.contains(download)) {
continue;
}
HISTORY_DOWLOAD_QUEUE.put(download);
DOWLOAD_QUEUE.put(download);
}
seleniumService.close();
}
}
说明:ThreadConstants.execute可以使用默认创建线程池代替,此处为自定义线程池。
4、使用方式
import java.io.IOException;
/**
* @author euler
*/
public final class SeleniumUtils {
private static final int CRAWLER_AMOUNT = 6;
private static final int DOWNLOAD_AMOUNT = 5;
private SeleniumUtils() {
// do nothing
}
public static void main(String[] args) throws InterruptedException, IOException {
String url = "http://www.baidu.com";
String parent = "G:/crawler/";
String driverPath = "chromedriver.exe";
CrawlerService crawlerService = new CrawlerService();
crawlerService.setRecursion(true);
crawlerService.setPathPrefix(parent);
crawlerService.setDriverPath(driverPath);
crawlerService.setCrawlerAmount(CRAWLER_AMOUNT);
crawlerService.setDownloadAmount(DOWNLOAD_AMOUNT);
crawlerService.init();
crawlerService.startCrawler(url);
}
}