使用webmagic+Selenium爬取小说
优点:自动爬取,不会爬取到重复数据。
缺点:爬取速度极慢。
1.添加依赖
<!-- selenium-java客户端段 -->
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.141.59</version>
</dependency>
<!--webmagic 核心包-->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
<exclusions>
<exclusion>
<groupId>commons-collections</groupId>
<artifactId>commons-collections</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- webmagic 扩展包 -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- webmagic-selenium -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-selenium</artifactId>
<version>0.7.3</version>
</dependency>
<!--<!– commons-collections –>-->
<dependency>
<groupId>commons-collections</groupId>
<artifactId>commons-collections</artifactId>
<version>3.2.1</version>
</dependency>
2.修改 webmagic-selenium 包中的WebDriverPool 和 SeleniumDownloader
修改1:WebDriverPool中
private static final String DEFAULT_CONFIG_FILE = "/data/webmagic/webmagic-selenium/config.ini";
将 Selenium 配置文件路径写死了,需要改变配置路径:
private static final String DEFAULT_CONFIG_FILE = “selenium.properties”;
同时修改读取配置文件的方式:
sConfig.load(Thread.currentThread().getContextClassLoader().getResourceAsStream(configFile));
修改2:Selenium在调用Chrome浏览器时,Chrome浏览器默认会弹出界面,可以同时设置 setHeadless() 来避免弹出Chrome浏览器。
3.实现代码
3.1.SeleniumDownloader
package com.example;
import org.apache.log4j.Logger;
import org.openqa.selenium.*;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.selector.PlainText;
import java.io.Closeable;
import java.io.IOException;
import java.util.Map;
public class SeleniumDownloader implements Downloader, Closeable {
private volatile WebDriverPool webDriverPool;
private Logger logger = Logger.getLogger(getClass());
private int sleepTime = 0;
private int poolSize = 1;
private static final String DRIVER_PHANTOMJS = "phantomjs";
/**
* 新建
*
* @param chromeDriverPath chromeDriverPath
*/
public SeleniumDownloader(String chromeDriverPath) {
System.getProperties().setProperty("webdriver.chrome.driver",
chromeDriverPath);
}
/**
* Constructor without any filed. Construct PhantomJS browser
*
* @author bob.li.0718@gmail.com
*/
public SeleniumDownloader() {
// System.setProperty("phantomjs.binary.path",
// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
}
/**
* set sleep time to wait until load success
*
* @param sleepTime sleepTime
* @return this
*/
public SeleniumDownloader setSleepTime(int sleepTime) {
this.sleepTime = sleepTime;
return this;
}
//调整:容许浏览器加载JS脚本
// @Override
// public Page download(Request request, Task task) {
// checkInit();
// WebDriver webDriver;
// try {
// webDriver = webDriverPool.get();
// } catch (InterruptedException e) {
// logger.warn("interrupted", e);
// return null;
// }
// logger.info("downloading page " + request.getUrl());
// webDriver.get(request.getUrl());
// try {
// Thread.sleep(sleepTime);
// } catch (InterruptedException e) {
// e.printStackTrace();
// }
// WebDriver.Options manage = webDriver.manage();
// Site site = task.getSite();
// if (site.getCookies() != null) {
// for (Map.Entry<String, String> cookieEntry : site.getCookies()
// .entrySet()) {
// Cookie cookie = new Cookie(cookieEntry.getKey(),
// cookieEntry.getValue());
// manage.addCookie(cookie);
// }
// }
//
// /*
// * TODO You can add mouse event or other processes
// *
// * @author: bob.li.0718@gmail.com
// */
//
// WebElement webElement = webDriver.findElement(By.xpath("/html"));
// String content = webElement.getAttribute("outerHTML");
// Page page = new Page();
// page.setRawText(content);
page.setHtml(new Html(content, request.getUrl()));
// page.setUrl(new PlainText(request.getUrl()));
// page.setRequest(request);
// webDriverPool.returnToPool(webDriver);
// return page;
// }
@Override
public Page download(Request request, Task task) {
checkInit();
WebDriver webDriver;
try {
webDriver = webDriverPool.get();
} catch (InterruptedException e) {
logger.warn("interrupted", e);
return null;
}
logger.info("downloading page " + request.getUrl());
webDriver.get(request.getUrl());
try {
Thread.sleep(sleepTime);
} catch (InterruptedException e) {
e.printStackTrace();
}
WebDriver.Options manage = webDriver.manage();
Site site = task.getSite();
if (site.getCookies() != null) {
for (Map.Entry<String, String> cookieEntry : site.getCookies()
.entrySet()) {
Cookie cookie = new Cookie(cookieEntry.getKey(),
cookieEntry.getValue());
manage.addCookie(cookie);
}
}
/*
* TODO You can add mouse event or other processes
*
* @author: bob.li.0718@gmail.com
*/
//模拟下拉,刷新页面
String js = "";
for (int i=0; i < 20; i++){
System.out.println("休眠1s");
try {
//滚动到最底部
((JavascriptExecutor)webDriver).executeScript("window.scrollTo(0,document.body.scrollHeight)");
//休眠,等待加载页面
Thread.sleep(2000);
//往回滚一点,否则不加载
((JavascriptExecutor)webDriver).executeScript("window.scrollBy(0,-300)");
} catch (InterruptedException e) {
e.printStackTrace();
}
}
WebElement webElement = webDriver.findElement(By.xpath("/html"));
String content = webElement.getAttribute("outerHTML");
Page page = new Page();
page.setRawText(content);
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
webDriverPool.returnToPool(webDriver);
return page;
}
private void checkInit() {
if (webDriverPool == null) {
synchronized (this) {
webDriverPool = new WebDriverPool(poolSize);
}
}
}
@Override
public void setThread(int thread) {
this.poolSize = thread;
}
@Override
public void close() throws IOException {
webDriverPool.closeAll();
}
}
3.2.爬取业务规则
package com.example;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.CollectionUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import java.util.List;
import java.util.regex.Pattern;
public class NovelPageProcessorInBiQuGe implements PageProcessor {
private final Logger LOGGER = LoggerFactory.getLogger(this.getClass());
/**
* 每页URL正则
*/
private static final String CHAPTER_URL = "http://m.biquyun.com/1_1559_\\d+/";
/**
* 每篇文章URL正则
*/
private static final String CONTENT_URL = "http://m.biquyun.com/wapbook/1559_\\d+\\.html";
private Pattern chapterPattern = Pattern.compile(CHAPTER_URL);
private Pattern contentPattern = Pattern.compile(CONTENT_URL);
private Site site;
/**
* 目标URL
*/
private static final String NOVEL_URL = "http://m.biquyun.com/1_1559_1/";
@Override
public void process(Page page) {
String url = page.getUrl().toString();
if (chapterPattern.matcher(url).find()){
chapterProcess(page);
} else if (contentPattern.matcher(url).find()){
contentProcess(page);
} else {
LOGGER.info("该URL:" + url + "不是目标路径");
}
}
/**
* 取出每章节中章节名,小说的内容
* @param page
*/
private void contentProcess(Page page) {
Html pageHtml = page.getHtml();
String bookName = pageHtml.xpath("//h1[@id='chaptertitle']/text()").toString();
String content = pageHtml.xpath("//div[@id='novelcontent']/p/text()").toString();
page.putField("bookName", bookName);
page.putField("content", content);
}
/**
* 取出小说章节列表中所有章节地址,并放进爬取队列
*/
private void chapterProcess(Page page) {
Html pageHtml = page.getHtml();
//取出所有链接
List<String> links = pageHtml.links().all();
if (!CollectionUtils.isEmpty(links)){
links.forEach((link) -> {
//只有每页URL和每篇文章的URL才会进行爬取
if (chapterPattern.matcher(link).find() || contentPattern.matcher(link).find()) {
page.addTargetRequest(link);
}
});
} else {
LOGGER.warn("没有取到小说章节地址!");
}
}
@Override
public Site getSite() {
if (site == null) {
site = Site.me().setDomain("http://m.biquyun.com/1_1559_1/")
.setSleepTime(1000).setCycleRetryTimes(3);
}
return site;
}
public static void main(String[] args) {
Spider.create(new NovelPageProcessorInBiQuGe())
.addUrl(NOVEL_URL)
//自定义Pipeline,需设置文件输出地址
.addPipeline(new NovelFilePipeline("E:\\demo\\novel"))
//修改后的SeleniumDownloader
.setDownloader(new SeleniumDownloader("E:\\small tools\\chromedriver\\chromedriver.exe").setSleepTime(2000))
.thread(5)
.run();
}
}
3.3.处理规则(输出到文件)
package com.example;
import com.sun.xml.internal.stream.writers.UTF8OutputStreamWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.StringUtils;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.utils.FilePersistentBase;
import java.io.*;
public class NovelFilePipeline extends FilePersistentBase implements Pipeline {
private Logger logger = LoggerFactory.getLogger(getClass());
public NovelFilePipeline() {
setPath("E:\\demo\\novel");
}
public NovelFilePipeline(String path) {
setPath(path);
}
@Override
public void process(ResultItems resultItems, Task task) {
String bookName = resultItems.get("bookName");
String rawContent = resultItems.get("content");
if (StringUtils.isEmpty(bookName) || StringUtils.isEmpty(rawContent)){
return;
}
//将空格替换成换行
String content = rawContent.replace(" ", "\r\n\t");
String path = this.path + PATH_SEPERATOR + bookName + ".txt";
PrintWriter writer = null;
try {
writer = new PrintWriter(new UTF8OutputStreamWriter(new FileOutputStream(getFile(path))));
writer.print(content);
writer.flush();
System.out.println("下载"+path+"成功");
} catch (IOException e) {
e.printStackTrace();
} finally {
if (writer != null) {
writer.close();
}
}
}
}
4.运行结果
虽然会有报错,但并无大碍,依旧在持续爬取数据。