使用webmagic+Selenium爬取小说

最新推荐文章于 2023-12-04 18:59:54 发布

清风凌冽

最新推荐文章于 2023-12-04 18:59:54 发布

阅读量1.4k

点赞数

分类专栏： java爬虫之webmagic 文章标签： xpath selenium java 爬虫

本文链接：https://blog.youkuaiyun.com/qq_42897733/article/details/117029321

版权

java爬虫之webmagic 专栏收录该内容

2 篇文章

订阅专栏

使用webmagic+Selenium爬取小说

优点：自动爬取，不会爬取到重复数据。
缺点：爬取速度极慢。

1.添加依赖

		<!-- selenium-java客户端段 -->
		<dependency>
			<groupId>org.seleniumhq.selenium</groupId>
			<artifactId>selenium-java</artifactId>
			<version>3.141.59</version>
		</dependency>
		 <!--webmagic 核心包-->
		<dependency>
			<groupId>us.codecraft</groupId>
			<artifactId>webmagic-core</artifactId>
			<version>0.7.3</version>
			<exclusions>
				<exclusion>
					<groupId>commons-collections</groupId>
					<artifactId>commons-collections</artifactId>
				</exclusion>
			</exclusions>
		</dependency>
		<!-- webmagic 扩展包 -->
		<dependency>
			<groupId>us.codecraft</groupId>
			<artifactId>webmagic-extension</artifactId>
			<version>0.7.3</version>
			<exclusions>
				<exclusion>
					<groupId>org.slf4j</groupId>
					<artifactId>slf4j-log4j12</artifactId>
				</exclusion>
			</exclusions>
		</dependency>
		<!-- webmagic-selenium -->
		<dependency>
			<groupId>us.codecraft</groupId>
			<artifactId>webmagic-selenium</artifactId>
			<version>0.7.3</version>
		</dependency>
		<!--&lt;!&ndash; commons-collections &ndash;&gt;-->
		<dependency>
			<groupId>commons-collections</groupId>
			<artifactId>commons-collections</artifactId>
			<version>3.2.1</version>
		</dependency>

2.修改 webmagic-selenium 包中的WebDriverPool 和 SeleniumDownloader

修改1：WebDriverPool中

private static final String DEFAULT_CONFIG_FILE = "/data/webmagic/webmagic-selenium/config.ini";

将 Selenium 配置文件路径写死了，需要改变配置路径：

private static final String DEFAULT_CONFIG_FILE = “selenium.properties”;

同时修改读取配置文件的方式：

sConfig.load(Thread.currentThread().getContextClassLoader().getResourceAsStream(configFile));

修改2：Selenium在调用Chrome浏览器时，Chrome浏览器默认会弹出界面，可以同时设置 setHeadless() 来避免弹出Chrome浏览器。

3.实现代码

3.1.SeleniumDownloader

package com.example;

import org.apache.log4j.Logger;
import org.openqa.selenium.*;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.selector.PlainText;

import java.io.Closeable;
import java.io.IOException;
import java.util.Map;


public class SeleniumDownloader implements Downloader, Closeable {

    private volatile WebDriverPool webDriverPool;

    private Logger logger = Logger.getLogger(getClass());

    private int sleepTime = 0;

    private int poolSize = 1;

    private static final String DRIVER_PHANTOMJS = "phantomjs";

    /**
     * 新建
     *
     * @param chromeDriverPath chromeDriverPath
     */
    public SeleniumDownloader(String chromeDriverPath) {
        System.getProperties().setProperty("webdriver.chrome.driver",
                chromeDriverPath);
    }

    /**
     * Constructor without any filed. Construct PhantomJS browser
     *
     * @author bob.li.0718@gmail.com
     */
    public SeleniumDownloader() {
        // System.setProperty("phantomjs.binary.path",
        // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
    }

    /**
     * set sleep time to wait until load success
     *
     * @param sleepTime sleepTime
     * @return this
     */
    public SeleniumDownloader setSleepTime(int sleepTime) {
        this.sleepTime = sleepTime;
        return this;
    }
    //调整：容许浏览器加载JS脚本
//    @Override
//    public Page download(Request request, Task task) {
//        checkInit();
//        WebDriver webDriver;
//        try {
//            webDriver = webDriverPool.get();
//        } catch (InterruptedException e) {
//            logger.warn("interrupted", e);
//            return null;
//        }
//        logger.info("downloading page " + request.getUrl());
//        webDriver.get(request.getUrl());
//        try {
//            Thread.sleep(sleepTime);
//        } catch (InterruptedException e) {
//            e.printStackTrace();
//        }
//        WebDriver.Options manage = webDriver.manage();
//        Site site = task.getSite();
//        if (site.getCookies() != null) {
//            for (Map.Entry<String, String> cookieEntry : site.getCookies()
//                    .entrySet()) {
//                Cookie cookie = new Cookie(cookieEntry.getKey(),
//                        cookieEntry.getValue());
//                manage.addCookie(cookie);
//            }
//        }
//
//        /*
//         * TODO You can add mouse event or other processes
//         *
//         * @author: bob.li.0718@gmail.com
//         */
//
//        WebElement webElement = webDriver.findElement(By.xpath("/html"));
//        String content = webElement.getAttribute("outerHTML");
//        Page page = new Page();
//        page.setRawText(content);
        page.setHtml(new Html(content, request.getUrl()));
//        page.setUrl(new PlainText(request.getUrl()));
//        page.setRequest(request);
//        webDriverPool.returnToPool(webDriver);
//        return page;
//    }

    @Override
    public Page download(Request request, Task task) {
        checkInit();
        WebDriver webDriver;
        try {
            webDriver = webDriverPool.get();
        } catch (InterruptedException e) {
            logger.warn("interrupted", e);
            return null;
        }
        logger.info("downloading page " + request.getUrl());
        webDriver.get(request.getUrl());
        try {
            Thread.sleep(sleepTime);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
        WebDriver.Options manage = webDriver.manage();
        Site site = task.getSite();
        if (site.getCookies() != null) {
            for (Map.Entry<String, String> cookieEntry : site.getCookies()
                    .entrySet()) {
                Cookie cookie = new Cookie(cookieEntry.getKey(),
                        cookieEntry.getValue());
                manage.addCookie(cookie);
            }
        }

        /*
         * TODO You can add mouse event or other processes
         *
         * @author: bob.li.0718@gmail.com
         */

        //模拟下拉，刷新页面
        String js = "";
        for (int i=0; i < 20; i++){
            System.out.println("休眠1s");
            try {
                //滚动到最底部
                ((JavascriptExecutor)webDriver).executeScript("window.scrollTo(0,document.body.scrollHeight)");
                //休眠，等待加载页面
                Thread.sleep(2000);
                //往回滚一点，否则不加载
                ((JavascriptExecutor)webDriver).executeScript("window.scrollBy(0,-300)");
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }

        WebElement webElement = webDriver.findElement(By.xpath("/html"));
        String content = webElement.getAttribute("outerHTML");
        Page page = new Page();
        page.setRawText(content);
        page.setUrl(new PlainText(request.getUrl()));
        page.setRequest(request);
        webDriverPool.returnToPool(webDriver);
        return page;
    }


    private void checkInit() {
        if (webDriverPool == null) {
            synchronized (this) {
                webDriverPool = new WebDriverPool(poolSize);
            }
        }
    }

    @Override
    public void setThread(int thread) {
        this.poolSize = thread;
    }

    @Override
    public void close() throws IOException {
        webDriverPool.closeAll();
    }
}

3.2.爬取业务规则

package com.example;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.CollectionUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;

import java.util.List;
import java.util.regex.Pattern;

public class NovelPageProcessorInBiQuGe implements PageProcessor {

    private final Logger LOGGER = LoggerFactory.getLogger(this.getClass());
    /**
     * 每页URL正则
     */
    private static final String CHAPTER_URL = "http://m.biquyun.com/1_1559_\\d+/";
    /**
     * 每篇文章URL正则
     */
    private static final String CONTENT_URL = "http://m.biquyun.com/wapbook/1559_\\d+\\.html";

    private Pattern chapterPattern = Pattern.compile(CHAPTER_URL);
    private Pattern contentPattern = Pattern.compile(CONTENT_URL);

    private Site site;

    /**
     * 目标URL
     */
    private static final String NOVEL_URL = "http://m.biquyun.com/1_1559_1/";


    @Override
    public void process(Page page) {
        String url = page.getUrl().toString();
        if (chapterPattern.matcher(url).find()){
            chapterProcess(page);
        } else if (contentPattern.matcher(url).find()){
            contentProcess(page);
        } else {
            LOGGER.info("该URL：" + url + "不是目标路径");
        }

    }

    /**
     * 取出每章节中章节名，小说的内容
     * @param page
     */
    private void contentProcess(Page page) {
        Html pageHtml = page.getHtml();
        String bookName = pageHtml.xpath("//h1[@id='chaptertitle']/text()").toString();
        String content =  pageHtml.xpath("//div[@id='novelcontent']/p/text()").toString();
        page.putField("bookName", bookName);
        page.putField("content", content);
    }

    /**
     * 取出小说章节列表中所有章节地址，并放进爬取队列
     */
    private void chapterProcess(Page page) {
        Html pageHtml = page.getHtml();
        //取出所有链接
        List<String> links = pageHtml.links().all();
        if (!CollectionUtils.isEmpty(links)){
            links.forEach((link) -> {
                //只有每页URL和每篇文章的URL才会进行爬取
                if (chapterPattern.matcher(link).find() || contentPattern.matcher(link).find()) {
                    page.addTargetRequest(link);
                }
            });
        } else {
            LOGGER.warn("没有取到小说章节地址！");
        }
    }

    @Override
    public Site getSite() {
        if (site == null) {
            site = Site.me().setDomain("http://m.biquyun.com/1_1559_1/")
                    .setSleepTime(1000).setCycleRetryTimes(3);
        }
        return site;
    }

    public static void main(String[] args) {
        Spider.create(new NovelPageProcessorInBiQuGe())
                .addUrl(NOVEL_URL)
                //自定义Pipeline，需设置文件输出地址
                .addPipeline(new NovelFilePipeline("E:\\demo\\novel"))
                //修改后的SeleniumDownloader
                .setDownloader(new SeleniumDownloader("E:\\small tools\\chromedriver\\chromedriver.exe").setSleepTime(2000))
                .thread(5)
                .run();
    }

}

3.3.处理规则（输出到文件）

package com.example;

import com.sun.xml.internal.stream.writers.UTF8OutputStreamWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.StringUtils;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.utils.FilePersistentBase;

import java.io.*;


public class NovelFilePipeline extends FilePersistentBase implements Pipeline {

    private Logger logger = LoggerFactory.getLogger(getClass());

    public NovelFilePipeline() {
        setPath("E:\\demo\\novel");
    }

    public NovelFilePipeline(String path) {
        setPath(path);
    }

    @Override
    public void process(ResultItems resultItems, Task task) {
        String bookName = resultItems.get("bookName");
        String rawContent = resultItems.get("content");
        if (StringUtils.isEmpty(bookName) || StringUtils.isEmpty(rawContent)){
            return;
        }
        //将空格替换成换行
        String content = rawContent.replace("    ", "\r\n\t");
        String path = this.path + PATH_SEPERATOR + bookName + ".txt";
        PrintWriter writer = null;
        try {
            writer = new PrintWriter(new UTF8OutputStreamWriter(new FileOutputStream(getFile(path))));
            writer.print(content);
            writer.flush();
            System.out.println("下载"+path+"成功");
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (writer != null) {
                writer.close();
            }
        }
    }
}