2021-09-12

记使用webmagic的坑(完整爬取小说并保存数据库)

先说说起因,想写一个爬小说的网站,但是都知道,小说详情和内容是有关联关系的,而爬虫其实是一个队列,按顺序排列的,如果正常放url的话,保存内容的时候就需要根据url去数据库查对应的书,感觉十分的不biu特佛(暂时只想到这种方法,不知道有没有什么其他的好的方法可以解决这个的),于是想到了下面这种写法

package com.java2nb.novel.utils;

import com.java2nb.novel.core.config.RedisManager;
import com.java2nb.novel.core.pipeline.BookPipeline;
import com.java2nb.novel.core.utils.IdWorker;
import com.java2nb.novel.entity.Book;
import com.java2nb.novel.entity.BookContent;
import com.java2nb.novel.entity.BookError;
import com.java2nb.novel.entity.BookIndex;
import com.java2nb.novel.mapper.BookErrorMapper;
import groovy.util.logging.Slf4j;
import lombok.RequiredArgsConstructor;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpResponse;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import redis.clients.jedis.JedisPool;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.RedisScheduler;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.utils.CharsetUtils;
import us.codecraft.webmagic.utils.HttpClientUtils;
import us.codecraft.webmagic.utils.HttpConstant;

import java.io.IOException;
import java.nio.charset.Charset;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;


@Component
@Slf4j
@RequiredArgsConstructor
public class ProcessorUtil implements PageProcessor {
    Logger logger = LoggerFactory.getLogger(ProcessorUtil.class);
    private Site site = Site.me().setCharset("utf8").setSleepTime(3000).setTimeOut(10000);
    private final BookPipeline bookPipeline;
    private static Map<String, Object> errorsMap = new HashMap<>();
    private static Map<String, Object> errorsIndexMap = new HashMap<>();
    private final RedisManager redisManager;
    private final JedisPool jedisPool;
    private Page page;
    //分页列表
    private List<String> pageUrlList = new ArrayList<>();
    //书籍列表
    private List<String> bookUrlList = new ArrayList<>();
    //目录列表
    private List<String> directoryUrlList = new ArrayList<>();
    //页面计数器
    private AtomicInteger pageCount = new AtomicInteger(0);
    //书籍计数器
    private AtomicInteger bookCount = new AtomicInteger(0);
    private static final IdWorker idWorker = new IdWorker();
    private AtomicInteger size = new AtomicInteger(-1);
    private Byte ywj = 1;
    private Byte wwj = 0;
    private Date date = new Date();
    private String BOOK_NAME = null;
    private String AUTHOR_NAME = null;
    String pageReg = "https://www.xbiquge.la/fenlei/\\d_\\d+.html$";
    String bookContentReg = "https://www.xbiquge.la/\\d+/\\d+/\\d+.html";
    String bookReg = "https://www.xbiquge.la/\\d+/\\d+/$";
    Long bookId = null;
    Long bookIndexId = null;
    private final BookErrorMapper bookErrorMapper;

    @Override
    public void process(Page page) {
        Selectable url = page.getUrl();
        //通用总页数匹配
//分页页面
        if (url != null) {
            String uri = url.toString();
            if (Pattern.matches(pageReg, uri)) {
                String text = page.getHtml().css("a.last", "text").toString();
                if (text != null) {
                    if (pageCount.intValue() == 1) {
                        logger.info("拿到总页数,并构造总页数集合数据");
                        Integer totalPage = Integer.valueOf(text);
                        for (int i = 1; i <= totalPage; i++) {
                            pageUrlList.add("https://www.xbiquge.la/fenlei/1_" + i + ".html");
                        }
                    } else {
                        if (pageUrlList.size() > 0) {
                            pageCount.incrementAndGet();
                            page.addTargetRequest(pageUrlList.get(pageCount.intValue()));
                        }
                    }
                    //通用添加页面
                    //页面所有书url
                    logger.info("拿到当前页面所有书url集合");
                    bookUrlList = page.getHtml().css("div#newscontent div.l ul li span.s2 a", "href").all();
                    if (bookUrlList.size() > 0) {
                        page.addTargetRequest(bookUrlList.get(0));
                        bookCount.set(0);
                        page.setSkip(true);
                    }
                }
            } else if (Pattern.matches(bookReg, uri)) {
                logger.info("爬取书籍详情页面");
                //书籍详情页面 匹配书名
                String bookName = page.getHtml().css("div#info>h1", "text").toString();
                System.out.println("bookName = " + bookName);
                if (bookName != null) {
                    System.out.println("bookName = " + bookName);
                    Selectable css = page.getHtml().css("div#info p").regex(":.*<").nodes().get(0);
                    //作者名
                    String authorName = page.getHtml().css("div#info p").regex(":.*<").nodes().get(0).regex(":.+").toString();
                    BOOK_NAME = bookName.trim();
                    AUTHOR_NAME = authorName.trim();
                    //是否完结
                    String isEnd = "连载";
                    logger.info("拿到书籍目录uri集合");
                    directoryUrlList = page.getHtml().css("div#list a", "href").all();
                    size.set(directoryUrlList.size() - 1);
                    String girl = "女生";
                    //最新更新时间
//                String updateTime = page.getHtml().css("p.sup").regex("^(?!.*<a)").regex(">([^/]+)</p>").toString();
                    //总字数
//                String wordCount = page.getHtml().css("p.sup").regex(".*<a.*").regex("/span>([^/]+)</p>").toString();
                    //分类
                    String type = "玄幻";
                    //书籍图片
                    String bookUrl = page.getHtml().css("div#fmimg>img", "src").toString();
                    //详细描述
                    String bookDetail = page.getHtml().css("div#intro p:nth-last-child(1)", "text").toString();
                    page.addTargetRequests(directoryUrlList);
                    Book book = new Book();
                    bookId = idWorker.nextId();
                    book.setBookStatus("连载".equals(isEnd) ? wwj : ywj);
                    book.setId(bookId);
                    book.setBookName(BOOK_NAME);
                    book.setAuthorName(AUTHOR_NAME);
                    book.setCatName(type);
                    book.setPicUrl(bookUrl);
                    book.setCatId(1);
                    book.setBookDesc(bookDetail);
                    book.setScore(10.0f);
                    book.setVisitCount(0L);
                    book.setUpdateTime(date);
                    book.setCreateTime(date);
                    book.setCrawlSourceId(1);
                    book.setWorkDirection("女生".equals(girl) ? ywj : wwj);
                    page.putField("book", book);
                    bookCount.incrementAndGet();
                    if (bookCount.intValue() >= bookUrlList.size()) {
                        bookCount.set(0);
                        page.addTargetRequest(pageUrlList.get(pageCount.intValue()));
                    } else {
                        page.addTargetRequest(bookUrlList.get(bookCount.intValue()));
                    }
                }
            } else if (Pattern.matches(bookContentReg, uri)) {
                logger.info("爬取当前书章节内容页面");
                int i = directoryUrlList.indexOf(uri);
                if (i > -1 && size.intValue() >= 0) {
                    String directory = page.getHtml().css("div.bookname>h1", "text").toString();
                    if (directory != null) {
                        BookIndex bookIndex = new BookIndex();
                        bookIndex.setBookId(bookId);
                        bookIndexId = idWorker.nextId();
                        bookIndex.setId(bookIndexId);
                        bookIndex.setIndexName(directory);
                        bookIndex.setCreateTime(date);
                        bookIndex.setUpdateTime(date);
                        bookIndex.setIndexNum(i);
                        page.putField("bookIndex", bookIndex);
                        String content = page.getHtml().css("div#content").toString();

                        if (content != null) {
                            BookContent bookContent = new BookContent();
                            bookContent.setContent(content);
                            bookContent.setIndexId(bookIndexId);
                            page.putField("bookContent", bookContent);
                        }
                        if (i == size.intValue() && size.intValue() >= 0) {
                            logger.info("爬取完当前书所有章节内容");
                            Book book = new Book();
                            book.setId(bookId);
                            book.setLastIndexId(bookIndexId);
                            book.setLastIndexUpdateTime(date);
                            book.setLastIndexName(directory);
                            size.set(-1);
                            bookCount.incrementAndGet();
                            page.putField("updateBookLastIndex", book);
                            Request request = new Request();
                            Integer extra = request.getExtra(Request.CYCLE_TRIED_TIMES);
                            request.putExtra(Request.CYCLE_TRIED_TIMES, extra == null ? 1 : extra + 1);
                            logger.info("放入下一本书url");
                            request.setUrl(bookUrlList.get(bookCount.intValue()));
                            page.addTargetRequest(request);
                        }
                    }
                }
            }
        }
    }

    @Override
    public Site getSite() {
        return site;
    }

    //添加监听 感觉没啥用
    private Spider addWebMagicListener(Spider spider) {
        ArrayList<SpiderListener> listeners = new ArrayList<>();
        listeners.add(new SpiderListener() {
            @Override
            public void onSuccess(Request request) {
            }

            @Override
            public void onError(Request request) {
                System.out.println("error = " + request.getUrl());
                if (pageCount.intValue() == pageUrlList.size()) {
                    spider.setExitWhenComplete(true);
                }
            }
        });
        spider.setSpiderListeners(listeners);
        return spider;
    }

    //启动爬虫
    public void startCrawl() {
        Spider spider = Spider.create(this).addUrl("https://www.xbiquge.la/fenlei/1_1.html")
                .setDownloader(new ProcessorUtil.MyHttpClientDownloader())
                .thread(8)
                .addPipeline(bookPipeline)
                .setScheduler(new RedisScheduler(jedisPool)
                        .setDuplicateRemover(new BloomFilterDuplicateRemover(10000000)));
        spider = addWebMagicListener(spider);
        spider.run();
    }

    //内部类重写HttpClientDownloader
    class MyHttpClientDownloader extends HttpClientDownloader {

        @Override
        protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
            byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
            String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
            Page page = new Page();
            page.setBytes(bytes);
            if (!request.isBinaryContent()) {
                if (charset == null) {
                    charset = this.getHtmlCharset(contentType, bytes);
                }

                page.setCharset(charset);
                page.setRawText(new String(bytes, charset));
            }

            String url = request.getUrl();
            int i = directoryUrlList.indexOf(url);
            page.setUrl(new PlainText(url));
            page.setRequest(request);
            page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
            //不等于200
            if (httpResponse.getStatusLine().getStatusCode() != HttpConstant.StatusCode.CODE_200) {
                if (Pattern.matches(bookContentReg, url)) {
                    logger.info("处理错误页面,保存到数据库" + url);
                    BookError bookError = new BookError();

                    bookError.setId(idWorker.nextId());
                    bookError.setBookId(bookId);
                    bookError.setUrl(page.getUrl().toString());
                    bookError.setCreateDate(new Date());
                    bookError.setZt(Constants.WXF);
                    bookError.setIndexNum(directoryUrlList.indexOf(url));
                    long l = idWorker.nextId();
                    bookError.setBookIndexId(l);
                    bookErrorMapper.insert(bookError);
                    if (i == size.intValue() && size.intValue() >= 0) {
                        logger.info("目录最后一章,修改书籍索引,目录计数器清零,清空书籍目录");
                        Book book1 = new Book();
                        book1.setId(bookId);
                        book1.setLastIndexId(l);
                        book1.setLastIndexUpdateTime(date);
                        book1.setLastIndexName("");
                        page.putField("updateBookLastIndex", book1);
                        bookCount.incrementAndGet();
                        size.set(0);
                        page.addTargetRequest(bookUrlList.get(bookCount.intValue()));

                    }
                    page.setDownloadSuccess(false);
                } else if (Pattern.matches(bookReg, url)) {
                    if (bookCount.intValue() >= bookUrlList.size()) {
                        logger.info("大于每页书集合大小,计数器计0,重置集合");
                        bookCount.set(0);
                        page.addTargetRequest(pageUrlList.get(pageCount.intValue()));
                        page.setSkip(true);
                    } else {
                        bookCount.incrementAndGet();
                        logger.info("书籍页面报503,则爬取下一本" + url);
                        String uri = bookUrlList.get(bookCount.intValue());
                        Request request1 = new Request();
                        request1.setUrl(uri);
                        page.addTargetRequest(request1);
                        System.out.println(" 下一页uri= " + uri);
                        page.setDownloadSuccess(true);
                        page.setSkip(true);
                    }
                }

            } else if (Pattern.matches(pageReg, url)) {
                logger.info("处理分页页面");
                if (pageCount.intValue() < pageUrlList.size()) {
                    pageCount.incrementAndGet();
                    page.addTargetRequest(pageUrlList.get(pageCount.intValue()));
                    page.setSkip(true);
                }
                page.setDownloadSuccess(true);
            }


            page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
            return page;
        }

        private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException {
            String charset = CharsetUtils.detectCharset(contentType, contentBytes);
            if (charset == null) {
                charset = Charset.defaultCharset().name();
            }

            return charset;
        }
    }


}




核心思想就是,所有页面url放一个集合,当前页面书籍url放一个集合,每次都只取一条,当书所有内容url爬取完成时,往队列中插入新的书url,但是这样写问题就出来了,因为很多url会报503,所以准备对错误页面进行处理,我是重写的HttpClientDownloader,在handleResponse方法中添加新的url,但是试了很久这种方法个别时候可以,大部分时间会出现程序直接结束,新添加的request并没有执行(至今不太明白为什么偶尔可以用==)

于是乎一路跟代码吧,发现在HttpClientDownloader的handleResponse方法中是这么写的

 protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
        byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
        String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
        Page page = new Page();
        page.setBytes(bytes);
        if (!request.isBinaryContent()) {
            if (charset == null) {
                charset = this.getHtmlCharset(contentType, bytes);
            }

            page.setCharset(charset);
            page.setRawText(new String(bytes, charset));
        }

        page.setUrl(new PlainText(request.getUrl()));
        page.setRequest(request);
        page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
        page.setDownloadSuccess(true);
        if (this.responseHeader) {
            page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
        }

        return page;
    }

很明显的可以看到这里的page对象跟process中的page对象,完全是两个对象,问题就出在这了,添加request肯定不行,但是这个page又貌似拿不到,所以此路貌似不通。后面又想到在webmagic中有个重试次数,那我直接用它重试次数的方法把我的request加到队列后面不就好了吗?
于是一路找重试次数的方法,可以看到在webmagic的Splider对象中有这样一个方法

    private void processRequest(Request request) {
        Page page;
        if (null != request.getDownloader()) {
            page = request.getDownloader().download(request, this);
        } else {
            page = this.downloader.download(request, this);
        }

        if (page.isDownloadSuccess()) {
            this.onDownloadSuccess(request, page);
        } else {
            this.onDownloaderFail(request);
        }

    }
    private void onDownloaderFail(Request request) {
        if (this.site.getCycleRetryTimes() == 0) {
            this.sleep(this.site.getSleepTime());
        } else {
            this.doCycleRetry(request);
        }

    }

//重试次数
    private void doCycleRetry(Request request) {
        Object cycleTriedTimesObject = request.getExtra("_cycle_tried_times");
        if (cycleTriedTimesObject == null) {
            this.addRequest(((Request)SerializationUtils.clone(request)).setPriority(0L).putExtra("_cycle_tried_times", 1));
        } else {
            int cycleTriedTimes = (Integer)cycleTriedTimesObject;
            ++cycleTriedTimes;
            if (cycleTriedTimes < this.site.getCycleRetryTimes()) {
                this.addRequest(((Request)SerializationUtils.clone(request)).setPriority(0L).putExtra("_cycle_tried_times", cycleTriedTimes));
            }
        }

        this.sleep(this.site.getRetrySleepTime());
    }

意思就是我直接调用这个doCycleRetry方法传入我的request不就好了吗?但是我们可以看到这个对象是私有的,所以我们需要对这个方法扩大一下作用域,于是我只需要继承一下Splider改变下作用域就好了,下面是完整代码

package com.java2nb.novel.utils;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.SerializationUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.pipeline.CollectorPipeline;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.Scheduler;
import us.codecraft.webmagic.thread.CountableThreadPool;
import us.codecraft.webmagic.utils.UrlUtils;
import us.codecraft.webmagic.utils.WMCollections;

import java.io.Closeable;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;

public class MySpider extends Spider {
    public MySpider(PageProcessor pageProcessor) {
        super(pageProcessor);
        this.newUrlCondition = this.newUrlLock.newCondition();
        this.pageCount = new AtomicLong(0L);
        this.emptySleepTime = 30000;
        this.pageProcessor = pageProcessor;
        this.site = pageProcessor.getSite();
    }
    private final ReentrantLock newUrlLock = new ReentrantLock();
    private final Condition newUrlCondition;
    private List<SpiderListener> spiderListeners;
    private final AtomicLong pageCount;
    private Date startTime;
    private int emptySleepTime;

    public static MySpider create(PageProcessor pageProcessor) {
        return new MySpider(pageProcessor);
    }


    public MySpider startUrls(List<String> startUrls) {
        this.checkIfRunning();
        this.startRequests = UrlUtils.convertToRequests(startUrls);
        return this;
    }

    public MySpider startRequest(List<Request> startRequests) {
        this.checkIfRunning();
        this.startRequests = startRequests;
        return this;
    }

    public MySpider setUUID(String uuid) {
        this.uuid = uuid;
        return this;
    }

    /** @deprecated */
    @Deprecated
    public MySpider scheduler(Scheduler scheduler) {
        return this.setScheduler(scheduler);
    }

    public MySpider setScheduler(Scheduler scheduler) {
        this.checkIfRunning();
        Scheduler oldScheduler = this.scheduler;
        this.scheduler = scheduler;
        Request request;
        if (oldScheduler != null) {
            while((request = oldScheduler.poll(this)) != null) {
                this.scheduler.push(request, this);
            }
        }

        return this;
    }

    /** @deprecated */
    @Deprecated
    public MySpider pipeline(Pipeline pipeline) {
        return this.addPipeline(pipeline);
    }

    public MySpider addPipeline(Pipeline pipeline) {
        this.checkIfRunning();
        this.pipelines.add(pipeline);
        return this;
    }

    public MySpider setPipelines(List<Pipeline> pipelines) {
        this.checkIfRunning();
        this.pipelines = pipelines;
        return this;
    }

    public MySpider clearPipeline() {
        this.pipelines = new ArrayList();
        return this;
    }

    /** @deprecated */
    @Deprecated
    public MySpider downloader(Downloader downloader) {
        return this.setDownloader(downloader);
    }

    public MySpider setDownloader(Downloader downloader) {
        this.checkIfRunning();
        this.downloader = downloader;
        return this;
    }

    protected void initComponent() {
        if (this.downloader == null) {
            this.downloader = new HttpClientDownloader();
        }

        if (this.pipelines.isEmpty()) {
            this.pipelines.add(new ConsolePipeline());
        }

        this.downloader.setThread(this.threadNum);
        if (this.threadPool == null || this.threadPool.isShutdown()) {
            if (this.executorService != null && !this.executorService.isShutdown()) {
                this.threadPool = new CountableThreadPool(this.threadNum, this.executorService);
            } else {
                this.threadPool = new CountableThreadPool(this.threadNum);
            }
        }

        if (this.startRequests != null) {
            Iterator var1 = this.startRequests.iterator();

            while(var1.hasNext()) {
                Request request = (Request)var1.next();
                this.addRequest(request);
            }

            this.startRequests.clear();
        }

        this.startTime = new Date();
    }
    @Override
    public void run() {
        this.checkRunningStat();
        this.initComponent();
        this.logger.info("Spider {} started!", this.getUUID());

        while(!Thread.currentThread().isInterrupted() && this.stat.get() == 1) {
            final Request request = this.scheduler.poll(this);
            if (request == null) {
                if (this.threadPool.getThreadAlive() == 0 && this.exitWhenComplete) {
                    break;
                }

                this.waitNewUrl();
            } else {
                this.threadPool.execute(new Runnable() {
                    public void run() {
                        try {
                            MySpider.this.processRequest(request);
                            MySpider.this.onSuccess(request);
                        } catch (Exception var5) {
                            MySpider.this.onError(request, var5);
                            MySpider.this.logger.error("process request " + request + " error", var5);
                        } finally {
                            MySpider.this.pageCount.incrementAndGet();
                            MySpider.this.signalNewUrl();
                        }

                    }
                });
            }
        }

        this.stat.set(2);
        if (this.destroyWhenExit) {
            this.close();
        }

        this.logger.info("Spider {} closed! {} pages downloaded.", this.getUUID(), this.pageCount.get());
    }

    /** @deprecated */
    @Deprecated
    protected void onError(Request request) {
    }

    protected void onError(Request request, Exception e) {
        this.onError(request);
        if (CollectionUtils.isNotEmpty(this.spiderListeners)) {
            Iterator var3 = this.spiderListeners.iterator();

            while(var3.hasNext()) {
                SpiderListener spiderListener = (SpiderListener)var3.next();
                spiderListener.onError(request, e);
            }
        }

    }

    protected void onSuccess(Request request) {
        if (CollectionUtils.isNotEmpty(this.spiderListeners)) {
            Iterator var2 = this.spiderListeners.iterator();

            while(var2.hasNext()) {
                SpiderListener spiderListener = (SpiderListener)var2.next();
                spiderListener.onSuccess(request);
            }
        }

    }

    private void checkRunningStat() {
        int statNow;
        do {
            statNow = this.stat.get();
            if (statNow == 1) {
                throw new IllegalStateException("Spider is already running!");
            }
        } while(!this.stat.compareAndSet(statNow, 1));

    }

    public void close() {
        this.destroyEach(this.downloader);
        this.destroyEach(this.pageProcessor);
        this.destroyEach(this.scheduler);
        Iterator var1 = this.pipelines.iterator();

        while(var1.hasNext()) {
            Pipeline pipeline = (Pipeline)var1.next();
            this.destroyEach(pipeline);
        }

        this.threadPool.shutdown();
    }

    private void destroyEach(Object object) {
        if (object instanceof Closeable) {
            try {
                ((Closeable)object).close();
            } catch (IOException var3) {
                var3.printStackTrace();
            }
        }

    }

    public void test(String... urls) {
        this.initComponent();
        if (urls.length > 0) {
            String[] var2 = urls;
            int var3 = urls.length;

            for(int var4 = 0; var4 < var3; ++var4) {
                String url = var2[var4];
                this.processRequest(new Request(url));
            }
        }

    }

    private void processRequest(Request request) {
        Page page;
        if (null != request.getDownloader()) {
            page = request.getDownloader().download(request, this);
        } else {
            page = this.downloader.download(request, this);
        }

        if (page.isDownloadSuccess()) {
            this.onDownloadSuccess(request, page);
        } else {
            this.onDownloaderFail(request);
        }

    }

    private void onDownloadSuccess(Request request, Page page) {
        if (this.site.getAcceptStatCode().contains(page.getStatusCode())) {
            this.pageProcessor.process(page);
            this.extractAndAddRequests(page, this.spawnUrl);
            if (!page.getResultItems().isSkip()) {
                Iterator var3 = this.pipelines.iterator();

                while(var3.hasNext()) {
                    Pipeline pipeline = (Pipeline)var3.next();
                    pipeline.process(page.getResultItems(), this);
                }
            }
        } else {
            this.logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
        }

        this.sleep(this.site.getSleepTime());
    }

    private void onDownloaderFail(Request request) {
        if (this.site.getCycleRetryTimes() == 0) {
            this.sleep(this.site.getSleepTime());
        } else {
            this.doCycleRetry(request);
        }

    }
//改为public 
    public void doCycleRetry(Request request) {
        Object cycleTriedTimesObject = request.getExtra("_cycle_tried_times");
        if (cycleTriedTimesObject == null) {
            this.addRequest(((Request)SerializationUtils.clone(request)).setPriority(0L).putExtra("_cycle_tried_times", 1));
        } else {
            int cycleTriedTimes = (Integer)cycleTriedTimesObject;
            ++cycleTriedTimes;
            if (cycleTriedTimes < this.site.getCycleRetryTimes()) {
                this.addRequest(((Request)SerializationUtils.clone(request)).setPriority(0L).putExtra("_cycle_tried_times", cycleTriedTimes));
            }
        }

        this.sleep(this.site.getRetrySleepTime());
    }

    protected void sleep(int time) {
        try {
            Thread.sleep((long)time);
        } catch (InterruptedException var3) {
            this.logger.error("Thread interrupted when sleep", var3);
        }

    }

    protected void extractAndAddRequests(Page page, boolean spawnUrl) {
        if (spawnUrl && CollectionUtils.isNotEmpty(page.getTargetRequests())) {
            Iterator var3 = page.getTargetRequests().iterator();

            while(var3.hasNext()) {
                Request request = (Request)var3.next();
                this.addRequest(request);
            }
        }

    }

    private void addRequest(Request request) {
        if (this.site.getDomain() == null && request != null && request.getUrl() != null) {
            this.site.setDomain(UrlUtils.getDomain(request.getUrl()));
        }

        this.scheduler.push(request, this);
    }

    protected void checkIfRunning() {
        if (this.stat.get() == 1) {
            throw new IllegalStateException("Spider is already running!");
        }
    }

    public void runAsync() {
        Thread thread = new Thread(this);
        thread.setDaemon(false);
        thread.start();
    }

    public MySpider addUrl(String... urls) {
        String[] var2 = urls;
        int var3 = urls.length;

        for(int var4 = 0; var4 < var3; ++var4) {
            String url = var2[var4];
            this.addRequest(new Request(url));
        }

        this.signalNewUrl();
        return this;
    }

    public <T> List<T> getAll(Collection<String> urls) {
        this.destroyWhenExit = false;
        this.spawnUrl = false;
        if (this.startRequests != null) {
            this.startRequests.clear();
        }

        Iterator var2 = UrlUtils.convertToRequests(urls).iterator();

        while(var2.hasNext()) {
            Request request = (Request)var2.next();
            this.addRequest(request);
        }

        CollectorPipeline collectorPipeline = this.getCollectorPipeline();
        this.pipelines.add(collectorPipeline);
        this.run();
        this.spawnUrl = true;
        this.destroyWhenExit = true;
        return collectorPipeline.getCollected();
    }

    protected CollectorPipeline getCollectorPipeline() {
        return new ResultItemsCollectorPipeline();
    }

    public <T> T get(String url) {
        List<String> urls = WMCollections.newArrayList(new String[]{url});
        List<T> resultItemses = this.getAll(urls);
        return resultItemses != null && resultItemses.size() > 0 ? resultItemses.get(0) : null;
    }

    public MySpider addRequest(Request... requests) {
        Request[] var2 = requests;
        int var3 = requests.length;

        for(int var4 = 0; var4 < var3; ++var4) {
            Request request = var2[var4];
            this.addRequest(request);
        }

        this.signalNewUrl();
        return this;
    }
    private void waitNewUrl() {
        this.newUrlLock.lock();

        try {
            if (this.threadPool.getThreadAlive() == 0 && this.exitWhenComplete) {
                return;
            }

            this.newUrlCondition.await((long)this.emptySleepTime, TimeUnit.MILLISECONDS);
        } catch (InterruptedException var5) {
            this.logger.warn("waitNewUrl - interrupted, error {}", var5);
        } finally {
            this.newUrlLock.unlock();
        }

    }

    private void signalNewUrl() {
        try {
            this.newUrlLock.lock();
            this.newUrlCondition.signalAll();
        } finally {
            this.newUrlLock.unlock();
        }

    }

    public void start() {
        this.runAsync();
    }

    public void stop() {
        if (this.stat.compareAndSet(1, 2)) {
            this.logger.info("Spider " + this.getUUID() + " stop success!");
        } else {
            this.logger.info("Spider " + this.getUUID() + " stop fail!");
        }

    }

    public MySpider thread(int threadNum) {
        this.checkIfRunning();
        this.threadNum = threadNum;
        if (threadNum <= 0) {
            throw new IllegalArgumentException("threadNum should be more than one!");
        } else {
            return this;
        }
    }

    public MySpider thread(ExecutorService executorService, int threadNum) {
        this.checkIfRunning();
        this.threadNum = threadNum;
        if (threadNum <= 0) {
            throw new IllegalArgumentException("threadNum should be more than one!");
        } else {
            this.executorService = executorService;
            return this;
        }
    }

    public boolean isExitWhenComplete() {
        return this.exitWhenComplete;
    }

    public MySpider setExitWhenComplete(boolean exitWhenComplete) {
        this.exitWhenComplete = exitWhenComplete;
        return this;
    }

    public boolean isSpawnUrl() {
        return this.spawnUrl;
    }

    public long getPageCount() {
        return this.pageCount.get();
    }

    public Spider.Status getStatus() {
        return Spider.Status.fromValue(this.stat.get());
    }

    public int getThreadAlive() {
        return this.threadPool == null ? 0 : this.threadPool.getThreadAlive();
    }

    public MySpider setSpawnUrl(boolean spawnUrl) {
        this.spawnUrl = spawnUrl;
        return this;
    }

    public String getUUID() {
        if (this.uuid != null) {
            return this.uuid;
        } else if (this.site != null) {
            return this.site.getDomain();
        } else {
            this.uuid = UUID.randomUUID().toString();
            return this.uuid;
        }
    }

    public MySpider setExecutorService(ExecutorService executorService) {
        this.checkIfRunning();
        this.executorService = executorService;
        return this;
    }

    public Site getSite() {
        return this.site;
    }

    public List<SpiderListener> getSpiderListeners() {
        return this.spiderListeners;
    }

    public Spider setSpiderListeners(List<SpiderListener> spiderListeners) {
        this.spiderListeners = spiderListeners;
        return this;
    }

    public Date getStartTime() {
        return this.startTime;
    }

    public Scheduler getScheduler() {
        return this.scheduler;
    }

    public void setEmptySleepTime(int emptySleepTime) {
        this.emptySleepTime = emptySleepTime;
    }

    public static enum Status {
        Init(0),
        Running(1),
        Stopped(2);

        private int value;

        private Status(int value) {
            this.value = value;
        }

        int getValue() {
            return this.value;
        }

        public static MySpider.Status fromValue(int value) {
            MySpider.Status[] var1 = values();
            int var2 = var1.length;

            for(int var3 = 0; var3 < var2; ++var3) {
                MySpider.Status status = var1[var3];
                if (status.getValue() == value) {
                    return status;
                }
            }

            return Init;
        }
    }
}

新的ProcessorUtil中直接调用此方法便可解决在downloader中添加request程序直接结束问题

package com.java2nb.novel.utils;

import com.java2nb.novel.core.config.RedisManager;
import com.java2nb.novel.core.pipeline.BookPipeline;
import com.java2nb.novel.core.utils.IdWorker;
import com.java2nb.novel.entity.Book;
import com.java2nb.novel.entity.BookContent;
import com.java2nb.novel.entity.BookError;
import com.java2nb.novel.entity.BookIndex;
import com.java2nb.novel.mapper.BookErrorMapper;
import groovy.util.logging.Slf4j;
import lombok.RequiredArgsConstructor;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpResponse;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import redis.clients.jedis.JedisPool;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.RedisScheduler;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.utils.CharsetUtils;
import us.codecraft.webmagic.utils.HttpClientUtils;
import us.codecraft.webmagic.utils.HttpConstant;

import java.io.IOException;
import java.nio.charset.Charset;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;


@Component
@Slf4j
@RequiredArgsConstructor
public class ProcessorUtil implements PageProcessor {
    Logger logger = LoggerFactory.getLogger(ProcessorUtil.class);
    private Site site = Site.me().setCharset("utf8").setSleepTime(3000).setTimeOut(10000);
    private final BookPipeline bookPipeline;
    private static Map<String, Object> errorsMap = new HashMap<>();
    private static Map<String, Object> errorsIndexMap = new HashMap<>();
    private final RedisManager redisManager;
    private final JedisPool jedisPool;
    private MySpider spider;
    //分页列表
    private List<String> pageUrlList = new ArrayList<>();
    //书籍列表
    private List<String> bookUrlList = new ArrayList<>();
    //目录列表
    private List<String> directoryUrlList = new ArrayList<>();
    //页面计数器
    private AtomicInteger pageCount = new AtomicInteger(1);
    //书籍计数器
    private AtomicInteger bookCount = new AtomicInteger(0);
    private static final IdWorker idWorker = new IdWorker();
    private AtomicInteger size = new AtomicInteger(-1);
    private Byte ywj = 1;
    private Byte wwj = 0;
    private Date date = new Date();
    private String BOOK_NAME = null;
    private String AUTHOR_NAME = null;
    String pageReg = "https://www.xbiquge.la/fenlei/\\d_\\d+.html$";
    String bookContentReg = "https://www.xbiquge.la/\\d+/\\d+/\\d+.html";
    String bookReg = "https://www.xbiquge.la/\\d+/\\d+/$";
    Long bookId = null;
    Long bookIndexId = null;
    private final BookErrorMapper bookErrorMapper;

    @Override
    public void process(Page page) {
        Selectable url = page.getUrl();
        //通用总页数匹配
//分页页面
        if (url != null) {
            String uri = url.toString();
            if (Pattern.matches(pageReg, uri)) {
                String text = page.getHtml().css("a.last", "text").toString();
                if (text != null) {
                    if (pageCount.intValue() == 1) {
                        logger.info("拿到总页数,并构造总页数集合数据");
                        Integer totalPage = Integer.valueOf(text);
                        for (int i = 1; i <= totalPage; i++) {
                            pageUrlList.add("https://www.xbiquge.la/fenlei/1_" + i + ".html");
                        }
                    } else {
                        if (pageUrlList.size() > 0) {
                            pageCount.incrementAndGet();
                            page.addTargetRequest(pageUrlList.get(pageCount.intValue()));
                        }
                    }
                    //通用添加页面
                    //页面所有书url
                    logger.info("拿到当前页面所有书url集合");
                    bookUrlList = page.getHtml().css("div#newscontent div.l ul li span.s2 a", "href").all();
                    if (bookUrlList.size() > 0) {
                        page.addTargetRequest(bookUrlList.get(0));
                        bookCount.set(0);
                        page.setSkip(true);
                    }
                }
            } else if (Pattern.matches(bookReg, uri)) {
                logger.info("爬取书籍详情页面");
                //书籍详情页面 匹配书名
                String bookName = page.getHtml().css("div#info>h1", "text").toString();
                System.out.println("bookName = " + bookName);
                if (bookName != null) {
                    System.out.println("bookName = " + bookName);
                    Selectable css = page.getHtml().css("div#info p").regex(":.*<").nodes().get(0);
                    //作者名
                    String authorName = page.getHtml().css("div#info p").regex(":.*<").nodes().get(0).regex(":.+").toString();
                    BOOK_NAME = bookName.trim();
                    AUTHOR_NAME = authorName.trim();
                    //是否完结
                    String isEnd = "连载";
                    logger.info("拿到书籍目录uri集合");
                    directoryUrlList = page.getHtml().css("div#list a", "href").all();
                    size.set(directoryUrlList.size() - 1);
                    String girl = "女生";
                    //最新更新时间
//                String updateTime = page.getHtml().css("p.sup").regex("^(?!.*<a)").regex(">([^/]+)</p>").toString();
                    //总字数
//                String wordCount = page.getHtml().css("p.sup").regex(".*<a.*").regex("/span>([^/]+)</p>").toString();
                    //分类
                    String type = "玄幻";
                    //书籍图片
                    String bookUrl = page.getHtml().css("div#fmimg>img", "src").toString();
                    //详细描述
                    String bookDetail = page.getHtml().css("div#intro p:nth-last-child(1)", "text").toString();
                    page.addTargetRequests(directoryUrlList);
                    Book book = new Book();
                    bookId = idWorker.nextId();
                    book.setBookStatus("连载".equals(isEnd) ? wwj : ywj);
                    book.setId(bookId);
                    book.setBookName(BOOK_NAME);
                    book.setAuthorName(AUTHOR_NAME);
                    book.setCatName(type);
                    book.setPicUrl(bookUrl);
                    book.setCatId(1);
                    book.setBookDesc(bookDetail);
                    book.setScore(10.0f);
                    book.setVisitCount(0L);
                    book.setUpdateTime(date);
                    book.setCreateTime(date);
                    book.setCrawlSourceId(1);
                    book.setWorkDirection("女生".equals(girl) ? ywj : wwj);
                    page.putField("book", book);
                    bookCount.incrementAndGet();
                    if (bookCount.intValue() >= bookUrlList.size()) {
                        bookCount.set(0);
                        page.addTargetRequest(pageUrlList.get(pageCount.intValue()));
                    } else {
                        page.addTargetRequest(bookUrlList.get(bookCount.intValue()));
                    }
                }
            } else if (Pattern.matches(bookContentReg, uri)) {
                logger.info("爬取当前书章节内容页面");
                int i = directoryUrlList.indexOf(uri);
                if (i > -1 && size.intValue() >= 0) {
                    String directory = page.getHtml().css("div.bookname>h1", "text").toString();
                    if (directory != null) {
                        BookIndex bookIndex = new BookIndex();
                        bookIndex.setBookId(bookId);
                        bookIndexId = idWorker.nextId();
                        bookIndex.setId(bookIndexId);
                        bookIndex.setIndexName(directory);
                        bookIndex.setCreateTime(date);
                        bookIndex.setUpdateTime(date);
                        bookIndex.setIndexNum(i);
                        page.putField("bookIndex", bookIndex);
                        String content = page.getHtml().css("div#content").toString();

                        if (content != null) {
                            BookContent bookContent = new BookContent();
                            bookContent.setContent(content);
                            bookContent.setIndexId(bookIndexId);
                            page.putField("bookContent", bookContent);
                        }
                        if (i == size.intValue() && size.intValue() >= 0) {
                            logger.info("爬取完当前书所有章节内容");
                            Book book = new Book();
                            book.setId(bookId);
                            book.setLastIndexId(bookIndexId);
                            book.setLastIndexUpdateTime(date);
                            book.setLastIndexName(directory);
                            size.set(-1);
                            bookCount.incrementAndGet();
                            page.putField("updateBookLastIndex", book);
                            Request request = new Request();
                            Integer extra = request.getExtra(Request.CYCLE_TRIED_TIMES);
                            request.putExtra(Request.CYCLE_TRIED_TIMES, extra == null ? 1 : extra + 1);
                            logger.info("放入下一本书url");
                            request.setUrl(bookUrlList.get(bookCount.intValue()));
                            page.addTargetRequest(request);
                        }
                    }
                }
            }
        }
    }

    @Override
    public Site getSite() {
        return site;
    }

    //添加监听 感觉没啥用
    private MySpider addWebMagicListener(MySpider spider) {
        ArrayList<SpiderListener> listeners = new ArrayList<>();
        listeners.add(new SpiderListener() {
            @Override
            public void onSuccess(Request request) {
            }

            @Override
            public void onError(Request request) {
                System.out.println("error = " + request.getUrl());
                if (pageCount.intValue() == pageUrlList.size()) {
                    spider.setExitWhenComplete(true);
                }
            }
        });
        spider.setSpiderListeners(listeners);
        return spider;
    }

    //启动爬虫
    public void startCrawl() {
        MySpider spider = MySpider.create(this).addUrl("https://www.xbiquge.la/fenlei/1_1.html")
                .setDownloader(new ProcessorUtil.MyHttpClientDownloader())
                .thread(8)
                .addPipeline(bookPipeline)
                .setScheduler(new RedisScheduler(jedisPool)
                        .setDuplicateRemover(new BloomFilterDuplicateRemover(10000000)));
        spider = addWebMagicListener(spider);
        this.spider=spider;
        spider.run();
    }

    //内部类重写HttpClientDownloader
    class MyHttpClientDownloader extends HttpClientDownloader {

        @Override
        protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
            byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
            String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
            Page page = new Page();
            page.setBytes(bytes);
            if (!request.isBinaryContent()) {
                if (charset == null) {
                    charset = this.getHtmlCharset(contentType, bytes);
                }

                page.setCharset(charset);
                page.setRawText(new String(bytes, charset));
            }

            String url = request.getUrl();
            int i = directoryUrlList.indexOf(url);
            page.setUrl(new PlainText(url));
            page.setRequest(request);
            page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
            //不等于200
            if (httpResponse.getStatusLine().getStatusCode() != HttpConstant.StatusCode.CODE_200) {
                if (Pattern.matches(bookContentReg, url)) {
                    logger.info("处理错误页面,保存到数据库" + url);
                    BookError bookError = new BookError();

                    bookError.setId(idWorker.nextId());
                    bookError.setBookId(bookId);
                    bookError.setUrl(page.getUrl().toString());
                    bookError.setCreateDate(new Date());
                    bookError.setZt(Constants.WXF);
                    bookError.setIndexNum(directoryUrlList.indexOf(url));
                    long l = idWorker.nextId();
                    bookError.setBookIndexId(l);
                    bookErrorMapper.insert(bookError);
                    if (i == size.intValue() && size.intValue() >= 0) {
                        logger.info("目录最后一章,修改书籍索引,目录计数器清零,清空书籍目录");
                        Book book1 = new Book();
                        book1.setId(bookId);
                        book1.setLastIndexId(l);
                        book1.setLastIndexUpdateTime(date);
                        book1.setLastIndexName("");
                        page.putField("updateBookLastIndex", book1);
                        bookCount.incrementAndGet();
                        size.set(0);
                        page.addTargetRequest(bookUrlList.get(bookCount.intValue()));

                    }
                    page.setDownloadSuccess(false);
                } else if (Pattern.matches(bookReg, url)) {
                    if (bookCount.intValue() >= bookUrlList.size()) {
                        logger.info("大于每页书集合大小,计数器计0,重置集合");
                        bookCount.set(0);
                        page.addTargetRequest(pageUrlList.get(pageCount.intValue()));
                        page.setSkip(true);
                    } else {
                        bookCount.incrementAndGet();
                        logger.info("书籍页面报503,则爬取下一本" + url);
                        String uri = bookUrlList.get(bookCount.intValue());
                        Request request1 = new Request();
                        request1.setUrl(uri);
                        spider.doCycleRetry(request1);
                        System.out.println(" 下一页uri= " + uri);
                        page.setDownloadSuccess(true);
                        page.setSkip(true);
                    }
                }

            } else if (Pattern.matches(pageReg, url)) {
                logger.info("处理分页页面");
                if (pageCount.intValue() < pageUrlList.size()) {
                    pageCount.incrementAndGet();
                    page.addTargetRequest(pageUrlList.get(pageCount.intValue()));
                    page.setSkip(true);
                }
                page.setDownloadSuccess(true);
            }


            page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
            return page;
        }

        private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException {
            String charset = CharsetUtils.detectCharset(contentType, contentBytes);
            if (charset == null) {
                charset = Charset.defaultCharset().name();
            }

            return charset;
        }
    }


}




评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值