记使用webmagic的坑(完整爬取小说并保存数据库)
先说说起因,想写一个爬小说的网站,但是都知道,小说详情和内容是有关联关系的,而爬虫其实是一个队列,按顺序排列的,如果正常放url的话,保存内容的时候就需要根据url去数据库查对应的书,感觉十分的不biu特佛(暂时只想到这种方法,不知道有没有什么其他的好的方法可以解决这个的),于是想到了下面这种写法
package com.java2nb.novel.utils;
import com.java2nb.novel.core.config.RedisManager;
import com.java2nb.novel.core.pipeline.BookPipeline;
import com.java2nb.novel.core.utils.IdWorker;
import com.java2nb.novel.entity.Book;
import com.java2nb.novel.entity.BookContent;
import com.java2nb.novel.entity.BookError;
import com.java2nb.novel.entity.BookIndex;
import com.java2nb.novel.mapper.BookErrorMapper;
import groovy.util.logging.Slf4j;
import lombok.RequiredArgsConstructor;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpResponse;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import redis.clients.jedis.JedisPool;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.RedisScheduler;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.utils.CharsetUtils;
import us.codecraft.webmagic.utils.HttpClientUtils;
import us.codecraft.webmagic.utils.HttpConstant;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
@Component
@Slf4j
@RequiredArgsConstructor
public class ProcessorUtil implements PageProcessor {
Logger logger = LoggerFactory.getLogger(ProcessorUtil.class);
private Site site = Site.me().setCharset("utf8").setSleepTime(3000).setTimeOut(10000);
private final BookPipeline bookPipeline;
private static Map<String, Object> errorsMap = new HashMap<>();
private static Map<String, Object> errorsIndexMap = new HashMap<>();
private final RedisManager redisManager;
private final JedisPool jedisPool;
private Page page;
//分页列表
private List<String> pageUrlList = new ArrayList<>();
//书籍列表
private List<String> bookUrlList = new ArrayList<>();
//目录列表
private List<String> directoryUrlList = new ArrayList<>();
//页面计数器
private AtomicInteger pageCount = new AtomicInteger(0);
//书籍计数器
private AtomicInteger bookCount = new AtomicInteger(0);
private static final IdWorker idWorker = new IdWorker();
private AtomicInteger size = new AtomicInteger(-1);
private Byte ywj = 1;
private Byte wwj = 0;
private Date date = new Date();
private String BOOK_NAME = null;
private String AUTHOR_NAME = null;
String pageReg = "https://www.xbiquge.la/fenlei/\\d_\\d+.html$";
String bookContentReg = "https://www.xbiquge.la/\\d+/\\d+/\\d+.html";
String bookReg = "https://www.xbiquge.la/\\d+/\\d+/$";
Long bookId = null;
Long bookIndexId = null;
private final BookErrorMapper bookErrorMapper;
@Override
public void process(Page page) {
Selectable url = page.getUrl();
//通用总页数匹配
//分页页面
if (url != null) {
String uri = url.toString();
if (Pattern.matches(pageReg, uri)) {
String text = page.getHtml().css("a.last", "text").toString();
if (text != null) {
if (pageCount.intValue() == 1) {
logger.info("拿到总页数,并构造总页数集合数据");
Integer totalPage = Integer.valueOf(text);
for (int i = 1; i <= totalPage; i++) {
pageUrlList.add("https://www.xbiquge.la/fenlei/1_" + i + ".html");
}
} else {
if (pageUrlList.size() > 0) {
pageCount.incrementAndGet();
page.addTargetRequest(pageUrlList.get(pageCount.intValue()));
}
}
//通用添加页面
//页面所有书url
logger.info("拿到当前页面所有书url集合");
bookUrlList = page.getHtml().css("div#newscontent div.l ul li span.s2 a", "href").all();
if (bookUrlList.size() > 0) {
page.addTargetRequest(bookUrlList.get(0));
bookCount.set(0);
page.setSkip(true);
}
}
} else if (Pattern.matches(bookReg, uri)) {
logger.info("爬取书籍详情页面");
//书籍详情页面 匹配书名
String bookName = page.getHtml().css("div#info>h1", "text").toString();
System.out.println("bookName = " + bookName);
if (bookName != null) {
System.out.println("bookName = " + bookName);
Selectable css = page.getHtml().css("div#info p").regex(":.*<").nodes().get(0);
//作者名
String authorName = page.getHtml().css("div#info p").regex(":.*<").nodes().get(0).regex(":.+").toString();
BOOK_NAME = bookName.trim();
AUTHOR_NAME = authorName.trim();
//是否完结
String isEnd = "连载";
logger.info("拿到书籍目录uri集合");
directoryUrlList = page.getHtml().css("div#list a", "href").all();
size.set(directoryUrlList.size() - 1);
String girl = "女生";
//最新更新时间
// String updateTime = page.getHtml().css("p.sup").regex("^(?!.*<a)").regex(">([^/]+)</p>").toString();
//总字数
// String wordCount = page.getHtml().css("p.sup").regex(".*<a.*").regex("/span>([^/]+)</p>").toString();
//分类
String type = "玄幻";
//书籍图片
String bookUrl = page.getHtml().css("div#fmimg>img", "src").toString();
//详细描述
String bookDetail = page.getHtml().css("div#intro p:nth-last-child(1)", "text").toString();
page.addTargetRequests(directoryUrlList);
Book book = new Book();
bookId = idWorker.nextId();
book.setBookStatus("连载".equals(isEnd) ? wwj : ywj);
book.setId(bookId);
book.setBookName(BOOK_NAME);
book.setAuthorName(AUTHOR_NAME);
book.setCatName(type);
book.setPicUrl(bookUrl);
book.setCatId(1);
book.setBookDesc(bookDetail);
book.setScore(10.0f);
book.setVisitCount(0L);
book.setUpdateTime(date);
book.setCreateTime(date);
book.setCrawlSourceId(1);
book.setWorkDirection("女生".equals(girl) ? ywj : wwj);
page.putField("book", book);
bookCount.incrementAndGet();
if (bookCount.intValue() >= bookUrlList.size()) {
bookCount.set(0);
page.addTargetRequest(pageUrlList.get(pageCount.intValue()));
} else {
page.addTargetRequest(bookUrlList.get(bookCount.intValue()));
}
}
} else if (Pattern.matches(bookContentReg, uri)) {
logger.info("爬取当前书章节内容页面");
int i = directoryUrlList.indexOf(uri);
if (i > -1 && size.intValue() >= 0) {
String directory = page.getHtml().css("div.bookname>h1", "text").toString();
if (directory != null) {
BookIndex bookIndex = new BookIndex();
bookIndex.setBookId(bookId);
bookIndexId = idWorker.nextId();
bookIndex.setId(bookIndexId);
bookIndex.setIndexName(directory);
bookIndex.setCreateTime(date);
bookIndex.setUpdateTime(date);
bookIndex.setIndexNum(i);
page.putField("bookIndex", bookIndex);
String content = page.getHtml().css("div#content").toString();
if (content != null) {
BookContent bookContent = new BookContent();
bookContent.setContent(content);
bookContent.setIndexId(bookIndexId);
page.putField("bookContent", bookContent);
}
if (i == size.intValue() && size.intValue() >= 0) {
logger.info("爬取完当前书所有章节内容");
Book book = new Book();
book.setId(bookId);
book.setLastIndexId(bookIndexId);
book.setLastIndexUpdateTime(date);
book.setLastIndexName(directory);
size.set(-1);
bookCount.incrementAndGet();
page.putField("updateBookLastIndex", book);
Request request = new Request();
Integer extra = request.getExtra(Request.CYCLE_TRIED_TIMES);
request.putExtra(Request.CYCLE_TRIED_TIMES, extra == null ? 1 : extra + 1);
logger.info("放入下一本书url");
request.setUrl(bookUrlList.get(bookCount.intValue()));
page.addTargetRequest(request);
}
}
}
}
}
}
@Override
public Site getSite() {
return site;
}
//添加监听 感觉没啥用
private Spider addWebMagicListener(Spider spider) {
ArrayList<SpiderListener> listeners = new ArrayList<>();
listeners.add(new SpiderListener() {
@Override
public void onSuccess(Request request) {
}
@Override
public void onError(Request request) {
System.out.println("error = " + request.getUrl());
if (pageCount.intValue() == pageUrlList.size()) {
spider.setExitWhenComplete(true);
}
}
});
spider.setSpiderListeners(listeners);
return spider;
}
//启动爬虫
public void startCrawl() {
Spider spider = Spider.create(this).addUrl("https://www.xbiquge.la/fenlei/1_1.html")
.setDownloader(new ProcessorUtil.MyHttpClientDownloader())
.thread(8)
.addPipeline(bookPipeline)
.setScheduler(new RedisScheduler(jedisPool)
.setDuplicateRemover(new BloomFilterDuplicateRemover(10000000)));
spider = addWebMagicListener(spider);
spider.run();
}
//内部类重写HttpClientDownloader
class MyHttpClientDownloader extends HttpClientDownloader {
@Override
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
Page page = new Page();
page.setBytes(bytes);
if (!request.isBinaryContent()) {
if (charset == null) {
charset = this.getHtmlCharset(contentType, bytes);
}
page.setCharset(charset);
page.setRawText(new String(bytes, charset));
}
String url = request.getUrl();
int i = directoryUrlList.indexOf(url);
page.setUrl(new PlainText(url));
page.setRequest(request);
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
//不等于200
if (httpResponse.getStatusLine().getStatusCode() != HttpConstant.StatusCode.CODE_200) {
if (Pattern.matches(bookContentReg, url)) {
logger.info("处理错误页面,保存到数据库" + url);
BookError bookError = new BookError();
bookError.setId(idWorker.nextId());
bookError.setBookId(bookId);
bookError.setUrl(page.getUrl().toString());
bookError.setCreateDate(new Date());
bookError.setZt(Constants.WXF);
bookError.setIndexNum(directoryUrlList.indexOf(url));
long l = idWorker.nextId();
bookError.setBookIndexId(l);
bookErrorMapper.insert(bookError);
if (i == size.intValue() && size.intValue() >= 0) {
logger.info("目录最后一章,修改书籍索引,目录计数器清零,清空书籍目录");
Book book1 = new Book();
book1.setId(bookId);
book1.setLastIndexId(l);
book1.setLastIndexUpdateTime(date);
book1.setLastIndexName("");
page.putField("updateBookLastIndex", book1);
bookCount.incrementAndGet();
size.set(0);
page.addTargetRequest(bookUrlList.get(bookCount.intValue()));
}
page.setDownloadSuccess(false);
} else if (Pattern.matches(bookReg, url)) {
if (bookCount.intValue() >= bookUrlList.size()) {
logger.info("大于每页书集合大小,计数器计0,重置集合");
bookCount.set(0);
page.addTargetRequest(pageUrlList.get(pageCount.intValue()));
page.setSkip(true);
} else {
bookCount.incrementAndGet();
logger.info("书籍页面报503,则爬取下一本" + url);
String uri = bookUrlList.get(bookCount.intValue());
Request request1 = new Request();
request1.setUrl(uri);
page.addTargetRequest(request1);
System.out.println(" 下一页uri= " + uri);
page.setDownloadSuccess(true);
page.setSkip(true);
}
}
} else if (Pattern.matches(pageReg, url)) {
logger.info("处理分页页面");
if (pageCount.intValue() < pageUrlList.size()) {
pageCount.incrementAndGet();
page.addTargetRequest(pageUrlList.get(pageCount.intValue()));
page.setSkip(true);
}
page.setDownloadSuccess(true);
}
page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
return page;
}
private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException {
String charset = CharsetUtils.detectCharset(contentType, contentBytes);
if (charset == null) {
charset = Charset.defaultCharset().name();
}
return charset;
}
}
}
核心思想就是,所有页面url放一个集合,当前页面书籍url放一个集合,每次都只取一条,当书所有内容url爬取完成时,往队列中插入新的书url,但是这样写问题就出来了,因为很多url会报503,所以准备对错误页面进行处理,我是重写的HttpClientDownloader,在handleResponse方法中添加新的url,但是试了很久这种方法个别时候可以,大部分时间会出现程序直接结束,新添加的request并没有执行(至今不太明白为什么偶尔可以用==)
于是乎一路跟代码吧,发现在HttpClientDownloader的handleResponse方法中是这么写的
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
Page page = new Page();
page.setBytes(bytes);
if (!request.isBinaryContent()) {
if (charset == null) {
charset = this.getHtmlCharset(contentType, bytes);
}
page.setCharset(charset);
page.setRawText(new String(bytes, charset));
}
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
page.setDownloadSuccess(true);
if (this.responseHeader) {
page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
}
return page;
}
很明显的可以看到这里的page对象跟process中的page对象,完全是两个对象,问题就出在这了,添加request肯定不行,但是这个page又貌似拿不到,所以此路貌似不通。后面又想到在webmagic中有个重试次数,那我直接用它重试次数的方法把我的request加到队列后面不就好了吗?
于是一路找重试次数的方法,可以看到在webmagic的Splider对象中有这样一个方法
private void processRequest(Request request) {
Page page;
if (null != request.getDownloader()) {
page = request.getDownloader().download(request, this);
} else {
page = this.downloader.download(request, this);
}
if (page.isDownloadSuccess()) {
this.onDownloadSuccess(request, page);
} else {
this.onDownloaderFail(request);
}
}
private void onDownloaderFail(Request request) {
if (this.site.getCycleRetryTimes() == 0) {
this.sleep(this.site.getSleepTime());
} else {
this.doCycleRetry(request);
}
}
//重试次数
private void doCycleRetry(Request request) {
Object cycleTriedTimesObject = request.getExtra("_cycle_tried_times");
if (cycleTriedTimesObject == null) {
this.addRequest(((Request)SerializationUtils.clone(request)).setPriority(0L).putExtra("_cycle_tried_times", 1));
} else {
int cycleTriedTimes = (Integer)cycleTriedTimesObject;
++cycleTriedTimes;
if (cycleTriedTimes < this.site.getCycleRetryTimes()) {
this.addRequest(((Request)SerializationUtils.clone(request)).setPriority(0L).putExtra("_cycle_tried_times", cycleTriedTimes));
}
}
this.sleep(this.site.getRetrySleepTime());
}
意思就是我直接调用这个doCycleRetry方法传入我的request不就好了吗?但是我们可以看到这个对象是私有的,所以我们需要对这个方法扩大一下作用域,于是我只需要继承一下Splider改变下作用域就好了,下面是完整代码
package com.java2nb.novel.utils;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.SerializationUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.pipeline.CollectorPipeline;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.Scheduler;
import us.codecraft.webmagic.thread.CountableThreadPool;
import us.codecraft.webmagic.utils.UrlUtils;
import us.codecraft.webmagic.utils.WMCollections;
import java.io.Closeable;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
public class MySpider extends Spider {
public MySpider(PageProcessor pageProcessor) {
super(pageProcessor);
this.newUrlCondition = this.newUrlLock.newCondition();
this.pageCount = new AtomicLong(0L);
this.emptySleepTime = 30000;
this.pageProcessor = pageProcessor;
this.site = pageProcessor.getSite();
}
private final ReentrantLock newUrlLock = new ReentrantLock();
private final Condition newUrlCondition;
private List<SpiderListener> spiderListeners;
private final AtomicLong pageCount;
private Date startTime;
private int emptySleepTime;
public static MySpider create(PageProcessor pageProcessor) {
return new MySpider(pageProcessor);
}
public MySpider startUrls(List<String> startUrls) {
this.checkIfRunning();
this.startRequests = UrlUtils.convertToRequests(startUrls);
return this;
}
public MySpider startRequest(List<Request> startRequests) {
this.checkIfRunning();
this.startRequests = startRequests;
return this;
}
public MySpider setUUID(String uuid) {
this.uuid = uuid;
return this;
}
/** @deprecated */
@Deprecated
public MySpider scheduler(Scheduler scheduler) {
return this.setScheduler(scheduler);
}
public MySpider setScheduler(Scheduler scheduler) {
this.checkIfRunning();
Scheduler oldScheduler = this.scheduler;
this.scheduler = scheduler;
Request request;
if (oldScheduler != null) {
while((request = oldScheduler.poll(this)) != null) {
this.scheduler.push(request, this);
}
}
return this;
}
/** @deprecated */
@Deprecated
public MySpider pipeline(Pipeline pipeline) {
return this.addPipeline(pipeline);
}
public MySpider addPipeline(Pipeline pipeline) {
this.checkIfRunning();
this.pipelines.add(pipeline);
return this;
}
public MySpider setPipelines(List<Pipeline> pipelines) {
this.checkIfRunning();
this.pipelines = pipelines;
return this;
}
public MySpider clearPipeline() {
this.pipelines = new ArrayList();
return this;
}
/** @deprecated */
@Deprecated
public MySpider downloader(Downloader downloader) {
return this.setDownloader(downloader);
}
public MySpider setDownloader(Downloader downloader) {
this.checkIfRunning();
this.downloader = downloader;
return this;
}
protected void initComponent() {
if (this.downloader == null) {
this.downloader = new HttpClientDownloader();
}
if (this.pipelines.isEmpty()) {
this.pipelines.add(new ConsolePipeline());
}
this.downloader.setThread(this.threadNum);
if (this.threadPool == null || this.threadPool.isShutdown()) {
if (this.executorService != null && !this.executorService.isShutdown()) {
this.threadPool = new CountableThreadPool(this.threadNum, this.executorService);
} else {
this.threadPool = new CountableThreadPool(this.threadNum);
}
}
if (this.startRequests != null) {
Iterator var1 = this.startRequests.iterator();
while(var1.hasNext()) {
Request request = (Request)var1.next();
this.addRequest(request);
}
this.startRequests.clear();
}
this.startTime = new Date();
}
@Override
public void run() {
this.checkRunningStat();
this.initComponent();
this.logger.info("Spider {} started!", this.getUUID());
while(!Thread.currentThread().isInterrupted() && this.stat.get() == 1) {
final Request request = this.scheduler.poll(this);
if (request == null) {
if (this.threadPool.getThreadAlive() == 0 && this.exitWhenComplete) {
break;
}
this.waitNewUrl();
} else {
this.threadPool.execute(new Runnable() {
public void run() {
try {
MySpider.this.processRequest(request);
MySpider.this.onSuccess(request);
} catch (Exception var5) {
MySpider.this.onError(request, var5);
MySpider.this.logger.error("process request " + request + " error", var5);
} finally {
MySpider.this.pageCount.incrementAndGet();
MySpider.this.signalNewUrl();
}
}
});
}
}
this.stat.set(2);
if (this.destroyWhenExit) {
this.close();
}
this.logger.info("Spider {} closed! {} pages downloaded.", this.getUUID(), this.pageCount.get());
}
/** @deprecated */
@Deprecated
protected void onError(Request request) {
}
protected void onError(Request request, Exception e) {
this.onError(request);
if (CollectionUtils.isNotEmpty(this.spiderListeners)) {
Iterator var3 = this.spiderListeners.iterator();
while(var3.hasNext()) {
SpiderListener spiderListener = (SpiderListener)var3.next();
spiderListener.onError(request, e);
}
}
}
protected void onSuccess(Request request) {
if (CollectionUtils.isNotEmpty(this.spiderListeners)) {
Iterator var2 = this.spiderListeners.iterator();
while(var2.hasNext()) {
SpiderListener spiderListener = (SpiderListener)var2.next();
spiderListener.onSuccess(request);
}
}
}
private void checkRunningStat() {
int statNow;
do {
statNow = this.stat.get();
if (statNow == 1) {
throw new IllegalStateException("Spider is already running!");
}
} while(!this.stat.compareAndSet(statNow, 1));
}
public void close() {
this.destroyEach(this.downloader);
this.destroyEach(this.pageProcessor);
this.destroyEach(this.scheduler);
Iterator var1 = this.pipelines.iterator();
while(var1.hasNext()) {
Pipeline pipeline = (Pipeline)var1.next();
this.destroyEach(pipeline);
}
this.threadPool.shutdown();
}
private void destroyEach(Object object) {
if (object instanceof Closeable) {
try {
((Closeable)object).close();
} catch (IOException var3) {
var3.printStackTrace();
}
}
}
public void test(String... urls) {
this.initComponent();
if (urls.length > 0) {
String[] var2 = urls;
int var3 = urls.length;
for(int var4 = 0; var4 < var3; ++var4) {
String url = var2[var4];
this.processRequest(new Request(url));
}
}
}
private void processRequest(Request request) {
Page page;
if (null != request.getDownloader()) {
page = request.getDownloader().download(request, this);
} else {
page = this.downloader.download(request, this);
}
if (page.isDownloadSuccess()) {
this.onDownloadSuccess(request, page);
} else {
this.onDownloaderFail(request);
}
}
private void onDownloadSuccess(Request request, Page page) {
if (this.site.getAcceptStatCode().contains(page.getStatusCode())) {
this.pageProcessor.process(page);
this.extractAndAddRequests(page, this.spawnUrl);
if (!page.getResultItems().isSkip()) {
Iterator var3 = this.pipelines.iterator();
while(var3.hasNext()) {
Pipeline pipeline = (Pipeline)var3.next();
pipeline.process(page.getResultItems(), this);
}
}
} else {
this.logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
}
this.sleep(this.site.getSleepTime());
}
private void onDownloaderFail(Request request) {
if (this.site.getCycleRetryTimes() == 0) {
this.sleep(this.site.getSleepTime());
} else {
this.doCycleRetry(request);
}
}
//改为public
public void doCycleRetry(Request request) {
Object cycleTriedTimesObject = request.getExtra("_cycle_tried_times");
if (cycleTriedTimesObject == null) {
this.addRequest(((Request)SerializationUtils.clone(request)).setPriority(0L).putExtra("_cycle_tried_times", 1));
} else {
int cycleTriedTimes = (Integer)cycleTriedTimesObject;
++cycleTriedTimes;
if (cycleTriedTimes < this.site.getCycleRetryTimes()) {
this.addRequest(((Request)SerializationUtils.clone(request)).setPriority(0L).putExtra("_cycle_tried_times", cycleTriedTimes));
}
}
this.sleep(this.site.getRetrySleepTime());
}
protected void sleep(int time) {
try {
Thread.sleep((long)time);
} catch (InterruptedException var3) {
this.logger.error("Thread interrupted when sleep", var3);
}
}
protected void extractAndAddRequests(Page page, boolean spawnUrl) {
if (spawnUrl && CollectionUtils.isNotEmpty(page.getTargetRequests())) {
Iterator var3 = page.getTargetRequests().iterator();
while(var3.hasNext()) {
Request request = (Request)var3.next();
this.addRequest(request);
}
}
}
private void addRequest(Request request) {
if (this.site.getDomain() == null && request != null && request.getUrl() != null) {
this.site.setDomain(UrlUtils.getDomain(request.getUrl()));
}
this.scheduler.push(request, this);
}
protected void checkIfRunning() {
if (this.stat.get() == 1) {
throw new IllegalStateException("Spider is already running!");
}
}
public void runAsync() {
Thread thread = new Thread(this);
thread.setDaemon(false);
thread.start();
}
public MySpider addUrl(String... urls) {
String[] var2 = urls;
int var3 = urls.length;
for(int var4 = 0; var4 < var3; ++var4) {
String url = var2[var4];
this.addRequest(new Request(url));
}
this.signalNewUrl();
return this;
}
public <T> List<T> getAll(Collection<String> urls) {
this.destroyWhenExit = false;
this.spawnUrl = false;
if (this.startRequests != null) {
this.startRequests.clear();
}
Iterator var2 = UrlUtils.convertToRequests(urls).iterator();
while(var2.hasNext()) {
Request request = (Request)var2.next();
this.addRequest(request);
}
CollectorPipeline collectorPipeline = this.getCollectorPipeline();
this.pipelines.add(collectorPipeline);
this.run();
this.spawnUrl = true;
this.destroyWhenExit = true;
return collectorPipeline.getCollected();
}
protected CollectorPipeline getCollectorPipeline() {
return new ResultItemsCollectorPipeline();
}
public <T> T get(String url) {
List<String> urls = WMCollections.newArrayList(new String[]{url});
List<T> resultItemses = this.getAll(urls);
return resultItemses != null && resultItemses.size() > 0 ? resultItemses.get(0) : null;
}
public MySpider addRequest(Request... requests) {
Request[] var2 = requests;
int var3 = requests.length;
for(int var4 = 0; var4 < var3; ++var4) {
Request request = var2[var4];
this.addRequest(request);
}
this.signalNewUrl();
return this;
}
private void waitNewUrl() {
this.newUrlLock.lock();
try {
if (this.threadPool.getThreadAlive() == 0 && this.exitWhenComplete) {
return;
}
this.newUrlCondition.await((long)this.emptySleepTime, TimeUnit.MILLISECONDS);
} catch (InterruptedException var5) {
this.logger.warn("waitNewUrl - interrupted, error {}", var5);
} finally {
this.newUrlLock.unlock();
}
}
private void signalNewUrl() {
try {
this.newUrlLock.lock();
this.newUrlCondition.signalAll();
} finally {
this.newUrlLock.unlock();
}
}
public void start() {
this.runAsync();
}
public void stop() {
if (this.stat.compareAndSet(1, 2)) {
this.logger.info("Spider " + this.getUUID() + " stop success!");
} else {
this.logger.info("Spider " + this.getUUID() + " stop fail!");
}
}
public MySpider thread(int threadNum) {
this.checkIfRunning();
this.threadNum = threadNum;
if (threadNum <= 0) {
throw new IllegalArgumentException("threadNum should be more than one!");
} else {
return this;
}
}
public MySpider thread(ExecutorService executorService, int threadNum) {
this.checkIfRunning();
this.threadNum = threadNum;
if (threadNum <= 0) {
throw new IllegalArgumentException("threadNum should be more than one!");
} else {
this.executorService = executorService;
return this;
}
}
public boolean isExitWhenComplete() {
return this.exitWhenComplete;
}
public MySpider setExitWhenComplete(boolean exitWhenComplete) {
this.exitWhenComplete = exitWhenComplete;
return this;
}
public boolean isSpawnUrl() {
return this.spawnUrl;
}
public long getPageCount() {
return this.pageCount.get();
}
public Spider.Status getStatus() {
return Spider.Status.fromValue(this.stat.get());
}
public int getThreadAlive() {
return this.threadPool == null ? 0 : this.threadPool.getThreadAlive();
}
public MySpider setSpawnUrl(boolean spawnUrl) {
this.spawnUrl = spawnUrl;
return this;
}
public String getUUID() {
if (this.uuid != null) {
return this.uuid;
} else if (this.site != null) {
return this.site.getDomain();
} else {
this.uuid = UUID.randomUUID().toString();
return this.uuid;
}
}
public MySpider setExecutorService(ExecutorService executorService) {
this.checkIfRunning();
this.executorService = executorService;
return this;
}
public Site getSite() {
return this.site;
}
public List<SpiderListener> getSpiderListeners() {
return this.spiderListeners;
}
public Spider setSpiderListeners(List<SpiderListener> spiderListeners) {
this.spiderListeners = spiderListeners;
return this;
}
public Date getStartTime() {
return this.startTime;
}
public Scheduler getScheduler() {
return this.scheduler;
}
public void setEmptySleepTime(int emptySleepTime) {
this.emptySleepTime = emptySleepTime;
}
public static enum Status {
Init(0),
Running(1),
Stopped(2);
private int value;
private Status(int value) {
this.value = value;
}
int getValue() {
return this.value;
}
public static MySpider.Status fromValue(int value) {
MySpider.Status[] var1 = values();
int var2 = var1.length;
for(int var3 = 0; var3 < var2; ++var3) {
MySpider.Status status = var1[var3];
if (status.getValue() == value) {
return status;
}
}
return Init;
}
}
}
新的ProcessorUtil中直接调用此方法便可解决在downloader中添加request程序直接结束问题
package com.java2nb.novel.utils;
import com.java2nb.novel.core.config.RedisManager;
import com.java2nb.novel.core.pipeline.BookPipeline;
import com.java2nb.novel.core.utils.IdWorker;
import com.java2nb.novel.entity.Book;
import com.java2nb.novel.entity.BookContent;
import com.java2nb.novel.entity.BookError;
import com.java2nb.novel.entity.BookIndex;
import com.java2nb.novel.mapper.BookErrorMapper;
import groovy.util.logging.Slf4j;
import lombok.RequiredArgsConstructor;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpResponse;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import redis.clients.jedis.JedisPool;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.RedisScheduler;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.utils.CharsetUtils;
import us.codecraft.webmagic.utils.HttpClientUtils;
import us.codecraft.webmagic.utils.HttpConstant;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
@Component
@Slf4j
@RequiredArgsConstructor
public class ProcessorUtil implements PageProcessor {
Logger logger = LoggerFactory.getLogger(ProcessorUtil.class);
private Site site = Site.me().setCharset("utf8").setSleepTime(3000).setTimeOut(10000);
private final BookPipeline bookPipeline;
private static Map<String, Object> errorsMap = new HashMap<>();
private static Map<String, Object> errorsIndexMap = new HashMap<>();
private final RedisManager redisManager;
private final JedisPool jedisPool;
private MySpider spider;
//分页列表
private List<String> pageUrlList = new ArrayList<>();
//书籍列表
private List<String> bookUrlList = new ArrayList<>();
//目录列表
private List<String> directoryUrlList = new ArrayList<>();
//页面计数器
private AtomicInteger pageCount = new AtomicInteger(1);
//书籍计数器
private AtomicInteger bookCount = new AtomicInteger(0);
private static final IdWorker idWorker = new IdWorker();
private AtomicInteger size = new AtomicInteger(-1);
private Byte ywj = 1;
private Byte wwj = 0;
private Date date = new Date();
private String BOOK_NAME = null;
private String AUTHOR_NAME = null;
String pageReg = "https://www.xbiquge.la/fenlei/\\d_\\d+.html$";
String bookContentReg = "https://www.xbiquge.la/\\d+/\\d+/\\d+.html";
String bookReg = "https://www.xbiquge.la/\\d+/\\d+/$";
Long bookId = null;
Long bookIndexId = null;
private final BookErrorMapper bookErrorMapper;
@Override
public void process(Page page) {
Selectable url = page.getUrl();
//通用总页数匹配
//分页页面
if (url != null) {
String uri = url.toString();
if (Pattern.matches(pageReg, uri)) {
String text = page.getHtml().css("a.last", "text").toString();
if (text != null) {
if (pageCount.intValue() == 1) {
logger.info("拿到总页数,并构造总页数集合数据");
Integer totalPage = Integer.valueOf(text);
for (int i = 1; i <= totalPage; i++) {
pageUrlList.add("https://www.xbiquge.la/fenlei/1_" + i + ".html");
}
} else {
if (pageUrlList.size() > 0) {
pageCount.incrementAndGet();
page.addTargetRequest(pageUrlList.get(pageCount.intValue()));
}
}
//通用添加页面
//页面所有书url
logger.info("拿到当前页面所有书url集合");
bookUrlList = page.getHtml().css("div#newscontent div.l ul li span.s2 a", "href").all();
if (bookUrlList.size() > 0) {
page.addTargetRequest(bookUrlList.get(0));
bookCount.set(0);
page.setSkip(true);
}
}
} else if (Pattern.matches(bookReg, uri)) {
logger.info("爬取书籍详情页面");
//书籍详情页面 匹配书名
String bookName = page.getHtml().css("div#info>h1", "text").toString();
System.out.println("bookName = " + bookName);
if (bookName != null) {
System.out.println("bookName = " + bookName);
Selectable css = page.getHtml().css("div#info p").regex(":.*<").nodes().get(0);
//作者名
String authorName = page.getHtml().css("div#info p").regex(":.*<").nodes().get(0).regex(":.+").toString();
BOOK_NAME = bookName.trim();
AUTHOR_NAME = authorName.trim();
//是否完结
String isEnd = "连载";
logger.info("拿到书籍目录uri集合");
directoryUrlList = page.getHtml().css("div#list a", "href").all();
size.set(directoryUrlList.size() - 1);
String girl = "女生";
//最新更新时间
// String updateTime = page.getHtml().css("p.sup").regex("^(?!.*<a)").regex(">([^/]+)</p>").toString();
//总字数
// String wordCount = page.getHtml().css("p.sup").regex(".*<a.*").regex("/span>([^/]+)</p>").toString();
//分类
String type = "玄幻";
//书籍图片
String bookUrl = page.getHtml().css("div#fmimg>img", "src").toString();
//详细描述
String bookDetail = page.getHtml().css("div#intro p:nth-last-child(1)", "text").toString();
page.addTargetRequests(directoryUrlList);
Book book = new Book();
bookId = idWorker.nextId();
book.setBookStatus("连载".equals(isEnd) ? wwj : ywj);
book.setId(bookId);
book.setBookName(BOOK_NAME);
book.setAuthorName(AUTHOR_NAME);
book.setCatName(type);
book.setPicUrl(bookUrl);
book.setCatId(1);
book.setBookDesc(bookDetail);
book.setScore(10.0f);
book.setVisitCount(0L);
book.setUpdateTime(date);
book.setCreateTime(date);
book.setCrawlSourceId(1);
book.setWorkDirection("女生".equals(girl) ? ywj : wwj);
page.putField("book", book);
bookCount.incrementAndGet();
if (bookCount.intValue() >= bookUrlList.size()) {
bookCount.set(0);
page.addTargetRequest(pageUrlList.get(pageCount.intValue()));
} else {
page.addTargetRequest(bookUrlList.get(bookCount.intValue()));
}
}
} else if (Pattern.matches(bookContentReg, uri)) {
logger.info("爬取当前书章节内容页面");
int i = directoryUrlList.indexOf(uri);
if (i > -1 && size.intValue() >= 0) {
String directory = page.getHtml().css("div.bookname>h1", "text").toString();
if (directory != null) {
BookIndex bookIndex = new BookIndex();
bookIndex.setBookId(bookId);
bookIndexId = idWorker.nextId();
bookIndex.setId(bookIndexId);
bookIndex.setIndexName(directory);
bookIndex.setCreateTime(date);
bookIndex.setUpdateTime(date);
bookIndex.setIndexNum(i);
page.putField("bookIndex", bookIndex);
String content = page.getHtml().css("div#content").toString();
if (content != null) {
BookContent bookContent = new BookContent();
bookContent.setContent(content);
bookContent.setIndexId(bookIndexId);
page.putField("bookContent", bookContent);
}
if (i == size.intValue() && size.intValue() >= 0) {
logger.info("爬取完当前书所有章节内容");
Book book = new Book();
book.setId(bookId);
book.setLastIndexId(bookIndexId);
book.setLastIndexUpdateTime(date);
book.setLastIndexName(directory);
size.set(-1);
bookCount.incrementAndGet();
page.putField("updateBookLastIndex", book);
Request request = new Request();
Integer extra = request.getExtra(Request.CYCLE_TRIED_TIMES);
request.putExtra(Request.CYCLE_TRIED_TIMES, extra == null ? 1 : extra + 1);
logger.info("放入下一本书url");
request.setUrl(bookUrlList.get(bookCount.intValue()));
page.addTargetRequest(request);
}
}
}
}
}
}
@Override
public Site getSite() {
return site;
}
//添加监听 感觉没啥用
private MySpider addWebMagicListener(MySpider spider) {
ArrayList<SpiderListener> listeners = new ArrayList<>();
listeners.add(new SpiderListener() {
@Override
public void onSuccess(Request request) {
}
@Override
public void onError(Request request) {
System.out.println("error = " + request.getUrl());
if (pageCount.intValue() == pageUrlList.size()) {
spider.setExitWhenComplete(true);
}
}
});
spider.setSpiderListeners(listeners);
return spider;
}
//启动爬虫
public void startCrawl() {
MySpider spider = MySpider.create(this).addUrl("https://www.xbiquge.la/fenlei/1_1.html")
.setDownloader(new ProcessorUtil.MyHttpClientDownloader())
.thread(8)
.addPipeline(bookPipeline)
.setScheduler(new RedisScheduler(jedisPool)
.setDuplicateRemover(new BloomFilterDuplicateRemover(10000000)));
spider = addWebMagicListener(spider);
this.spider=spider;
spider.run();
}
//内部类重写HttpClientDownloader
class MyHttpClientDownloader extends HttpClientDownloader {
@Override
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
Page page = new Page();
page.setBytes(bytes);
if (!request.isBinaryContent()) {
if (charset == null) {
charset = this.getHtmlCharset(contentType, bytes);
}
page.setCharset(charset);
page.setRawText(new String(bytes, charset));
}
String url = request.getUrl();
int i = directoryUrlList.indexOf(url);
page.setUrl(new PlainText(url));
page.setRequest(request);
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
//不等于200
if (httpResponse.getStatusLine().getStatusCode() != HttpConstant.StatusCode.CODE_200) {
if (Pattern.matches(bookContentReg, url)) {
logger.info("处理错误页面,保存到数据库" + url);
BookError bookError = new BookError();
bookError.setId(idWorker.nextId());
bookError.setBookId(bookId);
bookError.setUrl(page.getUrl().toString());
bookError.setCreateDate(new Date());
bookError.setZt(Constants.WXF);
bookError.setIndexNum(directoryUrlList.indexOf(url));
long l = idWorker.nextId();
bookError.setBookIndexId(l);
bookErrorMapper.insert(bookError);
if (i == size.intValue() && size.intValue() >= 0) {
logger.info("目录最后一章,修改书籍索引,目录计数器清零,清空书籍目录");
Book book1 = new Book();
book1.setId(bookId);
book1.setLastIndexId(l);
book1.setLastIndexUpdateTime(date);
book1.setLastIndexName("");
page.putField("updateBookLastIndex", book1);
bookCount.incrementAndGet();
size.set(0);
page.addTargetRequest(bookUrlList.get(bookCount.intValue()));
}
page.setDownloadSuccess(false);
} else if (Pattern.matches(bookReg, url)) {
if (bookCount.intValue() >= bookUrlList.size()) {
logger.info("大于每页书集合大小,计数器计0,重置集合");
bookCount.set(0);
page.addTargetRequest(pageUrlList.get(pageCount.intValue()));
page.setSkip(true);
} else {
bookCount.incrementAndGet();
logger.info("书籍页面报503,则爬取下一本" + url);
String uri = bookUrlList.get(bookCount.intValue());
Request request1 = new Request();
request1.setUrl(uri);
spider.doCycleRetry(request1);
System.out.println(" 下一页uri= " + uri);
page.setDownloadSuccess(true);
page.setSkip(true);
}
}
} else if (Pattern.matches(pageReg, url)) {
logger.info("处理分页页面");
if (pageCount.intValue() < pageUrlList.size()) {
pageCount.incrementAndGet();
page.addTargetRequest(pageUrlList.get(pageCount.intValue()));
page.setSkip(true);
}
page.setDownloadSuccess(true);
}
page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
return page;
}
private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException {
String charset = CharsetUtils.detectCharset(contentType, contentBytes);
if (charset == null) {
charset = Charset.defaultCharset().name();
}
return charset;
}
}
}