使用多线程处理爬虫作业加快速度
线程池
package work.spider.start;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import work.spider.entity.Page;
import work.spider.service.IDonwLoadService;
import work.spider.service.IProcessService;
import work.spider.service.IStoreService;
import work.spider.service.impl.优快云ProcessService;
import work.spider.service.impl.MysqlStoreService;
import work.spider.util.JsonUtil;
import work.spider.util.LoadPropertyUtil;
import work.spider.util.ThreadUtil;
import work.spider.service.impl.HttpClientDownloadService;
/*
* 电视剧爬虫执行入口类
* @auther lwr
* created by 2020-03-13
* */
public class Start优快云Count {
private IDonwLoadService downLoadService;
private IProcessService processService;
private IStoreService storeService;
//固定线程池
private static ExecutorService newFixedThreadPool=Executors.newFixedThreadPool(Integer.parseInt(LoadPropertyUtil.getconfig("threadNum")));
public IStoreService getStoreService() {
return storeService;
}
public void setStoreService(IStoreService storeService) {
this.storeService = storeService;
}
public IDonwLoadService getDownLoadService() {
return downLoadService;
}
public void setDownLoadService(IDonwLoadService downLoadService) {
this.downLoadService = downLoadService;
}
public IProcessService getProcessService() {
return processService;
}
public void setProcessService(IProcessService processService) {
this.processService = processService;
}
//页面下载
public Page downloadPage(String url) {
return this.downLoadService.download(url);
}
//页面解析
public void processPage(Page page) //解析下载下来的页面
{
this.processService.process(page);
}
//页面存储
public void storePageInfo(Page page) {
this.storeService.store(page);
}
public static void main(String args[]) {
final Start优快云Count csdn=new Start优快云Count();
csdn.setDownLoadService(new HttpClientDownloadService());
csdn.setProcessService(new 优快云ProcessService());
csdn.setStoreService(new MysqlStoreService());
List<String> urlList=new ArrayList<String>();
for(int i=0;i<2;i++) { //把前20名的url全部保存到urlList中
List<String> list = new LinkedList<String>();
try {
list = JsonUtil.httpclientMethod("https://blog.youkuaiyun.com/api/WritingRank/weekList?username=mid_Faker&page="
+(i+1)+ "&size=10");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
for(int t=0;t<list.size();t++) {
urlList.add(list.get(t));
}
}
for(int t=0;t<1;t++) {
for(int u=0;u<1;u++) {
final String url="https://blog.youkuaiyun.com/"+urlList.get(t)+"/article/list/"+(u+1);
newFixedThreadPool.execute(new Runnable() {
public void run() {
// TODO Auto-generated method stub
System.out.println("当前线程号:"+Thread.currentThread().getId());
//下载页面
Page page=csdn.downloadPage(url);
page.setUrl(url);
//解析页面
csdn.processPage(page);
//将详情页列表中的页面分别进行解析和存储
for(int p=0;p<page.getUrlList().size();p++) {
System.out.println(page.getUrlList().get(p));
Page currentPage=csdn.downloadPage(page.getUrlList().get(p));
currentPage.setUrl(page.getUrlList().get(p));
csdn.processPage(currentPage);
//System.out.println(currentPage.getPublishtime());
csdn.storePageInfo(currentPage);
}
ThreadUtil.sleep(Integer.parseInt(LoadPropertyUtil.getconfig("millions_1")));
}
});
}
}
newFixedThreadPool.shutdown(); //关闭线程池
}
}

本文介绍了一种使用多线程技术加速爬虫作业的方法,通过创建固定大小的线程池,实现对多个URL的并发下载和处理,有效提高了爬虫的运行效率。
308

被折叠的 条评论
为什么被折叠?



