java 网络爬虫项目(9)

最新推荐文章于 2021-05-19 19:34:34 发布

原创最新推荐文章于 2021-05-19 19:34:34 发布 · 202 阅读

1 ·

CC 4.0 BY-SA版权

java 同时被 2 个专栏收录

50 篇文章

订阅专栏

project

14 篇文章

订阅专栏

本文介绍了一种使用多线程技术加速爬虫作业的方法，通过创建固定大小的线程池，实现对多个URL的并发下载和处理，有效提高了爬虫的运行效率。

使用多线程处理爬虫作业加快速度

线程池


package work.spider.start;

import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import work.spider.entity.Page;
import work.spider.service.IDonwLoadService;
import work.spider.service.IProcessService;
import work.spider.service.IStoreService;
import work.spider.service.impl.优快云ProcessService;
import work.spider.service.impl.MysqlStoreService;
import work.spider.util.JsonUtil;
import work.spider.util.LoadPropertyUtil;
import work.spider.util.ThreadUtil;
import work.spider.service.impl.HttpClientDownloadService;

/*
 * 电视剧爬虫执行入口类
 * @auther lwr
 * created by 2020-03-13
 * */
public class Start优快云Count {
	private IDonwLoadService downLoadService;
	private IProcessService processService;
	private IStoreService storeService;
	
	//固定线程池
	private static ExecutorService newFixedThreadPool=Executors.newFixedThreadPool(Integer.parseInt(LoadPropertyUtil.getconfig("threadNum")));
	
	
	public IStoreService getStoreService() {
		return storeService;
	}

	public void setStoreService(IStoreService storeService) {
		this.storeService = storeService;
	}

	public IDonwLoadService getDownLoadService() {
		return downLoadService;
	}

	public void setDownLoadService(IDonwLoadService downLoadService) {
		this.downLoadService = downLoadService;
	}
	
	
	public IProcessService getProcessService() {
		return processService;
	}

	public void setProcessService(IProcessService processService) {
		this.processService = processService;
	}

	
	//页面下载
	public Page downloadPage(String url) {
		return this.downLoadService.download(url);
	}
	
	//页面解析
	public void processPage(Page page)  //解析下载下来的页面
	{
		this.processService.process(page);
	}
	
	//页面存储
	public void storePageInfo(Page page) {
		this.storeService.store(page);
	}
	
	public static void main(String args[]) {
		
		final Start优快云Count csdn=new Start优快云Count();
		
		csdn.setDownLoadService(new HttpClientDownloadService());
		csdn.setProcessService(new 优快云ProcessService());
		csdn.setStoreService(new MysqlStoreService());
		
		
	    List<String> urlList=new ArrayList<String>();
	    
	  
		
		for(int i=0;i<2;i++) {  //把前20名的url全部保存到urlList中
			List<String> list = new LinkedList<String>();
			try {
				
				list = JsonUtil.httpclientMethod("https://blog.youkuaiyun.com/api/WritingRank/weekList?username=mid_Faker&page="
						+(i+1)+ "&size=10");
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			for(int t=0;t<list.size();t++) {
				urlList.add(list.get(t));
				
			}
			
		}
		
		for(int t=0;t<1;t++) {
			
			for(int u=0;u<1;u++) {
				final String url="https://blog.youkuaiyun.com/"+urlList.get(t)+"/article/list/"+(u+1);
				
				newFixedThreadPool.execute(new Runnable() {

					public void run() {
						// TODO Auto-generated method stub
						System.out.println("当前线程号:"+Thread.currentThread().getId());
						//下载页面
						Page page=csdn.downloadPage(url);
						page.setUrl(url);
						
						//解析页面
						csdn.processPage(page);
						
						//将详情页列表中的页面分别进行解析和存储
						for(int p=0;p<page.getUrlList().size();p++) {
							
							System.out.println(page.getUrlList().get(p));
							
							Page currentPage=csdn.downloadPage(page.getUrlList().get(p));
							currentPage.setUrl(page.getUrlList().get(p));
							csdn.processPage(currentPage);
							//System.out.println(currentPage.getPublishtime());
							csdn.storePageInfo(currentPage);
						}
						
						ThreadUtil.sleep(Integer.parseInt(LoadPropertyUtil.getconfig("millions_1")));
					}
					
				});
				
			
				
			}
			
			
			
		}
		
		newFixedThreadPool.shutdown();   //关闭线程池

		
	}
	

}