网页自动采集之内涵吧内涵段子自动采集

最新推荐文章于 2021-02-12 18:43:05 发布

原创最新推荐文章于 2021-02-12 18:43:05 发布 · 3.6k 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#内容采集 #网页抓取 #java #内涵吧 #网页采集

java 专栏收录该内容

40 篇文章

订阅专栏

这篇博客扩展了上一篇文章的内容，详细介绍了如何进行网页自动采集，特别是针对内涵吧内涵段子的采集。文中提到的上一篇博客为基础类的介绍，而本篇则专注于内涵吧的采集实现，包括采集入口类Neihan8Crawl和更新列表页采集类Neihan8List的代码实现。提供了源代码下载链接。

转载请注明出处：http://blog.youkuaiyun.com/xiaojimanman/article/details/19168917

这篇博客属于博客 http://blog.youkuaiyun.com/xiaojimanman/article/details/19158815 的拓展，建议阅读此篇博客前先阅读上一篇博客。

上一篇博客介绍了关于笑话集网站的自动采集，这篇将对其进行扩展，介绍多内涵吧内涵段子的自动采集。

上一篇博客已经详细的介绍了几个基础类，现在就只取构建子类，来实现内涵吧内涵段子的采集。

内涵吧内涵段子采集入口类Neihan8Crawl 这里的没有实现抓取程序的周期性采集，这里可以根据自己的需要来写相应的线程。

 /**  
 *@Description:     
 */ 
package cn.lulei.crawl.neihan8;  

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;

import cn.lulei.db.neihan8.Neihan8DbOperation;
import cn.lulei.model.Neihan8;
import cn.lulei.util.ParseUtil;
import cn.lulei.util.ThreadUtil;
  
public class Neihan8Crawl {

	//内涵吧更新列表页url格式
	private static String listPageUrl = "http://www.neihan8.com/article/list_5_%pno%.html";
	//两次访问页面事件间隔，单位ms
	private static int sleepTime = 500;
	
	/**
	 * @param start 起始页
	 * @param end 终止页
	 * @throws IOException
	 * @Date: 2014-2-13  
	 * @Author: lulei  
	 * @Description: 抓取更新列表页上的内容
	 */
	public void crawlMain(int start, int end) throws IOException{
		start = start < 1 ? 1 : start;
		Neihan8DbOperation neihan8DbOperation = new Neihan8DbOperation();
		for ( ; start <= end; start++) {
			ThreadUtil.sleep(sleepTime);
			Neihan8List neihan8List = new Neihan8List(listPageUrl.replace("%pno%", start + ""));
			ArrayList<String> array = neihan8List.getPageUrls();
			HashSet<String> hash = ParseUtil.parseArrayToHashNeihan8(array);
			for (String s : hash) {
				Neihan8Detail neihan8Detail = new Neihan8Detail(s);
				Neihan8 neihan8 = neihan8Detail.getNeihan8();
				neihan8DbOperation.insert(neihan8);
				System.out.println("网址：" + s + "采集完成！");
				ThreadUtil.sleep(sleepTime);
			}
		}
	}
	
	/**  
	 * @param args
	 * @Date: 2014-2-13  
	 * @Author: lulei  
	 * @Description:  
	 */
	public static void main(String[] args) {
		// TODO Auto-generated method stub  
		try {
			new Neihan8Crawl().crawlMain(1, 41);
		} catch (Exception e) {
			// TODO Auto-generated catch block  
			e.printStackTrace();
		} 
	}

}

更新列表页采集类Neihan8List实现如下：

 /**  
 *@Description:  内涵吧更新列表页
 */ 
package cn.lulei.crawl.neihan8;  

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;

import cn.lulei.crawl.CrawlListPageBase;
  
public class Neihan8List extends CrawlListPageBase{
	
	//请求neihan8最新更新列表页参数
	private static HashMap<String, String> params = new HashMap<String, String>();
	
	static {
		params.put("Host", "www.neihan8.com");
		params.put("Referer", "www.neihan8.com");
		params.put("User-Agent", "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36");
	}

	public Neihan8List(String urlStr) throws IOException {
		this(urlStr, "gb2312");  
		// TODO Auto-generated constructor stub
	}
	
	public Neihan8List(String urlStr, String charsetName) throws IOException {
		super(urlStr, charsetName, "get", params);   
		// TODO Auto-generated constructor stub
	}

	@Override
	public String getUrlRegexString() {
		// TODO Auto-generated method stub  
		return "<h4>\\s*<a href=\"(.*?)\">";
	}

	@Override
	public int getUrlRegexStringNum() {
		// TODO Auto-generated method stub  
		return 1;
	}
	
	/**  
	 * @param args
	 * @Date: 2014-2-12  
	 * @Author: lulei  
	 * @Description:  main函数测试
	 */
	public static void main(String[] args) {
		// TODO Auto-generated method stub  
		try {
			Neihan8List neihan8List = new Neihan8List("http://www.neihan8.com/article/list_5_41.html", "gb2312");
			ArrayList<String> array = neihan8List.getPageUrls();
			for(String s : array){
				System.out.println(s);
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

}

内容详细页采集类Neihan8Detail实现如下：

 /**  
 *@Description:  内涵吧详细内容页
 */ 
package cn.lulei.crawl.neihan8;  

import java.io.IOException;
import java.util.HashMap;

import org.apache.commons.httpclient.HttpException;

import cn.lulei.crawl.CrawlBase;
import cn.lulei.model.Neihan8;
import cn.lulei.util.DoRegex;
import cn.lulei.util.ParseMD5;
  
  
/**  
 *@Description:  
 *@Author: lulei  
 *@Date: 2014-2-13  
 *@Version: 1.1.0  
 */
public class Neihan8Detail extends CrawlBase {
	
	//请求neihan8内容详细页请求参数
	private static HashMap<String, String> params = new HashMap<String, String>();
	private String pageUrl;
	//页面标题正则表达式
	private static String titleRegexString = "<h1>(.*?)</h1>";
	//页面内容正则表达式
	private static String contentRegexString = "<div class=\"con\">\\s*<table>\\s*<tr>\\s*<td>(.*?)</td>";
	//页面关键属性信息正则表达式
	private static String infoRegexString = "<p class=\"info\">(.*?)<span";
	
	static {
		params.put("Host", "www.neihan8.com");
		params.put("Referer", "www.neihan8.com");
		params.put("User-Agent", "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36");
	}
	
	protected Neihan8Detail(String urlStr) throws HttpException, IOException {
		this(urlStr, "gb2312");
	}
	
	protected Neihan8Detail(String urlStr, String charsetName) throws HttpException, IOException {
		this.pageUrl = urlStr;
		readPage(urlStr, charsetName, "get", params);
	}

	/**
	 * @return
	 * @Date: 2014-2-13  
	 * @Author: lulei  
	 * @Description:
	 */
	protected Neihan8 getNeihan8(){
		Neihan8 neihan8 = new Neihan8();
		
		neihan8.setPageUrl(pageUrl);
		neihan8.setMd5(ParseMD5.ParseStrToMd5L32(pageUrl));
		neihan8.setTitle(getTitle());
		neihan8.setContent(getContent());
		neihan8.setInfo(getInfo());
		
		return neihan8;
	}
	
	/**
	 * @return
	 * @Date: 2014-2-13  
	 * @Author: lulei  
	 * @Description: 获取页面标题信息
	 */
	private String getTitle() {
		return DoRegex.getFirstString(getPageSourceCode(), titleRegexString, 1);
	}
	
	/**
	 * @return
	 * @Date: 2014-2-13  
	 * @Author: lulei  
	 * @Description: 获取页面内容信息
	 */
	private String getContent() {
		String contentAll = DoRegex.getFirstString(getPageSourceCode(), contentRegexString, 1);
		contentAll = contentAll.replaceAll("<.*?>", "#br#")
							   .replaceAll("&.*?;", "");
		return contentAll;
	}
	
	/**
	 * @return
	 * @Date: 2014-2-13  
	 * @Author: lulei  
	 * @Description: 获取页面关键属性信息
	 */
	private String getInfo() {
		return DoRegex.getFirstString(getPageSourceCode(), infoRegexString, 1);
	}
	
	/**  
	 * @param args
	 * @throws IOException 
	 * @throws HttpException 
	 * @Date: 2014-2-12  
	 * @Author: lulei  
	 * @Description:  main函数测试
	 */
	public static void main(String[] args) throws HttpException, IOException {
		// TODO Auto-generated method stub  
		Neihan8Detail neihan8Detail = new Neihan8Detail("http://www.neihan8.com/article/23553.html");
		System.out.println(neihan8Detail.getContent());
		System.out.println(neihan8Detail.getTitle());
		System.out.println(neihan8Detail.getInfo());
	}
}

通过上一篇博客中的基类的创建，在实现内涵吧内涵段子就简单许多，从分析内涵吧的页面结构到完成编码、测试、运行等消耗不到4小时的时间，当然这也有两个网站的基础结构比较类似。

源代码下载地址：http://download.youkuaiyun.com/detail/xiaojimanman/6920219