笑话集网站最近更新网站内容采集

最新推荐文章于 2022-05-09 19:45:46 发布

xiaojimanman

最新推荐文章于 2022-05-09 19:45:46 发布

阅读量1.7k

点赞数 1

CC 4.0 BY-SA版权

分类专栏： java 文章标签：内容采集 java 笑话集网页抓取

本文链接：https://blog.youkuaiyun.com/xiaojimanman/article/details/19158815

java 专栏收录该内容

40 篇文章

订阅专栏

本文介绍了一个针对笑话集网站的爬虫实现方案，包括抓取更新列表页和详情页的内容，涉及网页抓取、正则匹配及数据存储等方面。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

转载请注明出处：http://blog.youkuaiyun.com/xiaojimanman/article/details/19158815

本篇博客主页介绍笑话集（www.jokeji.cn）最近更新列表页内容的抓取实现方式，程序源代码下载地址：http://download.youkuaiyun.com/detail/xiaojimanman/6918997

首先介绍一下抓取入口，这里的没有实现抓取程序的周期性采集，这里可以根据自己的需要来写相应的线程。

 /**  
 *@Description:  笑话集抓取调度入口
 */ 
package cn.lulei.crawl.jokeji;  

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.concurrent.TimeUnit;


import cn.lulei.db.jokeji.JokeDbOperation;
import cn.lulei.model.Jokeji;
import cn.lulei.util.ParseUtil;
  
public class JokeCrawl {

	//笑话集更新列表页url格式
	private static String listPageUrl = "http://www.jokeji.cn/list_%pno%.htm";
	//两次访问页面事件间隔，单位ms
	private static int sleepTime = 500;
	
	/**
	 * @param start 起始页
	 * @param end 终止页
	 * @throws IOException
	 * @Date: 2014-2-12  
	 * @Author: lulei  
	 * @Description: 抓取更新列表页上的内容
	 */
	public void crawlMain(int start, int end) throws IOException{
		start = start < 1 ? 1 : start;
		JokeDbOperation jokeDbOperation = new JokeDbOperation();
		for ( ; start <= end; start++) {
			sleep(sleepTime);
			JokeList jokeList = new JokeList(listPageUrl.replace("%pno%", start + ""));
			ArrayList<String> array = jokeList.getPageUrls();
			HashSet<String> hash = ParseUtil.parseArrayToHash(array);
			for (String s : hash) {
				JokeDetail jokeDetail = new JokeDetail(s);
				Jokeji jokeji = jokeDetail.getJokeji();
				jokeDbOperation.insert(jokeji);
				System.out.println("网址：" + s + "采集完成！");
				sleep(sleepTime);
			}
		}
	}
	
	/**
	 * @param sleepTime
	 * @Date: 2014-2-13  
	 * @Author: lulei  
	 * @Description: 线程暂停sleepTime毫秒
	 */
	public void sleep(int sleepTime){
		try {
			TimeUnit.MILLISECONDS.sleep(sleepTime);
		} catch (InterruptedException e) {
			// TODO Auto-generated catch block  
			e.printStackTrace();
		}
	}
	
	public static void main(String[] args) {
		// TODO Auto-generated method stub  
		try {
			new JokeCrawl().crawlMain(1, 380);
		} catch (Exception e) {
			// TODO Auto-generated catch block  
			e.printStackTrace();
		} 
	}

}

函数 public void crawlMain(int start, int end) 实现了列表页从start到end页的抓取，这里面设置的两次访问页面的时间间隔是500ms，可以根据自己电脑性能的配置和网速情况修改相应的配置，但是不建议将其修改太小，否则会被笑话集网站屏蔽。

下面的CrawlBase类将实现获取网页信息资源，pageSourceCode属性存储当前页面的源代码，做后续步骤的处理工作。这个类似获取网页资源信息的基类，可以根据不同的网页格式和抓取内容，构建相应的子类即可。

 /**  
 *@Description: 获取网页信息基类
 */ 
package cn.lulei.crawl;  

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.log4j.Logger;


public abstract class CrawlBase {
	private static Logger log = Logger.getLogger(CrawlBase.class);
	
	//链接源代码
	private String pageSourceCode = "";
	//返回头信息
	private Header[] responseHeaders = null;
	//连接超时时间
	private static int connectTimeout = 3500;
	//连接读取时间
	private static int readTimeout = 3500;
	//默认最大访问次数
	private static int maxConnectTimes = 3;
	private static HttpClient httpClient = new HttpClient();
	
	static {
		httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(connectTimeout);
		httpClient.getHttpConnectionManager().getParams().setSoTimeout(readTimeout);
	}
	
	/**
	 * @param urlStr
	 * @param charsetName
	 * @param method
	 * @param params
	 * @return
	 * @throws HttpException
	 * @throws IOException
	 * @Date: 2014-2-12  
	 * @Author: lulei  
	 * @Description: method方式访问页面
	 */
	public boolean readPage(String urlStr, String charsetName, String method, HashMap<String, String> params) throws HttpException, IOException {
		if ("post".equals(method) || "POST".equals(method)) {
			return readPageByPost(urlStr, charsetName, params);
		} else {
			return readPageByGet(urlStr, charsetName, params);	
		}
	}
	
	/**
	 * @param urlStr
	 * @param charsetName
	 * @param params
	 * @return 访问是否成功
	 * @throws HttpException
	 * @throws IOException
	 * @Date: 2013-9-12  
	 * @Author: lulei  
	 * @Description: Get方式访问页面
	 */
	public boolean readPageByGet(String urlStr, String charsetName, HashMap<String, String> params) throws HttpException, IOException {
		GetMethod getMethod = createGetMethod(urlStr, params);
		return readPage(getMethod, charsetName, urlStr);
	}
	
	/**
	 * @param urlStr
	 * @param charsetName
	 * @param params
	 * @return 访问是否成功
	 * @throws HttpException
	 * @throws IOException
	 * @Date: 2013-9-12  
	 * @Author: lulei  
	 * @Description: Post方式访问页面
	 */
	public boolean readPageByPost(String urlStr, String charsetName, HashMap<String, String> params) throws HttpException, IOException{
		PostMethod postMethod = createPostMethod(urlStr, params);
		return readPage(postMethod, charsetName, urlStr);
	}
	
	/**
	 * @param method
	 * @param charsetName
	 * @param urlStr
	 * @return 访问是否成功
	 * @throws HttpException
	 * @throws IOException
	 * @Date: 2013-9-12  
	 * @Author: lulei  
	 * @Description: 读取页面信息和头信息
	 */
	private boolean readPage(HttpMethod method, String charsetName, String urlStr) throws HttpException, IOException{
		int n = maxConnectTimes;
		while (n > 0) {
			try {
				if (httpClient.executeMethod(method) != HttpStatus.SC_OK){
					log.error("can not connect " + urlStr);
					return false;
				}
				//获取头信息
				responseHeaders = method.getResponseHeaders();
				//获取页面源代码
				InputStream inputStream = method.getResponseBodyAsStream();
				BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, charsetName));
				StringBuffer stringBuffer = new StringBuffer();
				String lineString = null;
				while ((lineString = bufferedReader.readLine()) != null){
					stringBuffer.append(lineString);
				}
				pageSourceCode = stringBuffer.toString();
				return true;
			} catch (Exception e) {
				System.out.println(urlStr + " -- can't connect  " + (maxConnectTimes - n + 1));
				n--;
			}
		}
		return false;
	}
	
	/**
	 * @param urlStr
	 * @param params
	 * @return GetMethod
	 * @Date: 2013-9-12  
	 * @Author: lulei  
	 * @Description: 设置get请求参数
	 */
	@SuppressWarnings("rawtypes")
	private GetMethod createGetMethod(String urlStr, HashMap<String, String> params){
		GetMethod getMethod = new GetMethod(urlStr);
		if (params == null){
			return getMethod;
		}
		Iterator iter = params.entrySet().iterator();
		while (iter.hasNext()) {
			Map.Entry entry = (Map.Entry) iter.next();
			String key = (String) entry.getKey();
			String val = (String) entry.getValue();
			getMethod.setRequestHeader(key, val);
		}
		return getMethod;
	}
	
	/**
	 * @param urlStr
	 * @param params
	 * @return PostMethod
	 * @Date: 2013-9-12  
	 * @Author: lulei  
	 * @Description: 设置post请求参数
	 */
	@SuppressWarnings("rawtypes")
	private PostMethod createPostMethod(String urlStr, HashMap<String, String> params){
		PostMethod postMethod = new PostMethod(urlStr);
		if (params == null){
			return postMethod;
		}
		Iterator iter = params.entrySet().iterator();
		while (iter.hasNext()) {
			Map.Entry entry = (Map.Entry) iter.next();
			String key = (String) entry.getKey();
			String val = (String) entry.getValue();
			postMethod.setParameter(key, val);
		}
		return postMethod;
	}
	
	/**
	 * @param urlStr
	 * @param charsetName
	 * @return 访问是否成功
	 * @throws IOException
	 * @Date: 2013-9-12  
	 * @Author: lulei  
	 * @Description: 不设置任何头信息直接访问网页
	 */
	public boolean readPageByGet(String urlStr, String charsetName) throws IOException{
		return this.readPageByGet(urlStr, charsetName, null);
	}
	
	/**
	 * @return String
	 * @Date: 2013-9-12  
	 * @Author: lulei  
	 * @Description: 获取网页源代码
	 */
	public String getPageSourceCode(){
		return pageSourceCode;
	}
	
	/**
	 * @return Header[]
	 * @Date: 2013-9-12  
	 * @Author: lulei  
	 * @Description: 获取网页返回头信息
	 */
	public Header[] getHeader(){
		return responseHeaders;
	}
	
	/**
	 * @param timeout
	 * @Date: 2013-9-12  
	 * @Author: lulei  
	 * @Description: 设置连接超时时间
	 */
	public void setConnectTimeout(int timeout){
		httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(timeout);
	}
	
	/**
	 * @param timeout
	 * @Date: 2013-9-12  
	 * @Author: lulei  
	 * @Description: 设置读取超时时间
	 */
	public void setReadTimeout(int timeout){
		httpClient.getHttpConnectionManager().getParams().setSoTimeout(timeout);
	}
	
	/**
	 * @param maxConnectTimes
	 * @Date: 2014-2-12  
	 * @Author: lulei  
	 * @Description: 设置最大访问次数，链接失败的情况下使用
	 */
	public static void setMaxConnectTimes(int maxConnectTimes) {
		CrawlBase.maxConnectTimes = maxConnectTimes;
	}

	/**
	 * @param connectTimeout
	 * @param readTimeout
	 * @Date: 2013-9-12  
	 * @Author: lulei  
	 * @Description: 设置连接超时时间和读取超时时间
	 */
	public void setTimeout(int connectTimeout, int readTimeout){
		setConnectTimeout(connectTimeout);
		setReadTimeout(readTimeout);
	}
}

对于更新列表页的详细页面的链接url，由于多数网站都有相同的共性，因此对CrawlBase进行再一次的封装成CrawlListPageBase类，实现更新列表页中链接url的获取。

 /**  
  *@Description: 获取页面链接地址信息基类  
 */ 
package cn.lulei.crawl;  

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;

import cn.lulei.util.DoRegex;

public abstract class CrawlListPageBase extends CrawlBase {
	private String pageurl;
	
	/**
	* @param urlStr
	* @param charsetName
	* @throws IOException
	 */
	public CrawlListPageBase(String urlStr, String charsetName) throws IOException{
		readPageByGet(urlStr, charsetName);
		pageurl = urlStr;
	}
	
	/**
	* @param urlStr
	* @param charsetName
	* @param method
	* @param params
	* @throws IOException
	 */
	public CrawlListPageBase(String urlStr, String charsetName, String method, HashMap<String, String> params) throws IOException{
		readPage(urlStr, charsetName, method, params);	
		pageurl = urlStr;
	}
	
	/**
	 * @return ArrayList<String>
	 * @Date: 2013-9-13  
	 * @Author: lulei  
	 * @Description: 返回页面上需求的链接地址
	 */
	public ArrayList<String> getPageUrls(){
		ArrayList<String> pageUrls = new ArrayList<String>();
		pageUrls = DoRegex.getArrayList(getPageSourceCode(), getUrlRegexString(), pageurl, getUrlRegexStringNum());
		return pageUrls;
	}
	
	/**
	 * @return String
	 * @Date: 2013-9-13  
	 * @Author: lulei  
	 * @Description: 返回页面上需求的网址连接的正则表达式
	 */
	public abstract String getUrlRegexString();
	
	/**
	 * @return int
	 * @Date: 2013-9-13  
	 * @Author: lulei  
	 * @Description: 正则表达式中要去的字段位置
	 */
	public abstract int getUrlRegexStringNum();	
}

继承该类，只需要实现 public abstract String getUrlRegexString(); public abstract int getUrlRegexStringNum(); 这两个抽象方法即可，对于笑话集的更新列表页的实现如下：

 /**  
 *@Description:  笑话集最近更新列表页面  
 */ 
package cn.lulei.crawl.jokeji;  

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;

import cn.lulei.crawl.CrawlListPageBase;

/**  
 *@Description:  
 *@Author: lulei  
 *@Date: 2014-2-12  
 *@Version: 1.1.0  
 */
public class JokeList extends CrawlListPageBase{
	
	//请求jokeji最新更新列表页参数
	private static HashMap<String, String> params = new HashMap<String, String>();
	
	static {
		params.put("Host", "www.jokeji.cn");
		params.put("Pragma", "no-cache");
		params.put("User-Agent", "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36");
	}

	public JokeList(String urlStr) throws IOException {
		this(urlStr, "gb2312");  
	}
	
	public JokeList(String urlStr, String charsetName) throws IOException {
		super(urlStr, charsetName, "get", params);  
		// TODO Auto-generated constructor stub
	}


	@Override
	public String getUrlRegexString() {
		// TODO Auto-generated method stub  
		return "<li><b><a href=\"(.*?)\"target=\"_blank\"";//链接url正则表达式
	}

	@Override
	public int getUrlRegexStringNum() {
		// TODO Auto-generated method stub  
		return 1;
	}

	/**  
	 * @param args
	 * @Date: 2014-2-12  
	 * @Author: lulei  
	 * @Description:  main函数测试
	 */
	public static void main(String[] args) {
		// TODO Auto-generated method stub  
		try {
			JokeList jokeList = new JokeList("http://www.jokeji.cn/list_1.htm", "gb2312");
			ArrayList<String> array = jokeList.getPageUrls();
			for(String s : array){
				System.out.println(s);
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}

经过上述的封装，对于实现列表页链接地址的获取将很容易实现。还有在上述的实现过程中使用到了正则匹配的工具类DoRegex，其实现如下：

 /**  
 * @Description: 正则处理工具   
 */ 
package cn.lulei.util;  

import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
  
public class DoRegex {
	
	private static String rootUrlRegex = "(http://.*?/)";
	private static String currentUrlRegex = "(http://.*/)";
	private static String ChRegex = "([\u4e00-\u9fa5]+)";

	/**
	 * @param dealStr
	 * @param regexStr
	 * @param splitStr
	 * @param n
	 * @return String
	 * @Date: 2013-9-13  
	 * @Author: lulei  
	 * @Description: 正则匹配结果，每条记录用splitStr分割
	 */
	public static String getString(String dealStr, String regexStr, String splitStr, int n){
		String reStr = "";
		if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){
			return reStr;
		}
		splitStr = (splitStr == null) ? "" : splitStr;
		Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
		Matcher matcher = pattern.matcher(dealStr);
		StringBuffer stringBuffer = new StringBuffer();
		while (matcher.find()) {
			stringBuffer.append(matcher.group(n).trim());
			stringBuffer.append(splitStr);
		}
		reStr = stringBuffer.toString();
		if (splitStr != "" && reStr.endsWith(splitStr)){
			reStr = reStr.substring(0, reStr.length() - splitStr.length());
		}
		return reStr;
	}
	
	/**
	 * @param dealStr
	 * @param regexStr
	 * @param n
	 * @return String
	 * @Date: 2013-9-13  
	 * @Author: lulei  
	 * @Description: 正则匹配结果，将所有匹配记录组装成字符串
	 */
	public static String getString(String dealStr, String regexStr, int n){
		return getString(dealStr, regexStr, null, n);
	}
	
	/**
	 * @param dealStr
	 * @param regexStr
	 * @param n
	 * @return String
	 * @Date: 2013-9-13  
	 * @Author: lulei  
	 * @Description: 正则匹配第一条结果
	 */
	public static String getFirstString(String dealStr, String regexStr, int n){
		if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){
			return "";
		}
		Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
		Matcher matcher = pattern.matcher(dealStr);
		while (matcher.find()) {
			return matcher.group(n).trim();
		}
		return "";
	}
	
	/**
	 * @param dealStr
	 * @param regexStr
	 * @param n
	 * @return ArrayList<String>
	 * @Date: 2013-9-13  
	 * @Author: lulei  
	 * @Description: 正则匹配结果，将匹配结果组装成数组
	 */
	public static ArrayList<String> getArrayList(String dealStr, String regexStr, int n){
		ArrayList<String> reArrayList = new ArrayList<String>();
		if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){
			return reArrayList;
		}
		Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
		Matcher matcher = pattern.matcher(dealStr);
		while (matcher.find()) {
			reArrayList.add(matcher.group(n).trim());
		}
		return reArrayList;
	}
	
	/**
	 * @param url
	 * @param currentUrl
	 * @return String
	 * @Date: 2013-9-13  
	 * @Author: lulei  
	 * @Description: 组装网址，网页的url
	 */
	private static String getHttpUrl(String url, String currentUrl){
		try {
			url = encodeUrlCh(url);
		} catch (UnsupportedEncodingException e) {
			// TODO Auto-generated catch block  
			e.printStackTrace();
		}
		if (url.indexOf("http") == 0){
			return url;
		}
		if  (url.indexOf("/") == 0){
			return getFirstString(currentUrl, rootUrlRegex, 1) + url.substring(1);
		}
		return getFirstString(currentUrl, currentUrlRegex, 1) + url;
	}
	
	/**
	 * @param dealStr
	 * @param regexStr
	 * @param currentUrl
	 * @param n
	 * @return ArrayList<String>
	 * @Date: 2013-9-13  
	 * @Author: lulei  
	 * @Description: 获取和正则匹配的绝对链接地址
	 */
	public static ArrayList<String> getArrayList(String dealStr, String regexStr, String currentUrl, int n){
		ArrayList<String> reArrayList = new ArrayList<String>();
		if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){
			return reArrayList;
		}
		Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
		Matcher matcher = pattern.matcher(dealStr);
		while (matcher.find()) {
			reArrayList.add(getHttpUrl(matcher.group(n).trim(), currentUrl));
		}
		return reArrayList;
	}
	
	/**
	 * @param url
	 * @return
	 * @throws UnsupportedEncodingException
	 * @Date: 2014-2-12  
	 * @Author: lulei  
	 * @Description: 将连接地址中的中文进行编码处理
	 */
	public static String encodeUrlCh (String url) throws UnsupportedEncodingException {
		while (true) {
			String s = getFirstString(url, ChRegex, 1);
			if ("".equals(s)){
				return url;
			}
			url = url.replaceAll(s, URLEncoder.encode(s, "utf-8"));
		}
	}
}

该类实现了正则表达式的一些匹配查找，已及网页相对地址转化为绝对地址等功能，详细参照程序中的注释。

通过JokeList类获取到详细页的url，现在只需要创建一个笑话集详细页的处理类JokeDetail即可，代码如下：

 /**  
 *@Description:    笑话集详细内容页
 */ 
package cn.lulei.crawl.jokeji;  

import java.io.IOException;
import java.util.HashMap;

import org.apache.commons.httpclient.HttpException;

import cn.lulei.crawl.CrawlBase;
import cn.lulei.model.Jokeji;
import cn.lulei.util.DoRegex;
import cn.lulei.util.ParseMD5;
  
public class JokeDetail extends CrawlBase {

	//请求jokeji内容详细页请求参数
	private static HashMap<String, String> params = new HashMap<String, String>();
	//获取内容部分的正则表达式
	private static String contentAllRegexString = "<span id=\"text110\">(.*?)</span>";
	private String pageUrl;
	
	static {
		params.put("Host", "www.jokeji.cn");
		params.put("Pragma", "no-cache");
		params.put("User-Agent", "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36");
		params.put("Referer", "http://www.jokeji.cn/list.htm");
	}
	
	/**
	* @param urlStr
	* @throws HttpException
	* @throws IOException
	 */
	protected JokeDetail(String urlStr) throws HttpException, IOException {
		this(urlStr, "gb2312");
	}
	
	/**
	* @param urlStr
	* @param charsetName
	* @throws HttpException
	* @throws IOException
	 */
	protected JokeDetail(String urlStr, String charsetName) throws HttpException, IOException {
		this.pageUrl = urlStr;
		readPage(urlStr, charsetName, "get", params);
	}
	
	/**
	 * @return
	 * @Date: 2014-2-12  
	 * @Author: lulei  
	 * @Description: 获取笑话集详细页对象
	 */
	protected Jokeji getJokeji() {
		Jokeji jokeji = new Jokeji();
		
		jokeji.setPageUrl(pageUrl);
		jokeji.setUrlMd5(ParseMD5.ParseStrToMd5L32(pageUrl));
		jokeji.setContent(getContent());
		
		return jokeji;
	}
	
	/**
	 * @return
	 * @Date: 2014-2-12  
	 * @Author: lulei  
	 * @Description: 获取内容详细
	 */
	private String getContent() {
		String contentAll = DoRegex.getFirstString(getPageSourceCode(), contentAllRegexString, 1);
		contentAll = contentAll.replaceAll("&.*?;", "")
							   .replaceAll("<br>", "#br#")
							   .replaceAll("<BR>", "#br#")
							   .replaceAll("</BR>", "#br#")
							   .replaceAll("</br>", "#br#")
							   .replaceAll("</P>", "#br#")
							   .replaceAll("</p>", "#br#")
							   .replaceAll("<.*?>", "");
		return contentAll;
	}
	
	
	/**  
	 * @param args
	 * @throws IOException 
	 * @throws HttpException 
	 * @Date: 2014-2-12  
	 * @Author: lulei  
	 * @Description:  main函数测试
	 */
	public static void main(String[] args) throws HttpException, IOException {
		// TODO Auto-generated method stub  
		JokeDetail jokeDetail = new JokeDetail("http://www.jokeji.cn/jokehtml/bxnn/20090926220449.htm");
		System.out.println(jokeDetail.getContent());
	}

}

到目前为止，即完成了笑话集从更新列表页到详细页的相关内容获取，具体的业务逻辑参照上述的JokeCrawl类。

上面说阐述的均是笑话集网站的抓取，至于数据的存储并没有太多的设计，也就是在JokeCrawl类中调用了相应的数据存储和数据排重等方法，具体的实现参照对应的源代码即可。