转载请注明出处:http://blog.youkuaiyun.com/xiaojimanman/article/details/19158815
本篇博客主页介绍笑话集(www.jokeji.cn)最近更新列表页内容的抓取实现方式,程序源代码下载地址:http://download.youkuaiyun.com/detail/xiaojimanman/6918997
首先介绍一下抓取入口,这里的没有实现抓取程序的周期性采集,这里可以根据自己的需要来写相应的线程。
/**
*@Description: 笑话集抓取调度入口
*/
package cn.lulei.crawl.jokeji;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.concurrent.TimeUnit;
import cn.lulei.db.jokeji.JokeDbOperation;
import cn.lulei.model.Jokeji;
import cn.lulei.util.ParseUtil;
public class JokeCrawl {
//笑话集更新列表页url格式
private static String listPageUrl = "http://www.jokeji.cn/list_%pno%.htm";
//两次访问页面事件间隔,单位ms
private static int sleepTime = 500;
/**
* @param start 起始页
* @param end 终止页
* @throws IOException
* @Date: 2014-2-12
* @Author: lulei
* @Description: 抓取更新列表页上的内容
*/
public void crawlMain(int start, int end) throws IOException{
start = start < 1 ? 1 : start;
JokeDbOperation jokeDbOperation = new JokeDbOperation();
for ( ; start <= end; start++) {
sleep(sleepTime);
JokeList jokeList = new JokeList(listPageUrl.replace("%pno%", start + ""));
ArrayList<String> array = jokeList.getPageUrls();
HashSet<String> hash = ParseUtil.parseArrayToHash(array);
for (String s : hash) {
JokeDetail jokeDetail = new JokeDetail(s);
Jokeji jokeji = jokeDetail.getJokeji();
jokeDbOperation.insert(jokeji);
System.out.println("网址:" + s + "采集完成!");
sleep(sleepTime);
}
}
}
/**
* @param sleepTime
* @Date: 2014-2-13
* @Author: lulei
* @Description: 线程暂停sleepTime毫秒
*/
public void sleep(int sleepTime){
try {
TimeUnit.MILLISECONDS.sleep(sleepTime);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void main(String[] args) {
// TODO Auto-generated method stub
try {
new JokeCrawl().crawlMain(1, 380);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
函数 public void crawlMain(int start, int end) 实现了列表页从start到end页的抓取,这里面设置的两次访问页面的时间间隔是500ms,可以根据自己电脑性能的配置和网速情况修改相应的配置,但是不建议将其修改太小,否则会被笑话集网站屏蔽。
下面的CrawlBase类将实现获取网页信息资源,pageSourceCode属性存储当前页面的源代码,做后续步骤的处理工作。这个类似获取网页资源信息的基类,可以根据不同的网页格式和抓取内容,构建相应的子类即可。
/**
*@Description: 获取网页信息基类
*/
package cn.lulei.crawl;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.log4j.Logger;
public abstract class CrawlBase {
private static Logger log = Logger.getLogger(CrawlBase.class);
//链接源代码
private String pageSourceCode = "";
//返回头信息
private Header[] responseHeaders = null;
//连接超时时间
private static int connectTimeout = 3500;
//连接读取时间
private static int readTimeout = 3500;
//默认最大访问次数
private static int maxConnectTimes = 3;
private static HttpClient httpClient = new HttpClient();
static {
httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(connectTimeout);
httpClient.getHttpConnectionManager().getParams().setSoTimeout(readTimeout);
}
/**
* @param urlStr
* @param charsetName
* @param method
* @param params
* @return
* @throws HttpException
* @throws IOException
* @Date: 2014-2-12
* @Author: lulei
* @Description: method方式访问页面
*/
public boolean readPage(String urlStr, String charsetName, String method, HashMap<String, String> params) throws HttpException, IOException {
if ("post".equals(method) || "POST".equals(method)) {
return readPageByPost(urlStr, charsetName, params);
} else {
return readPageByGet(urlStr, charsetName, params);
}
}
/**
* @param urlStr
* @param charsetName
* @param params
* @return 访问是否成功
* @throws HttpException
* @throws IOException
* @Date: 2013-9-12
* @Author: lulei
* @Description: Get方式访问页面
*/
public boolean readPageByGet(String urlStr, String charsetName, HashMap<String, String> params) throws HttpException, IOException {
GetMethod getMethod = createGetMethod(urlStr, params);
return readPage(getMethod, charsetName, urlStr);
}
/**
* @param urlStr
* @param charsetName
* @param params
* @return 访问是否成功
* @throws HttpException
* @throws IOException
* @Date: 2013-9-12
* @Author: lulei
* @Description: Post方式访问页面
*/
public boolean readPageByPost(String urlStr, String charsetName, HashMap<String, String> params) throws HttpException, IOException{
PostMethod postMethod = createPostMethod(urlStr, params);
return readPage(postMethod, charsetName, urlStr);
}
/**
* @param method
* @param charsetName
* @param urlStr
* @return 访问是否成功
* @throws HttpException
* @throws IOException
* @Date: 2013-9-12
* @Author: lulei
* @Description: 读取页面信息和头信息
*/
private boolean readPage(HttpMethod method, String charsetName, String urlStr) throws HttpException, IOException{
int n = maxConnectTimes;
while (n > 0) {
try {
if (httpClient.executeMethod(method) != HttpStatus.SC_OK){
log.error("can not connect " + urlStr);
return false;
}
//获取头信息
responseHeaders = method.getResponseHeaders();
//获取页面源代码
InputStream inputStream = method.getResponseBodyAsStream();
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, charsetName));
StringBuffer stringBuffer = new StringBuffer();
String lineString = null;
while ((lineString = bufferedReader.readLine()) != null){
stringBuffer.append(lineString);
}
pageSourceCode = stringBuffer.toString();
return true;
} catch (Exception e) {
System.out.println(urlStr + " -- can't connect " + (maxConnectTimes - n + 1));
n--;
}
}
return false;
}
/**
* @param urlStr
* @param params
* @return GetMethod
* @Date: 2013-9-12
* @Author: lulei
* @Description: 设置get请求参数
*/
@SuppressWarnings("rawtypes")
private GetMethod createGetMethod(String urlStr, HashMap<String, String> params){
GetMethod getMethod = new GetMethod(urlStr);
if (params == null){
return getMethod;
}
Iterator iter = params.entrySet().iterator();
while (iter.hasNext()) {
Map.Entry entry = (Map.Entry) iter.next();
String key = (String) entry.getKey();
String val = (String) entry.getValue();
getMethod.setRequestHeader(key, val);
}
return getMethod;
}
/**
* @param urlStr
* @param params
* @return PostMethod
* @Date: 2013-9-12
* @Author: lulei
* @Description: 设置post请求参数
*/
@SuppressWarnings("rawtypes")
private PostMethod createPostMethod(String urlStr, HashMap<String, String> params){
PostMethod postMethod = new PostMethod(urlStr);
if (params == null){
return postMethod;
}
Iterator iter = params.entrySet().iterator();
while (iter.hasNext()) {
Map.Entry entry = (Map.Entry) iter.next();
String key = (String) entry.getKey();
String val = (String) entry.getValue();
postMethod.setParameter(key, val);
}
return postMethod;
}
/**
* @param urlStr
* @param charsetName
* @return 访问是否成功
* @throws IOException
* @Date: 2013-9-12
* @Author: lulei
* @Description: 不设置任何头信息直接访问网页
*/
public boolean readPageByGet(String urlStr, String charsetName) throws IOException{
return this.readPageByGet(urlStr, charsetName, null);
}
/**
* @return String
* @Date: 2013-9-12
* @Author: lulei
* @Description: 获取网页源代码
*/
public String getPageSourceCode(){
return pageSourceCode;
}
/**
* @return Header[]
* @Date: 2013-9-12
* @Author: lulei
* @Description: 获取网页返回头信息
*/
public Header[] getHeader(){
return responseHeaders;
}
/**
* @param timeout
* @Date: 2013-9-12
* @Author: lulei
* @Description: 设置连接超时时间
*/
public void setConnectTimeout(int timeout){
httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(timeout);
}
/**
* @param timeout
* @Date: 2013-9-12
* @Author: lulei
* @Description: 设置读取超时时间
*/
public void setReadTimeout(int timeout){
httpClient.getHttpConnectionManager().getParams().setSoTimeout(timeout);
}
/**
* @param maxConnectTimes
* @Date: 2014-2-12
* @Author: lulei
* @Description: 设置最大访问次数,链接失败的情况下使用
*/
public static void setMaxConnectTimes(int maxConnectTimes) {
CrawlBase.maxConnectTimes = maxConnectTimes;
}
/**
* @param connectTimeout
* @param readTimeout
* @Date: 2013-9-12
* @Author: lulei
* @Description: 设置连接超时时间和读取超时时间
*/
public void setTimeout(int connectTimeout, int readTimeout){
setConnectTimeout(connectTimeout);
setReadTimeout(readTimeout);
}
}
对于更新列表页的详细页面的链接url,由于多数网站都有相同的共性,因此对CrawlBase进行再一次的封装成CrawlListPageBase类,实现更新列表页中链接url的获取。
/**
*@Description: 获取页面链接地址信息基类
*/
package cn.lulei.crawl;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import cn.lulei.util.DoRegex;
public abstract class CrawlListPageBase extends CrawlBase {
private String pageurl;
/**
* @param urlStr
* @param charsetName
* @throws IOException
*/
public CrawlListPageBase(String urlStr, String charsetName) throws IOException{
readPageByGet(urlStr, charsetName);
pageurl = urlStr;
}
/**
* @param urlStr
* @param charsetName
* @param method
* @param params
* @throws IOException
*/
public CrawlListPageBase(String urlStr, String charsetName, String method, HashMap<String, String> params) throws IOException{
readPage(urlStr, charsetName, method, params);
pageurl = urlStr;
}
/**
* @return ArrayList<String>
* @Date: 2013-9-13
* @Author: lulei
* @Description: 返回页面上需求的链接地址
*/
public ArrayList<String> getPageUrls(){
ArrayList<String> pageUrls = new ArrayList<String>();
pageUrls = DoRegex.getArrayList(getPageSourceCode(), getUrlRegexString(), pageurl, getUrlRegexStringNum());
return pageUrls;
}
/**
* @return String
* @Date: 2013-9-13
* @Author: lulei
* @Description: 返回页面上需求的网址连接的正则表达式
*/
public abstract String getUrlRegexString();
/**
* @return int
* @Date: 2013-9-13
* @Author: lulei
* @Description: 正则表达式中要去的字段位置
*/
public abstract int getUrlRegexStringNum();
}
/**
*@Description: 笑话集最近更新列表页面
*/
package cn.lulei.crawl.jokeji;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import cn.lulei.crawl.CrawlListPageBase;
/**
*@Description:
*@Author: lulei
*@Date: 2014-2-12
*@Version: 1.1.0
*/
public class JokeList extends CrawlListPageBase{
//请求jokeji最新更新列表页参数
private static HashMap<String, String> params = new HashMap<String, String>();
static {
params.put("Host", "www.jokeji.cn");
params.put("Pragma", "no-cache");
params.put("User-Agent", "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36");
}
public JokeList(String urlStr) throws IOException {
this(urlStr, "gb2312");
}
public JokeList(String urlStr, String charsetName) throws IOException {
super(urlStr, charsetName, "get", params);
// TODO Auto-generated constructor stub
}
@Override
public String getUrlRegexString() {
// TODO Auto-generated method stub
return "<li><b><a href=\"(.*?)\"target=\"_blank\"";//链接url正则表达式
}
@Override
public int getUrlRegexStringNum() {
// TODO Auto-generated method stub
return 1;
}
/**
* @param args
* @Date: 2014-2-12
* @Author: lulei
* @Description: main函数测试
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
try {
JokeList jokeList = new JokeList("http://www.jokeji.cn/list_1.htm", "gb2312");
ArrayList<String> array = jokeList.getPageUrls();
for(String s : array){
System.out.println(s);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
经过上述的封装,对于实现列表页链接地址的获取将很容易实现。还有在上述的实现过程中使用到了正则匹配的工具类DoRegex,其实现如下:
/**
* @Description: 正则处理工具
*/
package cn.lulei.util;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class DoRegex {
private static String rootUrlRegex = "(http://.*?/)";
private static String currentUrlRegex = "(http://.*/)";
private static String ChRegex = "([\u4e00-\u9fa5]+)";
/**
* @param dealStr
* @param regexStr
* @param splitStr
* @param n
* @return String
* @Date: 2013-9-13
* @Author: lulei
* @Description: 正则匹配结果,每条记录用splitStr分割
*/
public static String getString(String dealStr, String regexStr, String splitStr, int n){
String reStr = "";
if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){
return reStr;
}
splitStr = (splitStr == null) ? "" : splitStr;
Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
Matcher matcher = pattern.matcher(dealStr);
StringBuffer stringBuffer = new StringBuffer();
while (matcher.find()) {
stringBuffer.append(matcher.group(n).trim());
stringBuffer.append(splitStr);
}
reStr = stringBuffer.toString();
if (splitStr != "" && reStr.endsWith(splitStr)){
reStr = reStr.substring(0, reStr.length() - splitStr.length());
}
return reStr;
}
/**
* @param dealStr
* @param regexStr
* @param n
* @return String
* @Date: 2013-9-13
* @Author: lulei
* @Description: 正则匹配结果,将所有匹配记录组装成字符串
*/
public static String getString(String dealStr, String regexStr, int n){
return getString(dealStr, regexStr, null, n);
}
/**
* @param dealStr
* @param regexStr
* @param n
* @return String
* @Date: 2013-9-13
* @Author: lulei
* @Description: 正则匹配第一条结果
*/
public static String getFirstString(String dealStr, String regexStr, int n){
if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){
return "";
}
Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
Matcher matcher = pattern.matcher(dealStr);
while (matcher.find()) {
return matcher.group(n).trim();
}
return "";
}
/**
* @param dealStr
* @param regexStr
* @param n
* @return ArrayList<String>
* @Date: 2013-9-13
* @Author: lulei
* @Description: 正则匹配结果,将匹配结果组装成数组
*/
public static ArrayList<String> getArrayList(String dealStr, String regexStr, int n){
ArrayList<String> reArrayList = new ArrayList<String>();
if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){
return reArrayList;
}
Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
Matcher matcher = pattern.matcher(dealStr);
while (matcher.find()) {
reArrayList.add(matcher.group(n).trim());
}
return reArrayList;
}
/**
* @param url
* @param currentUrl
* @return String
* @Date: 2013-9-13
* @Author: lulei
* @Description: 组装网址,网页的url
*/
private static String getHttpUrl(String url, String currentUrl){
try {
url = encodeUrlCh(url);
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if (url.indexOf("http") == 0){
return url;
}
if (url.indexOf("/") == 0){
return getFirstString(currentUrl, rootUrlRegex, 1) + url.substring(1);
}
return getFirstString(currentUrl, currentUrlRegex, 1) + url;
}
/**
* @param dealStr
* @param regexStr
* @param currentUrl
* @param n
* @return ArrayList<String>
* @Date: 2013-9-13
* @Author: lulei
* @Description: 获取和正则匹配的绝对链接地址
*/
public static ArrayList<String> getArrayList(String dealStr, String regexStr, String currentUrl, int n){
ArrayList<String> reArrayList = new ArrayList<String>();
if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){
return reArrayList;
}
Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
Matcher matcher = pattern.matcher(dealStr);
while (matcher.find()) {
reArrayList.add(getHttpUrl(matcher.group(n).trim(), currentUrl));
}
return reArrayList;
}
/**
* @param url
* @return
* @throws UnsupportedEncodingException
* @Date: 2014-2-12
* @Author: lulei
* @Description: 将连接地址中的中文进行编码处理
*/
public static String encodeUrlCh (String url) throws UnsupportedEncodingException {
while (true) {
String s = getFirstString(url, ChRegex, 1);
if ("".equals(s)){
return url;
}
url = url.replaceAll(s, URLEncoder.encode(s, "utf-8"));
}
}
}
该类实现了正则表达式的一些匹配查找,已及网页相对地址转化为绝对地址等功能,详细参照程序中的注释。
通过JokeList类获取到详细页的url,现在只需要创建一个笑话集详细页的处理类JokeDetail即可,代码如下:
/**
*@Description: 笑话集详细内容页
*/
package cn.lulei.crawl.jokeji;
import java.io.IOException;
import java.util.HashMap;
import org.apache.commons.httpclient.HttpException;
import cn.lulei.crawl.CrawlBase;
import cn.lulei.model.Jokeji;
import cn.lulei.util.DoRegex;
import cn.lulei.util.ParseMD5;
public class JokeDetail extends CrawlBase {
//请求jokeji内容详细页请求参数
private static HashMap<String, String> params = new HashMap<String, String>();
//获取内容部分的正则表达式
private static String contentAllRegexString = "<span id=\"text110\">(.*?)</span>";
private String pageUrl;
static {
params.put("Host", "www.jokeji.cn");
params.put("Pragma", "no-cache");
params.put("User-Agent", "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36");
params.put("Referer", "http://www.jokeji.cn/list.htm");
}
/**
* @param urlStr
* @throws HttpException
* @throws IOException
*/
protected JokeDetail(String urlStr) throws HttpException, IOException {
this(urlStr, "gb2312");
}
/**
* @param urlStr
* @param charsetName
* @throws HttpException
* @throws IOException
*/
protected JokeDetail(String urlStr, String charsetName) throws HttpException, IOException {
this.pageUrl = urlStr;
readPage(urlStr, charsetName, "get", params);
}
/**
* @return
* @Date: 2014-2-12
* @Author: lulei
* @Description: 获取笑话集详细页对象
*/
protected Jokeji getJokeji() {
Jokeji jokeji = new Jokeji();
jokeji.setPageUrl(pageUrl);
jokeji.setUrlMd5(ParseMD5.ParseStrToMd5L32(pageUrl));
jokeji.setContent(getContent());
return jokeji;
}
/**
* @return
* @Date: 2014-2-12
* @Author: lulei
* @Description: 获取内容详细
*/
private String getContent() {
String contentAll = DoRegex.getFirstString(getPageSourceCode(), contentAllRegexString, 1);
contentAll = contentAll.replaceAll("&.*?;", "")
.replaceAll("<br>", "#br#")
.replaceAll("<BR>", "#br#")
.replaceAll("</BR>", "#br#")
.replaceAll("</br>", "#br#")
.replaceAll("</P>", "#br#")
.replaceAll("</p>", "#br#")
.replaceAll("<.*?>", "");
return contentAll;
}
/**
* @param args
* @throws IOException
* @throws HttpException
* @Date: 2014-2-12
* @Author: lulei
* @Description: main函数测试
*/
public static void main(String[] args) throws HttpException, IOException {
// TODO Auto-generated method stub
JokeDetail jokeDetail = new JokeDetail("http://www.jokeji.cn/jokehtml/bxnn/20090926220449.htm");
System.out.println(jokeDetail.getContent());
}
}
到目前为止,即完成了笑话集从更新列表页到详细页的相关内容获取,具体的业务逻辑参照上述的JokeCrawl类。
上面说阐述的均是笑话集网站的抓取,至于数据的存储并没有太多的设计,也就是在JokeCrawl类中调用了相应的数据存储和数据排重等方法,具体的实现参照对应的源代码即可。