转载请注明出处:http://blog.youkuaiyun.com/xiaojimanman/article/details/19168917
这篇博客属于博客 http://blog.youkuaiyun.com/xiaojimanman/article/details/19158815 的拓展,建议阅读此篇博客前先阅读上一篇博客。
上一篇博客介绍了关于笑话集网站的自动采集,这篇将对其进行扩展,介绍多内涵吧内涵段子的自动采集。
上一篇博客已经详细的介绍了几个基础类,现在就只取构建子类,来实现内涵吧内涵段子的采集。
内涵吧内涵段子采集入口类Neihan8Crawl 这里的没有实现抓取程序的周期性采集,这里可以根据自己的需要来写相应的线程。
/**
*@Description:
*/
package cn.lulei.crawl.neihan8;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import cn.lulei.db.neihan8.Neihan8DbOperation;
import cn.lulei.model.Neihan8;
import cn.lulei.util.ParseUtil;
import cn.lulei.util.ThreadUtil;
public class Neihan8Crawl {
//内涵吧更新列表页url格式
private static String listPageUrl = "http://www.neihan8.com/article/list_5_%pno%.html";
//两次访问页面事件间隔,单位ms
private static int sleepTime = 500;
/**
* @param start 起始页
* @param end 终止页
* @throws IOException
* @Date: 2014-2-13
* @Author: lulei
* @Description: 抓取更新列表页上的内容
*/
public void crawlMain(int start, int end) throws IOException{
start = start < 1 ? 1 : start;
Neihan8DbOperation neihan8DbOperation = new Neihan8DbOperation();
for ( ; start <= end; start++) {
ThreadUtil.sleep(sleepTime);
Neihan8List neihan8List = new Neihan8List(listPageUrl.replace("%pno%", start + ""));
ArrayList<String> array = neihan8List.getPageUrls();
HashSet<String> hash = ParseUtil.parseArrayToHashNeihan8(array);
for (String s : hash) {
Neihan8Detail neihan8Detail = new Neihan8Detail(s);
Neihan8 neihan8 = neihan8Detail.getNeihan8();
neihan8DbOperation.insert(neihan8);
System.out.println("网址:" + s + "采集完成!");
ThreadUtil.sleep(sleepTime);
}
}
}
/**
* @param args
* @Date: 2014-2-13
* @Author: lulei
* @Description:
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
try {
new Neihan8Crawl().crawlMain(1, 41);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
更新列表页采集类Neihan8List实现如下:
/**
*@Description: 内涵吧更新列表页
*/
package cn.lulei.crawl.neihan8;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import cn.lulei.crawl.CrawlListPageBase;
public class Neihan8List extends CrawlListPageBase{
//请求neihan8最新更新列表页参数
private static HashMap<String, String> params = new HashMap<String, String>();
static {
params.put("Host", "www.neihan8.com");
params.put("Referer", "www.neihan8.com");
params.put("User-Agent", "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36");
}
public Neihan8List(String urlStr) throws IOException {
this(urlStr, "gb2312");
// TODO Auto-generated constructor stub
}
public Neihan8List(String urlStr, String charsetName) throws IOException {
super(urlStr, charsetName, "get", params);
// TODO Auto-generated constructor stub
}
@Override
public String getUrlRegexString() {
// TODO Auto-generated method stub
return "<h4>\\s*<a href=\"(.*?)\">";
}
@Override
public int getUrlRegexStringNum() {
// TODO Auto-generated method stub
return 1;
}
/**
* @param args
* @Date: 2014-2-12
* @Author: lulei
* @Description: main函数测试
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
try {
Neihan8List neihan8List = new Neihan8List("http://www.neihan8.com/article/list_5_41.html", "gb2312");
ArrayList<String> array = neihan8List.getPageUrls();
for(String s : array){
System.out.println(s);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
内容详细页采集类Neihan8Detail实现如下:
/**
*@Description: 内涵吧详细内容页
*/
package cn.lulei.crawl.neihan8;
import java.io.IOException;
import java.util.HashMap;
import org.apache.commons.httpclient.HttpException;
import cn.lulei.crawl.CrawlBase;
import cn.lulei.model.Neihan8;
import cn.lulei.util.DoRegex;
import cn.lulei.util.ParseMD5;
/**
*@Description:
*@Author: lulei
*@Date: 2014-2-13
*@Version: 1.1.0
*/
public class Neihan8Detail extends CrawlBase {
//请求neihan8内容详细页请求参数
private static HashMap<String, String> params = new HashMap<String, String>();
private String pageUrl;
//页面标题正则表达式
private static String titleRegexString = "<h1>(.*?)</h1>";
//页面内容正则表达式
private static String contentRegexString = "<div class=\"con\">\\s*<table>\\s*<tr>\\s*<td>(.*?)</td>";
//页面关键属性信息正则表达式
private static String infoRegexString = "<p class=\"info\">(.*?)<span";
static {
params.put("Host", "www.neihan8.com");
params.put("Referer", "www.neihan8.com");
params.put("User-Agent", "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36");
}
protected Neihan8Detail(String urlStr) throws HttpException, IOException {
this(urlStr, "gb2312");
}
protected Neihan8Detail(String urlStr, String charsetName) throws HttpException, IOException {
this.pageUrl = urlStr;
readPage(urlStr, charsetName, "get", params);
}
/**
* @return
* @Date: 2014-2-13
* @Author: lulei
* @Description:
*/
protected Neihan8 getNeihan8(){
Neihan8 neihan8 = new Neihan8();
neihan8.setPageUrl(pageUrl);
neihan8.setMd5(ParseMD5.ParseStrToMd5L32(pageUrl));
neihan8.setTitle(getTitle());
neihan8.setContent(getContent());
neihan8.setInfo(getInfo());
return neihan8;
}
/**
* @return
* @Date: 2014-2-13
* @Author: lulei
* @Description: 获取页面标题信息
*/
private String getTitle() {
return DoRegex.getFirstString(getPageSourceCode(), titleRegexString, 1);
}
/**
* @return
* @Date: 2014-2-13
* @Author: lulei
* @Description: 获取页面内容信息
*/
private String getContent() {
String contentAll = DoRegex.getFirstString(getPageSourceCode(), contentRegexString, 1);
contentAll = contentAll.replaceAll("<.*?>", "#br#")
.replaceAll("&.*?;", "");
return contentAll;
}
/**
* @return
* @Date: 2014-2-13
* @Author: lulei
* @Description: 获取页面关键属性信息
*/
private String getInfo() {
return DoRegex.getFirstString(getPageSourceCode(), infoRegexString, 1);
}
/**
* @param args
* @throws IOException
* @throws HttpException
* @Date: 2014-2-12
* @Author: lulei
* @Description: main函数测试
*/
public static void main(String[] args) throws HttpException, IOException {
// TODO Auto-generated method stub
Neihan8Detail neihan8Detail = new Neihan8Detail("http://www.neihan8.com/article/23553.html");
System.out.println(neihan8Detail.getContent());
System.out.println(neihan8Detail.getTitle());
System.out.println(neihan8Detail.getInfo());
}
}
通过上一篇博客中的基类的创建,在实现内涵吧内涵段子就简单许多,从分析内涵吧的页面结构到完成编码、测试、运行等消耗不到4小时的时间,当然这也有两个网站的基础结构比较类似。
源代码下载地址:http://download.youkuaiyun.com/detail/xiaojimanman/6920219
这篇博客扩展了上一篇文章的内容,详细介绍了如何进行网页自动采集,特别是针对内涵吧内涵段子的采集。文中提到的上一篇博客为基础类的介绍,而本篇则专注于内涵吧的采集实现,包括采集入口类Neihan8Crawl和更新列表页采集类Neihan8List的代码实现。提供了源代码下载链接。
891

被折叠的 条评论
为什么被折叠?



