这段时间要做一个门户网站,新闻模块的信息采集,谈到信息采集,就想到了网络爬虫,毕竟我们没有太多的经历去自己写新闻,那么sina,sohu,就不好意思了,借用一下信息,网络提倡资源共享,这也是我一直追求的,看了一下。我也曾经想过用爬虫爬一些网络上的资源,拿来主义。package com.opensky.util; import java.util.HashMap; import java.util.Map; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.AndFilter; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.filters.HasParentFilter; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.parserapplications.filterbuilder.Filter; import org.htmlparser.tags.BodyTag; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; /** * httpclient与htmlparse对网页的解析 * * @author Administrator * */ public class HtmlparseUtil { WebHttpClient util = new WebHttpClient(); /** * 获得网页中的超链接,将href和text保存在Map中:map(href,text) * * @param url * @param charset * @return */ public Map<String, String> linkGet(String url, String charset) { String content = util.getWebContentByGet(url, charset); Map<String, String> linkMap = new HashMap<String, String>(); try { // 开始解析 Parser parser = Parser.createParser(content, charset); // 过滤出<a></a>标签 NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); NodeList list = parser.extractAllNodesThatMatch(linkFilter); Node node = null; for (int i = 0; i < list.size(); i++) { node = list.elementAt(i); // 获得网页中的链接map(href,text) linkMap.put(((LinkTag) node).getLink(), this .processText(((LinkTag) node).getLinkText())); } } catch (ParserException e) { e.printStackTrace(); } return linkMap; } /** * 获得网页<body></body>标签中的内容, 保存在body中 * * @param url * @param charset * @return */ public String bodyGet(String url, String charset) { String content = util.getWebContentByGet(url, charset); String body = ""; try { Parser parser = Parser.createParser(content, charset); // 过滤<body></body>标签 NodeFilter bodyFilter = new NodeClassFilter(BodyTag.class); NodeList list = parser.extractAllNodesThatMatch(bodyFilter); Node node = null; for (int i = 0; i < list.size(); i++) { node = list.elementAt(i); // 获得网页内容 保存在content中 body = ((BodyTag) node).getBody(); } } catch (ParserException e) { e.printStackTrace(); } return body; } /** * 过滤出class为term的<span>元素,并获得他们的文本 * * @param url * @param charset * @return */ public Map<String, String> termGet(String url, String charset) { // 获得网页中的所有HTML内容 String content = util.getWebContentByGet(url, charset); Map<String, String> map = new HashMap<String, String>(); try { // 开始解析 // 过滤出class为term的<span>元素 Parser parser = Parser.createParser(content, charset); // TagNameFilter(标签名字)------HasAttributeFilter(属性,和属性的值) AndFilter filter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "term")); Node node = null; NodeList nodeList = parser.parse(filter); for (int i = 0; i < nodeList.size(); i++) { node = nodeList.elementAt(i); // System.out.println("-----------------------------node.toPlainTextString()--------------->"); // System.out.println(node.toPlainTextString()); map.put("term", node.toPlainTextString()); } // 过滤出class为start-time的<span>元素 Parser parser2 = Parser.createParser(content, charset); AndFilter filter2 = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "start-time")); NodeList nodeList2 = parser2.parse(filter2); for (int i = 0; i < nodeList2.size(); i++) { node = nodeList2.elementAt(i); map.put("start-time", node.toPlainTextString()); } // 过滤出id为J_SingleEndTimeLabel的<span>元素 Parser parser3 = Parser.createParser(content, charset); AndFilter filter3 = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "J_SingleEndTimeLabel")); NodeList nodeList3 = parser3.parse(filter3); for (int i = 0; i < nodeList3.size(); i++) { node = nodeList3.elementAt(i); map.put("end-time", node.toPlainTextString()); } // 过滤出class为box post的<div>元素 Parser parser4 = Parser.createParser(content, charset); AndFilter filter4 = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "box post")); NodeList nodeList4 = parser4.parse(filter4); for (int i = 0; i < nodeList4.size(); i++) { node = nodeList4.elementAt(i); String temp = node.toPlainTextString().trim(); temp = temp.substring(10, 20).trim(); map.put("pre-term", temp); } // 过滤出class为J_AwardNumber的<span>元素 Parser parser5 = Parser.createParser(content, charset); // AndFilter filter5 = // new AndFilter(new TagNameFilter("span"),new // HasAttributeFilter("class","J_AwardNumber")); NodeList nodeList5 = parser5.parse(new HasAttributeFilter("class", "J_AwardNumber")); StringBuffer buffer = new StringBuffer(); for (int i = 0; i < nodeList5.size(); i++) { node = nodeList5.elementAt(i); buffer.append("," + node.toPlainTextString()); } buffer.append("|"); // 过滤出class为blue J_AwardNumber的<span>元素 Parser parser6 = Parser.createParser(content, charset); // AndFilter filter6 = // new AndFilter(new TagNameFilter("span"),new // HasAttributeFilter("class","blue J_AwardNumber")); NodeList nodeList6 = parser6.parse(new HasAttributeFilter("class", "blue J_AwardNumber")); for (int i = 0; i < nodeList6.size(); i++) { node = nodeList6.elementAt(i); buffer.append(node.toPlainTextString() + ","); } map.put("numbers", buffer.toString()); } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } return map; } /** * 过滤出class为list_00f_f14的 * <ul> * 元素,并获得其中 * <li>的文本,,新浪 国内新闻的,,国内要闻信息 * * @param url * @param charset * @return */ public Map<String, String> sinaChinaNewsGet(String url, String charset) { // 获得网页中的所有HTML内容 String content = util.getWebContentByGet(url, charset); Map<String, String> map = new HashMap<String, String>(); try { // 开始解析 // 过滤出class为list_00f_f14的<ul>元素 Parser parser = Parser.createParser(content, charset); // TagNameFilter(标签名字)------HasAttributeFilter(属性,和属性的值) //AndFilter filter = new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "list_00f_f14")); AndFilter filter=new AndFilter(new TagNameFilter("li"),new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "list_00f_f14")))); // TagNameFilter filter = new TagNameFilter("a"); Node node = null; NodeList nodeList = parser.parse(filter); for (int i = 0; i < nodeList.size(); i++) { node = nodeList.elementAt(i); //System.out.println("------------------------>>>>国内新闻版块---新浪>>>>>>>>>>>>>>>>>>"); //System.out.println("标题:"+node.toPlainTextString()); map.put("title" + i, node.toPlainTextString()); NodeList nodeChildList = node.getChildren(); Node nodeChild = null; for (int j = 0; j < nodeChildList.size(); j++) { nodeChild = nodeChildList.elementAt(j); if (nodeChild instanceof LinkTag) { String hrefStr = ((LinkTag) nodeChild).getAttribute("href"); //System.out.println("链接:"+hrefStr); map.put("href"+i, hrefStr); } } } } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } return map; } private String processText(String content) { content = content.trim().replaceAll("", ""); // content=content.replaceAll("<p>", "\n"); // content=content.replaceAll("</TD>", ""); // content=content.replaceAll("</div>", ""); // content=content.replaceAll("</a>", ""); // content=content.replaceAll("<a href=.*>", ""); return content; } }