曾经用HTMLParser过滤HTML, 但发现HTMLParser有时候对不规范的HTMl解析不了, 并且不支持xpath,
后来在Web-Harvest开源爬虫网站找到了HTMLParser,能够帮助我们将HTML 文档 转化为结构化的XML文档。虽然目前已经有了类似这样的工具,但是HtmlCleaner 能够完成几乎所有的HTML转换,而且不到30k,这是他们值得称道的地方。
1.HtmlCleaner的文档对象模型现在拥有了一些函数,处理节点和属性,所以现在在序列化之前搜索或者编辑是非常容易的。
2.提供基本
HtmlCleaner DOM的XPath支持
3. 解析后编程轻量级文档对象,能够很容易的被转换到DOM或者JDom标准文档,或者通过各种方式(压缩,打印)连续输出XML。
转换完成后, 能用JDOM,dom4j对文当进行处理
package com.citgee.webclip;
import org.htmlcleaner.*;
import java.net.*;
import java.io.*;
import java.util.*;
import org.jdom.*;
//import org.jdom.output.*;
import org.jdom.contrib.helpers.XPathHelper;
import org.jdom.filter.Filter;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
import org.jdom.xpath.XPath;
public class WebClipUtils {
public static Document getDocumentByURL(String url,String charset) throws MalformedURLException, IOException{
HtmlCleaner htmlCleaner = new HtmlCleaner();
CleanerProperties props = htmlCleaner.getProperties();
TagNode node = htmlCleaner.clean(new URL(url),charset);
JDomSerializer jdomSerializer = new JDomSerializer(props,true);
Document doc = jdomSerializer.createJDom(node);
return doc;
}
public static List<Element> getElementsByTagName(Document doc,String tagName){
List<Element> eleList = new ArrayList<Element>();
buildList(doc.getRootElement(),tagName,eleList);
return eleList;
}
private static void buildList(Element rootEle,String tagName,List<Element> eleList){
if(rootEle.getName().equals(tagName)){
eleList.add(rootEle);
}
List list = rootEle.getChildren();
for(Iterator iter = list.iterator();iter.hasNext();){
Element ele = (Element)iter.next();
buildList(ele,tagName,eleList);
}
}
public static void printElement(Element ele) throws IOException{
XMLOutputter outputer = new XMLOutputter();
Format format = outputer.getFormat();
format.setEncoding("GB2312");
outputer.setFormat(format);
outputer.output(ele, System.out);
}
public static void main(String[] args) throws Exception{
HtmlCleaner htmlCleaner = new HtmlCleaner();
CleanerProperties props = htmlCleaner.getProperties();
// TagNode node = htmlCleaner.clean(new URL("http://www.baidu.com"));
TagNode node = htmlCleaner.clean(new URL("http://www.huanqiu.com"),"UTF-8");
// XmlSerializer xmlSerializer = new PrettyXmlSerializer(props);
// StringWriter writer = new StringWriter();
// xmlSerializer.writeXml(node, writer, "GB2312");
// System.out.println(writer.toString());
JDomSerializer jdomSerializer = new JDomSerializer(props,true);
Document doc = jdomSerializer.createJDom(node);
Element rootEle = doc.getRootElement();
System.out.println(XPathHelper.getPathString(rootEle));
final String tagName = "div";
List list = getElementsByTagName(doc,"div");
System.out.println(list.size());
Iterator iter = list.iterator();
while (iter.hasNext()) {
Element ele = (Element) iter.next();
System.out.println();
System.out.println("*****************************************");
System.out.println(XPathHelper.getPathString(ele));
System.out.println("*****************************************");
printElement(ele);
}
}
}
public class HtmlClean {
public void cleanHtml(String htmlurl, String xmlurl) {
try {
long start = System.currentTimeMillis();
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
props.setUseCdataForScriptAndStyle(true);
props.setRecognizeUnicodeChars(true);
props.setUseEmptyElementTags(true);
props.setAdvancedXmlEscape(true);
props.setTranslateSpecialEntities(true);
props.setBooleanAttributeValues("empty");
TagNode node = cleaner.clean(new File(htmlurl));
System.out.println("vreme:" + (System.currentTimeMillis() - start));
new PrettyXmlSerializer(props).writeXmlToFile(node, xmlurl);
System.out.println("vreme:" + (System.currentTimeMillis() - start));
} catch (IOException e) {
e.printStackTrace();
}
}
}