背景:
获取html页面所有有text节点的xpath
思路:
NekoHTML是一个Java语言的 HTML扫描器和标签补全器(tag balancer) ,使得程序能解析HTML文档并用标准的XML接口来访问其中的信息。而dom4j可以很好的用于XML,XPath和XSL。
基于html构建xml
基于xml获取所有的有text的xpath
环境:
<dependency>
<groupId>net.sourceforge.nekohtml</groupId>
<artifactId>nekohtml</artifactId>
<version>1.9.10</version>
</dependency>
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
<version>1.6.1</version>
</dependency>
实现如下:
import java.io.FileInputStream;
import java.util.Iterator;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.cyberneko.html.parsers.DOMParser;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.io.DOMReader;
import org.xml.sax.InputSource;
public class NekoHTML {
public static void main(String[] args) throws Throwable {
DOMParser parser = new DOMParser();
parser.parse(new InputSource(new FileInputStream("D:/dom/1.htm")));
DOMReader domReader = new DOMReader();
Document document = domReader.read(parser.getDocument());
Element root = document.getRootElement();
Map<String, String> map = new java.util.concurrent.ConcurrentHashMap<String, String>();
dom2XPathMap(root, map);
System.out.println(map);
}
private static void dom2XPathMap(Element root, Map<String, String> map) {
if (root == null || root.isTextOnly()) {
if (StringUtils.isNotEmpty(root.getText())) {
map.put(root.getUniquePath(), root.getText());
}
if (map.containsKey(root.getUniquePath())) {
}
return;
}
Iterator<Element> iterator = root.elementIterator();
while (iterator.hasNext()) {
Element el = iterator.next();
dom2XPathMap(el, map);
}
}
}