HTML解析htmlparser
htmlparser
首页:http://sourceforge.net/projects/htmlparser/
下载:http://sourceforge.net/project/showfiles.php?group_id=24399
文件:HTMLParser-2.0-SNAPSHOT-bin.zip
cpdetector
首页:http://cpdetector.sourceforge.net/
下载:http://sourceforge.net/project/showfiles.php?group_id=114421
文件:cpdetector_eclipse_project_1.0.7.zip
解开压缩后,运行ANT打包命令,build.xml有些地方需要稍微根据具体情况调整一下
ant jar.htmlentitydecoder
得到JAR包
cpdetector_1.0.7.jar
HTML工具类函数一:自动探测URL的HTML内容的编码
/**
* 自动探测页面的编码
*
* @param url
* @return
* @throws MalformedURLException
*/
public static String autoDetectCharset(String url) {
URL source = null;
try {
source = new URL(url);
} catch (MalformedURLException e) {
log.error(e);
}
CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
detector.add(new ParsingDetector(false));
detector.add(JChardetFacade.getInstance());
detector.add(ASCIIDetector.getInstance());
detector.add(UnicodeDetector.getInstance());
Charset charset = null;
try {
charset = detector.detectCodepage(source);
} catch (IOException e) {
log.error(e);
}
if (charset == null) {
charset = Charset.defaultCharset();
}
return charset.name();
}
HTML工具类函数二:读取URL中的HTML文本
/**
* 读取文件HTML内容
*
* @param url
* @param charset
* @return
* @throws IOException
*/
public static String readURL(String url, String charset) {
/* StringBuffer的缓冲区大小 */
int TRANSFER_SIZE = 4096;
/* 当前平台的行分隔符 */
String lineSep = System.getProperty("line.separator");
String content = "";
URL source = null;
try {
source = new URL(url);
} catch (MalformedURLException e) {
log.error(e);
}
InputStream in = null;
try {
in = source.openStream();
} catch (IOException e) {
log.error(e);
}
BufferedReader reader = null;
try {
reader = new BufferedReader(new InputStreamReader(in, charset));
} catch (UnsupportedEncodingException e) {
log.error(e);
}
String line = new String();
StringBuffer temp = new StringBuffer(TRANSFER_SIZE);
try {
while ((line = reader.readLine()) != null) {
temp.append(line);
temp.append(lineSep);
}
in.close();
reader.close();
} catch (IOException e) {
log.error(e);
}
content = temp.toString();
return content;
}
HTML工具类函数三:解析HTML得到其中的所有TAG
public static NodeList getFormNodeList(String url) {
Parser parser = Parser.createParser(readURL(url),
autoDetectCharset(url));
PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
factory.registerTag(new ScclSelectBizCodesTag());
factory.registerTag(new InputTag());
factory.registerTag(new TextareaTag());
parser.setNodeFactory(factory);
NodeFilter formFilter = new PostFormFilter();
NodeList nodeList = null;
try {
nodeList = parser.extractAllNodesThatMatch(formFilter);
} catch (ParserException e) {
log.error(e);
}
return nodeList;
}
HTML工具类函数四:解析TAG中的属性,生成所有的PageField的POJO
public static List<PageField> getPageFields(String url) {
List<PageField> list = null;
NodeList nodeList = getFormNodeList(url);
if (nodeList != null && nodeList.size() > 0) {
// nodeList不为空,开始构建
list = new ArrayList<PageField>(nodeList.size());
for (int i = 0; i < nodeList.size(); i++) {
TagNode node = (TagNode) nodeList.elementAt(i);
if (node instanceof InputTag) {
InputTag input = (InputTag) node;
PageField t = new PageField(input.getAttribute("name"),
PageField.TAG_TYPE_INPUT, input
.getAttribute("type"));
list.add(t);
} else if (node instanceof ScclSelectBizCodesTag) {
ScclSelectBizCodesTag scclSelectBizCodesTag = (ScclSelectBizCodesTag) node;
PageField t = new PageField(scclSelectBizCodesTag
.getAttribute("id"),
PageField.TAG_TYPE_SELECT, null);
list.add(t);
} else if (node instanceof TextareaTag) {
TextareaTag textArea = (TextareaTag) node;
PageField t = new PageField(textArea.getAttribute("name"),PageField.TAG_TYPE_TEXTAREA,null);
list.add(t);
}
}
}
return list;
}
扩展自定义标签<sccl:selectBizCodes>
public class ScclSelectBizCodesTag extends TagNode {
private static final long serialVersionUID = -6352090777443844707L;
private static final String[] ids = new String[] { "sccl:selectBizCodes" };
public String[] getIds() {
return (ids);
}
public String[] getEnders() {
return (ids);
}
public String getCategory(){
return super.getAttribute("category");
}
public String getId(){
return super.getAttribute("id");
}
public String getSelected(){
return super.getAttribute("selected");
}
}
用FILTER方式过滤访问TAG
public class PostFormFilter implements NodeFilter {
private static final long serialVersionUID = 8162322553987269165L;
public boolean accept(Node node) {
if (node instanceof InputTag) {
return true;
}
if (node instanceof ScclSelectBizCodesTag) {
return true;
}
if (node instanceof TextareaTag) {
return true;
}
return false;
}
}
测试
public static void main(String[] args)
throws org.htmlparser.util.ParserException, IOException {
String url = "file:///E:\\work\\html\\editOrder.jsp";
List<PageField> list = getPageFields(url);
list.get(0);
}
以上代码可以解析<input> <select> 自定义类型
<sccl:selectBizCodes category="worksheet" id="worksheetCode" selected="cl" onChange="go();" html="style='test';"/>
问题一
拷贝cpdetector_1.0.7.jar到项目中后
同时也要拷贝ext下面的chardet.jar到lib下面,不然在调用
detector.add(JChardetFacade.getInstance());时要报错,找不到类
nsICharsetDetectionObserver
问题二
拷贝htmlparser相关包如下:
htmlparser.jar
htmllexer.jar
htmlparser
首页:http://sourceforge.net/projects/htmlparser/
下载:http://sourceforge.net/project/showfiles.php?group_id=24399
文件:HTMLParser-2.0-SNAPSHOT-bin.zip
cpdetector
首页:http://cpdetector.sourceforge.net/
下载:http://sourceforge.net/project/showfiles.php?group_id=114421
文件:cpdetector_eclipse_project_1.0.7.zip
解开压缩后,运行ANT打包命令,build.xml有些地方需要稍微根据具体情况调整一下
ant jar.htmlentitydecoder
得到JAR包
cpdetector_1.0.7.jar
HTML工具类函数一:自动探测URL的HTML内容的编码
/**
* 自动探测页面的编码
*
* @param url
* @return
* @throws MalformedURLException
*/
public static String autoDetectCharset(String url) {
URL source = null;
try {
source = new URL(url);
} catch (MalformedURLException e) {
log.error(e);
}
CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
detector.add(new ParsingDetector(false));
detector.add(JChardetFacade.getInstance());
detector.add(ASCIIDetector.getInstance());
detector.add(UnicodeDetector.getInstance());
Charset charset = null;
try {
charset = detector.detectCodepage(source);
} catch (IOException e) {
log.error(e);
}
if (charset == null) {
charset = Charset.defaultCharset();
}
return charset.name();
}
HTML工具类函数二:读取URL中的HTML文本
/**
* 读取文件HTML内容
*
* @param url
* @param charset
* @return
* @throws IOException
*/
public static String readURL(String url, String charset) {
/* StringBuffer的缓冲区大小 */
int TRANSFER_SIZE = 4096;
/* 当前平台的行分隔符 */
String lineSep = System.getProperty("line.separator");
String content = "";
URL source = null;
try {
source = new URL(url);
} catch (MalformedURLException e) {
log.error(e);
}
InputStream in = null;
try {
in = source.openStream();
} catch (IOException e) {
log.error(e);
}
BufferedReader reader = null;
try {
reader = new BufferedReader(new InputStreamReader(in, charset));
} catch (UnsupportedEncodingException e) {
log.error(e);
}
String line = new String();
StringBuffer temp = new StringBuffer(TRANSFER_SIZE);
try {
while ((line = reader.readLine()) != null) {
temp.append(line);
temp.append(lineSep);
}
in.close();
reader.close();
} catch (IOException e) {
log.error(e);
}
content = temp.toString();
return content;
}
HTML工具类函数三:解析HTML得到其中的所有TAG
public static NodeList getFormNodeList(String url) {
Parser parser = Parser.createParser(readURL(url),
autoDetectCharset(url));
PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
factory.registerTag(new ScclSelectBizCodesTag());
factory.registerTag(new InputTag());
factory.registerTag(new TextareaTag());
parser.setNodeFactory(factory);
NodeFilter formFilter = new PostFormFilter();
NodeList nodeList = null;
try {
nodeList = parser.extractAllNodesThatMatch(formFilter);
} catch (ParserException e) {
log.error(e);
}
return nodeList;
}
HTML工具类函数四:解析TAG中的属性,生成所有的PageField的POJO
public static List<PageField> getPageFields(String url) {
List<PageField> list = null;
NodeList nodeList = getFormNodeList(url);
if (nodeList != null && nodeList.size() > 0) {
// nodeList不为空,开始构建
list = new ArrayList<PageField>(nodeList.size());
for (int i = 0; i < nodeList.size(); i++) {
TagNode node = (TagNode) nodeList.elementAt(i);
if (node instanceof InputTag) {
InputTag input = (InputTag) node;
PageField t = new PageField(input.getAttribute("name"),
PageField.TAG_TYPE_INPUT, input
.getAttribute("type"));
list.add(t);
} else if (node instanceof ScclSelectBizCodesTag) {
ScclSelectBizCodesTag scclSelectBizCodesTag = (ScclSelectBizCodesTag) node;
PageField t = new PageField(scclSelectBizCodesTag
.getAttribute("id"),
PageField.TAG_TYPE_SELECT, null);
list.add(t);
} else if (node instanceof TextareaTag) {
TextareaTag textArea = (TextareaTag) node;
PageField t = new PageField(textArea.getAttribute("name"),PageField.TAG_TYPE_TEXTAREA,null);
list.add(t);
}
}
}
return list;
}
扩展自定义标签<sccl:selectBizCodes>
public class ScclSelectBizCodesTag extends TagNode {
private static final long serialVersionUID = -6352090777443844707L;
private static final String[] ids = new String[] { "sccl:selectBizCodes" };
public String[] getIds() {
return (ids);
}
public String[] getEnders() {
return (ids);
}
public String getCategory(){
return super.getAttribute("category");
}
public String getId(){
return super.getAttribute("id");
}
public String getSelected(){
return super.getAttribute("selected");
}
}
用FILTER方式过滤访问TAG
public class PostFormFilter implements NodeFilter {
private static final long serialVersionUID = 8162322553987269165L;
public boolean accept(Node node) {
if (node instanceof InputTag) {
return true;
}
if (node instanceof ScclSelectBizCodesTag) {
return true;
}
if (node instanceof TextareaTag) {
return true;
}
return false;
}
}
测试
public static void main(String[] args)
throws org.htmlparser.util.ParserException, IOException {
String url = "file:///E:\\work\\html\\editOrder.jsp";
List<PageField> list = getPageFields(url);
list.get(0);
}
以上代码可以解析<input> <select> 自定义类型
<sccl:selectBizCodes category="worksheet" id="worksheetCode" selected="cl" onChange="go();" html="style='test';"/>
问题一
拷贝cpdetector_1.0.7.jar到项目中后
同时也要拷贝ext下面的chardet.jar到lib下面,不然在调用
detector.add(JChardetFacade.getInstance());时要报错,找不到类
nsICharsetDetectionObserver
问题二
拷贝htmlparser相关包如下:
htmlparser.jar
htmllexer.jar