ftp client的开源实现

最新推荐文章于 2025-12-06 09:28:08 发布

原创最新推荐文章于 2025-12-06 09:28:08 发布 · 126 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#Ant #PHP #.net #HTML #EXT

JAVA 专栏收录该内容

309 篇文章

订阅专栏

本文介绍了一种使用HTMLParser和cpdetector库解析HTML文档的方法，包括自动检测网页编码、读取HTML内容、提取表单元素及自定义标签，并生成PageField对象列表。

HTML解析htmlparser

htmlparser
首页：http://sourceforge.net/projects/htmlparser/
下载：http://sourceforge.net/project/showfiles.php?group_id=24399
文件：HTMLParser-2.0-SNAPSHOT-bin.zip

cpdetector
首页：http://cpdetector.sourceforge.net/
下载：http://sourceforge.net/project/showfiles.php?group_id=114421
文件：cpdetector_eclipse_project_1.0.7.zip

解开压缩后，运行ANT打包命令，build.xml有些地方需要稍微根据具体情况调整一下
ant jar.htmlentitydecoder
得到JAR包
cpdetector_1.0.7.jar

HTML工具类函数一：自动探测URL的HTML内容的编码

/**
* 自动探测页面的编码
*
* @param url
* @return
* @throws MalformedURLException
*/
public static String autoDetectCharset(String url) {
URL source = null;
try {
source = new URL(url);
} catch (MalformedURLException e) {
log.error(e);
}
CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
detector.add(new ParsingDetector(false));
detector.add(JChardetFacade.getInstance());
detector.add(ASCIIDetector.getInstance());
detector.add(UnicodeDetector.getInstance());

Charset charset = null;
try {
charset = detector.detectCodepage(source);
} catch (IOException e) {
log.error(e);
}

if (charset == null) {
charset = Charset.defaultCharset();
}
return charset.name();
}

HTML工具类函数二：读取URL中的HTML文本
/**
* 读取文件HTML内容
*
* @param url
* @param charset
* @return
* @throws IOException
*/
public static String readURL(String url, String charset) {
/* StringBuffer的缓冲区大小 */
int TRANSFER_SIZE = 4096;

/* 当前平台的行分隔符 */
String lineSep = System.getProperty("line.separator");

String content = "";
URL source = null;
try {
source = new URL(url);
} catch (MalformedURLException e) {
log.error(e);
}
InputStream in = null;
try {
in = source.openStream();
} catch (IOException e) {
log.error(e);
}
BufferedReader reader = null;
try {
reader = new BufferedReader(new InputStreamReader(in, charset));
} catch (UnsupportedEncodingException e) {
log.error(e);
}
String line = new String();
StringBuffer temp = new StringBuffer(TRANSFER_SIZE);
try {
while ((line = reader.readLine()) != null) {
temp.append(line);
temp.append(lineSep);
}
in.close();
reader.close();
} catch (IOException e) {
log.error(e);
}
content = temp.toString();
return content;
}

HTML工具类函数三：解析HTML得到其中的所有TAG
public static NodeList getFormNodeList(String url) {
Parser parser = Parser.createParser(readURL(url),
autoDetectCharset(url));
PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
factory.registerTag(new ScclSelectBizCodesTag());
factory.registerTag(new InputTag());
factory.registerTag(new TextareaTag());
parser.setNodeFactory(factory);

NodeFilter formFilter = new PostFormFilter();

NodeList nodeList = null;
try {
nodeList = parser.extractAllNodesThatMatch(formFilter);
} catch (ParserException e) {
log.error(e);
}
return nodeList;
}

HTML工具类函数四：解析TAG中的属性，生成所有的PageField的POJO
public static List<PageField> getPageFields(String url) {
List<PageField> list = null;
NodeList nodeList = getFormNodeList(url);
if (nodeList != null && nodeList.size() > 0) {
// nodeList不为空，开始构建
list = new ArrayList<PageField>(nodeList.size());
for (int i = 0; i < nodeList.size(); i++) {
TagNode node = (TagNode) nodeList.elementAt(i);
if (node instanceof InputTag) {
InputTag input = (InputTag) node;
PageField t = new PageField(input.getAttribute("name"),
PageField.TAG_TYPE_INPUT, input
.getAttribute("type"));
list.add(t);
} else if (node instanceof ScclSelectBizCodesTag) {
ScclSelectBizCodesTag scclSelectBizCodesTag = (ScclSelectBizCodesTag) node;
PageField t = new PageField(scclSelectBizCodesTag
.getAttribute("id"),
PageField.TAG_TYPE_SELECT, null);
list.add(t);
} else if (node instanceof TextareaTag) {
TextareaTag textArea = (TextareaTag) node;
PageField t = new PageField(textArea.getAttribute("name"),PageField.TAG_TYPE_TEXTAREA,null);
list.add(t);
}
}
}
return list;
}

扩展自定义标签<sccl:selectBizCodes>
public class ScclSelectBizCodesTag extends TagNode {
private static final long serialVersionUID = -6352090777443844707L;
private static final String[] ids = new String[] { "sccl:selectBizCodes" };
public String[] getIds() {
return (ids);
}
public String[] getEnders() {
return (ids);
}
public String getCategory(){
return super.getAttribute("category");
}
public String getId(){
return super.getAttribute("id");
}
public String getSelected(){
return super.getAttribute("selected");
}
}

用FILTER方式过滤访问TAG
public class PostFormFilter implements NodeFilter {
private static final long serialVersionUID = 8162322553987269165L;
public boolean accept(Node node) {
if (node instanceof InputTag) {
return true;
}
if (node instanceof ScclSelectBizCodesTag) {
return true;
}
if (node instanceof TextareaTag) {
return true;
}
return false;
}
}

测试
public static void main(String[] args)
throws org.htmlparser.util.ParserException, IOException {
String url = "file:///E:\\work\\html\\editOrder.jsp";
List<PageField> list = getPageFields(url);
list.get(0);
}

以上代码可以解析<input> <select> 自定义类型
<sccl:selectBizCodes category="worksheet" id="worksheetCode" selected="cl" onChange="go();" html="style='test';"/>

问题一
拷贝cpdetector_1.0.7.jar到项目中后
同时也要拷贝ext下面的chardet.jar到lib下面，不然在调用
detector.add(JChardetFacade.getInstance());时要报错，找不到类
nsICharsetDetectionObserver

问题二
拷贝htmlparser相关包如下：
htmlparser.jar
htmllexer.jar