利用HtmlParser 完成含有html标签的字符截取并补全标签

截取一定长度带有HTML标签的文件内容,如果按普通文字用subtring进行截取的,当把html标签载断而非完整闭合时,整个页面都可能因此而变形, 利用HtmlParser可以很好的保留原HTML标签并将载段的标签补齐。

应该注意一点<strong>***</strong>标签需要重新定义,因为现在strong标签的getEndTag() 是null不知道是不是htmlparser的一个bug


package com.test.util;

import java.util.Stack;
import org.htmlparser.tags.CompositeTag;

import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.ParserException;

public class SubstringHTML {

private final String CONTENT;
private Parser parser;
private Stack<TagNode> nodeStack;
private int subLength;
private int textLength = 0;
private int pos = 0;

public SubstringHTML(String content) {
CONTENT = content;

parser = Parser.createParser(content, "GBK");
factory = new PrototypicalNodeFactory();
factory.registerTag(new StrongTag());
parser.setNodeFactory(factory);

nodeStack = new Stack<TagNode>();
}

private void recusive(NodeIterator iterator) throws ParserException {

while (iterator.hasMoreNodes()) {
Node node = iterator.nextNode();
if (node instanceof TagNode) {
TagNode tagNode = (TagNode)node;
Tag tag = tagNode.getEndTag();

if (tag != null) {
nodeStack.push(tagNode);
}
}
else if (node instanceof TextNode) {
if (node.getText().trim().length() == 0) {
continue;
}

if (node.getTagName.equals("SCRIPT") {
continue;
}

String nodeText = node.getText();
int tLen = nodeText.length();
if ((textLength < subLength) && ((textLength + tLen) > subLength)) {
pos = node.getStartPosition() + subLength - textLength;
textLength = subLength;
return;
}
else {
textLength += tLen;
pos = node.getEndPosition();
}
}

if (node.getChildren() == null) {
continue;
}
recusive(node.getChildren().elements());

if (subLength <= textLength) {
return;
}
}
}

public String subString(int length, String end) {
if (length >= CONTENT.length() || length <= 0) {
return CONTENT;
}

subLength = length;
try {
recusive(parser.elements());
} catch (ParserException e) {
System.out.println("parser error:" + e.getMessage());
return CONTENT;
}

int size = nodeStack.size();
StringBuffer buffer = new StringBuffer();
buffer.append(CONTENT.substring(0, pos));

while (size > 0) {
TagNode node = nodeStack.pop();
size--;

if (node.getEndTag().getEndPosition() <= pos || node.getTagBegin() >= pos) {
continue;
}

buffer.append("</");
buffer.append(node.getTagName());
buffer.append(">");
}

buffer.append(end);
return buffer.toString();
}

private static String getContent() {
byte[] con = null;
InputStream in = SubstringHTML.class.getResourceAsStream("content.txt");
try {
int length = in.available();
con = new byte[length];
in.read(con, 0, length);
} catch (IOException e) {
e.printStackTrace();
}

try {
return new String(con, "GBK");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
return "";
}
}

public static void main(String[] args) {
String content = getContent();
SubstringHTML app = new SubstringHTML(content);
String str = app.subString(200, "");
System.out.println(str);
}
}

class StrongTag extends CompositeTag {
private static final long serialVersionUID = 1L;
private static final String[] mIds = new String[] { "STRONG" };
private static final String[] mEndTagEnders = new String[] {"BODY", "HTML"};

public String[] getIds() {
return mIds;
}

public String[] getEndTagEnders ()
{
return (mEndTagEnders);
}
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值