截取一定长度带有HTML标签的文件内容,如果按普通文字用subtring进行截取的,当把html标签载断而非完整闭合时,整个页面都可能因此而变形, 利用HtmlParser可以很好的保留原HTML标签并将载段的标签补齐。
应该注意一点<strong>***</strong>标签需要重新定义,因为现在strong标签的getEndTag() 是null不知道是不是htmlparser的一个bug
应该注意一点<strong>***</strong>标签需要重新定义,因为现在strong标签的getEndTag() 是null不知道是不是htmlparser的一个bug
package com.test.util;
import java.util.Stack;
import org.htmlparser.tags.CompositeTag;
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.ParserException;
public class SubstringHTML {
private final String CONTENT;
private Parser parser;
private Stack<TagNode> nodeStack;
private int subLength;
private int textLength = 0;
private int pos = 0;
public SubstringHTML(String content) {
CONTENT = content;
parser = Parser.createParser(content, "GBK");
factory = new PrototypicalNodeFactory();
factory.registerTag(new StrongTag());
parser.setNodeFactory(factory);
nodeStack = new Stack<TagNode>();
}
private void recusive(NodeIterator iterator) throws ParserException {
while (iterator.hasMoreNodes()) {
Node node = iterator.nextNode();
if (node instanceof TagNode) {
TagNode tagNode = (TagNode)node;
Tag tag = tagNode.getEndTag();
if (tag != null) {
nodeStack.push(tagNode);
}
}
else if (node instanceof TextNode) {
if (node.getText().trim().length() == 0) {
continue;
}
if (node.getTagName.equals("SCRIPT") {
continue;
}
String nodeText = node.getText();
int tLen = nodeText.length();
if ((textLength < subLength) && ((textLength + tLen) > subLength)) {
pos = node.getStartPosition() + subLength - textLength;
textLength = subLength;
return;
}
else {
textLength += tLen;
pos = node.getEndPosition();
}
}
if (node.getChildren() == null) {
continue;
}
recusive(node.getChildren().elements());
if (subLength <= textLength) {
return;
}
}
}
public String subString(int length, String end) {
if (length >= CONTENT.length() || length <= 0) {
return CONTENT;
}
subLength = length;
try {
recusive(parser.elements());
} catch (ParserException e) {
System.out.println("parser error:" + e.getMessage());
return CONTENT;
}
int size = nodeStack.size();
StringBuffer buffer = new StringBuffer();
buffer.append(CONTENT.substring(0, pos));
while (size > 0) {
TagNode node = nodeStack.pop();
size--;
if (node.getEndTag().getEndPosition() <= pos || node.getTagBegin() >= pos) {
continue;
}
buffer.append("</");
buffer.append(node.getTagName());
buffer.append(">");
}
buffer.append(end);
return buffer.toString();
}
private static String getContent() {
byte[] con = null;
InputStream in = SubstringHTML.class.getResourceAsStream("content.txt");
try {
int length = in.available();
con = new byte[length];
in.read(con, 0, length);
} catch (IOException e) {
e.printStackTrace();
}
try {
return new String(con, "GBK");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
return "";
}
}
public static void main(String[] args) {
String content = getContent();
SubstringHTML app = new SubstringHTML(content);
String str = app.subString(200, "");
System.out.println(str);
}
}
class StrongTag extends CompositeTag {
private static final long serialVersionUID = 1L;
private static final String[] mIds = new String[] { "STRONG" };
private static final String[] mEndTagEnders = new String[] {"BODY", "HTML"};
public String[] getIds() {
return mIds;
}
public String[] getEndTagEnders ()
{
return (mEndTagEnders);
}
}