HTML过滤和补齐（一）

最新推荐文章于 2021-02-13 04:08:17 发布

magic_dreamer

最新推荐文章于 2021-02-13 04:08:17 发布

阅读量158

点赞数

CC 4.0 BY-SA版权

分类专栏： JAVA 文章标签： HTML Apache Security XML J#

本文链接：https://blog.youkuaiyun.com/magic_dreamer/article/details/83525469

JAVA 专栏收录该内容

309 篇文章

订阅专栏

主要使用了一个UTIL工具来过滤HTML
其中使用到了alibaba的几个类,
import com.alibaba.common.lang.ObjectUtil;
import com.alibaba.common.lang.StringEscapeUtil;
import com.alibaba.common.lang.i18n.LocaleUtil;
import com.alibaba.common.lang.internal.Entities;
import com.alibaba.common.lang.StringUtil;

package com.megaeyes.ipcamera.service.util;

import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

import org.apache.commons.codec.binary.Base64;
import org.apache.html.dom.HTMLDocumentImpl;
import org.apache.oro.text.regex.Pattern;
import org.apache.oro.text.regex.PatternMatcher;
import org.apache.oro.text.regex.PatternMatcherInput;
import org.apache.oro.text.regex.Perl5Compiler;
import org.apache.oro.text.regex.Perl5Matcher;
import org.apache.oro.text.regex.Perl5Substitution;
import org.apache.oro.text.regex.Util;
import org.apache.xerces.xni.parser.XMLDocumentFilter;
import org.cyberneko.html.filters.ElementRemover;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.html.HTMLDocument;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import com.alibaba.common.lang.ObjectUtil;
import com.alibaba.common.lang.StringEscapeUtil;
import com.alibaba.common.lang.i18n.LocaleUtil;
import com.alibaba.common.lang.internal.Entities;
import com.alibaba.common.lang.StringUtil;

public class TBStringUtil {
private static MessageDigest mHasher;
private static char[] digits = { '0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
private static Pattern escapeURLsInHTMLPattern = null;
private static Pattern escapeSpecialHTMLPattern = null;
private static String[] commonAttribute = new String[] { "style", "align",
"valign", "class", "bgcolor", "background", "title" };
private static String[] divAttribute = new String[] { "align", "valign",
"class", "bgcolor", "background", "title" };
private static String[] imgAttribute = new String[] { "style", "align",
"valign", "class", "bgcolor", "background", "title", "src",
"border", "width", "height", "alt", "usemap" };
private static String[] fontAttribute = new String[] { "style", "align",
"valign", "class", "bgcolor", "background", "title", "color",
"size", "face" };
private static String[] tableAttribute = new String[] { "style", "align",
"valign", "class", "bgcolor", "background", "title", "border",
"width", "height", "cellpadding", "cellspacing", "bordercolor",
"blockquote" };
private static String[] tdAttribute = new String[] { "style", "align",
"valign", "class", "bgcolor", "background", "title", "width",
"height", "colspan", "rowspan" };
private static String[] marqueeAttribute = new String[] { "style", "align",
"valign", "class", "bgcolor", "background", "title",
"scrollamount", "direction", "behavior", "width", "height",
"scrolldelay" };
private static String[] aAttribute = new String[] { "style", "align",
"valign", "class", "bgcolor", "background", "title", "target",
"name", "href" };

private static String[] bgsoundAttribute = new String[] { "src", "loop" };
private static String[] mapAttribute = new String[] { "name" };
private static String[] areaAttribute = new String[] { "href", "shape",
"coords" };

private static Set INLINE_CLOSED_TAG = new HashSet();

static {
INLINE_CLOSED_TAG.add("img");
INLINE_CLOSED_TAG.add("br");
INLINE_CLOSED_TAG.add("input");

try {
escapeURLsInHTMLPattern = (new Perl5Compiler())
.compile("(http://[a-zA-Z0-9_/&=?\\.;]*)");
escapeSpecialHTMLPattern = (new Perl5Compiler()).compile(
"^http://[a-z0-9]+\\.taobao\\.com.*$",
Perl5Compiler.CASE_INSENSITIVE_MASK);
} catch (Exception e) {
e.printStackTrace();
}
try {
mHasher = MessageDigest.getInstance("MD5");
} catch (NoSuchAlgorithmException nex) {
mHasher = null;
nex.printStackTrace();
}
}

public static String hash(String str) {
byte[] bt = null;
synchronized (mHasher) {
bt = mHasher.digest(str.getBytes());
}
int l = bt.length;

char[] out = new char[l << 1];

for (int i = 0, j = 0; i < l; i++) {
out[j++] = digits[(0xF0 & bt[i]) >>> 4];
out[j++] = digits[0x0F & bt[i]];
}

return new String(out);
}

/**
* 转化字符串以适合html输出
*
* @param str
*
* @return
*/
public static String escapeHTML(String strInput) {
if (strInput == null) {
return "";
}

try {
StringWriter out = new StringWriter(strInput.length());

if (escapeEntities(Entities.HTML40, strInput, out)) {
return out.toString();
}

return strInput;
} catch (IOException e) {
return ""; // StringWriter不可能发生这个异常
}
}

/**
* 将字符串中的部分字符转换成实体编码。
*
* @param entities
* 实体集合
* @param str
* 要转义的字符串
* @param out
* 字符输出流，不能为<code>null</code>
*
* @return 如果字符串没有变化，则返回<code>false</code>
*
* @throws IllegalArgumentException
* 如果<code>entities</code>或输出流为<code>null</code>
* @throws IOException
* 如果输出失败
*/
protected static boolean escapeEntities(Entities entities, String str,
Writer out) throws IOException {
boolean needToChange = false;

if (entities == null) {
throw new IllegalArgumentException("The Entities must not be null");
}

if (out == null) {
throw new IllegalArgumentException("The Writer must not be null");
}

if (str == null) {
return needToChange;
}

for (int i = 0; i < str.length(); ++i) {
char ch = str.charAt(i);
String entityName = entities.getEntityName(ch);

if (entityName == null) {
if (ch == '\n') {
out.write('<');
out.write('b');
out.write('r');
out.write('/');
out.write('>');
// out.write(ch);
} else if (ch == '\r') {
// nodo
} else {
out.write(ch);
}

needToChange = true;
} else {
out.write('&');
out.write(entityName);
out.write(';');

// 设置改变标志
needToChange = true;
}
}

return needToChange;
}

/**
* 比较两个字符串是否相等，""与null相等 extends com.alibaba.common.lang.StringUtil
*
* @param str1
* @param str2
*
* @return
*/
public static boolean equals(String str1, String str2) {
if (StringUtil.isBlank(str1) && StringUtil.isBlank(str2)) {
return true;
}
return StringUtil.equals(str1, str2);
}

/**
* 去除特殊的HTML标记，自动补齐不完整的HTML
*
* @param String
* 转换前的HTML
*
* @return String 转换后的HTML
*/
public static String escapeSpecialHTML(String str) {
return escapeSpecialHTML(str, true);
}

/**
* 去除HTML标记
*
* @param str
* @return
*/
public static String stripHTML(String str) {
if (StringUtil.isBlank(str)) {
return "";
}

try {
DOMFragmentParser parser = new DOMFragmentParser();

// 标签过滤器
// acceptElement指接受那些html标签。removeElement表示那些标签会全部除去（包括子标签）。这两种之外的会去掉标签，但保留内容。
ElementRemover remover = new ElementRemover();

remover.removeElement("script");
remover.removeElement("style");
remover.removeElement("head");
remover.removeElement("select");

XMLDocumentFilter[] filters = { remover };

parser.setProperty("http://cyberneko.org/html/properties/filters",
filters);

HTMLDocument document = new HTMLDocumentImpl();
DocumentFragment fragment = document.createDocumentFragment();
InputSource is = new InputSource(new StringReader(str));

is.setEncoding("GBK");
parser.parse(is, fragment);

return getHTML(fragment, false).toString();
} catch (IOException e) {
// e.printStackTrace();
} catch (SAXException e) {
// e.printStackTrace();
} catch (Exception e) {
// ingore
}

return escapeHTML(str);
}

/**
* 去除特殊的HTML标记，自动补齐不完整的HTML
*
* @param String
* 转换前的HTML
* @param boolean
* forOutPut
* true：判断表格和链接是否合法。false：不判断。系统中发布宝贝的时候存入数据库前是true，显示宝贝的时候用的false
*
* @return String 转换后的HTML
*/
public static String escapeSpecialHTML(String str, boolean check) {
if (StringUtil.isBlank(str)) {
return "";
}

try {
DOMFragmentParser parser = new DOMFragmentParser();

// 标签过滤器
// acceptElement指接受那些html标签。removeElement表示那些标签会全部除去（包括子标签）。这两种之外的会去掉标签，但保留内容。
ElementRemover remover = new ElementRemover();

remover.acceptElement("b", commonAttribute);
remover.acceptElement("i", commonAttribute);
remover.acceptElement("u", commonAttribute);
remover.acceptElement("br", commonAttribute);
remover.acceptElement("hr", commonAttribute);
remover.acceptElement("sup", commonAttribute);
remover.acceptElement("sub", commonAttribute);
remover.acceptElement("strong", commonAttribute);
remover.acceptElement("em", commonAttribute);
remover.acceptElement("strike", commonAttribute);
remover.acceptElement("ol", commonAttribute);
remover.acceptElement("li", commonAttribute);
remover.acceptElement("ul", commonAttribute);
remover.acceptElement("h1", commonAttribute);
remover.acceptElement("h3", commonAttribute);
remover.acceptElement("h2", commonAttribute);
remover.acceptElement("h4", commonAttribute);
remover.acceptElement("h5", commonAttribute);

remover.acceptElement("span", commonAttribute);
remover.acceptElement("div", divAttribute);
remover.acceptElement("p", commonAttribute);

remover.acceptElement("a", aAttribute);
remover.acceptElement("img", imgAttribute);
remover.acceptElement("font", fontAttribute);
remover.acceptElement("table", tableAttribute);
remover.acceptElement("caption", commonAttribute);
remover.acceptElement("tr", tdAttribute);
remover.acceptElement("td", tdAttribute);
remover.acceptElement("bgsound", bgsoundAttribute);
remover.acceptElement("map", mapAttribute);
remover.acceptElement("area", areaAttribute);
remover.acceptElement("marquee", marqueeAttribute);
remover.acceptElement("blockquote", commonAttribute);
remover.acceptElement("cite", commonAttribute);

remover.removeElement("script");
remover.removeElement("style");
remover.removeElement("head");
remover.removeElement("select");

XMLDocumentFilter[] filters = { remover };

parser.setProperty(
"http://cyberneko.org/html/properties/default-encoding",
"GBK");
parser.setProperty("http://cyberneko.org/html/properties/filters",
filters);

HTMLDocument document = new HTMLDocumentImpl();
DocumentFragment fragment = document.createDocumentFragment();
InputSource is = new InputSource(new StringReader(str));

is.setEncoding("GBK");
parser.parse(is, fragment);

return getHTML(fragment, check).toString();
} catch (IOException e) {
// e.printStackTrace();
} catch (SAXException e) {
// e.printStackTrace();
} catch (Exception e) {
// ignore
}

return escapeHTML(str);
}