HTML输入流编码探测

最新推荐文章于 2021-07-05 16:22:11 发布

原创最新推荐文章于 2021-07-05 16:22:11 发布 · 1.9k 阅读

2 ·

CC 4.0 BY-SA版权

文章标签：

#html #encoding #string #文档 #byte #input

JAVA 专栏收录该内容

17 篇文章

订阅专栏

本文介绍了一种从HTML输入流中解析文档编码的方法。通过分析HTML头部的meta标记来确定文档编码，确保正确解析文档内容。

博学，切问，近思--詹子知(http://blog.youkuaiyun.com/zhiqiangzhan)

在项目中经常会需要对Html输入流进行分析，那么我们首先就必须确定该文档的编码，如果编码分析错误，就会得到一堆的乱码。而且现在很多WEB Server在返回相应头的过程中也很少会指定该HTML文档的编码，因此我们只能对它的输入流就进行分析。判断的标准之一就是HTML文档中meta标记中的charset提供的信息，现在基本上大部分文档都会提供这个head字段，所以，分析这个字段是个简单而且十分有效的方法，但是，要想得到这个字段，我们必须要拿到文档的字符内容才可以，而这个时候，它的字符编码我们还不知道。由于meta标记出现在HTML文档的头部，因此我们有理由只解析这部分字符串，所以我们可以从HTML InputStream读出部分字节，把它们解码成一个字符串，然后根据这个字符串信息，我们就可以顺利拿到整个文档的编码。在这里，我们使用ISO8859-1为默认的编码，之所以选择它，很大的一点是因为它是一种无损失的编码，一旦我们不能正确的拿出文档的编码，也没有关系，只要我们知道文档正确的编码，根据ISO8859-1编码后的字符串，使用ISO8859-1解码后我们就可以得到原来字节数组。import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.net.URL; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.log4j.Logger; public class HtmlInputStreamDecoder { private static final Logger LOGGER = Logger.getLogger(HtmlInputStreamDecoder.class); private static final String ENCODING_TAG = "<meta[^;]+;//s*charset//s*=//s*([^/"//s]+)[^>]*>"; private static final String IGNORE_TAG_REGEX = "<(head|script|style|iframe|comment)[^>]*>.*?<///1>"; private static final String DEFAULT_ENCODING = "iso8859-1"; private static final Pattern EMPTY_LINE = Pattern.compile("/n^//s*$", Pattern.MULTILINE | Pattern.UNIX_LINES); private static final int BLOCK_SIZE = 1024; public String decode(InputStream is) throws IOException { StringBuilder sb = new StringBuilder(); byte[] bytes = new byte[BLOCK_SIZE]; int len = is.read(bytes); String encoding = resolveEncoding(bytes, len); if (LOGGER.isDebugEnabled()) { LOGGER.debug("Current encoding is " + encoding); } sb.append(stripEmptyLine(new String(bytes, 0, len, encoding))); while ((len = is.read(bytes)) > 0) { sb.append(stripEmptyLine(new String(bytes, 0, len, encoding))); } return sb.toString(); } //去除无关的tag，比如Script，style，head等 public String stripIngoreTag(String input) { Pattern ingorePattern = Pattern.compile(IGNORE_TAG_REGEX, Pattern.DOTALL | Pattern.MULTILINE | Pattern.CASE_INSENSITIVE); Matcher m = ingorePattern.matcher(input); return m.replaceAll(""); } //去除多余空行 protected String stripEmptyLine(String input) { Matcher m = EMPTY_LINE.matcher(input); return m.replaceAll(""); } protected String resolveEncoding(byte[] bytes, int len) throws UnsupportedEncodingException { String encoding = resolveEncoding(bytes); if (encoding == null) { String detector = new String(bytes, 0, len, DEFAULT_ENCODING); Pattern encodingPattern = Pattern.compile(ENCODING_TAG, Pattern.CASE_INSENSITIVE); Matcher m = encodingPattern.matcher(detector); if (m.find()) { encoding = m.group(1); } else { encoding = DEFAULT_ENCODING; } } return encoding; } public String resolveEncoding(byte[] rawBytes) { String result = null; boolean utf16BEBom = false; boolean utf16LEBom = false; boolean utf16BE = false; boolean utf16LE = false; if (rawBytes.length >= 2) { if (((byte) 0xFE == rawBytes[0]) && ((byte) 0xFF == rawBytes[1])) { utf16BEBom = true; } else if ((byte) 0xFF == (rawBytes[0]) && ((byte) 0xFE == rawBytes[1])) { utf16LEBom = true; } } if (rawBytes.length >= 4) { if ((0 != rawBytes[0]) && (0 == rawBytes[1]) && (0 != rawBytes[2]) && (0 == rawBytes[3])) { utf16LE = true; } else if ((0 == rawBytes[0]) && (0 != rawBytes[1]) && (0 == rawBytes[2]) && (0 != rawBytes[3])) { utf16BE = true; } } if (utf16LE) { result = "UTF-16LE"; } else if (utf16BE) { result = "UTF-16BE"; } else if (utf16LEBom) { result = "UTF-16LEBom"; } else if (utf16BEBom) { result = "UTF-16BEBom"; } return result; } public static void main(String[] args) throws IOException { HtmlInputStreamDecoder decoder = new HtmlInputStreamDecoder(); String text = decoder.decode(new URL("http://www.csdn.com/").openStream()); System.out.println(text); System.out.println(decoder.stripIngoreTag(text)); } }