/**//* * EncodeGoter.java * * Created on 2007年9月30日, 下午4:49 * * To change this template, choose Tools | Template Manager * and open the template in the editor. */package com.ckcs.url;import java.io.BufferedInputStream;import java.io.ByteArrayOutputStream;import java.io.InputStream;import java.net.URL;import java.net.URLConnection;import java.nio.charset.Charset;import java.util.regex.Matcher;import java.util.regex.Pattern;/** *//** * * @author admin */public class EncodeGoter ...{ /** *//** * Creates a new instance of EncodeGoter */ public EncodeGoter() ...{ } /** *//** * 获得页面的字符编码 */ private String getEncode(String size) throws Exception...{ URL url = new URL(size); String charset = null; Pattern pattern = Pattern.compile("charset.*=.*>?", Pattern.CASE_INSENSITIVE); URLConnection con = url.openConnection(); String contentType = con.getContentType(); //先尝试从http响应头获取字符编码 charset = doGetEncode(pattern, contentType); if(charset == null) ...{ //如果得不到,尝试从页面的元数据信息上获取 InputStream is = url.openStream(); BufferedInputStream bis = new BufferedInputStream(is); ByteArrayOutputStream bos = new ByteArrayOutputStream(); int count = 0; byte[] bytes = new byte[1024]; while((count = bis.read(bytes)) != -1) ...{ //每次读1024把字符截断了怎么办 bos.write(bytes, 0, count); bos.flush(); charset = doGetEncode(pattern, bos.toString()); if(charset != null) ...{ //找到编码 break; } bos.reset(); } } return charset; } /** *//** * 读取页面数据匹配模式 */ private String doGetEncode(Pattern pattern, String str) throws Exception...{ Matcher matcher = null; String matchStr = null; String charset = null; matcher = pattern.matcher(str); if(matcher.find()) ...{ //找到第一个符合要求的 matchStr = matcher.group(); //截取希望处理的字符串,替换可能的特殊符号 charset = matchStr.substring(matchStr.indexOf("=") + 1).replaceAll("["|/|/|/s].*[/>|>]", ""); } return charset; } public static void main(String[] args) throws Exception ...{ EncodeGoter eg = new EncodeGoter();// eg.getEncode("http://java.sun.com");// eg.getEncode("http://www.sun.com");// eg.getEncode("http://www.youkuaiyun.com");// eg.getEncode("http://www.dmoz.org/");// eg.getEncode("http://www.baidu.com/search/image_recommend.html"); String charset = eg.getEncode("http://java.sun.com"); if (charset != null) ...{ System.out.println("页面的字符编码应该为:" + charset); } else ...{ charset = Charset.defaultCharset().toString(); //使用默认编码 System.out.println("找不到页面字符编码,平台默认编码为:" + charset); } }}