1:字符编码是怎么回事?
举个例子:一张图片,它可以保存为多种格式:jpg、png、bmp、gif等等。虽然数据格式不同,但它们传递的信息是一样的。同理,字符串(string)是信息,而字符编码就是数据格式。
信息是抽象的,而用来记录信息的数据是具体的。比如你脑子里有个想法,这个想法就是个抽象的东西。虽然它得得确确是存在的,但如果你不把它表达出来,那它事实上等于没有。怎么表达呢?说话或写字,可以用汉语、英语、日语。。。
在C#中,string就是信息,抽像的。string本身是没有编码的。但string对应的byte[]数组必需有编码。这就是信息-数据的特点。信息是抽象的,而数据必需有数据格式。你在程序中有一个string,你想把这个string输出到文件,怎么办呢?请看.net字符串输出函数:
byte[] Encoding.GetEncoding(string charset).GetBytes(string text)
text是信息,charset可以把这个信息保存为不同格式的数据。
反过来:
string Encoding.GetEncoding(string Charset).GetString(byte[] data)
可以根据特定格式数据获得信息
所以:字符编码就是string转byte[]时所用的数据格式。"字符(string)的编码"是个错误概念。"字符数据(byte[])的编码"才是正确的。
再回到图片,不同的图片保存方式有不同的特点。jpg压缩比高省空间,bmp质量最高,png各方面性能较为均衡而且支持透明色。string对应的不同编码格式也各有优缺点。如果要表达汉字,一般情况下gb2312最高效。gbk比gb2312能表达更多汉字。而如果你需要表达多国语言,那么需要采用unicode系列如utf-8。
2:网页文件
很多朋友在下载到网页文件后(html、css或js等),发现里面是乱码。乱码的主要原因就是在由byte[]转string过程中,编码格式不对。咋不对呢?就是string转byte[]用的是A编码,可你在反转时用的却是编码B或C。
下面代码是我项目中的一个类,这个项目是把网页保存成mht格式文件。这个类用来下载网页资源,它里面包含了对编码的判定。需要说明的是:html 文件里面一般都包含编码信息,而css或js文件一般没有。少数情况下下载函数可以返回编码类型,更多情况需要自己判定css或js文件的编码类型。怎么判定呢?utf-8可以通过0xef 0xbb 0xbf头进行判定。其它的就很难判定了,对于不能判定情况,我采用html文件的编码。
using System;
using System.Collections.Generic;
using System.Windows.Forms;
using System.IO;
using System.IO.Compression;
using System.Net;
using System.Net.Cache;
using System.Text;
using System.Text.RegularExpressions;
namespace HtmlShow
{
public class DownloadFile
{
string url;
/// <summary>
/// 资源路径
/// </summary>
public string Url
{
get
{ return url; }
set
{ url = value; }
}
/// <summary>
/// 原始字符编码
/// </summary>
string originalCharset;
public string OriginalCharset
{
get
{ return originalCharset; }
set
{ originalCharset = value; }
}
string charset;
/// <summary>
/// 字符编码
/// </summary>
public string CharSet
{
get
{
return charset == null ? originalCharset : charset;
}
set
{ charset = value; }
}
string type;
/// <summary>
/// 资源的mime类型
/// </summary>
public string ContentType
{
get
{ return type; }
}
EncodeType encode = EncodeType.none;
/// <summary>
/// 数据编码
/// </summary>
public EncodeType Encode
{
get
{ return encode; }
set
{ encode = value; }
}
/// <summary>
/// 字符串内容
/// </summary>
public string Content
{
get
{
if (originalCharset != null)
{
return AdjustCharSet(Encoding.GetEncoding(originalCharset).GetString(data));
}
else
{
string temp = Encoding.Default.GetString(data);
Regex reg_charset = new Regex("<meta.+?charset=(?<name>.*?)\".*?>", RegexOptions.IgnoreCase);
Match m_charset = reg_charset.Match(temp);
if (m_charset.Success)
{
originalCharset = m_charset.Groups["name"].Value;
return AdjustCharSet(Encoding.GetEncoding(originalCharset).GetString(data));
}
else
{
if (data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF)
{
originalCharset = "utf-8";
return AdjustCharSet(Encoding.UTF8.GetString(data));
}
else
{
originalCharset = Encoding.Default.WebName;
return Encoding.Default.GetString(data);
}
}
}
}
}
byte[] data;
/// <summary>
/// 二进制数据
/// </summary>
public byte[] Data
{
get
{
switch (encode)
{
case EncodeType.none:
case EncodeType.bit8:
byte[] data = Encoding.GetEncoding(charset).GetBytes(Content);
if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase) &&
data[0] != 0xEF)
{
byte[] result = new byte[data.Length + 3];
result[0] = 0xEF;
result[1] = 0xBB;
result[2] = 0xBF;
Buffer.BlockCopy(data, 0, result, 3, data.Length);
return result;
}
else
{ return data; }
case EncodeType.base64:
string ss = toBase64(this.data);
StringBuilder sb = new StringBuilder();
int start = 0;
while (start + 76 < ss.Length)
{
sb.Append(ss.Substring(start, 76) + "\r\n");
start += 76;
}
if (start < ss.Length - 1)
{
sb.Append(ss.Substring(start));
}
return System.Text.Encoding.Default.GetBytes(sb.ToString());
default:
return Encoding.GetEncoding(charset).GetBytes(Content); ;
}
}
}
public DownloadFile(string url)
{
this.url = url;
}
public bool Download()
{
Stream stream1 = null;
HttpWebRequest HWR = null;
HttpWebResponse HWResp = null;
StreamReader reader1 = null;
MemoryStream temp = new MemoryStream();
Regex reg_charset = new Regex("<meta.+?charset=(?<name>.*?)\".*?>", RegexOptions.IgnoreCase);
try
{
int Redirect = 0;
CookieContainer cc = new CookieContainer();
start:
HWR = (HttpWebRequest)WebRequest.Create(Url);
HWR.Timeout = 16000;
//HWR.ReadWriteTimeout = 10000;
HWR.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR " + Environment.Version.ToString() + ")";
HWR.Accept = "*/*";
HWR.KeepAlive = true;
HWR.Method = "GET";
HWR.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
HWR.Headers.Add("Accept-Encoding", "gzip, deflate");
HWR.ContentType = "application/x-www-form-urlencoded";
HWR.AllowAutoRedirect = false;
HWR.CachePolicy = new RequestCachePolicy(RequestCacheLevel.CacheIfAvailable);
HWR.Credentials = CredentialCache.DefaultCredentials;
HWR.CookieContainer = cc;
HWResp = (HttpWebResponse)HWR.GetResponse();
if ((HWResp.StatusCode == HttpStatusCode.Redirect) ||
(HWResp.StatusCode == HttpStatusCode.Moved) ||
(HWResp.StatusCode == HttpStatusCode.MovedPermanently))
{
cc.Add(HWResp.Cookies);
Url = HWResp.Headers["Location"];// HWResp.ResponseUri.OriginalString;
Redirect++;
goto start;
}
if (Redirect > 0)
{ this.Url = HWResp.ResponseUri.OriginalString; }
type = HWResp.ContentType;
Regex reg = new Regex("charset=(?<set>[^\"\\s;]+)", RegexOptions.IgnoreCase);
Match m = reg.Match(type);
if (m.Success)
{ originalCharset = m.Groups["set"].Value; }
stream1 = Decompress(HWResp);
long totalDownloadedByte = 0;
byte[] by = new byte[1024];
int osize = stream1.Read(by, 0, (int)by.Length);
while (osize > 0)
{
totalDownloadedByte = osize + totalDownloadedByte;
temp.Write(by, 0, osize);
osize = stream1.Read(by, 0, (int)by.Length);
}
data = temp.GetBuffer();
return true;
}
catch (Exception)
{ return false; }
finally
{
if (reader1 != null)
{ reader1.Close(); }
if (stream1 != null)
{ stream1.Close(); }
if (HWResp != null)
{ HWResp.Close(); }
temp.Close();
}
}
/// <summary>
/// 纠正title标签位于charset标志之前的bug
/// </summary>
/// <returns></returns>
private string AdjustCharSet(string text)
{
string input = string.Copy(text);
Regex reg_charset = new Regex("<meta[^<>]+?charset=(?<name>.*?)\"[^<>]*?>", RegexOptions.IgnoreCase);
Match m_charset = reg_charset.Match(input);
Regex reg_head = new Regex("<head[^>]*?>", RegexOptions.IgnoreCase);
Match m_head = reg_head.Match(input);
if (m_head.Success && m_charset.Success)
{
input = input.Replace(m_charset.Value, "");
input = reg_head.Replace(input, m_head.Value + m_charset.Value);
}
return input;
}
/// <summary>
/// 将Byte[]转换成Base64编码文本
/// </summary>
/// <param name="binBuffer">Byte[]</param>
/// <returns></returns>
public static string toBase64(byte[] binBuffer)
{
int base64ArraySize = (int)Math.Ceiling(binBuffer.Length / 3d) * 4;
char[] charBuffer = new char[base64ArraySize];
Convert.ToBase64CharArray(binBuffer, 0, binBuffer.Length, charBuffer, 0);
string s = new string(charBuffer);
return s;
}
/// GZip解压函数
/// </summary>
/// <param name="data"></param>
/// <returns></returns>
public Stream Decompress(HttpWebResponse HWResp)
{
Stream stream1 = HWResp.GetResponseStream();
MemoryStream stream = new MemoryStream();
if (string.Equals(HWResp.ContentEncoding, "gzip", StringComparison.OrdinalIgnoreCase))
{
using (GZipStream gZipStream = new GZipStream(stream1, CompressionMode.Decompress))
{
byte[] bytes = new byte[40960];
int n;
while ((n = gZipStream.Read(bytes, 0, bytes.Length)) != 0)
{
stream.Write(bytes, 0, n);
}
gZipStream.Close();
}
stream.Seek((long)0, SeekOrigin.Begin);
return stream;
}
else
{
if (string.Equals(HWResp.ContentEncoding, "deflate", StringComparison.OrdinalIgnoreCase))
{
using (DeflateStream deflateStream = new DeflateStream(stream1, CompressionMode.Decompress))
{
byte[] bytes = new byte[40960];
int n;
while ((n = deflateStream.Read(bytes, 0, bytes.Length)) != 0)
{
stream.Write(bytes, 0, n);
}
deflateStream.Close();
}
stream.Seek((long)0, SeekOrigin.Begin);
return stream;
}
}
return stream1;
}
}
public enum EncodeType
{
bit8,
base64,
none
}
}
注:EncodeType是对数据进行再编码,为了便于网络传输,和今天讨论主题无关。
最后:
较真地说:C#中的string其实也是有编码的,因为绝对抽像的东西是不存在的。就像你心里的话,在没由嘴里说出和写到纸上前,它是由你的脑细胞、脑电波构成的。。。。
C#中string虽然对你抽像了,但VisualStudio还是要保存它为具体数据格式的。C#中string采用的是unicode(utf-16)编码。string的编码仅对VisualStudio有用,对用户(程序员)透明。
如何判断css和js文件编码,我至今没有一个完美的解决方案(不知道网络浏览器是怎么判定的),大家有好办法欢迎补充。