1, 基本概念,几种编码方式
1,unicode: 是一个字符集; utf16, UCS-2编码,little endian格式
2,Unicode big endian编码
3,utf8: unicode的一种,变长编码
4,ansi: 本地编码
英文-ascii ,英文字符和二进制之间的关系,包含128个字符;128--255扩展字符,
简体中文- gb2312,DBCS编码,一个汉字两个英文字符, 对ascii的中文扩展,一个小于127的字符的意义与原来相同,但两个大于127的字符连在一起时,就表示一个汉字;有数字即在127内存在,也在127外存在,数字<127,半角字符; >127,全角字符 < GBK 标准<GB18030;
繁体中文-big5
2,字符串长度: "abc汉字” unicode: 5个字符,一个字符两个字节; dbcs: 7个字符,汉字占两个英文字符
3, 实现,如果有多国语言,则用unicode来显示。工程内部用utf8来传递
//如果有多国语言要现实,则肯定要用unicode来保存显示。
wchar_t test[120] = L"مرحباً 你好にac字ㄽㄾ";
::MessageBoxW(NULL, test, L"test", MB_OK);
//本地编码会丢失数据
string acpStr = WideStrToString(test, CP_ACP);
wstring acpWstr = StringToWideStr(acpStr.c_str(), CP_ACP);
::MessageBoxW(NULL, acpWstr.c_str(), L"acpWstr", MB_OK);
//utf8不会丢失数据
string utf8Str = WideStrToString(test, CP_UTF8);
wstring utf8Wstr = StringToWideStr(utf8Str.c_str(), CP_UTF8);
::MessageBoxW(NULL, utf8Wstr.c_str(), L"utf8Wstr", MB_OK);
#ifndef __K_UTILITY_H__
#define __K_UTILITY_H__
/************************************************************************/
/*
编码转换
utf8 - gb2312
在unix平台中可以使用iconv来做转换
在windows平台可以用MultiByteToWideChar/WideCharToMultiByte 函数.
char - wchar_t
使用CRT库的mbstowcs()函数和wcstombs()函数,平台无关,需设定locale
*/
/************************************************************************/
#include "OPPOS.h"
#include <locale.h>
#ifndef OPP_WIN32
#include <iconv.h>
#endif
#include <string>
using namespace std;
inline string IntToString( int nVal );
inline string Int64ToString( __int64 liVal );
inline string FloatToString( float fVal );
inline int StringToInt( const char * pVal );
inline __int64 StringToInt64( const char * pVal );
inline float StringToFloat( const char* pVal );
inline int WideStrToInt( const wchar_t * pVal );
inline __int64 WideStrToInt64( const wchar_t * pVal );
inline float WideStrToFloat( const wchar_t * pVal );
inline wstring IntToWideStr( int nVal );
inline wstring Int64ToWideStr( __int64 lnVal );
inline wstring FloatToWideStr( float fVal );
inline string ws2s(const wstring& ws);
inline wstring s2ws(const string& s);
//GB2312 转为 UTF-8
inline void GB2312ToUTF_8(string& pOut,char *pText, int pLen);
//UTF-8 转为 GB2312
inline void UTF_8ToGB2312(string &pOut, char *pText, int pLen);
#ifndef OPP_WIN32
//代码转换:从一种编码转为另一种编码
inline int code_convert(char *from_charset,char *to_charset,char *inbuf,int inlen,char *outbuf,int outlen);
//UNICODE码转为GB2312码
inline int u2g(char *inbuf,int inlen,char *outbuf,int outlen);
//GB2312码转为UNICODE码
inline int g2u(char *inbuf,size_t inlen,char *outbuf,size_t outlen);
#else
// 把UTF-8转换成Unicode
inline void UTF_8ToUnicode(WCHAR* pOut,char *pText);
// Unicode 转换成UTF-8
inline void UnicodeToUTF_8(char* pOut,WCHAR* pText);
// 把Unicode 转换成 GB2312
inline void UnicodeToGB2312(char* pOut,unsigned short uData);
// GB2312 转换成 Unicode
inline void Gb2312ToUnicode(WCHAR* pOut,char *gbBuffer);
//GB2312 转为 UTF-8
inline void GB2312ToUTF_8ByWin(string& pOut,char *pText, int pLen);
//UTF-8 转为 GB2312
inline void UTF_8ToGB2312ByWin(string &pOut, char *pText, int pLen);
inline char* UTF8ToString(const char* src, char* dest, int dest_size);
#endif
//
inline string IntToString( int nVal )
{
// integer MAX : 4294967295L
char Buf[16] = {'\0'};
_itoa_s(nVal, Buf, sizeof(Buf), 10L);
return string(Buf);
}
inline string Int64ToString( __int64 liVal )
{
// integer_64 MAX : 18446744073709551615L
char Buf[32] = {'\0'};
_i64toa_s(liVal, Buf, sizeof(Buf), 10L);
return string(Buf);
}
inline string FloatToString( float fVal )
{
char Buf[32] = {'\0'};
sprintf_s( Buf, sizeof(Buf), "%f", fVal);
return string(Buf);
}
inline int StringToInt( const char * pVal )
{
assert(pVal);
return ( ::atoi(pVal) );
}
inline __int64 StringToInt64( const char * pVal )
{
assert(pVal);
return ( ::_atoi64(pVal) );
}
inline float StringToFloat( const char* pVal )
{
assert(pVal);
return float( ::atof(pVal) );
}
inline int WideStrToInt( const wchar_t * pVal )
{
assert(pVal);
return (::_wtoi(pVal));
}
inline __int64 WideStrToInt64( const wchar_t * pVal )
{
assert(pVal);
return (::_wtoi64(pVal));
}
inline float WideStrToFloat( const wchar_t * pVal )
{
assert(pVal);
return ((float)::_wtof(pVal));
}
inline wstring IntToWideStr( int nVal )
{
wchar_t buf[32] = {L"\0"};
_itow_s(
nVal,
buf,
32,
10 );
return wstring(buf);
}
inline wstring Int64ToWideStr( __int64 lnVal )
{
wchar_t buf[32] = {L"\0"};
_i64tow_s(
lnVal,
buf,
32,
10 );
return wstring(buf);
}
inline wstring FloatToWideStr( float fVal )
{
wchar_t buf[32] = {L"\0"};
wsprintfW(buf, L"%f", fVal);
return wstring(buf);
}
inline string ws2s(const wstring& ws)
{
string curLocale = setlocale(LC_ALL, NULL); // curLocale = "C";
//以gbk页码来翻译为中文的双字节
setlocale(LC_ALL, "chs");
const wchar_t* _Source = ws.c_str();
size_t _Dsize = 2 * ws.size() + 1;
char *_Dest = new char[_Dsize];
memset(_Dest,0,_Dsize);
wcstombs(_Dest,_Source,_Dsize);
string result = _Dest;
delete []_Dest;
setlocale(LC_ALL, curLocale.c_str());
return result;
}
inline wstring s2ws(const string& s)
{
//以gbk页码来翻译为中文的双字节
setlocale(LC_ALL, "chs");
const char* _Source = s.c_str();
size_t _Dsize = s.size() + 1;
wchar_t *_Dest = new wchar_t[_Dsize];
wmemset(_Dest, 0, _Dsize);
mbstowcs(_Dest,_Source,_Dsize);
wstring result = _Dest;
delete []_Dest;
setlocale(LC_ALL, "C");
return result;
}
#ifndef OPP_WIN32
//代码转换:从一种编码转为另一种编码
int code_convert(char *from_charset,char *to_charset,char *inbuf,int inlen,char *outbuf,int outlen)
{
iconv_t cd;
int rc;
char **pin = &inbuf;
char **pout = &outbuf;
cd = iconv_open(to_charset,from_charset);
if (cd==0) return -1;
memset(outbuf,0,outlen);
if (iconv(cd,pin,&inlen,pout,&outlen)==-1) return -1;
iconv_close(cd);
return 0;
}
//UNICODE码转为GB2312码
int u2g(char *inbuf,int inlen,char *outbuf,int outlen)
{
return code_convert("utf-8","gb2312",inbuf,inlen,outbuf,outlen);
}
//GB2312码转为UNICODE码
int g2u(char *inbuf,size_t inlen,char *outbuf,size_t outlen)
{
return code_convert("gb2312","utf-8",inbuf,inlen,outbuf,outlen);
}
#else
// 把UTF-8转换成Unicode
void UTF_8ToUnicode(WCHAR* pOut,char *pText)
{
char* uchar = (char *)pOut;
uchar[1] = ((pText[0] & 0x0F) << 4) + ((pText[1] >> 2) & 0x0F);
uchar[0] = ((pText[1] & 0x03) << 6) + (pText[2] & 0x3F);
return;
}
// Unicode 转换成UTF-8
void UnicodeToUTF_8(char* pOut,WCHAR* pText)
{
// 注意 WCHAR高低字的顺序,低字节在前,高字节在后
char* pchar = (char *)pText;
pOut[0] = (0xE0 | ((pchar[1] & 0xF0) >> 4));
pOut[1] = (0x80 | ((pchar[1] & 0x0F) << 2)) + ((pchar[0] & 0xC0) >> 6);
pOut[2] = (0x80 | (pchar[0] & 0x3F));
return;
}
// 把Unicode 转换成 GB2312
void UnicodeToGB2312(char* pOut,unsigned short uData)
{
WideCharToMultiByte(CP_ACP,NULL,(WCHAR*)&uData,1,pOut,sizeof(WCHAR),NULL,NULL);
return;
}
// GB2312 转换成 Unicode
void Gb2312ToUnicode(WCHAR* pOut,char *gbBuffer)
{
::MultiByteToWideChar(CP_ACP,MB_PRECOMPOSED,gbBuffer,2,pOut,1);
return;
}
//GB2312 转为 UTF-8
void GB2312ToUTF_8ByWin(string& pOut,char *pText, int pLen)
{
char buf[4];
char* rst = new char[pLen + (pLen >> 2) + 2];
memset(buf,0,4);
memset(rst,0,pLen + (pLen >> 2) + 2);
int i = 0;
int j = 0;
while(i < pLen)
{
//如果是英文直接复制就可以
if( *(pText + i) >= 0)
{
rst[j++] = pText[i++];
}
else
{
WCHAR pbuffer;
Gb2312ToUnicode(&pbuffer,pText+i);
UnicodeToUTF_8(buf,&pbuffer);
unsigned short int tmp = 0;
tmp = rst[j] = buf[0];
tmp = rst[j+1] = buf[1];
tmp = rst[j+2] = buf[2];
j += 3;
i += 2;
}
}
rst[j] = '\0';
//返回结果
pOut = rst;
delete []rst;
return;
}
char* UTF8ToString(const char* src, char* dest, int dest_size)
{
wchar_t wbuffer[2048];
#ifdef _WIN32
MultiByteToWideChar(CP_ACP, 0, src, -1, wbuffer, 2048);
WideCharToMultiByte(CP_UTF8, 0, wbuffer, -1, dest, dest_size, NULL, NULL);
#else
mbstowcs(wbuffer, src, 2048);
wcstombs(dest, wbuffer, dest_size);
#endif
return dest;
}
//UTF-8 转为 GB2312
void UTF_8ToGB2312ByWin(string &pOut, char *pText, int pLen)
{
char * newBuf = new char[pLen+1];
newBuf[pLen] = '\0';
char Ctemp[4] = {"\0"};
int i =0;
int j = 0;
while(i < pLen)
{
if(pText[i] > 0)
{
newBuf[j++] = pText[i++];
}
else
{
WCHAR Wtemp;
UTF_8ToUnicode(&Wtemp,pText + i);
UnicodeToGB2312(Ctemp,Wtemp);
newBuf[j] = Ctemp[0];
newBuf[j + 1] = Ctemp[1];
i += 3;
j += 2;
}
}
newBuf[j] = '\0';
pOut = newBuf;
delete []newBuf;
return;
}
#endif
//GB2312 转为 UTF-8
inline void GB2312ToUTF_8(string& pOut,char *pText, int pLen)
{
#ifndef OPP_WIN32
int outLen = pLen + (pLen >> 2) + 2;
g2u(pText, pLen, pOut, outLen);
#else
GB2312ToUTF_8ByWin(pOut, pText, pLen);
#endif
}
//UTF-8 转为 GB2312
inline void UTF_8ToGB2312(string &pOut, char *pText, int pLen)
{
#ifndef OPP_WIN32
u2g(pText, pLen, pOut, pLen+1);
#else
UTF_8ToGB2312ByWin(pOut, pText, pLen);
#endif
}
#endif // end of __K_UTILITY_H__
url: http://blog.youkuaiyun.com/lijie_sh/article/details/4396872