unix/win32: utf8, unicode, ansi

最新推荐文章于 2024-07-19 22:55:18 发布

zozoiiiiiiii

最新推荐文章于 2024-07-19 22:55:18 发布

阅读量1.2k

点赞数

分类专栏： c++ 文章标签： string null float integer 平台扩展

本文链接：https://blog.youkuaiyun.com/zozoiiiiiiii/article/details/6651660

版权

c++ 专栏收录该内容

20 篇文章

订阅专栏

1, 基本概念，几种编码方式

1，unicode: 是一个字符集; utf16, UCS-2编码，little endian格式

2，Unicode big endian编码

3，utf8: unicode的一种，变长编码

4，ansi：本地编码

英文-ascii ，英文字符和二进制之间的关系，包含128个字符；128--255扩展字符，

简体中文- gb2312，DBCS编码，一个汉字两个英文字符，对ascii的中文扩展，一个小于127的字符的意义与原来相同，但两个大于127的字符连在一起时，就表示一个汉字；有数字即在127内存在，也在127外存在，数字<127,半角字符； >127,全角字符 < GBK 标准<GB18030;

繁体中文-big5

2，字符串长度： "abc汉字” unicode: 5个字符，一个字符两个字节； dbcs: 7个字符，汉字占两个英文字符

3, 实现,如果有多国语言，则用unicode来显示。工程内部用utf8来传递

   //如果有多国语言要现实，则肯定要用unicode来保存显示。
   wchar_t test[120] = L"مرحباً 你好にac字ㄽㄾ";
   ::MessageBoxW(NULL, test, L"test", MB_OK);

   //本地编码会丢失数据
   string acpStr = WideStrToString(test, CP_ACP);
   wstring acpWstr = StringToWideStr(acpStr.c_str(), CP_ACP);
   ::MessageBoxW(NULL, acpWstr.c_str(), L"acpWstr", MB_OK);

   //utf8不会丢失数据
   string utf8Str = WideStrToString(test, CP_UTF8);
   wstring utf8Wstr = StringToWideStr(utf8Str.c_str(), CP_UTF8);

::MessageBoxW(NULL, utf8Wstr.c_str(), L"utf8Wstr", MB_OK);

#ifndef __K_UTILITY_H__
#define __K_UTILITY_H__

/************************************************************************/
/*
编码转换

utf8 - gb2312
在unix平台中可以使用iconv来做转换
在windows平台可以用MultiByteToWideChar/WideCharToMultiByte 函数.

char - wchar_t
使用CRT库的mbstowcs()函数和wcstombs()函数，平台无关，需设定locale

*/
/************************************************************************/

#include "OPPOS.h"

#include <locale.h>

#ifndef OPP_WIN32
#include <iconv.h>
#endif

#include <string>
using namespace std;

inline string IntToString( int nVal );
inline string Int64ToString( __int64 liVal );
inline string FloatToString( float fVal );

inline int		StringToInt( const char * pVal );
inline __int64	StringToInt64( const char * pVal );
inline float	StringToFloat( const char* pVal );

inline int		WideStrToInt( const wchar_t * pVal );
inline __int64	WideStrToInt64( const wchar_t * pVal );
inline float	WideStrToFloat( const wchar_t * pVal );

inline wstring	IntToWideStr( int nVal );
inline wstring	Int64ToWideStr( __int64 lnVal );
inline wstring	FloatToWideStr( float fVal );

inline string ws2s(const wstring& ws);
inline wstring s2ws(const string& s);

//GB2312 转为 UTF-8
inline void GB2312ToUTF_8(string& pOut,char *pText, int pLen);
//UTF-8 转为 GB2312
inline void UTF_8ToGB2312(string &pOut, char *pText, int pLen);


#ifndef OPP_WIN32
//代码转换:从一种编码转为另一种编码
inline int code_convert(char *from_charset,char *to_charset,char *inbuf,int inlen,char *outbuf,int outlen);

//UNICODE码转为GB2312码
inline int u2g(char *inbuf,int inlen,char *outbuf,int outlen);

//GB2312码转为UNICODE码
inline int g2u(char *inbuf,size_t inlen,char *outbuf,size_t outlen);

#else

// 把UTF-8转换成Unicode
inline void UTF_8ToUnicode(WCHAR* pOut,char *pText);
// Unicode 转换成UTF-8 
inline void UnicodeToUTF_8(char* pOut,WCHAR* pText);
// 把Unicode 转换成 GB2312 
inline void UnicodeToGB2312(char* pOut,unsigned short uData);
// GB2312 转换成　Unicode
inline void Gb2312ToUnicode(WCHAR* pOut,char *gbBuffer);

//GB2312 转为 UTF-8
inline void GB2312ToUTF_8ByWin(string& pOut,char *pText, int pLen);
//UTF-8 转为 GB2312
inline void UTF_8ToGB2312ByWin(string &pOut, char *pText, int pLen);

inline char* UTF8ToString(const char* src, char* dest, int dest_size);


#endif

//

inline string IntToString( int nVal )
{
	// integer MAX : 4294967295L
	char Buf[16] = {'\0'};
	_itoa_s(nVal, Buf, sizeof(Buf), 10L);
	return string(Buf);
}

inline string Int64ToString( __int64 liVal )
{
	// integer_64 MAX : 18446744073709551615L
	char Buf[32] = {'\0'};
	_i64toa_s(liVal, Buf, sizeof(Buf), 10L);
	return string(Buf);
}

inline string FloatToString( float fVal )
{
	char Buf[32] = {'\0'};
	sprintf_s( Buf, sizeof(Buf), "%f", fVal);
	return string(Buf);
}


inline int	StringToInt( const char * pVal )
{
	assert(pVal);
	return ( ::atoi(pVal) );
}

inline __int64	StringToInt64( const char * pVal )
{
	assert(pVal);
	return ( ::_atoi64(pVal) );
}

inline float	StringToFloat( const char* pVal )
{
	assert(pVal);
	return float( ::atof(pVal) );
}

inline int		WideStrToInt( const wchar_t * pVal )
{
	assert(pVal);
	return (::_wtoi(pVal));
}

inline __int64	WideStrToInt64( const wchar_t * pVal )
{
	assert(pVal);
	return (::_wtoi64(pVal));
}

inline float	WideStrToFloat( const wchar_t * pVal )
{
	assert(pVal);
	return ((float)::_wtof(pVal));
}

inline wstring	IntToWideStr( int nVal )
{
	wchar_t buf[32] = {L"\0"};

	_itow_s(
		nVal,
		buf,
		32,
		10 );

	return wstring(buf);
}

inline wstring	Int64ToWideStr( __int64 lnVal )
{
	wchar_t buf[32] = {L"\0"};

	_i64tow_s(
		lnVal,
		buf,
		32,
		10 );

	return wstring(buf);
}

inline wstring	FloatToWideStr( float fVal )
{
	wchar_t buf[32] = {L"\0"};

	wsprintfW(buf, L"%f", fVal);

	return wstring(buf);	
}

inline string ws2s(const wstring& ws)
{
	string curLocale = setlocale(LC_ALL, NULL); // curLocale = "C";

	//以gbk页码来翻译为中文的双字节
	setlocale(LC_ALL, "chs");

	const wchar_t* _Source = ws.c_str();
	size_t _Dsize = 2 * ws.size() + 1;
	char *_Dest = new char[_Dsize];
	memset(_Dest,0,_Dsize);
	wcstombs(_Dest,_Source,_Dsize);
	string result = _Dest;
	delete []_Dest;

	setlocale(LC_ALL, curLocale.c_str());

	return result;
}

inline wstring s2ws(const string& s)
{
	//以gbk页码来翻译为中文的双字节
	setlocale(LC_ALL, "chs");

	const char* _Source = s.c_str();
	size_t _Dsize = s.size() + 1;
	wchar_t *_Dest = new wchar_t[_Dsize];
	wmemset(_Dest, 0, _Dsize);
	mbstowcs(_Dest,_Source,_Dsize);
	wstring result = _Dest;
	delete []_Dest;

	setlocale(LC_ALL, "C");

	return result;
}


#ifndef OPP_WIN32
//代码转换:从一种编码转为另一种编码
int code_convert(char *from_charset,char *to_charset,char *inbuf,int inlen,char *outbuf,int outlen)
{
	iconv_t cd;
	int rc;
	char **pin = &inbuf;
	char **pout = &outbuf;

	cd = iconv_open(to_charset,from_charset);
	if (cd==0) return -1;
	memset(outbuf,0,outlen);
	if (iconv(cd,pin,&inlen,pout,&outlen)==-1) return -1;
	iconv_close(cd);
	return 0;
}
//UNICODE码转为GB2312码
int u2g(char *inbuf,int inlen,char *outbuf,int outlen)
{
	return code_convert("utf-8","gb2312",inbuf,inlen,outbuf,outlen);
}
//GB2312码转为UNICODE码
int g2u(char *inbuf,size_t inlen,char *outbuf,size_t outlen)
{
	return code_convert("gb2312","utf-8",inbuf,inlen,outbuf,outlen);
}
#else

// 把UTF-8转换成Unicode
void UTF_8ToUnicode(WCHAR* pOut,char *pText)
{
	char* uchar = (char *)pOut;

	uchar[1] = ((pText[0] & 0x0F) << 4) + ((pText[1] >> 2) & 0x0F);
	uchar[0] = ((pText[1] & 0x03) << 6) + (pText[2] & 0x3F);

	return;
}
// Unicode 转换成UTF-8 
void UnicodeToUTF_8(char* pOut,WCHAR* pText)
{
	// 注意 WCHAR高低字的顺序,低字节在前，高字节在后
	char* pchar = (char *)pText;

	pOut[0] = (0xE0 | ((pchar[1] & 0xF0) >> 4));
	pOut[1] = (0x80 | ((pchar[1] & 0x0F) << 2)) + ((pchar[0] & 0xC0) >> 6);
	pOut[2] = (0x80 | (pchar[0] & 0x3F));

	return;
}
// 把Unicode 转换成 GB2312 
void UnicodeToGB2312(char* pOut,unsigned short uData)
{
	WideCharToMultiByte(CP_ACP,NULL,(WCHAR*)&uData,1,pOut,sizeof(WCHAR),NULL,NULL);
	return;
}
// GB2312 转换成　Unicode
void Gb2312ToUnicode(WCHAR* pOut,char *gbBuffer)
{
	::MultiByteToWideChar(CP_ACP,MB_PRECOMPOSED,gbBuffer,2,pOut,1);
	return;
}
//GB2312 转为 UTF-8
void GB2312ToUTF_8ByWin(string& pOut,char *pText, int pLen)
{
	char buf[4];
	char* rst = new char[pLen + (pLen >> 2) + 2];

	memset(buf,0,4);
	memset(rst,0,pLen + (pLen >> 2) + 2);

	int i = 0;
	int j = 0;
	while(i < pLen)
	{
		//如果是英文直接复制就可以
		if( *(pText + i) >= 0)
		{
			rst[j++] = pText[i++];
		}
		else
		{
			WCHAR pbuffer;
			Gb2312ToUnicode(&pbuffer,pText+i);

			UnicodeToUTF_8(buf,&pbuffer);

			unsigned short int tmp = 0;
			tmp = rst[j] = buf[0];
			tmp = rst[j+1] = buf[1];
			tmp = rst[j+2] = buf[2];


			j += 3;
			i += 2;
		}
	}
	rst[j] = '\0';

	//返回结果
	pOut = rst;
	delete []rst;

	return;
}

char* UTF8ToString(const char* src, char* dest, int dest_size)
{
	wchar_t wbuffer[2048];
#ifdef _WIN32
	MultiByteToWideChar(CP_ACP, 0, src, -1, wbuffer, 2048);
	WideCharToMultiByte(CP_UTF8, 0, wbuffer, -1, dest, dest_size, NULL, NULL);
#else
	mbstowcs(wbuffer, src, 2048); 
	wcstombs(dest, wbuffer, dest_size);
#endif

	return dest;
}

//UTF-8 转为 GB2312
void UTF_8ToGB2312ByWin(string &pOut, char *pText, int pLen)
{
	char * newBuf = new char[pLen+1];
	newBuf[pLen] = '\0';

	char Ctemp[4] = {"\0"};

	int i =0;
	int j = 0;

	while(i < pLen)
	{
		if(pText[i] > 0)
		{
			newBuf[j++] = pText[i++];
		}
		else
		{
			WCHAR Wtemp;
			UTF_8ToUnicode(&Wtemp,pText + i);

			UnicodeToGB2312(Ctemp,Wtemp);

			newBuf[j] = Ctemp[0];
			newBuf[j + 1] = Ctemp[1];

			i += 3;
			j += 2;
		}
	}
	newBuf[j] = '\0';

	pOut = newBuf;
	delete []newBuf;

	return;
}
#endif



//GB2312 转为 UTF-8
inline void GB2312ToUTF_8(string& pOut,char *pText, int pLen)
{
#ifndef OPP_WIN32
	int outLen = pLen + (pLen >> 2) + 2;
	g2u(pText, pLen, pOut, outLen);
#else
	GB2312ToUTF_8ByWin(pOut, pText, pLen);
#endif
}

//UTF-8 转为 GB2312
inline void UTF_8ToGB2312(string &pOut, char *pText, int pLen)
{

#ifndef OPP_WIN32
	u2g(pText, pLen, pOut, pLen+1);
#else
	UTF_8ToGB2312ByWin(pOut, pText, pLen);
#endif
}

#endif // end of __K_UTILITY_H__

url: http://blog.youkuaiyun.com/lijie_sh/article/details/4396872