CUnicodeUtils
#pragma once
#include <stdint.h>
#include <string>
class CUnicodeUtils
{
public:
//
// @brief: 获取UTF16字符个数
// @param: pData 数据(UTF16编码, 大端字节序或小端字节序, 可包含BOM)
// @param: size 数据长度(字节)
// @ret: int 若返回值 >= 0, 表示字符个数, 若返回值 < 0, 表示文本内容不是合法的 UTF16 编码字符串
static int32_t GetUtf16Count(const void* pData, size_t size = -1);
//
// @brief: 获取UTF8字符个数
// @param: pData 数据(UTF8编码数据, 可包含BOM)
// @param: size 数据长度(字节)
// @ret: int 若返回值 >= 0, 表示字符个数, 若返回值 < 0, 表示文本内容不是合法的 UTF8 编码字符串
static int32_t GetUtf8Count(const void* pData, size_t size = -1);
//
// @brief: 获取GBK字符个数
// @param: pData 数据(UTF8编码数据, 可包含BOM)
// @param: size 数据长度(字节)
// @ret: int 若返回值 >= 0, 表示字符个数, 若返回值 < 0, 表示文本内容不是合法的 UTF8 编码字符串
static int32_t GetGbkCount(const void* pData, size_t size = -1);
//
// @brief: 转换为UTF16编码的字符串
// @param: pData 数据(UTF8编码数据, 可包含BOM)
// @param: size 数据长度(字节)
// @ret: std::wstring UTF16编码的字符串
static std::wstring Utf8ToUtf16(const void* pData, size_t size = -1);
//
// @brief: 转换为UTF8编码的字符串
// @param: pData 数据(UTF8编码数据, 可包含BOM)
// @param: size 数据长度(字节)
// @ret: std::string UTF8编码的字符串
static std::string Utf16ToUtf8(const void* pData, size_t size = -1);
private:
static void _CodePointToUtf8(uint32_t cp32, uint8_t* pBuf);
static int32_t _Utf8ToUtf16(const void* pData, size_t size = -1, std::string* pUtf8 = nullptr, std::wstring* pUtf16 = nullptr);
static int32_t _Utf16ToUtf8(const void* pData, size_t size = -1, std::string* pUtf8 = nullptr, std::wstring* pUtf16 = nullptr);
static int32_t _GetGbkCount(const void* pData, size_t size = -1, std::string* pGbk = nullptr);
};
CUnicodeUtils.cpp
#include "CUnicodeUtils.h"
// ANSI GBK 编码标准
// 第一字节(称为高字节)的范围: 0x81 - 0xFE
// 第二字节(称为低字节)的范围: 0x40 - 0xFE (不含0x7F)
//
// 汉字区
// GBK/2:0XBOA1-F7FE 收录 GB 2312 汉字 6763 个,按原序排列
// GBK/3:0X8140-AOFE,收录 CJK 汉字 6080 个
// GBK/4:0XAA40-FEAO,收录 CJK 汉字和增补的汉字 8160 个
//
// 图形符号区
// GBK/1:0XA1A1-A9FE,除 GB 2312 的符号外,还增补了其它符号
// GBK/5:0XA840-A9AO,扩除非汉字区
//
// 用户自定义区
// GBK 区域中的空白区,用户可以自己定义字符
// UTF-8 编码标准
//
// 1字节 U+0000000 - U+0000007F 0xxxxxxx
// 2字节 U+0000080 - U+000007FF 110xxxxx 10xxxxxx
// 3字节 U+0000800 - U+0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
// 4字节 U+0010000 - U+001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// 5字节 U+0200000 - U+03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// 6字节 U+4000000 - U+7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// UTF16 编码标准
//
// 基本多语言平面(U+0000 - U+FFFF)
// U+000000 - U+00D7FF
// U+00D800 - U+00DFFF 保留区域
// U+00E000 - U+00FFFF
// U+010000 - U+10FFFF
//
// 辅助平面(U+10000 - U+10FFFF)
// 1.码位减去 0x10000,得到20位的代理值(0x00 - 0xFFFFF)
// 2.高10位(范围0 - 0x3FF)加 0xD800 得到高位代理(0xD800 - 0xDBFF)
// 3.低10位(范围0 - 0x3FF)加 0xDC00 得到低位代理(0xDC00 - 0xDFFF)
int32_t CUnicodeUtils::GetUtf16Count(const void* pData, size_t size/* = -1*/)
{
return _Utf16ToUtf8(pData, size);
}
int32_t CUnicodeUtils::GetUtf8Count(const void* pData, size_t size/* = -1*/)
{
return _Utf8ToUtf16(pData, size);
}
int32_t CUnicodeUtils::GetGbkCount(const void* pData, size_t size/* = -1*/)
{
std::string strResult8;
int32_t nLength = _GetGbkCount(pData, size, &strResult8);
return nLength;
}
std::wstring CUnicodeUtils::Utf8ToUtf16(const void* pData, size_t size/* = -1*/)
{
std::string strResult8;
std::wstring strResult16;
int32_t nLength = _Utf8ToUtf16(pData, size, nullptr, &strResult16);
return strResult16;
}
std::string CUnicodeUtils::Utf16ToUtf8(const void* pData, size_t size/* = -1*/)
{
std::string strResult8;
std::wstring strResult16;
int32_t nLength = _Utf16ToUtf8(pData, size, &strResult8, nullptr);
return strResult8;
}
void CUnicodeUtils::_CodePointToUtf8(uint32_t cp32, uint8_t* pBuf)
{
// 1字节 0xxxxxxx
if (cp32 >= 0x00000000 && cp32 <= 0x0000007F)
{
pBuf[0] = (uint8_t)cp32;
pBuf[1] = 0;
}
// 2字节 110xxxxx 10xxxxxx
if (cp32 >= 0x00000080 && cp32 <= 0x000007FF)
{
pBuf[0] = ((cp32 >> 6) & 0x1F) | 0xC0;
pBuf[1] = ((cp32 & 0x3F)) | 0x80;
pBuf[2] = 0;
}
// 3字节 1110xxxx 10xxxxxx 10xxxxxx
if (cp32 >= 0x00000800 && cp32 <= 0x0000FFFF)
{
pBuf[0] = ((cp32 >> 12) & 0x0F) | 0xE0;
pBuf[1] = ((cp32 >> 6) & 0x3F) | 0x80;
pBuf[2] = ((cp32 & 0x3F)) | 0x80;
pBuf[3] = 0;
}
// 4字节 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
if (cp32 >= 0x00010000 && cp32 <= 0x001FFFFF)
{
pBuf[0] = ((cp32 >> 18) & 0x07) | 0xF0;
pBuf[1] = ((cp32 >> 12) & 0x3F) | 0x80;
pBuf[2] = ((cp32 >> 6) & 0x3F) | 0x80;
pBuf[3] = ((cp32 & 0x3F)) | 0x80;
pBuf[4] = 0;
}
// 5字节 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
if (cp32 >= 0x00200000 && cp32 <= 0x03FFFFFF)
{
pBuf[0] = ((cp32 >> 24) & 0x03) | 0xF8;
pBuf[1] = ((cp32 >> 18) & 0x3F) | 0x80;
pBuf[2] = ((cp32 >> 12) & 0x3F) | 0x80;
pBuf[3] = ((cp32 >> 6) & 0x3F) | 0x80;
pBuf[4] = ((cp32 & 0x3F)) | 0x80;
pBuf[5] = 0;
}
// 6字节 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
if (cp32 >= 0x04000000 && cp32 <= 0x7FFFFFFF)
{
pBuf[0] = ((cp32 >> 30) & 0x01) | 0xFC;
pBuf[1] = ((cp32 >> 24) & 0x3F) | 0x80;
pBuf[2] = ((cp32 >> 18) & 0x3F) | 0x80;
pBuf[3] = ((cp32 >> 12) & 0x3F) | 0x80;
pBuf[4] = ((cp32 >> 6) & 0x3F) | 0x80;
pBuf[5] = ((cp32 & 0x3F)) | 0x80;
pBuf[6] = 0;
}
}
int32_t CUnicodeUtils::_Utf8ToUtf16(const void* pData, size_t size/* = -1*/, std::string* pUtf8/* = nullptr*/, std::wstring* pUtf16/* = nullptr*/)
{
const uint8_t* pCpData = (const uint8_t*)pData;
std::wstring strOut16; // 输出UTF16
std::string strOut8; // 输出UTF8
uint32_t cp32 = 0; // UNICODE码点
int32_t nByteCount = 0; // 字节计数
int32_t nChCount = 0; // 字符计数
bool fResult = true; // 操作结果
bool fBom = true; // BOM(Byte Order Mark)
while ((0 != *pCpData) && (0 != size))
{
uint8_t ch = *pCpData;
// 普通 Ascii 也是 UTF-8 一部分
if (ch < 0x7F)
{
cp32 = ch;
nChCount++;
}
else
{
// 检查 UTF-8 首字节
if (0 == nByteCount)
{
cp32 = 0;
if (ch >= 0xC0)
{
uint8_t u8CodeMask = 0xC0; // 11000000
uint8_t u8DataMask = 0x1F; // 000xxxxx
int nCount = 2; // 有效字节数量: 2-6
// 检索字符使用的字节数量
while(u8CodeMask <= 0xFC)
{
uint8_t u8MaskMax = u8CodeMask | u8DataMask;
if (ch >= u8CodeMask && ch <= u8MaskMax)
{
cp32 = ch & u8DataMask;
nByteCount = nCount;
break;
}
u8CodeMask = (u8CodeMask >> 1) | 0x80;
u8DataMask = u8DataMask >> 1;
nCount++;
}
if (0 == nByteCount)
{
fResult = false;
break;
}
if (0xEF == ch && 3 == nByteCount)
{
fBom = true;
}
nByteCount--;
}
else
{
fResult = false;
break;
}
}
else
{
// 非首字节掩码: 10xxxxxx
if (0x80 != (ch & 0xC0))
{
fResult = false;
break;
}
// BOM处理
if (fBom)
{
if (0xBB != ch && 2 == nByteCount)
{
fBom = false;
}
if (0xBF != ch && 1 == nByteCount)
{
fBom = false;
}
}
cp32 = cp32 << 6;
cp32 |= ch & 0x3F;
nByteCount--;
if (0 == nByteCount)
{
// 跳过BOM
if (fBom)
{
fBom = false;
pCpData++;
continue;
}
nChCount++;
}
}
}
if (0 == nByteCount)
{
uint8_t szBuf[7] = { 0 };
if (pUtf8)
{
_CodePointToUtf8(cp32, szBuf);
strOut8 += (const char*)szBuf;
}
if (pUtf16)
{
if (cp32 < 0x10000)
{
strOut16.push_back((uint16_t)(cp32 & 0xFFFF));
}
else
{
uint16_t cp = (uint16_t)(cp32 - 0x10000);
uint16_t cp32Hi = (uint16_t)(cp >> 10) + 0xD800;
uint16_t cp32Lo = (uint16_t)(cp & 0x3FF) + 0xDC00;
strOut16.push_back(cp32Hi);
strOut16.push_back(cp32Lo);
}
}
}
pCpData++;
if (-1 != size)
{
size--;
}
}
if (!fResult)
{
return -1;
}
if (pUtf8)
{
*pUtf8 = std::move(strOut8);
}
if (pUtf16)
{
*pUtf16 = std::move(strOut16);
}
return nChCount;
}
int32_t CUnicodeUtils::_Utf16ToUtf8(const void* pData, size_t size/* = -1*/, std::string* pUtf8/* = nullptr*/, std::wstring* pUtf16/* = nullptr*/)
{
const uint16_t* pCpData = (const uint16_t*)pData;
std::wstring strOut16; // 输出UTF16
std::string strOut8; // 输出UTF8
uint32_t cp32 = 0; // 32位码点
uint16_t cp32Hi = 0; // 32位码点高10位
uint16_t cp32Lo = 0; // 32位码点低10位
uint16_t cp16 = 0; // 16位码点
int32_t nByteCount = 0; // 字节计数
int32_t nChCount = 0; // 字符计数
bool fBigEndian = false; // 是否大端字节序
bool fLittleEndian = false; // 是否小端字节序
bool fResult = true; // 操作结果
if (-1 != size)
{
if ((size < 2) || (0 != (size % 2)))
{
return -1;
}
}
while ((0 != *pCpData) && (0 != size))
{
cp16 = *pCpData;
// BOM检查
if (0xFFFE == cp16 || 0xFEFF == cp16)
{
if (0 == nByteCount)
{
if (0xFFFE == cp16) // 大端字节序 (Big Endian)
{
fBigEndian = true;
}
if (0xFEFF == cp16) // 小端字节序 (Little Endian)
{
fLittleEndian = true;
}
}
else
{
fResult = false;
break;
}
// 不可能同时存在两种字节序
if (fBigEndian && fLittleEndian)
{
fResult = false;
break;
}
pCpData++;
if (-1 != size)
{
size -= 2;
}
continue;
}
if (fBigEndian)
{
cp16 = ((cp16 >> 8) | (cp16 << 8));
}
//检查是否为基本多语言平面(U+0000 - U+FFFF)
if (!(cp16 >= 0xD800 && cp16 <= 0xDFFF))
{
if (cp32Hi > 0) // 高位码点后必须跟着低位码点
{
fResult = false;
break;
}
cp32 = cp16;
nChCount++;
}
else
{
if (0 == nByteCount)
{
//检查是否为辅助平面(U+10000 - U+10FFFF)
if (cp16 >= 0xD800 && cp16 <= 0xDBFF) //检查高位代理(0xD800 - 0xDBFF)
{
cp32Hi = (cp16 - 0xD800);
nByteCount = 1;
}
else
{
fResult = false;
break;
}
}
else
{
if (1 == nByteCount) // 高位码点后必须接着低位码点
{
if (cp16 >= 0xDC00 && cp16 <= 0xDFFF) //检查低位代理(0xDC00 - 0xDFFF)
{
cp32Lo = (cp16 - 0xDC00);
cp32 = 0x10000 + ((uint32_t)cp32Hi << 10 | cp32Lo);
cp32Lo = 0;
cp32Hi = 0;
}
else
{
fResult = false;
break;
}
}
nByteCount--;
if (0 == nByteCount)
{
nChCount++;
}
}
}
// 转换为 UTF 编码
if (0 == nByteCount)
{
uint8_t szBuf[7] = { 0 };
if (pUtf8)
{
_CodePointToUtf8(cp32, szBuf);
strOut8 += (const char*)szBuf;
}
if (pUtf16)
{
if (cp32 < 0x10000)
{
strOut16.push_back((uint16_t)(cp32 & 0xFFFF));
}
else
{
uint16_t cp = (uint16_t)(cp32 - 0x10000);
uint16_t cpHi = (uint16_t)(cp >> 10) + 0xD800;
uint16_t cpLo = (uint16_t)(cp & 0x3FF) + 0xDC00;
strOut16.push_back(cpHi);
strOut16.push_back(cpLo);
}
}
}
pCpData++;
if (-1 != size)
{
size -= 2;
}
}
if (!fResult)
{
return -1;
}
if (pUtf8)
{
*pUtf8 = std::move(strOut8);
}
if (pUtf16)
{
*pUtf16 = std::move(strOut16);
}
return nChCount;
}
int32_t CUnicodeUtils::_GetGbkCount(const void* pData, size_t size/* = -1*/, std::string* pGbk/* = nullptr*/)
{
const uint8_t* pCpData = (const uint8_t*)pData;
std::string strOutGbk; // 输出UTF8
uint16_t gbkCode = 0; // GBK编码
int32_t nByteCount = 0; // 字节计数
int32_t nChCount = 0; // 字符计数
bool fResult = true; // 操作结果
while ((0 != *pCpData) && (0 != size))
{
uint8_t ch = *pCpData;
if (ch < 0x7F)
{
gbkCode = ch;
nChCount++;
}
else
{
// 检查 UTF-8 首字节
if (0 == nByteCount)
{
gbkCode = 0;
// 第1字节: 0x81 - 0xFE
if (ch >= 0x81 && ch<=0xFE)
{
gbkCode = ch;
nByteCount = 1;
}
else
{
fResult = false;
break;
}
}
else
{
if (1 == nByteCount)
{
// 第2字节: 0x40 - 0xFE (不包括0x7F)
if (!(ch >= 0x40 && ch<=0xFE) || 0x7F == ch)
{
fResult = false;
break;
}
}
gbkCode = gbkCode << 8;
gbkCode |= ch;
nByteCount--;
if (0 == nByteCount)
{
nChCount++;
}
}
}
if (0 == nByteCount)
{
if (gbkCode <= 0x7F)
{
strOutGbk.push_back((uint8_t)gbkCode);
}
else
{
strOutGbk.push_back(gbkCode >> 8);
strOutGbk.push_back(gbkCode & 0xFF);
}
}
pCpData++;
if (-1 != size)
{
size--;
}
}
if (!fResult)
{
return -1;
}
if (pGbk)
{
*pGbk = std::move(strOutGbk);
}
return nChCount;
}
main.cpp
// CUnicodeUtils.cpp : 此文件包含 "main" 函数。程序执行将在此处开始并结束。
//
#include <iostream>
#include "CUnicodeUtils.h"
int main()
{
char szUtf16Little[] = { 0x3C,0xD8,0x0D,0xDF,0x55,0x00,0x6E,0x00,0x69,0x00,0x63,0x00,0x6F,0x00,0x64,0x00,0x65,0x00,0x16,0x7F,0x01,0x78,0x4B,0x6D,0xD5,0x8B, 0x00,0x00 };
char szUtf16Big[] = { 0xD8,0x3C,0xDF,0x0D,0x00,0x55,0x00,0x6E,0x00,0x69,0x00,0x63,0x00,0x6F,0x00,0x64,0x00,0x65,0x7F,0x16,0x78,0x01,0x6D,0x4B,0x8B,0xD5,0xD8,0x3C,0xDF,0x0D };
char szUtf8[] = u8"🌍Unicode编码测试";
int nUtf16Length = CUnicodeUtils::GetUtf16Count(szUtf16Little);
nUtf16Length = CUnicodeUtils::GetUtf16Count(szUtf16Big);
int nUtf8Length = CUnicodeUtils::GetUtf8Count(szUtf8);
wchar_t* lpStr = (wchar_t*)szUtf16Little;
std::string str8 = CUnicodeUtils::Utf16ToUtf8(szUtf16Little);
std::wstring str16 = CUnicodeUtils::Utf8ToUtf16(szUtf8);
int nGbkLength = CUnicodeUtils::GetGbkCount("789\xCC\x80");
std::string str8Test;
str8Test.push_back(0xEF);
str8Test.push_back(0xBB);
str8Test.push_back(0xBF);
str8Test += str8;
for (int i = 0; i < 10000; i++)
{
str8 = CUnicodeUtils::Utf16ToUtf8(szUtf16Little);
str16 = CUnicodeUtils::Utf8ToUtf16(szUtf8);
}
return 0;
}