#include <iostream>
// UTF-8 编码标准
//
// 1字节 U+0000000 - U+0000007F 0xxxxxxx
// 2字节 U+0000080 - U+000007FF 110xxxxx 10xxxxxx
// 3字节 U+0000800 - U+0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
// 4字节 U+0010000 - U+001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// 5字节 U+0200000 - U+03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// 6字节 U+4000000 - U+7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// UTF16 编码标准
//
// 基本多语言平面(U+0000 - U+FFFF)
//
// 辅助平面(U+10000 - U+10FFFF)
// 1.码位减去 0x10000,得到20位的代理值(0x00 - 0xFFFFF)
// 2.高10位(范围0 - 0x3FF)加 0xD800 得到高位代理(0xD800 - 0xDBFF)
// 3.低10位(范围0 - 0x3FF)加 0xDC00 得到低位代理(0xDC00 - 0xDFFF)
int GetUtf8Count(const std::string& str);
int GetUtf16Count(const std::wstring& str);
int GetUtf8Count(const void* data_ptr, size_t size = -1);
int GetUtf16Count(const void* data_ptr, size_t size = -1);
int main()
{
std::wstring strUtf16 = L"我是地球🌍";
std::cout << GetUtf16Count(strUtf16) << std::endl;
std::string strUtf8 = u8"我是地球🌍";
std::cout << GetUtf8Count(strUtf8) << std::endl;
return 0;
}
int GetUtf8Count(const std::string& str)
{
return GetUtf8Count(str.c_str(), str.size());
}
int GetUtf16Count(const std::wstring& str)
{
return GetUtf16Count(str.c_str(), str.size() * sizeof(wchar_t));
}
int GetUtf8Count(const void* data_ptr, size_t size/* = -1*/)
{
const uint8_t* ch_data_ptr = (const uint8_t*)data_ptr;
std::wstring text_out_utf16;
std::string text_out_utf8;
uint32_t cp32 = 0;
int32_t byte_count = 0;
int32_t ch_count = 0;
bool result_flag = true;
while ((0 != *ch_data_ptr) && (0 != size))
{
uint8_t ch = *ch_data_ptr;
if (ch < 0x7F)
{
cp32 = ch;
ch_count++;
}
else
{
if (0 == byte_count)
{
cp32 = 0;
if (ch >= 0xC0)
{
if (ch >= 0xC0 && ch <= 0xDF)
{
byte_count = 2;
cp32 = ch & 0x1F;
}
else if (ch >= 0xE0 && ch <= 0xEF)
{
byte_count = 3;
cp32 = ch & 0x0F;
}
else if (ch >= 0xF0 && ch <= 0xF7)
{
byte_count = 4;
cp32 = ch & 0x07;
}
else if (ch >= 0xF8 && ch <= 0xFB)
{
byte_count = 5;
cp32 = ch & 0x03;
}
else if (ch >= 0xFC && ch <= 0xFD)
{
byte_count = 6;
cp32 = ch & 0x01;
}
if (0 == byte_count)
{
result_flag = false;
break;
}
byte_count--;
}
else
{
result_flag = false;
break;
}
}
else
{
if (0x80 != (ch & 0xC0))
{
result_flag = false;
break;
}
cp32 = cp32 << 6;
cp32 |= ch & 0x3F;
byte_count--;
if ((0 == byte_count) && (0xFEFF != cp32))
{
ch_count++;
}
}
}
ch_data_ptr++;
if (-1 != size)
{
size--;
}
}
if (!result_flag)
{
return -1;
}
return ch_count;
}
int32_t GetUtf16Count(const void* data_ptr, size_t size/* = -1*/)
{
const uint16_t* ch_data_ptr = (const uint16_t*)data_ptr;
uint32_t cp32 = 0;
uint16_t cp32_high = 0;
uint16_t cp32_low = 0;
uint16_t cp16 = 0;
int32_t byte_count = 0;
int32_t ch_count = 0;
bool flag_big_endian = false;
bool flag_little_endian = false;
bool result_flag = true;
if (-1 != size)
{
if ((size < 2) || (0 != (size % 2)))
{
return -1;
}
}
while ((0 != *ch_data_ptr) && (0 != size))
{
cp16 = *ch_data_ptr;
if (0xFFFE == cp16 || 0xFEFF == cp16)
{
if (0 == byte_count)
{
if (0xFFFE == cp16)
{
flag_big_endian = true;
}
if (0xFEFF == cp16)
{
flag_little_endian = true;
}
}
else
{
result_flag = false;
break;
}
if (flag_big_endian && flag_little_endian)
{
result_flag = false;
break;
}
ch_data_ptr++;
if (-1 != size)
{
size -= 2;
}
continue;
}
if (flag_big_endian)
{
cp16 = ((cp16 >> 8) | (cp16 << 8));
}
if (!(cp16 >= 0xD800 && cp16 <= 0xDFFF))
{
if (cp32_high > 0)
{
result_flag = false;
break;
}
cp32 = cp16;
ch_count++;
}
else
{
if (0 == byte_count)
{
if (cp16 >= 0xD800 && cp16 <= 0xDBFF)
{
cp32_high = (cp16 - 0xD800);
byte_count = 1;
}
else
{
result_flag = false;
break;
}
}
else
{
if (1 == byte_count)
{
if ((cp16 >= 0xDC00) && (cp16 <= 0xDFFF))
{
cp32_low = (cp16 - 0xDC00);
cp32 = 0x10000 + ((uint32_t)cp32_high << 10 | cp32_low);
cp32_low = 0;
cp32_high = 0;
}
else
{
result_flag = false;
break;
}
}
byte_count--;
if (0 == byte_count)
{
ch_count++;
}
}
}
ch_data_ptr++;
if (-1 != size)
{
size -= 2;
}
}
if (!result_flag)
{
return -1;
}
return ch_count;
}
获取uft8, utf16编码字符数量
于 2024-11-07 09:52:30 首次发布