Utf8码表输出

原创已于 2024-09-22 13:29:41 修改 · 361 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#c++ #开发语言

于 2024-09-21 20:56:09 首次发布

C++ 同时被 3 个专栏收录

87 篇文章

订阅专栏

UTF8

4 篇文章

订阅专栏

UTF8码表

1 篇文章

订阅专栏

main.cpp

#include <iostream>
#include <fstream>
#include <windows.h>
#include <tchar.h>

std::string FormatA(LPCSTR pFormat, ...);
bool ToFileA(const std::string& strFile, const std::string& str);
void UTF8_Output(uint32_t uStart, uint32_t uEnd, int code, int charater, int hex, const std::string& strFileNamePrefix);
void ConsoleOutput(const char* pFormat, ...);
void PrintHelp();

int main(int argc, char* argv[])
{
    uint32_t uStart = 0x4E00;
    uint32_t uEnd = 0x9FFF;
    int codeIndex = 0;
    int chIndex = 1;
    int hexIndex = 2;
    std::string strFileNamePrefix = "Utf8";

    if (argc < 3)
    {
        PrintHelp();
        return -1;
    }

    for (int i = 1; i < argc; i++)
    {
        if (0 == _stricmp(argv[i], "-help"))
        {
            PrintHelp();
            return 0;
        }

        if (0 == _stricmp(argv[i], "-start") && (i + 1 < argc))
        {
            uStart = strtoul(argv[++i], NULL, 16);
            continue;
        }

        if (0 == _stricmp(argv[i], "-end") && (i + 1 < argc))
        {
            uEnd = strtoul(argv[++i], NULL, 16);
            continue;
        }

        if (0 == _stricmp(argv[i], "-code") && (i + 1 < argc))
        {
            codeIndex = strtol(argv[++i], NULL, 16);
            continue;
        }

        if (0 == _stricmp(argv[i], "-ch") && (i + 1 < argc))
        {
            chIndex = strtol(argv[++i], NULL, 16);
            continue;
        }

        if (0 == _stricmp(argv[i], "-hex") && (i + 1 < argc))
        {
            hexIndex = strtol(argv[++i], NULL, 16);
            continue;
        }

        if (0 == _stricmp(argv[i], "-name") && (i + 1 < argc))
        {
            strFileNamePrefix = argv[++i];
            continue;
        }
    }

    clock_t tmBegin = ::clock();
    UTF8_Output(uStart, uEnd, codeIndex, chIndex, hexIndex, strFileNamePrefix);
    clock_t tmEnd = ::clock();

    ConsoleOutput("Output cost time: %d ms\r\n", tmEnd - tmBegin);

    return 0;
}

void PrintHelp()
{
    ConsoleOutput("==========Utf8 Output usage==========\r\n");
    ConsoleOutput("example: ");
    ConsoleOutput("\r\n");
    ConsoleOutput("    CUtf8.exe -start 4E00 -end 9fff -code 0 x-ch 1 -hex 2 -name utf8\r\n");
    ConsoleOutput("argument: ");
    ConsoleOutput("\r\n");
    ConsoleOutput("    -start: Unicode code point start position (Range: 0x00 - 0x1FFFFF, 默认: 0x4E00)\r\n");
    ConsoleOutput("      -end: Unicode code point end position (Range: 0x00 - 0x1FFFFF, 默认: 0x9FFF)\r\n");
    ConsoleOutput("     -code: Code column index (Range: 0 - 2, Default: 0)\r\n");
    ConsoleOutput("       -ch: Character column index (Range: 0 - 2, Default: 1)\r\n");
    ConsoleOutput("      -hex: Hex column index (Range: 0 - 2, Default: 2)\r\n");
    ConsoleOutput("     -name: Output filename prefix (Default: Utf8)\r\n");
    ConsoleOutput("\r\n");

    ConsoleOutput("==========Utf8 输出用法==========\r\n");
    ConsoleOutput("示例: ");
    ConsoleOutput("\r\n");
    ConsoleOutput("    CUtf8.exe -start 4E00 -end 9fff -code 0 x-ch 1 -hex 2 -name utf8\r\n");
    ConsoleOutput("命令参数: ");
    ConsoleOutput("\r\n");
    ConsoleOutput("    -start: Unicode 码位起始位置 (范围: 0x00 - 0x1FFFFF, 默认: 0x4E00)\r\n");
    ConsoleOutput("      -end: Unicode 码位结束位置 (范围: 0x00 - 0x1FFFFF, 默认: 0x9FFF)\r\n");
    ConsoleOutput("     -code: 码位列序号 (范围: -1 - 2, 默认: 0)\r\n");
    ConsoleOutput("       -ch: 字符列序号 (范围: -1 - 2, 默认: 1)\r\n");
    ConsoleOutput("      -hex: 十六进制序号 (范围: -1 - 2, 默认: 2)\r\n");
    ConsoleOutput("     -name: 输出文件名前缀 (默认: Utf8)\r\n");
    ConsoleOutput("\r\n");
}

void ConsoleOutput(const char* pFormat, ...)
{
    size_t nCchCount = MAX_PATH;
    std::string strResult(nCchCount, 0);
    va_list args;

    va_start(args, pFormat);

    do
    {
        //格式化输出字符串
        int nSize = _vsnprintf_s(&strResult[0], nCchCount, _TRUNCATE, pFormat, args);
        if (-1 != nSize)
        {
            HANDLE console = ::GetStdHandle(STD_OUTPUT_HANDLE);
            ::WriteConsoleA(console, strResult.c_str(), nSize, NULL, NULL);
            break;
        }

        //缓冲大小超限终止
        if (nCchCount >= INT32_MAX)
        {
            break;
        }

        //重新分配缓冲
        nCchCount *= 2;
        strResult.resize(nCchCount);

    } while (true);

    va_end(args);
}

void UTF8_Output(uint32_t uStart, uint32_t uEnd, int code, int ch, int hex, const std::string& strFileNamePrefix)
{
    uint8_t szBuf[MAX_PATH] = { 0 };
    std::string strOutput;
    strOutput += "\xef\xbb\xbf";

    int nOutColumn = 0;

    if (code >= 0) nOutColumn++;
    if (ch >= 0) nOutColumn++;
    if (hex >= 0) nOutColumn++;

    for (uint32_t i = uStart; i <= uEnd; i++)
    {
        std::string strhex;
        std::string strCode;

        // 1字节
        // 0xxxxxxx
        if (i >= 0x00000000 && i <= 0x0000007F)
        {
            szBuf[0] = i;
            szBuf[1] = 0;
            strhex = FormatA("0x%02X", szBuf[0]);
            strCode = FormatA("U+%02X", i);
        }

        // 2字节
        // 110xxxxx 10xxxxxx
        if (i >= 0x00000080 && i <= 0x000007FF)
        {
            szBuf[0] = ((i >> 6) & 0x1F) | 0xC0;
            szBuf[1] = ( i & 0x3F) | 0x80;
            szBuf[2] = 0;
            strhex = FormatA("0x%02X%02X", szBuf[0], szBuf[1]);
            strCode = FormatA("U+%04X", i);
        }

        // 3字节
        // 1110xxxx 10xxxxxx 10xxxxxx
        if (i >= 0x00000800 && i <= 0x0000FFFF)
        {
            szBuf[0] = ((i >> 12) & 0x0F) | 0xE0;
            szBuf[1] = ((i >> 6) & 0x3F) | 0x80;
            szBuf[2] = (i & 0x3F) | 0x80;
            szBuf[3] = 0;
            strhex = FormatA("0x%02X%02X%02X", szBuf[0], szBuf[1], szBuf[2]);
            strCode = FormatA("U+%04X", i);
        }

        // 4字节
        // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
        if (i >= 0x00010000 && i <= 0x001FFFFF)
        {
            szBuf[0] = ((i >> 18) & 0x07) | 0xF0;
            szBuf[1] = ((i >> 12) & 0x3F) | 0x80;
            szBuf[2] = ((i >> 6) & 0x3F) | 0x80;
            szBuf[3] = (i & 0x3F) | 0x80;
            szBuf[4] = 0;
            strhex = FormatA("0x%02X%02X%02X%02X", szBuf[0], szBuf[1], szBuf[2], szBuf[3]);
            strCode = FormatA("U+%06X", i);
        }

        // 5字节
        // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
        if (i >= 0x00200000 && i <= 0x03FFFFFF)
        {
            szBuf[0] = ((i >> 24) & 0x03) | 0xF8;
            szBuf[1] = ((i >> 18) & 0x3F) | 0x80;
            szBuf[2] = ((i >> 12) & 0x3F) | 0x80;
            szBuf[3] = ((i >> 6) & 0x3F) | 0x80;
            szBuf[4] = (i & 0x3F) | 0x80;
            szBuf[5] = 0;
            strhex = FormatA("0x%02X%02X%02X%02X%02X", szBuf[0], szBuf[1], szBuf[2], szBuf[3], szBuf[4]);
            strCode = FormatA("U+%08X", i);
        }

        // 6字节
        // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
        if (i >= 0x04000000 && i <= 0x7FFFFFFF)
        {
            szBuf[0] = ((i >> 30) & 0x01) | 0xF8;
            szBuf[1] = ((i >> 24) & 0x3F) | 0x80;
            szBuf[2] = ((i >> 18) & 0x3F) | 0x80;
            szBuf[3] = ((i >> 12) & 0x3F) | 0x80;
            szBuf[4] = ((i >> 6) & 0x3F) | 0x80;
            szBuf[5] = (i & 0x3F) | 0x80;
            szBuf[6] = 0;
            strhex = FormatA("0x%02X%02X%02X%02X%02X%02X", szBuf[0], szBuf[1], szBuf[2], szBuf[3], szBuf[4], szBuf[5]);
            strCode = FormatA("U+%08X", i);
        }

        for (int nIndex = 0; nIndex < nOutColumn; nIndex++)
        {
            if (nIndex == code)
            {
                strOutput += strCode;
                if (nIndex < 2)
                {
                    strOutput += " ";
                }
                continue;
            }

            if (nIndex == ch)
            {
                strOutput += (char*)szBuf;
                if (nIndex < 2)
                {
                    strOutput += " ";
                }
                continue;
            }

            if (nIndex == hex)
            {
                strOutput += strhex;
                if (nIndex < 2)
                {
                    strOutput += " ";
                }
                continue;
            }
        }

        if (i < uEnd)
        {
            strOutput += "\r\n";
        }
    }

    std::string strFileName = strFileNamePrefix;
    strFileName += FormatA("_%08X_%08X", uStart, uEnd);

    for (int nIndex = 0; nIndex < nOutColumn; nIndex++)
    {
        if (nIndex == code)
        {
            strFileName += "_code";
            continue;
        }

        if (nIndex == ch)
        {
            strFileName += "_ch";
            continue;
        }

        if (nIndex == hex)
        {
            strFileName += "_hex";
            continue;
        }
    }

    uint32_t uCount = 0;
    if (uStart <= uEnd)
    {
        uCount = uEnd - uStart + 1;
    }

    strFileName += FormatA("(%d)", uCount);
    strFileName += ".txt";

    ToFileA(strFileName, strOutput);
}

std::string FormatA(LPCSTR pFormat, ...)
{
    size_t nCchCount = MAX_PATH;
    std::string strResult(nCchCount, 0);
    va_list args;

    va_start(args, pFormat);

    do
    {
        //成功则赋值字符串并终止循环
        int nSize = _vsnprintf_s(&strResult[0], nCchCount, _TRUNCATE, pFormat, args);
        if (-1 != nSize)
        {
            strResult.resize(nSize);
            break;
        }

        //缓冲大小超限终止
        if (nCchCount >= INT32_MAX)
        {
            break;
        }

        //重新分配缓冲
        nCchCount *= 2;
        strResult.resize(nCchCount);

    } while (true);

    va_end(args);

    return strResult;
}

bool ToFileA(const std::string& strFile, const std::string& str)
{
    std::ofstream outputFile(strFile.c_str(), std::ios::binary | std::ios::out);
    if (!outputFile.is_open())
    {
        return false;
    }

    outputFile.write(str.c_str(), str.size());
    outputFile.close();

    return true;
}

Output.bat

cd /d %~dp0

rem https://symbl.cc/cn/unicode-table/#cjk-symbols-and-punctuation

rem 基本拉丁字母
CUtf8.exe -start 00 -end 7f -code 0 x-ch 1 -hex 2 -name utf8_1

rem 拉丁字母扩充
CUtf8.exe -start 80 -end 7ff -code 0 x-ch 1 -hex 2 -name utf8_1

rem 国际音标扩展
CUtf8.exe -start 250 -end 2AF -code 0 x-ch 1 -hex 2 -name utf8_1

rem 中日韩字符集兼容
CUtf8.exe -start 3300 -end 33FF -code 0 x-ch 1 -hex 2 -name utf8_3

rem 中日韩统一表意文字扩展区A
CUtf8.exe -start 3400 -end 4DBF -code 0 x-ch 1 -hex 2 -name utf8_3

rem 易经六十四卦符号
CUtf8.exe -start 4DC0 -end 4DFF -code 0 x-ch 1 -hex 2 -name utf8_3

rem 中日韩统一表意文字
CUtf8.exe -start 4E00 -end 9fff -code 0 x-ch 1 -hex 2 -name utf8_3

rem 日文平假名/片假名
CUtf8.exe -start 3040 -end 30FF -code 0 x-ch 1 -hex 2 -name utf8_3

rem 中日韩兼容表意文字
CUtf8.exe -start F900 -end FAFF -code 0 x-ch 1 -hex 2 -name utf8_3

rem 中日韩统一表意文字扩展区B
CUtf8.exe -start 20000 -end 2A6DF -code 0 x-ch 1 -hex 2 -name utf8_3

rem 中日韩统一表意文字扩展区C
CUtf8.exe -start 2A700 -end 2B73F -code 0 x-ch 1 -hex 2 -name utf8_3

rem 中日韩统一表意文字扩展区D
CUtf8.exe -start 2B740 -end 2B81F -code 0 x-ch 1 -hex 2 -name utf8_3

rem 中日韩统一表意文字扩展区E
CUtf8.exe -start 2B820 -end 2CEAF -code 0 x-ch 1 -hex 2 -name utf8_3

rem 中日韩统一表意文字扩展区D
CUtf8.exe -start 2CEB0 -end 2EBEF -code 0 x-ch 1 -hex 2 -name utf8_3

rem 中日韩兼容文字补充区
CUtf8.exe -start 2F800 -end 2FA1F -code 0 x-ch 1 -hex 2 -name utf8_3

Gitee仓库: UTF8汉字输出