UTF-8 CPP的使用

最新推荐文章于 2025-03-02 16:10:17 发布

fengbingchun

最新推荐文章于 2025-03-02 16:10:17 发布

阅读量4.4k

点赞数 2

CC 4.0 BY-SA版权

分类专栏： C/C++/C++11/14/17/20

本文链接：https://blog.youkuaiyun.com/fengbingchun/article/details/47681289

C/C++/C++11/14/17/20 专栏收录该内容

261 篇文章

订阅专栏

本文介绍了一个轻量级的UTF-8编码字符串库UTF-8CPP的使用方法，包括如何验证文件是否为有效的UTF-8编码、修复无效的UTF-8序列以及进行UTF-8与UTF-16之间的转换。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

UTF-8 CPP是一个简单、小巧、轻量级、跨平台的UTF-8编码字符串库。

下面对其使用方法进行简单的介绍：

1. 从http://sourceforge.net/projects/utfcpp/下载最新的utf8_v2_3_4.zip源码，将其解压缩；

2. 新建一个vs2013 控制台工程TestUTF-8CPP，将utf-8cpp中的src文件加入到包含目录中；

3. 参考http://utfcpp.sourceforge.net/，测试代码内容为：

#include "stdafx.h"
#include <iostream>
#include <string>
#include <fstream>
#include <vector>
#include <assert.h>
#include "utf8.h"

// checks whether the content of a file is valid UTF-8 encoded text without reading the content into the memory
bool valid_utf8_file(const char* file_name)
{
    std::ifstream ifs(file_name);
    if (!ifs)
        return false; // even better, throw here

    std::istreambuf_iterator<char> it(ifs.rdbuf());
    std::istreambuf_iterator<char> eos;

    return utf8::is_valid(it, eos);
}

// The function will replace any invalid UTF-8 sequence with a Unicode replacement character
void fix_utf8_string(std::string& str)
{
    std::string temp;
    utf8::replace_invalid(str.begin(), str.end(), back_inserter(temp));
    str = temp;
}

int main(int argc, char* argv[])
{
    const char* test_file_path = "../../../demo/test.txt";
    // Open the test file(contains UTF-8 encoded text)
    std::ifstream fs8(test_file_path);
    if (!fs8.is_open()) {
        std::cout << "Could not open " << test_file_path << std::endl;
        return -1;
    }

    if (!valid_utf8_file(test_file_path))
        return -1;

    unsigned line_count = 1;
    std::string line;
    // Play with all the lines in the file
    while (getline(fs8, line)) {
        // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function)
        std::string::iterator end_it = utf8::find_invalid(line.begin(), line.end());
        if (end_it != line.end()) {
            std::cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n";
            std::cout << "This part is fine: " << std::string(line.begin(), end_it) << "\n";
        }

        // Get the line length (at least for the valid part)
        int length = utf8::distance(line.begin(), end_it);
        std::cout << "Length of line " << line_count << " is " << length << "\n";

        // Convert it to utf-16
        std::vector<unsigned short> utf16line;
        utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line));

        // And back to utf-8
        std::string utf8line;
        utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line));

        // Confirm that the conversion went OK:
        if (utf8line != std::string(line.begin(), end_it))
            std::cout << "Error in UTF-16 conversion at line: " << line_count << "\n";

        line_count++;
    }

    std::string str = "ABCD";
    std::vector<unsigned short> utf16result;
    utf8::utf8to16(str.begin(), str.end(), std::back_inserter(utf16result));
    size_t size1 = utf16result.size();

    std::string str2 = "濦粿夿旴";
    std::string utf8str;
    utf8::utf16to8(str2.begin(), str2.end(), std::back_inserter(utf8str));
    size_t size2 = utf8str.length();

    char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
    std::vector <unsigned short> utf16result1;
    utf8::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result1));
    assert(utf16result1.size() == 4);
    assert(utf16result1[2] == 0xd834);
    assert(utf16result1[3] == 0xdd1e);

    unsigned short utf16string[] = { 0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e };
    std::vector<unsigned char> utf8result;
    utf8::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
    assert(utf8result.size() == 10);

    char* szSex = "\xe7\x94\xb7\x00";
    std::basic_string<wchar_t> sex;
    utf8::utf8to16(szSex, szSex + strlen(szSex), back_inserter(sex));

    if (sex != L"男") {
        std::cout << "unicode char utf16 error" << std::endl;
        return -1;
    }

    std::cout << "ok!" << std::endl;

	return 0;
}

GitHub：https://github.com/fengbingchun/UTF-8CPP_Test