使用DeepSeek编写DuckDB支持中文字符集编码的read_csv表函数

众所周知,Pandas的read_csv函数支持指定字符集,但DuckDB不支持,虽然可以用Linux iconv命令事先转换,但是对于一般用户执行命令行有所不便。所以,让DeepSeek实现了一个。

提示词如下

请再实现一个duckdb的read_csv_e(文件名,编码名)表函数,编码支持’gbk’,‘gb2312’,‘gb18030’,‘big5’。要求支持读入这4种编码文件,查询正常显示,插入duckdb表时采用UTF8

源代码如下,csv_reader2.hpp文件

#pragma once
#include "duckdb.hpp"
#include <iconv.h>
#include <fstream>
#include <vector>

namespace duckdb {

struct CSVGlobalState {
    std::unique_ptr<std::ifstream> file;
    bool finished = false;
};

class CSVEncodingConverter {
public:
    static Value ConvertEncoding(const string &input, const string &from_encoding, const string &to_encoding = "UTF-8");
};

class ReadCSVEFunction {
public:
    static TableFunction GetFunction();
    
    struct BindData : public TableFunctionData {
        string filename;
        string encoding;
        vector<LogicalType> return_types;
        vector<string> return_names;
    };

    static unique_ptr<FunctionData> Bind(ClientContext &context, 
                                        TableFunctionBindInput &input,
                                        vector<LogicalType> &return_types,
                                        vector<string> &return_names);

    static unique_ptr<GlobalTableFunctionState> InitGlobal(ClientContext &context, 
                                                         TableFunctionInitInput &input);

    static void Function(ClientContext &context, 
                       TableFunctionInput &data, 
                       DataChunk &output);
};

} // namespace duckdb

csv_reader2.cpp文件

#include "csv_reader2.hpp"
#include <sstream>
#include <stdexcept>

using namespace duckdb;
using namespace std;

Value CSVEncodingConverter::ConvertEncoding(const string &input, const string &from_encoding, const string &to_encoding) {
    if (from_encoding == "utf8" || from_encoding == "UTF-8") {
        return Value(input);
    }

    iconv_t cd = iconv_open(to_encoding.c_str(), from_encoding.c_str());
    if (cd == (iconv_t)-1) {
        throw std::runtime_error("iconv_open failed for " + from_encoding + " to " + to_encoding);
    }

    size_t in_bytes_left = input.size();
    char *in_buf = const_cast<char*>(input.data());

    size_t out_bytes_left = input.size() * 4;
    vector<char> out_buf(out_bytes_left);
    char *out_ptr = out_buf.data();

    if (iconv(cd, &in_buf, &in_bytes_left, &out_ptr, &out_bytes_left) == (size_t)-1) {
        iconv_close(cd);
        throw std::runtime_error("iconv conversion failed");
    }

    iconv_close(cd);
    return Value(string(out_buf.data(), out_ptr - out_buf.data()));
}

duckdb::unique_ptr<FunctionData> ReadCSVEFunction::Bind(ClientContext &context, 
                                              TableFunctionBindInput &input,
                                              vector<LogicalType> &return_types,
                                              vector<string> &return_names) {
    auto result = duckdb::make_uniq<BindData>();
    
    result->filename = input.inputs[0].GetValue<string>();
    result->encoding = StringUtil::Lower(input.inputs[1].GetValue<string>());
    
    static const vector<string> SUPPORTED_ENCODINGS = {"gbk", "gb2312", "gb18030", "big5", "utf8"};
    if (find(SUPPORTED_ENCODINGS.begin(), SUPPORTED_ENCODINGS.end(), result->encoding) == SUPPORTED_ENCODINGS.end()) {
        throw BinderException("Unsupported encoding. Supported encodings are: gbk, gb2312, gb18030, big5");
    }

    ifstream file(result->filename);
    if (!file.is_open()) {
        throw IOException("Could not open file: " + result->filename);
    }

    string header_line;
    getline(file, header_line);
    file.close();
    // 新增:去除标题行末尾的\r
    if (!header_line.empty() && header_line.back() == '\r') {
        header_line.pop_back();
    }
        
    string utf8_header = CSVEncodingConverter::ConvertEncoding(header_line, result->encoding).GetValue<string>();
    
    stringstream ss(utf8_header);
    string column_name;
    while (getline(ss, column_name, ',')) {
        return_names.push_back(column_name);
        return_types.push_back(LogicalType::VARCHAR);
    }

    result->return_types = return_types;
    result->return_names = return_names;
    return std::move(result);
}

// DuckDB 1.3.0兼容版本
struct CSVFunctionGlobalState : public GlobalTableFunctionState {
	public:  // 添加这行
    duckdb::unique_ptr<CSVGlobalState> csv_state;
    //unique_ptr<CSVGlobalState> csv_state;
};

duckdb::unique_ptr<GlobalTableFunctionState> ReadCSVEFunction::InitGlobal(ClientContext &context, 
                                                                 TableFunctionInitInput &input) {
    auto result = duckdb::make_uniq<CSVFunctionGlobalState>();
    auto &bind_data = (BindData &)*input.bind_data;
    
    result->csv_state = duckdb::make_uniq<CSVGlobalState>();
    result->csv_state->file = std::make_unique<ifstream>(bind_data.filename);
    
    if (!result->csv_state->file->is_open()) {
        throw IOException("Could not open file: " + bind_data.filename);
    }
    
    // 跳过标题行
    string dummy;
    getline(*result->csv_state->file, dummy);
    
    return std::move(result);
}

void ReadCSVEFunction::Function(ClientContext &context, 
                               TableFunctionInput &data, 
                               DataChunk &output) {
    auto &bind_data = (BindData &)*data.bind_data;
    auto &global_state = ((CSVFunctionGlobalState &)*data.global_state).csv_state;
    
    if (!global_state->file || global_state->finished) {
        output.SetCardinality(0);
        return;
    }

    idx_t count = 0;
    while (count < STANDARD_VECTOR_SIZE) {
        string line;
        if (!getline(*global_state->file, line)) {
            global_state->finished = true;
            global_state->file->close();
            break;
        }

        // 新增:去除Windows换行符的\r
        if (!line.empty() && line.back() == '\r') {
            line.pop_back();
        }
                
        string utf8_line = CSVEncodingConverter::ConvertEncoding(line, bind_data.encoding).GetValue<string>();
        
        stringstream ss(utf8_line);
        string value;
        idx_t col_idx = 0;
        
        while (getline(ss, value, ',') && col_idx < output.ColumnCount()) {
            output.data[col_idx].SetValue(count, Value(value));
            col_idx++;
        }
        
        count++;
    }
    
    output.SetCardinality(count);
}

TableFunction ReadCSVEFunction::GetFunction() {
    return TableFunction("read_csv_e", 
                        {LogicalType::VARCHAR, LogicalType::VARCHAR},
                        Function,
                        Bind,
                        InitGlobal);
}

测试文件testcsv.cpp

#include "duckdb.hpp"
//#include "readpg5.cpp"
#include "csv_reader2.hpp"
#include <iostream>
#include "duckdb/main/extension_util.hpp"
#include "duckdb/main/query_result.hpp"
#include "duckdb/common/box_renderer.hpp"
using namespace duckdb;
using namespace std;

int main() {
    DuckDB db(nullptr);
    Connection con(db);

    try {
        DatabaseInstance& db_instance = *db.instance;
        //ExtensionUtil::RegisterFunction(db_instance, PGTableFunction::GetFunction());
        // 在main函数中添加:
        ExtensionUtil::RegisterFunction(db_instance, ReadCSVEFunction::GetFunction());
    } catch (const exception &e) {
        cerr << "初始化错误: " << e.what() << endl;
        return 1;
    }


    cout << "=== 测试1: 查询中文编码csv表函数 ===" << endl;
    auto result = con.Query("SELECT * FROM read_csv_e('/par/gbk_file.csv', 'gbk') LIMIT 10");
    if (result->HasError()) {
        cerr << "查询错误: " << result->GetError() << endl;
    } else {
        result->Print();
    }
    
    
    cout << "=== 测试2: 测试创建UTF-8表 ===" << endl;
    con.Query("CREATE TABLE utf8_data AS SELECT * FROM read_csv_e('/par/big5_file.csv', 'big5')");
    result = con.Query("SELECT * FROM utf8_data");
    if (result->HasError()) {
        cerr << "查询错误: " << result->GetError() << endl;
    } else {
        result->Print();
    }    
    
    cout << "=== 测试3: 多编码测试 ===" << endl;

    result = con.Query("SELECT a.*, b.* FROM read_csv_e('gb18030_file.csv', 'gb18030') a JOIN read_csv_e('big5_file.csv', 'big5') b ON a.id = b.id");
    if (result->HasError()) {
        cerr << "查询错误: " << result->GetError() << endl;
    } else {
        //result->Print();
                            // 获取当前客户端上下文
            auto &context = *con.context;
            
            // 创建默认的 BoxRenderer 配置
            BoxRendererConfig config;
            
            // 调用 ToBox 方法获取格式化字符串
            std::string box_output = result->ToBox(context, config);
            
            // 输出结果
            std::cout << box_output << std::endl;
    }  
     cout << "\n=== 测试完成 ===" << endl;
    return 0;
}

编译命令行

export LIBRARY_PATH=/par/duck/build/src
export LD_LIBRARY_PATH=/par/duck/build/src
g++ -std=c++17 -o readcsv csv_reader2.cpp testcsv.cpp -lduckdb -liconv -I /par/duck/src/include

编译很顺利,证明系统中有iconv.h文件,但是链接时出错了,找不到动态链接库/usr/bin/ld: cannot find -liconv: No such file or directory,按照这篇文章提供的方法解决了。复制如下:

1.手动下载安装
wget https://ftp.gnu.org/pub/gnu/libiconv/libiconv-1.18.tar.gz
2.在当前目录下解压,并进入解压后的文件夹,后续步骤全程用管理员权限进行
tar -zxf libiconv-1.18.tar.gz
cd libiconv-1.18
3.编译与安装
./configure --prefix=/usr/local
make
make install
4.将其链接至/usr/lib/中
ln -s /usr/local/lib/libiconv.so.2 /usr/lib/libiconv.so.2
ldconfig

我没有做第4步,所以运行时还报一个错误./readcsv: error while loading shared libraries: libiconv.so.2: cannot open shared object file: No such file or directory, 用export LD_LIBRARY_PATH=/usr/local/lib:/par/duck/build/src把存放libiconv.so的/usr/local/lib加入搜索路径就好了。所以还是应该做到位。
再编辑一个csv文件gbk_file.csv,内容如下:

id, lang , code
1, 中文,GBK
2, 〇镕镚閫閬, GBK
3, 烎玊奣嘦勥巭嫑恏兲氼忈炛, GBK

我是用Windows下的Notepad4编辑的。也可以用其他编辑器,如vi编辑。再另存为BIG5、GB18030等编码文件备用,注意BIG5字符集收录字符较少,有些字用??代替了。
下面是运行结果:

=== 测试1: 查询中文编码csv表函数 ===
id       lang    code
VARCHAR VARCHAR VARCHAR
[ Rows: 3]
1        中文   GBK
2        〇镕镚閫閬      GBK
3        烎玊奣嘦勥巭嫑恏兲氼忈炛        GBK


=== 测试2: 测试创建UTF-8表 ===
id       lang    code
VARCHAR VARCHAR VARCHAR
[ Rows: 3]
1        中文   BIG5
2        ???閫閬         BIG5
3        烎玊??????????  BIG5


=== 测试3: 多编码测试 ===
┌─────────┬───────────────────────────┬──────────┬─────────┬─────────────────┬─────────┐
│   id    │           lang            │   code   │   id    │      lang       │   code  │
│ varchar │          varchar          │ varchar  │ varchar │     varchar     │ varchar │
├─────────┼───────────────────────────┼──────────┼─────────┼─────────────────┼─────────┤
│ 1       │  中文                     │ GB18030  │ 1       │  中文           │ BIG5    │
│ 2       │  〇镕镚閫閬               │  GB18030 │ 2       │  ???閫閬        │  BIG5   │
│ 3       │  烎玊奣嘦勥巭嫑恏兲氼忈炛 │  GB18030 │ 3       │  烎玊?????????? │  BIG5   │
└─────────┴───────────────────────────┴──────────┴─────────┴─────────────────┴─────────┘


=== 测试完成 ===
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值