众所周知,Pandas的read_csv函数支持指定字符集,但DuckDB不支持,虽然可以用Linux iconv
命令事先转换,但是对于一般用户执行命令行有所不便。所以,让DeepSeek实现了一个。
提示词如下
请再实现一个duckdb的read_csv_e(文件名,编码名)表函数,编码支持’gbk’,‘gb2312’,‘gb18030’,‘big5’。要求支持读入这4种编码文件,查询正常显示,插入duckdb表时采用UTF8
源代码如下,csv_reader2.hpp
文件
#pragma once
#include "duckdb.hpp"
#include <iconv.h>
#include <fstream>
#include <vector>
namespace duckdb {
struct CSVGlobalState {
std::unique_ptr<std::ifstream> file;
bool finished = false;
};
class CSVEncodingConverter {
public:
static Value ConvertEncoding(const string &input, const string &from_encoding, const string &to_encoding = "UTF-8");
};
class ReadCSVEFunction {
public:
static TableFunction GetFunction();
struct BindData : public TableFunctionData {
string filename;
string encoding;
vector<LogicalType> return_types;
vector<string> return_names;
};
static unique_ptr<FunctionData> Bind(ClientContext &context,
TableFunctionBindInput &input,
vector<LogicalType> &return_types,
vector<string> &return_names);
static unique_ptr<GlobalTableFunctionState> InitGlobal(ClientContext &context,
TableFunctionInitInput &input);
static void Function(ClientContext &context,
TableFunctionInput &data,
DataChunk &output);
};
} // namespace duckdb
csv_reader2.cpp
文件
#include "csv_reader2.hpp"
#include <sstream>
#include <stdexcept>
using namespace duckdb;
using namespace std;
Value CSVEncodingConverter::ConvertEncoding(const string &input, const string &from_encoding, const string &to_encoding) {
if (from_encoding == "utf8" || from_encoding == "UTF-8") {
return Value(input);
}
iconv_t cd = iconv_open(to_encoding.c_str(), from_encoding.c_str());
if (cd == (iconv_t)-1) {
throw std::runtime_error("iconv_open failed for " + from_encoding + " to " + to_encoding);
}
size_t in_bytes_left = input.size();
char *in_buf = const_cast<char*>(input.data());
size_t out_bytes_left = input.size() * 4;
vector<char> out_buf(out_bytes_left);
char *out_ptr = out_buf.data();
if (iconv(cd, &in_buf, &in_bytes_left, &out_ptr, &out_bytes_left) == (size_t)-1) {
iconv_close(cd);
throw std::runtime_error("iconv conversion failed");
}
iconv_close(cd);
return Value(string(out_buf.data(), out_ptr - out_buf.data()));
}
duckdb::unique_ptr<FunctionData> ReadCSVEFunction::Bind(ClientContext &context,
TableFunctionBindInput &input,
vector<LogicalType> &return_types,
vector<string> &return_names) {
auto result = duckdb::make_uniq<BindData>();
result->filename = input.inputs[0].GetValue<string>();
result->encoding = StringUtil::Lower(input.inputs[1].GetValue<string>());
static const vector<string> SUPPORTED_ENCODINGS = {"gbk", "gb2312", "gb18030", "big5", "utf8"};
if (find(SUPPORTED_ENCODINGS.begin(), SUPPORTED_ENCODINGS.end(), result->encoding) == SUPPORTED_ENCODINGS.end()) {
throw BinderException("Unsupported encoding. Supported encodings are: gbk, gb2312, gb18030, big5");
}
ifstream file(result->filename);
if (!file.is_open()) {
throw IOException("Could not open file: " + result->filename);
}
string header_line;
getline(file, header_line);
file.close();
// 新增:去除标题行末尾的\r
if (!header_line.empty() && header_line.back() == '\r') {
header_line.pop_back();
}
string utf8_header = CSVEncodingConverter::ConvertEncoding(header_line, result->encoding).GetValue<string>();
stringstream ss(utf8_header);
string column_name;
while (getline(ss, column_name, ',')) {
return_names.push_back(column_name);
return_types.push_back(LogicalType::VARCHAR);
}
result->return_types = return_types;
result->return_names = return_names;
return std::move(result);
}
// DuckDB 1.3.0兼容版本
struct CSVFunctionGlobalState : public GlobalTableFunctionState {
public: // 添加这行
duckdb::unique_ptr<CSVGlobalState> csv_state;
//unique_ptr<CSVGlobalState> csv_state;
};
duckdb::unique_ptr<GlobalTableFunctionState> ReadCSVEFunction::InitGlobal(ClientContext &context,
TableFunctionInitInput &input) {
auto result = duckdb::make_uniq<CSVFunctionGlobalState>();
auto &bind_data = (BindData &)*input.bind_data;
result->csv_state = duckdb::make_uniq<CSVGlobalState>();
result->csv_state->file = std::make_unique<ifstream>(bind_data.filename);
if (!result->csv_state->file->is_open()) {
throw IOException("Could not open file: " + bind_data.filename);
}
// 跳过标题行
string dummy;
getline(*result->csv_state->file, dummy);
return std::move(result);
}
void ReadCSVEFunction::Function(ClientContext &context,
TableFunctionInput &data,
DataChunk &output) {
auto &bind_data = (BindData &)*data.bind_data;
auto &global_state = ((CSVFunctionGlobalState &)*data.global_state).csv_state;
if (!global_state->file || global_state->finished) {
output.SetCardinality(0);
return;
}
idx_t count = 0;
while (count < STANDARD_VECTOR_SIZE) {
string line;
if (!getline(*global_state->file, line)) {
global_state->finished = true;
global_state->file->close();
break;
}
// 新增:去除Windows换行符的\r
if (!line.empty() && line.back() == '\r') {
line.pop_back();
}
string utf8_line = CSVEncodingConverter::ConvertEncoding(line, bind_data.encoding).GetValue<string>();
stringstream ss(utf8_line);
string value;
idx_t col_idx = 0;
while (getline(ss, value, ',') && col_idx < output.ColumnCount()) {
output.data[col_idx].SetValue(count, Value(value));
col_idx++;
}
count++;
}
output.SetCardinality(count);
}
TableFunction ReadCSVEFunction::GetFunction() {
return TableFunction("read_csv_e",
{LogicalType::VARCHAR, LogicalType::VARCHAR},
Function,
Bind,
InitGlobal);
}
测试文件testcsv.cpp
#include "duckdb.hpp"
//#include "readpg5.cpp"
#include "csv_reader2.hpp"
#include <iostream>
#include "duckdb/main/extension_util.hpp"
#include "duckdb/main/query_result.hpp"
#include "duckdb/common/box_renderer.hpp"
using namespace duckdb;
using namespace std;
int main() {
DuckDB db(nullptr);
Connection con(db);
try {
DatabaseInstance& db_instance = *db.instance;
//ExtensionUtil::RegisterFunction(db_instance, PGTableFunction::GetFunction());
// 在main函数中添加:
ExtensionUtil::RegisterFunction(db_instance, ReadCSVEFunction::GetFunction());
} catch (const exception &e) {
cerr << "初始化错误: " << e.what() << endl;
return 1;
}
cout << "=== 测试1: 查询中文编码csv表函数 ===" << endl;
auto result = con.Query("SELECT * FROM read_csv_e('/par/gbk_file.csv', 'gbk') LIMIT 10");
if (result->HasError()) {
cerr << "查询错误: " << result->GetError() << endl;
} else {
result->Print();
}
cout << "=== 测试2: 测试创建UTF-8表 ===" << endl;
con.Query("CREATE TABLE utf8_data AS SELECT * FROM read_csv_e('/par/big5_file.csv', 'big5')");
result = con.Query("SELECT * FROM utf8_data");
if (result->HasError()) {
cerr << "查询错误: " << result->GetError() << endl;
} else {
result->Print();
}
cout << "=== 测试3: 多编码测试 ===" << endl;
result = con.Query("SELECT a.*, b.* FROM read_csv_e('gb18030_file.csv', 'gb18030') a JOIN read_csv_e('big5_file.csv', 'big5') b ON a.id = b.id");
if (result->HasError()) {
cerr << "查询错误: " << result->GetError() << endl;
} else {
//result->Print();
// 获取当前客户端上下文
auto &context = *con.context;
// 创建默认的 BoxRenderer 配置
BoxRendererConfig config;
// 调用 ToBox 方法获取格式化字符串
std::string box_output = result->ToBox(context, config);
// 输出结果
std::cout << box_output << std::endl;
}
cout << "\n=== 测试完成 ===" << endl;
return 0;
}
编译命令行
export LIBRARY_PATH=/par/duck/build/src
export LD_LIBRARY_PATH=/par/duck/build/src
g++ -std=c++17 -o readcsv csv_reader2.cpp testcsv.cpp -lduckdb -liconv -I /par/duck/src/include
编译很顺利,证明系统中有iconv.h文件,但是链接时出错了,找不到动态链接库/usr/bin/ld: cannot find -liconv: No such file or directory
,按照这篇文章提供的方法解决了。复制如下:
1.手动下载安装
wget https://ftp.gnu.org/pub/gnu/libiconv/libiconv-1.18.tar.gz
2.在当前目录下解压,并进入解压后的文件夹,后续步骤全程用管理员权限进行
tar -zxf libiconv-1.18.tar.gz
cd libiconv-1.18
3.编译与安装
./configure --prefix=/usr/local
make
make install
4.将其链接至/usr/lib/中
ln -s /usr/local/lib/libiconv.so.2 /usr/lib/libiconv.so.2
ldconfig
我没有做第4步,所以运行时还报一个错误./readcsv: error while loading shared libraries: libiconv.so.2: cannot open shared object file: No such file or directory
, 用export LD_LIBRARY_PATH=/usr/local/lib:/par/duck/build/src
把存放libiconv.so的/usr/local/lib
加入搜索路径就好了。所以还是应该做到位。
再编辑一个csv文件gbk_file.csv
,内容如下:
id, lang , code
1, 中文,GBK
2, 〇镕镚閫閬, GBK
3, 烎玊奣嘦勥巭嫑恏兲氼忈炛, GBK
我是用Windows下的Notepad4编辑的。也可以用其他编辑器,如vi编辑。再另存为BIG5、GB18030等编码文件备用,注意BIG5字符集收录字符较少,有些字用??代替了。
下面是运行结果:
=== 测试1: 查询中文编码csv表函数 ===
id lang code
VARCHAR VARCHAR VARCHAR
[ Rows: 3]
1 中文 GBK
2 〇镕镚閫閬 GBK
3 烎玊奣嘦勥巭嫑恏兲氼忈炛 GBK
=== 测试2: 测试创建UTF-8表 ===
id lang code
VARCHAR VARCHAR VARCHAR
[ Rows: 3]
1 中文 BIG5
2 ???閫閬 BIG5
3 烎玊?????????? BIG5
=== 测试3: 多编码测试 ===
┌─────────┬───────────────────────────┬──────────┬─────────┬─────────────────┬─────────┐
│ id │ lang │ code │ id │ lang │ code │
│ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │
├─────────┼───────────────────────────┼──────────┼─────────┼─────────────────┼─────────┤
│ 1 │ 中文 │ GB18030 │ 1 │ 中文 │ BIG5 │
│ 2 │ 〇镕镚閫閬 │ GB18030 │ 2 │ ???閫閬 │ BIG5 │
│ 3 │ 烎玊奣嘦勥巭嫑恏兲氼忈炛 │ GB18030 │ 3 │ 烎玊?????????? │ BIG5 │
└─────────┴───────────────────────────┴──────────┴─────────┴─────────────────┴─────────┘
=== 测试完成 ===