前文实现了支持中文字符集的DuckDB read_csv_e表函数,考虑到read_excel不支持传统xls格式文件,在经过选型测试后选定了libxls作为实现读xls文件的工具库。
把libxls库自带的xls2csv例子和read_csv_e表函数的头文件和cpp这两个文件一起传给DeepSeek,提示词如下:
请整合附件xls转csv文件和duckdb读入csv文件表函数的代码,建立Duckdb读入xls文件表函数read_xls,参数比read_csv_e增加一个sheet编号(或sheet名)
可能是因为我删除了上传文件中的汉字,返回的说明和代码注释是纯英文的,不过正确性没问题,除了一处需要指定命名空间xls,我加上主函数包含测试用例一次编译通过。我们必须对DeepSeek对于已有的正确实现的改写和整合能力点赞。
实现源代码read_xls.cpp
#pragma once
#include "duckdb.hpp"
#include <iconv.h>
#include <vector>
#include "libxls-1.6.3/include/xls.h"
using namespace xls;
namespace duckdb {
struct XLSGlobalState {
xlsWorkBook* workbook;
xlsWorkSheet* worksheet;
unsigned int current_row;
bool finished;
};
class XLSEncodingConverter {
public:
static Value ConvertEncoding(const string &input, const string &from_encoding, const string &to_encoding = "UTF-8");
};
class ReadXLSFunction {
public:
static TableFunction GetFunction();
struct BindData : public TableFunctionData {
string filename;
string encoding;
string sheet_name;
int sheet_num = -1; // -1 means not specified
vector<LogicalType> return_types;
vector<string> return_names;
};
static unique_ptr<FunctionData> Bind(ClientContext &context,
TableFunctionBindInput &input,
vector<LogicalType> &return_types,
vector<string> &return_names);
static unique_ptr<GlobalTableFunctionState> InitGlobal(ClientContext &context,
TableFunctionInitInput &input);
static void Function(ClientContext &context,
TableFunctionInput &data,
DataChunk &output);
private:
static void OutputString(const char *string, char stringSeparator, Vector &output, idx_t count);
static void OutputNumber(double number, Vector &output, idx_t count);
static void DetermineColumns(xlsWorkSheet* pWS, vector<string> &names, vector<LogicalType> &types);
};
Value XLSEncodingConverter::ConvertEncoding(const string &input, const string &from_encoding, const string &to_encoding) {
if (from_encoding == "utf8" || from_encoding == "UTF-8") {
return Value(input);
}
iconv_t cd = iconv_open(to_encoding.c_str(), from_encoding.c_str());
if (cd == (iconv_t)-1) {
throw std::runtime_error("iconv_open failed for " + from_encoding + " to " + to_encoding);
}
size_t in_bytes_left = input.size();
char *in_buf = const_cast<char*>(input.data());
size_t out_bytes_left = input.size() * 4;
vector<char> out_buf(out_bytes_left);
char *out_ptr = out_buf.data();
if (iconv(cd, &in_buf, &in_bytes_left, &out_ptr, &out_bytes_left) == (size_t)-1) {
iconv_close(cd);
throw std::runtime_error("iconv conversion failed");
}
iconv_close(cd);
return Value(string(out_buf.data(), out_ptr - out_buf.data()));
}
void ReadXLSFunction::DetermineColumns(xlsWorkSheet* pWS, vector<string> &names, vector<LogicalType> &types) {
// Use first row as column names
xlsRow* first_row = xls_row(pWS, 0);
if (!first_row) return;
for (WORD col = 0; col <= pWS->rows.lastcol; col++) {
xlsCell *cell = xls_cell(pWS, 0, col);
if (!cell || cell->isHidden) continue;
string col_name;
if (cell->str) {
col_name = string((char*)cell->str);
} else {
col_name = "column_" + to_string(col + 1);
}
names.push_back(col_name);
types.push_back(LogicalType::VARCHAR); // Default to VARCHAR, could be refined
}
}
unique_ptr<FunctionData> ReadXLSFunction::Bind(ClientContext &context,
TableFunctionBindInput &input,
vector<LogicalType> &return_types,
vector<string> &return_names) {
auto result = make_uniq<BindData>();
// Get filename
result->filename = input.inputs[0].GetValue<string>();
// Get encoding (default to UTF-8)
if (input.inputs.size() > 1) {
result->encoding = StringUtil::Lower(input.inputs[1].GetValue<string>());
} else {
result->encoding = "UTF-8";
}
// Get sheet name or number
if (input.inputs.size() > 2) {
if (input.inputs[2].type() == LogicalType::VARCHAR) {
result->sheet_name = input.inputs[2].GetValue<string>();
} else if (input.inputs[2].type().IsNumeric()) {
result->sheet_num = input.inputs[2].GetValue<int>();
}
}
// Open workbook to get sheet info
xls_error_t error = LIBXLS_OK;
xlsWorkBook* pWB = xls_open_file(result->filename.c_str(), result->encoding.c_str(), &error);
if (!pWB) {
throw IOException("Error reading XLS file: " + string(xls_getError(error)));
}
// Validate sheet selection
if (result->sheet_num >= 0) {
if (result->sheet_num >= (int)pWB->sheets.count) {
xls_close(pWB);
throw BinderException("Sheet number " + to_string(result->sheet_num) +
" not found, file only has " + to_string(pWB->sheets.count) + " sheets");
}
} else if (!result->sheet_name.empty()) {
bool found = false;
for (unsigned int i = 0; i < pWB->sheets.count; i++) {
if (pWB->sheets.sheet[i].name &&
result->sheet_name == (char *)pWB->sheets.sheet[i].name) {
result->sheet_num = i;
found = true;
break;
}
}
if (!found) {
xls_close(pWB);
throw BinderException("Sheet \"" + result->sheet_name + "\" not found");
}
} else {
// Default to first sheet
result->sheet_num = 0;
}
// Open worksheet to determine columns
xlsWorkSheet* pWS = xls_getWorkSheet(pWB, result->sheet_num);
xls_parseWorkSheet(pWS);
DetermineColumns(pWS, return_names, return_types);
result->return_types = return_types;
result->return_names = return_names;
xls_close_WS(pWS);
xls_close(pWB);
return std::move(result);
}
struct XLSFunctionGlobalState : public GlobalTableFunctionState {
duckdb::unique_ptr<XLSGlobalState> xls_state;
};
unique_ptr<GlobalTableFunctionState> ReadXLSFunction::InitGlobal(ClientContext &context,
TableFunctionInitInput &input) {
auto result = make_uniq<XLSFunctionGlobalState>();
auto &bind_data = (BindData &)*input.bind_data;
result->xls_state = make_uniq<XLSGlobalState>();
// Open workbook
xls_error_t error = LIBXLS_OK;
result->xls_state->workbook = xls_open_file(bind_data.filename.c_str(), bind_data.encoding.c_str(), &error);
if (!result->xls_state->workbook) {
throw IOException("Error reading XLS file: " + string(xls_getError(error)));
}
// Open worksheet
int sheet_num = bind_data.sheet_num >= 0 ? bind_data.sheet_num : 0;
result->xls_state->worksheet = xls_getWorkSheet(result->xls_state->workbook, sheet_num);
xls_parseWorkSheet(result->xls_state->worksheet);
result->xls_state->current_row = 1; // Skip header row
result->xls_state->finished = false;
return std::move(result);
}
void ReadXLSFunction::OutputString(const char *string, char stringSeparator, Vector &output, idx_t count) {
std::string result;
//result += stringSeparator;
for (const char *str = string; *str; str++) {
if (*str == stringSeparator) {
result += stringSeparator;
result += stringSeparator;
} else if (*str == '\\') {
result += "\\\\";
} else {
result += *str;
}
}
// 直接输出字符串内容,不加双引号
//result += stringSeparator;
output.SetValue(count, Value(result));
}
void ReadXLSFunction::OutputNumber(double number, Vector &output, idx_t count) {
char buffer[50];
snprintf(buffer, sizeof(buffer), "%.15g", number);
output.SetValue(count, Value(buffer));
}
void ReadXLSFunction::Function(ClientContext &context,
TableFunctionInput &data,
DataChunk &output) {
auto &bind_data = (BindData &)*data.bind_data;
auto &global_state = ((XLSFunctionGlobalState &)*data.global_state).xls_state;
if (!global_state->worksheet || global_state->finished) {
output.SetCardinality(0);
return;
}
idx_t count = 0;
while (count < STANDARD_VECTOR_SIZE) {
if (global_state->current_row > (unsigned int)global_state->worksheet->rows.lastrow) {
global_state->finished = true;
break;
}
// Process cells in this row
for (WORD col = 0; col < output.ColumnCount(); col++) {
xlsCell *cell = xls_cell(global_state->worksheet, global_state->current_row, col);
if (!cell || cell->isHidden) {
output.data[col].SetValue(count, Value());
continue;
}
// Handle different cell types
if (cell->id == XLS_RECORD_RK || cell->id == XLS_RECORD_MULRK || cell->id == XLS_RECORD_NUMBER) {
OutputNumber(cell->d, output.data[col], count);
} else if (cell->id == XLS_RECORD_FORMULA || cell->id == XLS_RECORD_FORMULA_ALT) {
if (cell->l == 0) {
OutputNumber(cell->d, output.data[col], count);
} else if (cell->str) {
if (!strcmp((char *)cell->str, "bool")) {
OutputString((int)cell->d ? "true" : "false", '"', output.data[col], count);
} else if (!strcmp((char *)cell->str, "error")) {
OutputString("*error*", '"', output.data[col], count);
} else {
OutputString((char *)cell->str, '"', output.data[col], count);
}
} else {
output.data[col].SetValue(count, Value());
}
} else if (cell->str) {
OutputString((char *)cell->str, '"', output.data[col], count);
} else {
output.data[col].SetValue(count, Value());
}
}
global_state->current_row++;
count++;
}
output.SetCardinality(count);
}
TableFunction ReadXLSFunction::GetFunction() {
TableFunction table_func("read_xls",
{LogicalType::VARCHAR, LogicalType::VARCHAR, LogicalType::ANY},
Function,
Bind,
InitGlobal);
// 设置默认参数值
table_func.named_parameters["encoding"] = LogicalType::VARCHAR;
table_func.named_parameters["sheet"] = LogicalType::ANY;
//table_func.named_parameters["encoding"] = Value("UTF-8");
//table_func.named_parameters["sheet"] = Value(0); // 默认第一个工作表
return table_func;
}
} // namespace duckdb
测试代码test_xls.cpp
#include "duckdb.hpp"
//#include "readpg5.cpp"
#include "read_xls.cpp"
#include <iostream>
#include "duckdb/main/extension_util.hpp"
#include "duckdb/main/query_result.hpp"
#include "duckdb/common/box_renderer.hpp"
using namespace duckdb;
using namespace std;
int main() {
DuckDB db(nullptr);
Connection con(db);
// Register the function
try {
DatabaseInstance& db_instance = *db.instance;
ExtensionUtil::RegisterFunction(db_instance, ReadXLSFunction::GetFunction());
} catch (const exception &e) {
cerr << "init err" << e.what() << endl;
return 1;
}
// Read first sheet with default encoding (UTF-8)
auto result = con.Query("SELECT * FROM read_xls('file.xls')");
if (result->HasError()) {
cerr << "query err " << result->GetError() << endl;
} else {
result->Print();
}
// Read sheet by name with specific encoding
result = con.Query("SELECT * FROM read_xls('file.xls', 'GBK', 'Sheet2')");
if (result->HasError()) {
cerr << "query err " << result->GetError() << endl;
} else {
result->Print();
}
// Read sheet by number (0-based index)
result = con.Query("SELECT * FROM read_xls('file.xls', 'UTF-8', 1)");
//result = con.Query("SELECT a.*, b.* FROM read_csv_e('gb18030_file.csv', 'gb18030') a JOIN read_csv_e('big5_file.csv', 'big5') b ON a.id = b.id");
if (result->HasError()) {
cerr << "query err " << result->GetError() << endl;
} else {
//result->Print();
auto &context = *con.context;
BoxRendererConfig config;
std::string box_output = result->ToBox(context, config);
std::cout << box_output << std::endl;
}
return 0;
}
执行结果如下:
./readxls
query err Binder Error: No function matches the given name and argument types 'read_xls(VARCHAR)'. You might need to add explicit type casts.
Candidate functions:
read_xls(VARCHAR, VARCHAR, ANY)
LINE 1: SELECT * FROM read_xls('file.xls')
^
ID Name Value
VARCHAR VARCHAR VARCHAR
[ Rows: 3]
1 "Tom" 12.4
2 "Jerry" 3
3 "张一" -12
┌─────────┬─────────┬─────────┐
│ ID │ Name │ Value │
│ varchar │ varchar │ varchar │
├─────────┼─────────┼─────────┤
│ 1 │ "Tom" │ 12.4 │
│ 2 │ "Jerry" │ 3 │
│ 3 │ "张一" │ -12 │
└─────────┴─────────┴─────────┘
csv文件通常需要在字符串左右加上分隔符,原例子就是如此。但在表中不需要,所以让他去掉了两边的双引号。但默认打开第1个sheet实现不对,他给出的修订代码default_parameters
参数在类中不存在,编译错误error: 'class duckdb::TableFunction' has no member named 'default_parameters'
, 还是注释掉了,留待以后解决。
如果要测试以上代码,请自行建立一个传统的file.xls, 并在Sheet2中填充内容。