让DeepSeek参照libxls的xls2csv例子实现的DuckDB read_xls表函数

前文实现了支持中文字符集的DuckDB read_csv_e表函数,考虑到read_excel不支持传统xls格式文件,在经过选型测试后选定了libxls作为实现读xls文件的工具库。
把libxls库自带的xls2csv例子和read_csv_e表函数的头文件和cpp这两个文件一起传给DeepSeek,提示词如下:

请整合附件xls转csv文件和duckdb读入csv文件表函数的代码,建立Duckdb读入xls文件表函数read_xls,参数比read_csv_e增加一个sheet编号(或sheet名)

可能是因为我删除了上传文件中的汉字,返回的说明和代码注释是纯英文的,不过正确性没问题,除了一处需要指定命名空间xls,我加上主函数包含测试用例一次编译通过。我们必须对DeepSeek对于已有的正确实现的改写和整合能力点赞。

实现源代码read_xls.cpp

#pragma once
#include "duckdb.hpp"
#include <iconv.h>
#include <vector>
#include "libxls-1.6.3/include/xls.h"
using namespace xls;
namespace duckdb {

struct XLSGlobalState {
    xlsWorkBook* workbook;
    xlsWorkSheet* worksheet;
    unsigned int current_row;
    bool finished;
};

class XLSEncodingConverter {
public:
    static Value ConvertEncoding(const string &input, const string &from_encoding, const string &to_encoding = "UTF-8");
};

class ReadXLSFunction {
public:
    static TableFunction GetFunction();
    
    struct BindData : public TableFunctionData {
        string filename;
        string encoding;
        string sheet_name;
        int sheet_num = -1; // -1 means not specified
        vector<LogicalType> return_types;
        vector<string> return_names;
    };

    static unique_ptr<FunctionData> Bind(ClientContext &context, 
                                        TableFunctionBindInput &input,
                                        vector<LogicalType> &return_types,
                                        vector<string> &return_names);

    static unique_ptr<GlobalTableFunctionState> InitGlobal(ClientContext &context, 
                                                         TableFunctionInitInput &input);

    static void Function(ClientContext &context, 
                       TableFunctionInput &data, 
                       DataChunk &output);

private:
    static void OutputString(const char *string, char stringSeparator, Vector &output, idx_t count);
    static void OutputNumber(double number, Vector &output, idx_t count);
    static void DetermineColumns(xlsWorkSheet* pWS, vector<string> &names, vector<LogicalType> &types);
};

Value XLSEncodingConverter::ConvertEncoding(const string &input, const string &from_encoding, const string &to_encoding) {
    if (from_encoding == "utf8" || from_encoding == "UTF-8") {
        return Value(input);
    }

    iconv_t cd = iconv_open(to_encoding.c_str(), from_encoding.c_str());
    if (cd == (iconv_t)-1) {
        throw std::runtime_error("iconv_open failed for " + from_encoding + " to " + to_encoding);
    }

    size_t in_bytes_left = input.size();
    char *in_buf = const_cast<char*>(input.data());

    size_t out_bytes_left = input.size() * 4;
    vector<char> out_buf(out_bytes_left);
    char *out_ptr = out_buf.data();

    if (iconv(cd, &in_buf, &in_bytes_left, &out_ptr, &out_bytes_left) == (size_t)-1) {
        iconv_close(cd);
        throw std::runtime_error("iconv conversion failed");
    }

    iconv_close(cd);
    return Value(string(out_buf.data(), out_ptr - out_buf.data()));
}

void ReadXLSFunction::DetermineColumns(xlsWorkSheet* pWS, vector<string> &names, vector<LogicalType> &types) {
    // Use first row as column names
    xlsRow* first_row = xls_row(pWS, 0);
    if (!first_row) return;

    for (WORD col = 0; col <= pWS->rows.lastcol; col++) {
        xlsCell *cell = xls_cell(pWS, 0, col);
        if (!cell || cell->isHidden) continue;

        string col_name;
        if (cell->str) {
            col_name = string((char*)cell->str);
        } else {
            col_name = "column_" + to_string(col + 1);
        }
        names.push_back(col_name);
        types.push_back(LogicalType::VARCHAR); // Default to VARCHAR, could be refined
    }
}

unique_ptr<FunctionData> ReadXLSFunction::Bind(ClientContext &context, 
                                              TableFunctionBindInput &input,
                                              vector<LogicalType> &return_types,
                                              vector<string> &return_names) {
    auto result = make_uniq<BindData>();
    
    // Get filename
    result->filename = input.inputs[0].GetValue<string>();
    
    // Get encoding (default to UTF-8)
    if (input.inputs.size() > 1) {
        result->encoding = StringUtil::Lower(input.inputs[1].GetValue<string>());
    } else {
        result->encoding = "UTF-8";
    }
    
    // Get sheet name or number
    if (input.inputs.size() > 2) {
        if (input.inputs[2].type() == LogicalType::VARCHAR) {
            result->sheet_name = input.inputs[2].GetValue<string>();
        } else if (input.inputs[2].type().IsNumeric()) {
            result->sheet_num = input.inputs[2].GetValue<int>();
        }
    }

    // Open workbook to get sheet info
    xls_error_t error = LIBXLS_OK;
    xlsWorkBook* pWB = xls_open_file(result->filename.c_str(), result->encoding.c_str(), &error);
    if (!pWB) {
        throw IOException("Error reading XLS file: " + string(xls_getError(error)));
    }

    // Validate sheet selection
    if (result->sheet_num >= 0) {
        if (result->sheet_num >= (int)pWB->sheets.count) {
            xls_close(pWB);
            throw BinderException("Sheet number " + to_string(result->sheet_num) + 
                                 " not found, file only has " + to_string(pWB->sheets.count) + " sheets");
        }
    } else if (!result->sheet_name.empty()) {
        bool found = false;
        for (unsigned int i = 0; i < pWB->sheets.count; i++) {
            if (pWB->sheets.sheet[i].name && 
                result->sheet_name == (char *)pWB->sheets.sheet[i].name) {
                result->sheet_num = i;
                found = true;
                break;
            }
        }
        if (!found) {
            xls_close(pWB);
            throw BinderException("Sheet \"" + result->sheet_name + "\" not found");
        }
    } else {
        // Default to first sheet
        result->sheet_num = 0;
    }

    // Open worksheet to determine columns
    xlsWorkSheet* pWS = xls_getWorkSheet(pWB, result->sheet_num);
    xls_parseWorkSheet(pWS);
    
    DetermineColumns(pWS, return_names, return_types);
    
    result->return_types = return_types;
    result->return_names = return_names;
    
    xls_close_WS(pWS);
    xls_close(pWB);
    
    return std::move(result);
}

struct XLSFunctionGlobalState : public GlobalTableFunctionState {
    duckdb::unique_ptr<XLSGlobalState> xls_state;
};

unique_ptr<GlobalTableFunctionState> ReadXLSFunction::InitGlobal(ClientContext &context, 
                                                               TableFunctionInitInput &input) {
    auto result = make_uniq<XLSFunctionGlobalState>();
    auto &bind_data = (BindData &)*input.bind_data;
    
    result->xls_state = make_uniq<XLSGlobalState>();
    
    // Open workbook
    xls_error_t error = LIBXLS_OK;
    result->xls_state->workbook = xls_open_file(bind_data.filename.c_str(), bind_data.encoding.c_str(), &error);
    if (!result->xls_state->workbook) {
        throw IOException("Error reading XLS file: " + string(xls_getError(error)));
    }
    
    // Open worksheet
    int sheet_num = bind_data.sheet_num >= 0 ? bind_data.sheet_num : 0;
    result->xls_state->worksheet = xls_getWorkSheet(result->xls_state->workbook, sheet_num);
    xls_parseWorkSheet(result->xls_state->worksheet);
    
    result->xls_state->current_row = 1; // Skip header row
    result->xls_state->finished = false;
    
    return std::move(result);
}

void ReadXLSFunction::OutputString(const char *string, char stringSeparator, Vector &output, idx_t count) {
    std::string result;
    //result += stringSeparator;
    
    for (const char *str = string; *str; str++) {
        if (*str == stringSeparator) {
            result += stringSeparator;
            result += stringSeparator;
        } else if (*str == '\\') {
            result += "\\\\";
        } else {
            result += *str;
        }
    }

    // 直接输出字符串内容,不加双引号
    //result += stringSeparator;
    output.SetValue(count, Value(result));
}

void ReadXLSFunction::OutputNumber(double number, Vector &output, idx_t count) {
    char buffer[50];
    snprintf(buffer, sizeof(buffer), "%.15g", number);
    output.SetValue(count, Value(buffer));
}

void ReadXLSFunction::Function(ClientContext &context, 
                             TableFunctionInput &data, 
                             DataChunk &output) {
    auto &bind_data = (BindData &)*data.bind_data;
    auto &global_state = ((XLSFunctionGlobalState &)*data.global_state).xls_state;
    
    if (!global_state->worksheet || global_state->finished) {
        output.SetCardinality(0);
        return;
    }

    idx_t count = 0;
    while (count < STANDARD_VECTOR_SIZE) {
        if (global_state->current_row > (unsigned int)global_state->worksheet->rows.lastrow) {
            global_state->finished = true;
            break;
        }

        // Process cells in this row
        for (WORD col = 0; col < output.ColumnCount(); col++) {
            xlsCell *cell = xls_cell(global_state->worksheet, global_state->current_row, col);
            
            if (!cell || cell->isHidden) {
                output.data[col].SetValue(count, Value());
                continue;
            }

            // Handle different cell types
            if (cell->id == XLS_RECORD_RK || cell->id == XLS_RECORD_MULRK || cell->id == XLS_RECORD_NUMBER) {
                OutputNumber(cell->d, output.data[col], count);
            } else if (cell->id == XLS_RECORD_FORMULA || cell->id == XLS_RECORD_FORMULA_ALT) {
                if (cell->l == 0) {
                    OutputNumber(cell->d, output.data[col], count);
                } else if (cell->str) {
                    if (!strcmp((char *)cell->str, "bool")) {
                        OutputString((int)cell->d ? "true" : "false", '"', output.data[col], count);
                    } else if (!strcmp((char *)cell->str, "error")) {
                        OutputString("*error*", '"', output.data[col], count);
                    } else {
                        OutputString((char *)cell->str, '"', output.data[col], count);
                    }
                } else {
                    output.data[col].SetValue(count, Value());
                }
            } else if (cell->str) {
                OutputString((char *)cell->str, '"', output.data[col], count);
            } else {
                output.data[col].SetValue(count, Value());
            }
        }

        global_state->current_row++;
        count++;
    }
    
    output.SetCardinality(count);
}

TableFunction ReadXLSFunction::GetFunction() {
    TableFunction table_func("read_xls", 
                        {LogicalType::VARCHAR, LogicalType::VARCHAR, LogicalType::ANY},
                        Function,
                        Bind,
                        InitGlobal);
    // 设置默认参数值
    table_func.named_parameters["encoding"] = LogicalType::VARCHAR;
    table_func.named_parameters["sheet"] = LogicalType::ANY;
    //table_func.named_parameters["encoding"] = Value("UTF-8");
    //table_func.named_parameters["sheet"] = Value(0); // 默认第一个工作表
    
    return table_func;                        
}

} // namespace duckdb

测试代码test_xls.cpp

#include "duckdb.hpp"
//#include "readpg5.cpp"
#include "read_xls.cpp"
#include <iostream>
#include "duckdb/main/extension_util.hpp"
#include "duckdb/main/query_result.hpp"
#include "duckdb/common/box_renderer.hpp"
using namespace duckdb;
using namespace std;

int main() {
    DuckDB db(nullptr);
    Connection con(db);
    // Register the function
    try {
        DatabaseInstance& db_instance = *db.instance;

        ExtensionUtil::RegisterFunction(db_instance, ReadXLSFunction::GetFunction());
    } catch (const exception &e) {
        cerr << "init err" << e.what() << endl;
        return 1;
    }

    // Read first sheet with default encoding (UTF-8)
    auto result = con.Query("SELECT * FROM read_xls('file.xls')");

    if (result->HasError()) {
        cerr << "query err " << result->GetError() << endl;
    } else {
        result->Print();
    }
    
    
    // Read sheet by name with specific encoding
    result = con.Query("SELECT * FROM read_xls('file.xls', 'GBK', 'Sheet2')");

    if (result->HasError()) {
        cerr << "query err " << result->GetError() << endl;
    } else {
        result->Print();
    }    
    
    // Read sheet by number (0-based index)
    result = con.Query("SELECT * FROM read_xls('file.xls', 'UTF-8', 1)");
    //result = con.Query("SELECT a.*, b.* FROM read_csv_e('gb18030_file.csv', 'gb18030') a JOIN read_csv_e('big5_file.csv', 'big5') b ON a.id = b.id");
    if (result->HasError()) {
        cerr << "query err " << result->GetError() << endl;
    } else {
        //result->Print();

            auto &context = *con.context;
            BoxRendererConfig config;
            std::string box_output = result->ToBox(context, config);
            std::cout << box_output << std::endl;
    }  

    return 0;
}

执行结果如下:

 ./readxls
query err Binder Error: No function matches the given name and argument types 'read_xls(VARCHAR)'. You might need to add explicit type casts.
        Candidate functions:
        read_xls(VARCHAR, VARCHAR, ANY)


LINE 1: SELECT * FROM read_xls('file.xls')
                      ^
ID      Name    Value
VARCHAR VARCHAR VARCHAR
[ Rows: 3]
1       "Tom"   12.4
2       "Jerry" 3
3       "张一"  -12


┌─────────┬─────────┬─────────┐
│   ID    │  Name   │  Value  │
│ varchar │ varchar │ varchar │
├─────────┼─────────┼─────────┤
│ 1"Tom"12.4    │
│ 2"Jerry"3       │
│ 3"张一"-12     │
└─────────┴─────────┴─────────┘

csv文件通常需要在字符串左右加上分隔符,原例子就是如此。但在表中不需要,所以让他去掉了两边的双引号。但默认打开第1个sheet实现不对,他给出的修订代码default_parameters参数在类中不存在,编译错误error: 'class duckdb::TableFunction' has no member named 'default_parameters', 还是注释掉了,留待以后解决。
如果要测试以上代码,请自行建立一个传统的file.xls, 并在Sheet2中填充内容。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值