不同DuckDB插件对不同工具生成的xlsx文件读取速度的比较

上文提到的SharedString格式对读取性能的影响如何,下面用实验来说明。
1.生成两种格式文件
DuckDB的excel插件生成不带SharedString格式的xlsx文件,如下所示

load excel
copy (from read_csv('NYC_311_SR_2010-2020-sample-1M.csv'))to 'ducknyc.xlsx' WITH (FORMAT xlsx, HEADER true);
Run Time (s): real 62.449 user 59.112000 sys 0.852000

writexlsx库生成带SharedString格式的xlsx文件,我让DeepSeek分别用c和python编写了把csv文件转换为xlsx的程序,如下所示:

#include "xlsxwriter.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAX_LINE_LENGTH 1024
#define MAX_FIELD_LENGTH 256

int main(int argc, char *argv[]) {
    if (argc != 3) {
        printf("Usage: %s <input.csv> <output.xlsx>\n", argv[0]);
        return 1;
    }

    const char *input_csv = argv[1];
    const char *output_xlsx = argv[2];

    // 打开 CSV 文件
    FILE *csv_file = fopen(input_csv, "r");
    if (!csv_file) {
        printf("Error: Could not open CSV file '%s'\n", input_csv);
        return 1;
    }

    // 创建工作簿和工作表
    lxw_workbook *workbook = workbook_new(output_xlsx);
    lxw_worksheet *worksheet = workbook_add_worksheet(workbook, NULL);

    char line[MAX_LINE_LENGTH];
    int row = 0;
    
    // 逐行读取 CSV 文件
    while (fgets(line, sizeof(line), csv_file)) {
        // 移除换行符
        line[strcspn(line, "\n")] = '\0';
        
        char *token;
        int col = 0;
        
        // 使用逗号分隔字段
        token = strtok(line, ",");
        while (token != NULL) {
            // 尝试将字段转换为数字
            char *endptr;
            double number = strtod(token, &endptr);
            
            if (*endptr == '\0') {
                // 如果是数字,写入数字
                worksheet_write_number(worksheet, row, col, number, NULL);
            } else {
                // 否则写入字符串
                worksheet_write_string(worksheet, row, col, token, NULL);
            }
            
            token = strtok(NULL, ",");
            col++;
        }
        
        row++;
    }

    // 关闭文件
    fclose(csv_file);
    
    // 保存并关闭工作簿
    lxw_error error = workbook_close(workbook);
    if (error) {
        printf("Error: Failed to create XLSX file: %s\n", lxw_strerror(error));
        return 1;
    }

    printf("Successfully converted '%s' to '%s'\n", input_csv, output_xlsx);
    return 0;
}

编译执行如下

git clone --depth=1 https://github.com/jmcnamara/libxlsxwriter
cd libxlsxwriter
make
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/par/libxlsxwriter/lib
gcc c2xx.c -o c2xx -lxlsxwriter -I /par/libxlsxwriter/include -L /par/libxlsxwriter/lib -O3

time ./c2xx nyc1m.csv cnyc.xlsx
Successfully converted 'nyc1m.csv' to 'cnyc.xlsx'

real	0m29.910s
user	0m28.432s
sys	0m1.420s

ls -la cnyc.xlsx
-rw-r--r-- 1 root root 188461713 Aug 15 09:10 cnyc.xlsx

c程序没有考虑转义字符的问题,转成的xlsx文件有点问题,但稀奇的是,sheetreader插件居然能识别。

import csv
import xlsxwriter
import sys

def csv_to_xlsx(input_csv, output_xlsx):
    """
    将 CSV 文件转换为 XLSX 文件
    
    参数:
        input_csv (str): 输入的 CSV 文件路径
        output_xlsx (str): 输出的 Excel 文件路径
    """
    try:
        # 创建工作簿和工作表
        workbook = xlsxwriter.Workbook(output_xlsx)
        worksheet = workbook.add_worksheet()
        
        # 设置格式 - 可选
        bold_format = workbook.add_format({'bold': True})
        date_format = workbook.add_format({'num_format': 'yyyy-mm-dd'})
        
        # 打开 CSV 文件
        with open(input_csv, 'r', encoding='utf-8') as csv_file:
            csv_reader = csv.reader(csv_file)
            
            for row_idx, row in enumerate(csv_reader):
                for col_idx, value in enumerate(row):
                    try:
                        # 尝试转换为浮点数
                        num_value = float(value)
                        if num_value.is_integer():
                            # 如果是整数,写入整数
                            worksheet.write_number(row_idx, col_idx, int(num_value))
                        else:
                            # 如果是浮点数,写入浮点数
                            worksheet.write_number(row_idx, col_idx, num_value)
                    except ValueError:
                        # 如果不能转换为数字,写入字符串
                        worksheet.write_string(row_idx, col_idx, value)
        
        # 关闭工作簿
        workbook.close()
        print(f"成功将 '{input_csv}' 转换为 '{output_xlsx}'")
        
    except FileNotFoundError:
        print(f"错误: 找不到输入文件 '{input_csv}'")
        sys.exit(1)
    except Exception as e:
        print(f"发生错误: {str(e)}")
        sys.exit(1)

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("用法: python csv_to_xlsx.py <input.csv> <output.xlsx>")
        sys.exit(1)
    
    input_csv = sys.argv[1]
    output_xlsx = sys.argv[2]
    
    csv_to_xlsx(input_csv, output_xlsx)

执行如下

python3 pip.pyz install xlsxwriter
time python3 c2xx.py "NYC_311_SR_2010-2020-sample-1M.csv" pynyc.xlsx
成功将 'NYC_311_SR_2010-2020-sample-1M.csv' 转换为 'pynyc.xlsx'

real	6m13.559s
user	6m4.476s
sys	0m5.680s

因为python有内置的csv读取函数,所以它正确地处理了转义,得到的xlsx文件格式正确。
xlsx文件及其中SharedString和sheet1.xml大小如下表所示:

转换工具xlsxSharedStringsheet1
duckdb excel249MB01.7GB
c188MB120MB1.1GB
python220MB107MB1.5GB

2.用不同插件读取
2.1用excel插件

D load excel;
--不带all_varchar=1选项读取python生成的文件失败
D select count(*) from 'pynyc.xlsx';
  0% ▕                                                            ▏ Run Time (s): real 3.805 user 7.372000 sys 0.132000
Invalid Input Error:
read_xlsx: Failed to parse cell 'I4018': Could not convert string '' to DOUBLE
--带all_varchar=1选项读取成功
D select count(*) from read_xlsx('pynyc.xlsx',all_varchar=1);
┌────────────────┐
│  count_star()  │
│     int64      │
├────────────────┤
│    1000000     │
│ (1.00 million) │
└────────────────┘
Run Time (s): real 48.707 user 95.948000 sys 0.272000
header=1相当于自动判断格式,header=0相当于all_varchar,因为列标题都是字符串。

Run Time (s): real 0.111 user 0.176000 sys 0.016000
D select count(*) from read_xlsx('pynyc.xlsx',header=1);
  0% ▕                                                            ▏ Run Time (s): real 3.539 user 6.880000 sys 0.092000
Invalid Input Error:
read_xlsx: Failed to parse cell 'I4018': Could not convert string '' to DOUBLE
D select count(*) from read_xlsx('ducknyc.xlsx',header=1);
Run Time (s): real 3.411 user 5.300000 sys 0.312000
Invalid Input Error:
read_xlsx: Failed to parse cell 'N4': Could not convert string 'WILLOUGHBY AVENUE' to DOUBLE
D select count(*) from read_xlsx('cnyc.xlsx',header=1);
  0% ▕                                                            ▏ Run Time (s): real 3.804 user 7.516000 sys 0.084000
Invalid Input Error:
read_xlsx: Failed to parse cell 'I10': Could not convert string '1305 EAST 19 STREET' to DOUBLE
D 
D select count(*) from read_xlsx('pynyc.xlsx',header=0);
 
┌────────────────┐
│  count_star()  │
│     int64      │
├────────────────┤
│    1000001     │
│ (1.00 million) │
└────────────────┘
Run Time (s): real 48.054 user 51.680000 sys 0.192000
D select count(*) from read_xlsx('ducknyc.xlsx',header=0);
 
┌────────────────┐
│  count_star()  │
│     int64      │
├────────────────┤
│    1000001     │
│ (1.00 million) │
└────────────────┘
Run Time (s): real 44.848 user 87.600000 sys 0.444000
D select count(*) from read_xlsx('cnyc.xlsx',header=0);
 
┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│    296663    │
└──────────────┘
Run Time (s): real 13.064 user 25.484000 sys 0.100000
D 

2.2用rusty_sheet插件


D select count(*) from read_sheet('pynyc.xlsx',header=1);
 
┌────────────────┐
│  count_star()  │
│     int64      │
├────────────────┤
│    1000000     │
│ (1.00 million) │
└────────────────┘
Run Time (s): real 76.280 user 71.944000 sys 4.160000
D select count(*) from read_sheet('ducknyc.xlsx',header=1);
 
┌────────────────┐
│  count_star()  │
│     int64      │
├────────────────┤
│    1000000     │
│ (1.00 million) │
└────────────────┘
Run Time (s): real 46.921 user 57.200000 sys 0.640000
D select count(*) from read_sheet('cnyc.xlsx',header=1);
Run Time (s): real 38.976 user 38.220000 sys 0.656000
Binder Error:
Read spreadsheet failed: Missing column name at 'AP1'
D 
D select count(*) from read_sheet('pynyc.xlsx',header=0);
 
┌────────────────┐
│  count_star()  │
│     int64      │
├────────────────┤
│    1000001     │
│ (1.00 million) │
└────────────────┘
Run Time (s): real 77.455 user 93.204000 sys 4.148000
D select count(*) from read_sheet('ducknyc.xlsx',header=0);
 
┌────────────────┐
│  count_star()  │
│     int64      │
├────────────────┤
│    1000001     │
│ (1.00 million) │
└────────────────┘
Run Time (s): real 49.145 user 61.764000 sys 0.704000
D select count(*) from read_sheet('cnyc.xlsx',header=0);
 
┌────────────────┐
│  count_star()  │
│     int64      │
├────────────────┤
│    1000217     │
│ (1.00 million) │
└────────────────┘
Run Time (s): real 56.306 user 72.896000 sys 0.644000

2.3用sheetreader插件,因最高只支持Duckdb 1.2.2,故采用该版本测试。


./duckdb122
v1.2.2 7c039464e4
D .timer on
D load sheetreader;

D select count(*) from sheetreader('pynyc.xlsx',has_header=1);
┌────────────────┐
│  count_star()  │
│     int64      │
├────────────────┤
│    1000000     │
│ (1.00 million) │
└────────────────┘
Run Time (s): real 17.722 user 46.544000 sys 0.368000
D select count(*) from sheetreader('ducknyc.xlsx',has_header=1);
Run Time (s): real 10.142 user 45.000000 sys 0.544000
Invalid Error:
Failed to retrieve shared strings file
D select count(*) from sheetreader('cnyc.xlsx',has_header=1);
Run Time (s): real 6.426 user 28.680000 sys 0.136000
Binder Error:
First row must contain only strings when has_header is set to true
D 
D select count(*) from sheetreader('pynyc.xlsx',has_header=0);
┌────────────────┐
│  count_star()  │
│     int64      │
├────────────────┤
│    1000000     │
│ (1.00 million) │
└────────────────┘
Run Time (s): real 17.470 user 55.900000 sys 0.160000
D select count(*) from sheetreader('ducknyc.xlsx',has_header=0);
Run Time (s): real 10.124 user 44.872000 sys 0.536000
Invalid Error:
Failed to retrieve shared strings file
D select count(*) from sheetreader('cnyc.xlsx',has_header=0);
┌────────────────┐
│  count_star()  │
│     int64      │
├────────────────┤
│    1000217     │
│ (1.00 million) │
└────────────────┘
Run Time (s): real 13.244 user 42.060000 sys 0.184000

简单地数方框就知道rusty_sheet插件兼容性最好,sheetreader处理shared strings格式效率最高,它也处理不了别的格式。并行度大约是4。
DuckDB官方excel插件在全字符选项下兼容两种格式,奇怪的是带标题判断,它连自己生成的文件格式都判断错误。

对于识别的结果,我们用summarize命令来查看.
作为基准的duckdb识别原始csv文件

D create table t as from read_csv('NYC_311_SR_2010-2020-sample-1M.csv.mp3');
Run Time (s): real 11.214 user 19.248000 sys 1.760000
D summarize t;
┌──────────────────────┬─────────────┬──────────────────────┬──────────────────────┬───────────────┬───┬──────────────────────┬──────────────────────┬─────────┬─────────────────┐
│     column_name      │ column_type │         min          │         max          │ approx_unique │ … │         q50          │         q75          │  count  │ null_percentage │
│       varcharvarcharvarcharvarchar        │     int64     │   │       varcharvarchar        │  int64  │  decimal(9,2)   │
├──────────────────────┼─────────────┼──────────────────────┼──────────────────────┼───────────────┼───┼──────────────────────┼──────────────────────┼─────────┼─────────────────┤
│ Unique KeyBIGINT11465364484781731022792 │ … │ 328556514020143510000000.00 │
│ Created DateTIMESTAMP2010-01-01 00:00:002020-12-23 01:25:51717832 │ … │ 2016-02-11 14:40:2…  │ 2018-07-31 08:55:5…  │ 10000000.00 │
│ Closed DateTIMESTAMP1900-01-01 00:00:002100-01-01 00:00:00619261 │ … │ 2016-02-26 00:08:4…  │ 2018-08-02 13:09:5…  │ 10000002.86 │
│ Agency               │ VARCHAR3-1-1                │ TLC                  │            29 │ … │ NULLNULL10000000.00 │
│ Agency Name          │ VARCHAR3-1-1                │ Valuation Policy     │           686 │ … │ NULLNULL10000000.00 │
│ Complaint TypeVARCHAR../../WEB-INF/web.…  │ ZTESTINT             │           281 │ … │ NULLNULL10000000.00 │
│ Descriptor           │ VARCHAR1 Missed Collection  │ unknown odor/taste…  │          1553 │ … │ NULLNULL10000000.30...

然后用不同插件读取大家都识别正确的pynyc.xlsx。
duckdb官方excel插件全识别为varchar

D create table t as from read_xlsx('pynyc.xlsx',header=0);
D summarize t;
┌─────────────┬─────────────┬──────────────────────┬────────────────────────────────────────────────────┬───────────────┬───────┬───────┬───────┬───────┬───────┬─────────┬─────────────────┐
│ column_name │ column_type │         min          │                        max                         │ approx_unique │  avg  │  std  │  q25  │  q50  │  q75  │  count  │ null_percentage │
│   varcharvarcharvarcharvarchar                       │     int64     │ int32 │ int32 │ int32 │ int32 │ int32 │  int64  │  decimal(9,2)   │
├─────────────┼─────────────┼──────────────────────┼────────────────────────────────────────────────────┼───────────────┼───────┼───────┼───────┼───────┼───────┼─────────┼─────────────────┤
│ A1          │ VARCHAR11465364Unique Key857267NULLNULLNULLNULLNULL10000010.00 │
│ B1          │ VARCHAR01/01/2010 01:05:5…  │ Created Date920156NULLNULLNULLNULLNULL10000010.00 │
│ C1          │ VARCHAR     │                      │ Closed Date667996NULLNULLNULLNULLNULL10000010.00 │
│ D1          │ VARCHAR3-1-1                │ TLC                                                │            29NULLNULLNULLNULLNULL10000010.00 │
│ E1          │ VARCHAR3-1-1                │ Valuation Policy                                   │           686NULLNULLNULLNULLNULL10000010.00 │
│ F1          │ VARCHAR../../WEB-INF/web.…  │ ZTESTINT                                           │           281NULLNULLNULLNULLNULL10000010.00 │
│ G1          │ VARCHAR     │                      │ unknown odor/taste in drinking water (QA6)1553NULLNULLNULLNULLNULL10000010.00 │
│ H1          │ VARCHAR     │                      │ Wooded Area                                        │           149NULLNULLNULLNULLNULL10000010.00 │
│ I1          │ VARCHAR     │                      │ XXXXX                                              │           560NULLNULLNULLNULLNULL10000010.00...

sheetreader只有double和varchar两种格式

D create table t as from sheetreader('pynyc.xlsx');
Run Time (s): real 22.319 user 62.448000 sys 2.436000
D summarize t;
┌──────────────────────┬─────────────┬──────────────────────┬──────────────────────┬───────────────┬───┬────────────────────┬────────────────────┬─────────┬─────────────────┐
│     column_name      │ column_type │         min          │         max          │ approx_unique │ … │        q50         │        q75         │  count  │ null_percentage │
│       varcharvarcharvarcharvarchar        │     int64     │   │      varcharvarchar       │  int64  │  decimal(9,2)   │
├──────────────────────┼─────────────┼──────────────────────┼──────────────────────┼───────────────┼───┼────────────────────┼────────────────────┼─────────┼─────────────────┤
│ Unique KeyDOUBLE11465364.048478173.0972278 │ … │ 32847889.4505127440204486.5782267310000000.00 │
│ Created DateVARCHAR01/01/2010 01:05:5…  │ 12/31/2019 12:58:5…  │        815306 │ … │ NULLNULL10000000.00 │
│ Closed DateVARCHAR     │                      │ 12/31/2019 12:59:0…  │        736431 │ … │ NULLNULL10000000.00 │
│ Agency               │ VARCHAR3-1-1                │ TLC                  │            27 │ … │ NULLNULL10000000.00 │
│ Agency Name          │ VARCHAR3-1-1                │ Valuation Policy     │           476 │ … │ NULLNULL10000000.00 │
│ Complaint TypeVARCHAR../../WEB-INF/web.…  │ ZTESTINT             │           247 │ … │ NULLNULL10000000.00 │
│ Descriptor           │ VARCHAR     │                      │ unknown odor/taste…  │          1296 │ … │ NULLNULL10000000.00 │
│ Location TypeVARCHAR     │                      │ Wooded Area          │           147 │ … │ NULLNULL10000000.00 │
│ Incident Zip         │ DOUBLE0.0900836.0438 │ … │ 11204.5187783145711234.27829456380610000005.51...

rusty_sheet基本还原了整数类型,但日期型识别为字符串。

D load rusty_sheet;
D create table t2 as from read_sheet('pynyc.xlsx',header=1);
D summarize t2;
┌──────────────────────┬─────────────┬──────────────────────┬──────────────────────┬───────────────┬────────────────────┬───┬──────────┬──────────┬──────────┬─────────┬─────────────────┐
│     column_name      │ column_type │         min          │         max          │ approx_unique │        avg         │ … │   q25    │   q50    │   q75    │  count  │ null_percentage │
│       varcharvarcharvarcharvarchar        │     int64     │      varchar       │   │ varcharvarcharvarchar  │  int64  │  decimal(9,2)   │
├──────────────────────┼─────────────┼──────────────────────┼──────────────────────┼───────────────┼────────────────────┼───┼──────────┼──────────┼──────────┼─────────┼─────────────────┤
│ Unique KeyBIGINT1146536448478173102279232687965.858032    │ … │ 25252380328492874020484210000000.00 │
│ Created DateVARCHAR01/01/2010 01:05:5…  │ 12/31/2019 12:58:5…  │        920156NULL               │ … │ NULLNULLNULL10000000.00 │
│ Closed DateVARCHAR     │                      │ 12/31/2019 12:59:0…  │        667996NULL               │ … │ NULLNULLNULL10000000.00 │
│ Agency               │ VARCHAR3-1-1                │ TLC                  │            29NULL               │ … │ NULLNULLNULL10000000.00 │
│ Agency Name          │ VARCHAR3-1-1                │ Valuation Policy     │           686NULL               │ … │ NULLNULLNULL10000000.00 │
│ Complaint TypeVARCHAR../../WEB-INF/web.…  │ ZTESTINT             │           281NULL               │ … │ NULLNULLNULL10000000.00 │
│ Descriptor           │ VARCHAR     │                      │ unknown odor/taste…  │          1553NULL               │ … │ NULLNULLNULL10000000.00 │
│ Location TypeVARCHAR     │                      │ Wooded Area          │           149NULL               │ … │ NULLNULLNULL10000000.00 │
│ Incident Zip         │ BIGINT090083654810825.273804119031 │ … │ 10337112041123410000005.51 │
│ Incident Address     │ VARCHAR     │                      │ west 155 street an…  │        333521NULL               │ … │ NULLNULLNULL10000000.00...

以上仅是对一个文件的测试,不代表其他文件的情况。各种插件都有指定列类型的功能,加上后应该能取得更好的效果。

评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值