利用DeepSeek编写使用libcsv解析csv文件并用libxlsxwriter写入xlsx文件的C程序

前面用Minicsv实现的程序不够完善,而libcsv更加完善,但调用方法比较复杂,我在网上搜到一篇liibcsv库解析csv格式文件的文章(作者:caojun97), 于是让DeepSeek将它改写为输出到xlsx,我采取了4个步骤。

第一步:

附件是使用libcsv解析csv文件的例子,把它改造成读入不固定列的csv,并输出第一行、第二行、及最后一行并根据转成数字的结果在每列后面标上类型N或T

第二步:

再添加功能,对一个大文件,先尝试读取20行,每列的最终类型由这20行决定,规则是,如果某列全是数字类型,则此列就是数字,否则是文本,如果第一行是标题行 命令行参数为-H=1,则以读取2-21行的结果确定,以上20由命令行参数 -R=20决定,规定第一个参数为文件名,如果行数不足就报错退出,仍然输出整个文件的首行、第二行和最后一行。

第三步:

数字类型包括整数和双精度浮点数,分别用I和D表示,请只给出修改此部分的代码,其他无需修改部分,用文字标出

第四步:

以刚才修改的带浮点数判断的csv读取程序为基础,把之前写xlsx的逻辑加入,数据无需每列临时判断,默认带标题行即H=1,无标题行用-H=0表示,减少取样行数为5行,带标题行的最后输出各列的标题及类型。

最终的程序如下

#include <string.h>
#include <stdlib.h>
#include <stdbool.h>
#include "csv.h"
#include "xlsxwriter.h"
#include <errno.h>

#define MAX_COLS 256
#define MAX_LINE_LENGTH 65536
#define DEFAULT_SAMPLE_ROWS 5

struct ColumnInfo {
    char *value;
    char type; // 'I' for integer, 'D' for double, 'T' for text
};

struct CsvData {
    int row_count;
    int col_count;
    int sample_rows;
    bool has_header;
    bool sampling_complete;
    char *column_names[MAX_COLS];  // 列标题
    char column_types[MAX_COLS];   // 列类型
    lxw_workbook *workbook;
    lxw_worksheet *worksheet;
};

// 检查字符串是否为数字并返回类型
char get_number_type(const char *str) {
    if (str == NULL || *str == '\0') return 'T';
    char *endptr;
    strtod(str, &endptr);
    if (*endptr != '\0') return 'T'; // 不是数字
    
    // 检查是否是整数
    if (strchr(str, '.') == NULL && strchr(str, 'e') == NULL && 
        strchr(str, 'E') == NULL) {
        return 'I'; // 整数
    }
    return 'D'; // 浮点数
}

// 初始化CSV数据结构
void init_csv_data(struct CsvData *csv_data, int sample_rows, bool has_header, const char *output_file) {
    memset(csv_data, 0, sizeof(struct CsvData));
    csv_data->sample_rows = sample_rows;
    csv_data->has_header = has_header;
    csv_data->workbook = workbook_new(output_file);
    csv_data->worksheet = workbook_add_worksheet(csv_data->workbook, NULL);
}

// 释放CSV数据结构
void free_csv_data(struct CsvData *csv_data) {
    for (int i = 0; i < MAX_COLS; i++) {
        if (csv_data->column_names[i]) free(csv_data->column_names[i]);
    }
    if (csv_data->workbook) workbook_close(csv_data->workbook);
}

// 列级数据处理回调函数
void col_callback(void *s, size_t len, void *data) {
    struct CsvData *csv_data = (struct CsvData *)data;
    char *str = malloc(len + 1);
    memset(str, 0, len + 1);
    memcpy(str, (char *)s, len);

    if (csv_data->col_count >= MAX_COLS) {
        free(str);
        return;
    }

    char num_type = get_number_type(str);
    
    // 处理标题行
    if (csv_data->row_count == 0 && csv_data->has_header) {
        csv_data->column_names[csv_data->col_count] = strdup(str);
        free(str);
        csv_data->col_count++;
        return;
    }

    // 采样阶段:确定列类型
    if (!csv_data->sampling_complete) {
        int effective_row = csv_data->has_header ? csv_data->row_count - 1 : csv_data->row_count;
        
        if (effective_row < csv_data->sample_rows) {
            // 初始化列类型
            if (effective_row == 0) {
                csv_data->column_types[csv_data->col_count] = num_type;
            } else {
                // 更新列类型(浮点数覆盖整数)
                if (csv_data->column_types[csv_data->col_count] == 'I' && num_type == 'D') {
                    csv_data->column_types[csv_data->col_count] = 'D';
                } else if (num_type == 'T') {
                    csv_data->column_types[csv_data->col_count] = 'T';
                }
            }
            
            // 如果是采样最后一行,标记完成
            if (effective_row == csv_data->sample_rows - 1) {
                csv_data->sampling_complete = (csv_data->col_count == MAX_COLS - 1);
            }
        }
    }

    // 写入Excel
    int excel_row = csv_data->has_header ? csv_data->row_count - 1 : csv_data->row_count;
    switch (csv_data->column_types[csv_data->col_count]) {
        case 'I':
            worksheet_write_number(csv_data->worksheet, excel_row, csv_data->col_count, atoi(str), NULL);
            break;
        case 'D':
            worksheet_write_number(csv_data->worksheet, excel_row, csv_data->col_count, atof(str), NULL);
            break;
        default:
            worksheet_write_string(csv_data->worksheet, excel_row, csv_data->col_count, str, NULL);
    }

    free(str);
    csv_data->col_count++;
}

// 行级数据处理回调函数
void row_callback(int c, void *data) {
    struct CsvData *csv_data = (struct CsvData *)data;
    
    // 检查是否采样了足够的行
    int sampled_rows = csv_data->has_header ? 
        (csv_data->row_count > 0 ? csv_data->row_count - 1 : 0) : 
        csv_data->row_count;
    
    if (!csv_data->sampling_complete && sampled_rows >= csv_data->sample_rows) {
        csv_data->sampling_complete = true;
    }
    
    // 准备处理下一行
    csv_data->row_count++;
    csv_data->col_count = 0;
}

int csv_parse_file(const char *file_path, struct CsvData *csv_data) {
    struct csv_parser conf;
    FILE *fp = NULL;
    size_t bytes_read = 0;
    size_t retval = 0;
    char buf[1024] = { 0 };
    int result = -1;

    if (csv_init(&conf, CSV_STRICT | CSV_STRICT_FINI) != 0) {
        printf("failed to initialize csv parser\n");
        return result;
    }
    
    fp = fopen(file_path, "rb");
    if (fp == NULL) {
        fprintf(stderr, "Failed to open file %s: %s\n", file_path, strerror(errno));
        goto END;
    }
    
    while ((bytes_read = fread(buf, sizeof(char), sizeof(buf), fp)) > 0) {
        retval = csv_parse(&conf, buf, bytes_read, col_callback, row_callback, csv_data);
        if (retval != bytes_read) {
            fprintf(stderr, "Error parsing file: %s\n", csv_strerror(csv_error(&conf)));
            goto END;
        }
    }
    
    if (0 != csv_fini(&conf, col_callback, row_callback, csv_data)) {
        goto END;
    }

    // 检查是否采样了足够的行
    int sampled_rows = csv_data->has_header ? 
        (csv_data->row_count > 1 ? csv_data->row_count - 1 : 0) : 
        csv_data->row_count;
    
    if (sampled_rows < csv_data->sample_rows) {
        fprintf(stderr, "Error: File has only %d rows (need at least %d rows for sampling)\n",
               csv_data->row_count, 
               csv_data->has_header ? csv_data->sample_rows + 1 : csv_data->sample_rows);
        result = -2;
        goto END;
    }
    
    result = 0;
    
END:
    csv_free(&conf);
    if (fp) fclose(fp);
    return result;
}

void parse_args(int argc, char *argv[], int *sample_rows, bool *has_header, const char **input_file, const char **output_file) {
    *sample_rows = DEFAULT_SAMPLE_ROWS;
    *has_header = true; // 默认带标题行
    *input_file = NULL;
    *output_file = "output.xlsx";

    for (int i = 1; i < argc; i++) {
        if (strncmp(argv[i], "-H=", 3) == 0) {
            *has_header = atoi(argv[i] + 3) != 0;
        } else if (strncmp(argv[i], "-R=", 3) == 0) {
            *sample_rows = atoi(argv[i] + 3);
            if (*sample_rows <= 0) {
                fprintf(stderr, "Sample rows must be positive, using default %d\n", DEFAULT_SAMPLE_ROWS);
                *sample_rows = DEFAULT_SAMPLE_ROWS;
            }
        } else if (*input_file == NULL) {
            *input_file = argv[i];
        } else if (*output_file == NULL) {
            *output_file = argv[i];
        } else {
            fprintf(stderr, "Ignoring unknown argument: %s\n", argv[i]);
        }
    }

    if (*input_file == NULL) {
        fprintf(stderr, "Usage: %s <input.csv> [output.xlsx] [-H=0|1] [-R=sample_rows]\n", argv[0]);
        exit(EXIT_FAILURE);
    }
}

int main(int argc, char *argv[]) {
    int sample_rows;
    bool has_header;
    const char *input_file;
    const char *output_file;
    
    parse_args(argc, argv, &sample_rows, &has_header, &input_file, &output_file);
    
    struct CsvData csv_data;
    init_csv_data(&csv_data, sample_rows, has_header, output_file);
    
    printf("Processing CSV file: %s\n", input_file);
    printf("Output XLSX file: %s\n", output_file);
    printf("Sample rows: %d\n", sample_rows);
    printf("Header row: %s\n", has_header ? "yes" : "no");
    
    if (csv_parse_file(input_file, &csv_data)!=0) {
        free_csv_data(&csv_data);
        return EXIT_FAILURE;
    }
    
    // 输出列信息
    if (has_header) {
        printf("\nColumn names and types:\n");
        for (int i = 0; i < MAX_COLS && csv_data.column_names[i]; i++) {
            printf("Column %d: %s (%c)\n", i+1, csv_data.column_names[i], csv_data.column_types[i]);
        }
    }
    
    printf("\nSuccessfully converted %d rows to XLSX\n", 
           has_header ? csv_data.row_count - 1 : csv_data.row_count);
    
    free_csv_data(&csv_data);
    return EXIT_SUCCESS;
}

编译执行, 先将网上下载的libcsv源码中的csv.h和libcsv.c文件放在当前目录下

export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:libxlsxwriter/lib

gcc libcsv.c ds_libcsv2xlsx.c -o ds_libcsv2xlsx -I . -O3  -I libxlsxwriter/include -lxlsxwriter -L libxlsxwriter/lib

time ./ds_libcsv2xlsx NYC_311_SR_2010-2020-sample-1M.csv dsnyc.xlsx
Ignoring unknown argument: dsnyc.xlsx
Processing CSV file: NYC_311_SR_2010-2020-sample-1M.csv
Output XLSX file: output.xlsx
Sample rows: 5
Header row: yes

Column names and types:
Column 1: Unique Key (I)
Column 2: Created Date (T)
Column 3: Closed Date (T)
Column 4: Agency (T)
Column 5: Agency Name (T)
Column 6: Complaint Type (T)
Column 7: Descriptor (T)
Column 8: Location Type (T)
Column 9: Incident Zip (I)
Column 10: Incident Address (T)
Column 11: Street Name (T)
Column 12: Cross Street 1 (T)
Column 13: Cross Street 2 (T)
Column 14: Intersection Street 1 (T)
Column 15: Intersection Street 2 (T)
Column 16: Address Type (T)
Column 17: City (T)
Column 18: Landmark (T)
Column 19: Facility Type (T)
Column 20: Status (T)
Column 21: Due Date (T)
Column 22: Resolution Description (T)
Column 23: Resolution Action Updated Date (T)
Column 24: Community Board (T)
Column 25: BBL (T)
Column 26: Borough (T)
Column 27: X Coordinate (State Plane) (I)
Column 28: Y Coordinate (State Plane) (I)
Column 29: Open Data Channel Type (T)
Column 30: Park Facility Name (T)
Column 31: Park Borough (T)
Column 32: Vehicle Type (T)
Column 33: Taxi Company Borough (T)
Column 34: Taxi Pick Up Location (T)
Column 35: Bridge Highway Name (T)
Column 36: Bridge Highway Direction (T)
Column 37: Road Ramp (T)
Column 38: Bridge Highway Segment (T)
Column 39: Latitude (D)
Column 40: Longitude (D)
Column 41: Location (T)

Successfully converted 1000000 rows to XLSX

real    1m6.379s
user    0m42.476s
sys     0m6.464s

测试生成的xlsx文件能被catamine 示例文件读取

time /par/calamine/target/release/examples/excel_to_csv /par/output.xlsx Sheet1

real    0m29.629s
user    0m16.662s
sys     0m2.356s

补记:
读取文件时的缓冲区对解析csv速度有影响。
下面是csv_parse_file()函数 char buf[bufsize] = { 0 };的bufsize分别设置成1024、65536、1024*1024的输出时间对比(单位:秒)

缓冲区大小1024655361024*1024
100万行41列665048
500万行14列442221

可见64K缓冲区一般就合适了,再增大,速度提高也很有限。

评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值