利用美团龙猫编写的自解析xml转csv文件C程序-优快云博客

利用libxlm2的代码性能不高，因为通用库做了很多我们不需要要的工作，所以让他自己解析。
提示词

不依赖任何xml解析库，顺序解析所需的<row r属性，<c r属性 t属性 <v>和<t>标签内容并输出csv，其余丢弃。只写出代码，不做别的

结果出来的还是分块解析思路的代码，继续提示

以上代码的chunk里只有1个<row> ..</row>标签对，导致输出只有1行，简化思路，不要固定头尾，从第一个<row开始解析，到</row>满一行就输出一行，直到最后一个</row>或文件尾退出。给出代码，不做别的

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>

// 字母列号转数字，A1 -> (0,0), B1 -> (1,0)
int col_letter_to_num(const char* col, int len) {
    int num = 0;
    for (int i = 0; i < len; i++) {
        if (col[i] >= 'A' && col[i] <= 'Z')
            num = num * 26 + (col[i] - 'A' + 1);
        else if (col[i] >= 'a' && col[i] <= 'z')
            num = num * 26 + (col[i] - 'a' + 1);
        else
            break;
    }
    return num - 1;
}

// 查找标签属性值
const char* get_attr(const char* start, const char* end, const char* attr) {
    const char* p = start;
    int attr_len = strlen(attr);
    while (p < end - attr_len) {
        if (strncmp(p, attr, attr_len) == 0 && p[attr_len] == '=') {
            p += attr_len + 1;
            if (*p == '"' || *p == '\'') {
                char q = *p;
                p++;
                const char* val = p;
                while (p < end && *p != q) p++;
                char* ret = (char*)malloc(p - val + 1);
                strncpy(ret, val, p - val);
                ret[p - val] = 0;
                return ret;
            }
        }
        p++;
    }
    return NULL;
}

// 查找标签名
const char* get_tagname(const char* start, const char* end, const char* tag) {
    const char* p = start;
    int tag_len = strlen(tag);
    while (p < end - tag_len - 1) {
        if (strncmp(p, "<", 1) == 0 && strncmp(p+1, tag, tag_len) == 0 && (p[1+tag_len]==' ' || p[1+tag_len]=='>')) {
            return p;
        }
        p++;
    }
    return NULL;
}

// 查找标签内容
const char* get_tagcontent(const char* start, const char* end, const char* tag) {
    const char* p = start;
    int tag_len = strlen(tag);
    while (p < end - tag_len - 1) {
        if (strncmp(p, "<", 1) == 0 && strncmp(p+1, tag, tag_len) == 0) {
            const char* tag_end = strchr(p, '>');
            if (!tag_end) return NULL;
            const char* content = tag_end + 1;
            const char* close = strstr(content, "</");
            if (!close || close > end) return NULL;
            const char* close_tag = close + 2;
            if (strncmp(close_tag, tag, tag_len) == 0 && close_tag[tag_len] == '>') {
                char* ret = (char*)malloc(close - content + 1);
                strncpy(ret, content, close - content);
                ret[close - content] = 0;
                return ret;
            }
        }
        p++;
    }
    return NULL;
}

// 查找下一个标签
const char* next_tag(const char* start, const char* end) {
    const char* p = start;
    while (p < end) {
        if (*p == '<') return p;
        p++;
    }
    return NULL;
}

// 查找标签结束
const char* tag_end(const char* tag) {
    const char* p = tag;
    while (*p && *p != '>' && *p != ' ') p++;
    return p;
}

// 查找属性值，返回指向属性值的指针
const char* tag_attrval(const char* tag, const char* attr, const char** val_end) {
    int attr_len = strlen(attr);
    const char* p = tag;
    while (*p && *p != '>') {
        if (strncmp(p, attr, attr_len) == 0 && p[attr_len] == '=') {
            p += attr_len + 1;
            if (*p == '"' || *p == '\'') {
                char q = *p;
                p++;
                const char* val = p;
                while (*p && *p != q) p++;
                *val_end = p;
                return val;
            }
        }
        p++;
    }
    *val_end = NULL;
    return NULL;
}

// 主函数
int main(int argc, char* argv[]) {
    if (argc != 2) {
        fprintf(stderr, "Usage: %s <input.xml>\n", argv[0]);
        exit(1);
    }
    const char* fname = argv[1];
    int fd = open(fname, O_RDONLY);
    if (fd < 0) { perror("open"); exit(1); }
    struct stat sb;
    if (fstat(fd, &sb) < 0) { perror("fstat"); exit(1); }
    size_t flen = sb.st_size;
    char* mapped = (char*)mmap(NULL, flen, PROT_READ, MAP_PRIVATE, fd, 0);
    if (mapped == MAP_FAILED) { perror("mmap"); exit(1); }

    // 查找第一个<row
    const char* first_row = strstr(mapped, "<row");
    if (!first_row) {
        munmap(mapped, flen);
        close(fd);
        return 0;
    }
    // 查找最后一个</row>
    const char* last_row_end = strstr(mapped, "</row>");
    if (!last_row_end) {
        munmap(mapped, flen);
        close(fd);
        return 0;
    }
    last_row_end = strstr(last_row_end, ">");
    if (last_row_end) last_row_end += 1;
    else last_row_end = mapped + flen;
    const char* file_tail_start = last_row_end;

    // 当前行最大列
    int max_col = 0;
    // 当前解析位置
    const char* p = first_row;
    // 64K块
    const size_t BUF_SIZE = 65536;
    //char* chunk = (char*)malloc(BUF_SIZE + 100);

    // 行号缓存
    int last_row = 0;

    // 解析一行
    void output_row(const char* row_start, const char* row_end, int* max_col, int last_row) {
        // 提取r属性
        const char* r_attr_val = get_attr(row_start, row_end, "r");
        int row_num = r_attr_val ? atoi(r_attr_val) : -1;
        if (r_attr_val) free((void*)r_attr_val);
        if (row_num < 1) return;

        // 统计最大列
        int this_max_col = -1;
        const char* c = row_start;
        while (1) {
            const char* c_tag = get_tagname(c, row_end, "c");
            if (!c_tag) break;
            const char* c_end = strstr(c_tag, "</c>");
            if (!c_end || c_end > row_end) break;
            c_end = strstr(c_end, ">");
            if (c_end) c_end += 1;
            else c_end = row_end;
            const char* c_r_attr = get_attr(c_tag, c_end, "r");
            if (c_r_attr) {
                int col_len = 0;
                while (c_r_attr[col_len] && c_r_attr[col_len] >= 'A' && (c_r_attr[col_len] <= 'Z' || c_r_attr[col_len] <= 'z')) col_len++;
                int col_num = col_letter_to_num(c_r_attr, col_len);
                if (col_num > this_max_col) this_max_col = col_num;
                free((void*)c_r_attr);
            }
            c = c_end;
        }
        if (this_max_col < 0) return;
        if (this_max_col > *max_col) *max_col = this_max_col;

        // 补空行
        while (last_row < row_num - 1) {
            last_row++;
            int has_data = 0;
            for (int c = 0; c <= *max_col; c++) {
                // 全空
            }
            if (!has_data) continue;
            printf("%d", last_row);
            for (int c = 0; c <= *max_col; c++) printf(",");
            printf("\n");
        }

        // 解析c节点
        char** cells = (char**)calloc(this_max_col + 1, sizeof(char*));
        c = row_start;
        while (1) {
            const char* c_tag = get_tagname(c, row_end, "c");
            if (!c_tag) break;
            const char* c_end = strstr(c_tag, "</c>");
            if (!c_end || c_end > row_end) break;
            c_end = strstr(c_end, ">");
            if (c_end) c_end += 1;
            else c_end = row_end;
            const char* c_r_attr = get_attr(c_tag, c_end, "r");
            if (c_r_attr) {
                int col_len = 0;
                while (c_r_attr[col_len] && c_r_attr[col_len] >= 'A' && (c_r_attr[col_len] <= 'Z' || c_r_attr[col_len] <= 'z')) col_len++;
                int col_num = col_letter_to_num(c_r_attr, col_len);
                free((void*)c_r_attr);

                // 判断t属性
                const char* t_attr = get_attr(c_tag, c_end, "t");
                int is_str = (t_attr && strcmp(t_attr, "inlineStr") == 0);
                if (t_attr) free((void*)t_attr);

                const char* val = NULL;
                if (is_str) {
                    const char* t_tag = get_tagcontent(c_tag, c_end, "t");
                    val = t_tag ? t_tag : strdup("");
                } else {
                    const char* v_tag = get_tagcontent(c_tag, c_end, "v");
                    val = v_tag ? v_tag : strdup("");
                }
                cells[col_num] = (char*)val;
            }
            c = c_end;
        }

        // 输出行
        int has_data = 0;
        for (int c = 0; c <= this_max_col; c++) {
            if (cells[c] && strlen(cells[c]) > 0) {
                has_data = 1; break;
            }
        }
        if (has_data) {
            printf("%d", row_num);
            for (int c = 0; c <= this_max_col; c++) {
                if (cells[c] && strlen(cells[c]) > 0)
                    printf(",%s", cells[c]);
                else
                    printf(",");
            }
            printf("\n");
        }
        // 释放
        for (int c = 0; c <= this_max_col; c++) {
            if (cells[c]) free(cells[c]);
        }
        free(cells);
    }

    // 顺序解析
    while (1){ //p < file_tail_start) {
        // 找下一个<row
        const char* row_start = p;//get_tagname(p, file_tail_start, "row");
        if (!row_start) break;
        const char* row_end = strstr(row_start, "</row>");

        //if (!row_end || row_end > file_tail_start) break;
        row_end = strstr(row_end, ">");
        if (row_end) row_end += 1;
        else row_end = file_tail_start;
        // 解析并输出一行
        output_row(row_start, row_end, &max_col, last_row);
        last_row = last_row > 0 ? last_row : atoi(get_attr(row_start, row_end, "r"));

        // 继续
        p = row_end;
    }

    // 释放
    //free(chunk);
    munmap(mapped, flen);
    close(fd);
    return 0;
}

原始代码输出一行就退出，经过人工调试，注释了多余的条件，可以输出多行了，龙猫竟然还知道gcc允许在函数里定义其他函数，比如output_row，真博学。

gcc catxmls2.c -o catxmls2
./catxmls2 sheet13.xml
1,1,15519,785,1,17.00,24386.67,0.04,0.02,N,O,35137.0,35107.0,35146.0,DELIVER IN PERSON,TRUCK,to beans x-ray carefull
2,1,6731,732,2,36.00,58958.28,0.09,0.06,N,O,35167.0,35123.0,35175.0,TAKE BACK RETURN,MAIL, according to the final foxes. qui

Segmentation fault (core dumped)

gcc catxmls2.c -o catxmls2 -O3

time ./catxmls2 lineitem/xl/worksheets/sheet1.xml >s2.csv
Segmentation fault (core dumped)

real	0m13.215s
user	0m12.628s
sys	0m0.320s

gcc to-csv.c -o to-csv -O3
time ./to-csv lineitem/xl/worksheets/sheet1.xml A1:P1100000 s13.csv

real	0m8.349s
user	0m7.568s
sys	0m0.360s

输出100万行只比张泽鹏先生手搓的程序差50%，相当不错。还有内存错误，需要进一步研究。

后记：
内存错误原因找到了，找最后一个标签的语句const char* last_row_end = strstr(mapped, "</row>");写错了。
不能正向查找，而要反向查找。
应该是char* last_row_end = (char*)strrstr(mapped, mapped + flen, "</row>");,然后把注释掉的语句恢复，再配上如下来自分块解析代码的自定义函数就正确了。

// 辅助函数：在指定范围内查找子串
char* strrstr(const char* haystack, const char* end, const char* needle) {
    char* p = (char*)end - strlen(needle);
    while (p >= haystack) {
        if (strncmp(p, needle, strlen(needle)) == 0)
            return p;
        p--;
    }
    return NULL;
}