指导美团龙猫把用libxml2编写的XML转CSV文件C程序改为分块解析输出-优快云博客

上一个程序把整个xml文件解析到内存，这对于大型文件不可行，利用前面expat拼接内存的思路，指导龙猫改写程序实现了分块解析并及时输出。
提示词

文件过大不能一次载入内存解析，请用如下思路分块处理：分配一块内存buf比如64K，用strstr从内存映射中找到第一个<row位置，将前面内容读入内存buf，然后从内存映射文件尾部找到最后一个</row>位置，将其后的内容作为文件尾保存，然后回到第一个<row位置，读取64k-文件头尾总长度，然后回退到读取的最后一个</row>位置，再将文件尾接上，将拼接的内存交给xmlParseMemory函数，完成输出后，再到回退位置继续拼接下一块，直到内存映射上次找出的尾部停止。仅给出需要修改的代码，不做别的

第一次给出的代码编译通过，但执行有错，经过人工调试，发现了问题所在，于是再次提示

有2个问题，1.memcpy(chunk + offset, cur_pos, block_size);应该在回退到本次最后一个</row>后再复制。2.输出到stdout应该在解析完本轮迭代就输出，下次内存还能重复利用。另外，读变动文件头尾复杂了，改为固定头尾，例如char fix_head[]="<sheetData>"; head_len=11; char fix_tail[]="</sheetData>"; tail_len=12;请按上述意见改写这一段代码，不做别的

他输出了如下程序

    // 初始化libxml2
    xmlInitParser();

    const size_t BUF_SIZE = 65536; // 64K
    // 固定头尾
    char fix_head[] = "<sheetData>";
    char fix_tail[] = "</sheetData>";
    size_t head_len = strlen(fix_head);
    size_t tail_len = strlen(fix_tail);

    // 查找第一个<row
    char* first_row = (char*)strstr(mapped, "<row");
    if (!first_row) {
        xmlCleanupParser();
        munmap(mapped, flen);
        close(fd);
        return 0;
    }
    // 查找最后一个</row>
    char* last_row_end = (char*)strrstr(mapped, mapped + flen, "</row>");
    if (!last_row_end) {
        xmlCleanupParser();
        munmap(mapped, flen);
        close(fd);
        return 0;
    }
    last_row_end = (char*)strstr(last_row_end, ">");
    if (last_row_end) last_row_end += 1;
    else last_row_end = mapped + flen;
    char* file_tail_start = last_row_end;

    // 当前处理位置
    char* cur_pos = first_row;
    // 保存当前块内容
    char* chunk = (char*)malloc(BUF_SIZE + 100);
    size_t chunk_size = 0;

    // 存储所有行
    struct row_data {
        int row_num;
        int max_col;
        char** cells;
    };
    struct row_data* all_rows = NULL;
    int all_rows_size = 0, all_rows_count = 0;

    // 用于输出
    int max_col = 0;

    while (cur_pos < file_tail_start) {
        // 先拼接固定头
        size_t offset = 0;
        memcpy(chunk + offset, fix_head, head_len);
        offset += head_len;

        // 计算本次块大小
        size_t block_size = BUF_SIZE - head_len - tail_len;
        if (cur_pos + block_size > file_tail_start) {
            block_size = file_tail_start - cur_pos;
        }
        if (block_size <= 0) break;

        // 先只复制块内容
        memcpy(chunk + offset, cur_pos, block_size);
        offset += block_size;

        // 查找本次块最后一个</row>
        char* last_row_in_chunk = (char*)strrstr(chunk + head_len, chunk + offset, "</row>");
        if (!last_row_in_chunk) {
            // 没有完整row，跳过
            cur_pos += block_size;
            continue;
        }
        char* last_row_end_in_chunk = (char*)strstr(last_row_in_chunk, ">");
        if (last_row_end_in_chunk) last_row_end_in_chunk += 1;
        else last_row_end_in_chunk = chunk + offset;
        size_t real_chunk_size = last_row_end_in_chunk - chunk;

        // 拼接固定尾
        memcpy(chunk + real_chunk_size, fix_tail, tail_len);
        real_chunk_size += tail_len;
        chunk[real_chunk_size] = '\0';

        // 回退到最后一个</row>，作为下次开始
        char* last_row_end_in_file = cur_pos + (last_row_end_in_chunk - (chunk + head_len));

        // 解析本块
        xmlDocPtr doc = xmlParseMemory(chunk, real_chunk_size);
        if (!doc) {
            cur_pos = last_row_end_in_file;
            continue;
        }
        xmlNodePtr root = xmlDocGetRootElement(doc);

        for (xmlNodePtr node = root; node; node = node->next) {
            if (node->type == XML_ELEMENT_NODE && strcmp((const char*)node->name, "sheetData") == 0) {
                for (xmlNodePtr row_node = node->children; row_node; row_node = row_node->next) {
                    if (row_node->type == XML_ELEMENT_NODE && strcmp((const char*)row_node->name, "row") == 0) {
                        const char* r_prop = get_prop(row_node, "r");
                        if (!r_prop) continue;
                        int row_num = atoi(r_prop);
                        if (row_num < 1) continue;
                        int this_max_col = -1;
                        // 先统计本行最大列
                        for (xmlNodePtr cnode = row_node->children; cnode; cnode = cnode->next) {
                            if (cnode->type == XML_ELEMENT_NODE && strcmp((const char*)cnode->name, "c") == 0) {
                                const char* r_attr = get_prop(cnode, "r");
                                if (!r_attr) continue;
                                int col_len = 0;
                                while (r_attr[col_len] && r_attr[col_len] >= 'A' && (r_attr[col_len] <= 'Z' || r_attr[col_len] <= 'z')) col_len++;
                                int col_num = col_letter_to_num(r_attr, col_len);
                                if (col_num > this_max_col) this_max_col = col_num;
                            }
                        }
                        if (this_max_col < 0) continue;
                        if (this_max_col > max_col) max_col = this_max_col;
                        // 扩展all_rows
                        all_rows = (struct row_data*)realloc(all_rows, sizeof(struct row_data) * (all_rows_size + 1));
                        all_rows[all_rows_count].row_num = row_num;
                        all_rows[all_rows_count].max_col = this_max_col;
                        all_rows[all_rows_count].cells = (char**)calloc(this_max_col + 1, sizeof(char*));
                        // 遍历c节点
                        for (xmlNodePtr cnode = row_node->children; cnode; cnode = cnode->next) {
                            if (cnode->type == XML_ELEMENT_NODE && strcmp((const char*)cnode->name, "c") == 0) {
                                const char* r_attr = get_prop(cnode, "r");
                                if (!r_attr) continue;
                                int col_len = 0;
                                while (r_attr[col_len] && r_attr[col_len] >= 'A' && (r_attr[col_len] <= 'Z' || r_attr[col_len] <= 'z')) col_len++;
                                int col_num = col_letter_to_num(r_attr, col_len);
                                if (col_num < 0) continue;
                                const char* val = parse_c_value(cnode);
                                all_rows[all_rows_count].cells[col_num] = strdup(val ? val : "");
                                //if (val && (val != (const char*)xmlNodeGetContent(cnode))) xmlFree((void*)val);
                            }
                        }
                        all_rows_count++;
                        all_rows_size++;
                    }
                }
            }
        }
        xmlFreeDoc(doc);

        // 本块解析完，立即输出
        // 按行号排序
        for (int i = all_rows_count - 1; i >= 0; i--) {
            for (int j = 0; j < i; j++) {
                if (all_rows[j].row_num > all_rows[j+1].row_num) {
                    struct row_data tmp = all_rows[j];
                    all_rows[j] = all_rows[j+1];
                    all_rows[j+1] = tmp;
                }
            }
        }
        // 输出新增行
        for (int i = 0; i < all_rows_count; i++) {
            int r = all_rows[i].row_num;
            int has_data = 0;
            for (int c = 0; c <= all_rows[i].max_col; c++) {
                if (all_rows[i].cells[c] && strlen(all_rows[i].cells[c]) > 0) {
                    has_data = 1; break;
                }
            }
            if (!has_data) continue;
            printf("%d", r);
            for (int c = 0; c <= all_rows[i].max_col; c++) {
                if (all_rows[i].cells[c] && strlen(all_rows[i].cells[c]) > 0)
                    printf(",%s", all_rows[i].cells[c]);
                else
                    printf(",");
            }
            printf("\n");
        }
        // 清空本块数据，保留内存
        for (int i = 0; i < all_rows_count; i++) {
            for (int c = 0; c <= all_rows[i].max_col; c++) {
                if (all_rows[i].cells[c]) free(all_rows[i].cells[c]);
                all_rows[i].cells[c] = NULL;
            }
            all_rows[i].row_num = 0;
            all_rows[i].max_col = 0;
        }
        all_rows_count = 0;

        // 下次从此处继续
        cur_pos = last_row_end_in_file;
    }

    // 释放内存
    for (int i = 0; i < all_rows_size; i++) {
        free(all_rows[i].cells);
    }
    if (all_rows) free(all_rows);
    if (chunk) free(chunk);

    xmlCleanupParser();
    munmap(mapped, flen);
    close(fd);
    return 0;
}

// 辅助函数：在指定范围内查找子串
char* strrstr(const char* haystack, const char* end, const char* needle) {
    char* p = (char*)end - strlen(needle);
    while (p >= haystack) {
        if (strncmp(p, needle, strlen(needle)) == 0)
            return p;
        p--;
    }
    return NULL;
}

按照提示覆盖原有代码，并把辅助函数移到main函数前面，就可以编译执行了，仍然需要注释掉 xmlFree函数调用，否则报Segmentation fault (core dumped)错误退出。
查看代码，他并没有机械地按要求改，而是在复制固定文件尾的时候，复制到最后一个</row>标签后，然后用\0填充。龙猫的理解力和代码准确性都很不错。

time ./catxmlp2 sheet12.xml > s12.csv
double free or corruption (!prev)
Aborted (core dumped)

real    0m34.581s
user    0m11.822s
sys     0m3.059s

time ./tocsv sheet12.xml A1:P1000000 s13.csv

real    0m13.659s
user    0m1.733s
sys     0m1.083s

通过分块处理，输出大文件没问题了，效率也不算太差，和最快的张泽鹏先生手搓代码tocsv相比只有3倍差距，比第一版好多了。仍然有内存错误，还要进一步研究。