上一个程序把整个xml文件解析到内存,这对于大型文件不可行,利用前面expat拼接内存的思路,指导龙猫改写程序实现了分块解析并及时输出。
提示词
文件过大不能一次载入内存解析,请用如下思路分块处理:分配一块内存buf比如64K,用strstr从内存映射中找到第一个
<row位置,将前面内容读入内存buf,然后从内存映射文件尾部找到最后一个</row>位置,将其后的内容作为文件尾保存,然后回到第一个<row位置,读取64k-文件头尾总长度,然后回退到读取的最后一个</row>位置,再将文件尾接上,将拼接的内存交给xmlParseMemory函数,完成输出后,再到回退位置继续拼接下一块,直到内存映射上次找出的尾部停止。仅给出需要修改的代码,不做别的
第一次给出的代码编译通过,但执行有错,经过人工调试,发现了问题所在,于是再次提示
有2个问题,1.
memcpy(chunk + offset, cur_pos, block_size);应该在回退到本次最后一个</row>后再复制。2.输出到stdout应该在解析完本轮迭代就输出,下次内存还能重复利用。另外,读变动文件头尾复杂了,改为固定头尾,例如char fix_head[]="<sheetData>"; head_len=11; char fix_tail[]="</sheetData>"; tail_len=12;请按上述意见改写这一段代码,不做别的
他输出了如下程序
// 初始化libxml2
xmlInitParser();
const size_t BUF_SIZE = 65536; // 64K
// 固定头尾
char fix_head[] = "<sheetData>";
char fix_tail[] = "</sheetData>";
size_t head_len = strlen(fix_head);
size_t tail_len = strlen(fix_tail);
// 查找第一个<row
char* first_row = (char*)strstr(mapped, "<row");
if (!first_row) {
xmlCleanupParser();
munmap(mapped, flen);
close(fd);
return 0;
}
// 查找最后一个</row>
char* last_row_end = (char*)strrstr(mapped, mapped + flen, "</row>");
if (!last_row_end) {
xmlCleanupParser();
munmap(mapped, flen);
close(fd);
return 0;
}
last_row_end = (char*)strstr(last_row_end, ">");
if (last_row_end) last_row_end += 1;
else last_row_end = mapped + flen;
char* file_tail_start = last_row_end;
// 当前处理位置
char* cur_pos = first_row;
// 保存当前块内容
char* chunk = (char*)malloc(BUF_SIZE + 100);
size_t chunk_size = 0;
// 存储所有行
struct row_data {
int row_num;
int max_col;
char** cells;
};
struct row_data* all_rows = NULL;
int all_rows_size = 0, all_rows_count = 0;
// 用于输出
int max_col = 0;
while (cur_pos < file_tail_start) {
// 先拼接固定头
size_t offset = 0;
memcpy(chunk + offset, fix_head, head_len);
offset += head_len;
// 计算本次块大小
size_t block_size = BUF_SIZE - head_len - tail_len;
if (cur_pos + block_size > file_tail_start) {
block_size = file_tail_start - cur_pos;
}
if (block_size <= 0) break;
// 先只复制块内容
memcpy(chunk + offset, cur_pos, block_size);
offset += block_size;
// 查找本次块最后一个</row>
char* last_row_in_chunk = (char*)strrstr(chunk + head_len, chunk + offset, "</row>");
if (!last_row_in_chunk) {
// 没有完整row,跳过
cur_pos += block_size;
continue;
}
char* last_row_end_in_chunk = (char*)strstr(last_row_in_chunk, ">");
if (last_row_end_in_chunk) last_row_end_in_chunk += 1;
else last_row_end_in_chunk = chunk + offset;
size_t real_chunk_size = last_row_end_in_chunk - chunk;
// 拼接固定尾
memcpy(chunk + real_chunk_size, fix_tail, tail_len);
real_chunk_size += tail_len;
chunk[real_chunk_size] = '\0';
// 回退到最后一个</row>,作为下次开始
char* last_row_end_in_file = cur_pos + (last_row_end_in_chunk - (chunk + head_len));
// 解析本块
xmlDocPtr doc = xmlParseMemory(chunk, real_chunk_size);
if (!doc) {
cur_pos = last_row_end_in_file;
continue;
}
xmlNodePtr root = xmlDocGetRootElement(doc);
for (xmlNodePtr node = root; node; node = node->next) {
if (node->type == XML_ELEMENT_NODE && strcmp((const char*)node->name, "sheetData") == 0) {
for (xmlNodePtr row_node = node->children; row_node; row_node = row_node->next) {
if (row_node->type == XML_ELEMENT_NODE && strcmp((const char*)row_node->name, "row") == 0) {
const char* r_prop = get_prop(row_node, "r");
if (!r_prop) continue;
int row_num = atoi(r_prop);
if (row_num < 1) continue;
int this_max_col = -1;
// 先统计本行最大列
for (xmlNodePtr cnode = row_node->children; cnode; cnode = cnode->next) {
if (cnode->type == XML_ELEMENT_NODE && strcmp((const char*)cnode->name, "c") == 0) {
const char* r_attr = get_prop(cnode, "r");
if (!r_attr) continue;
int col_len = 0;
while (r_attr[col_len] && r_attr[col_len] >= 'A' && (r_attr[col_len] <= 'Z' || r_attr[col_len] <= 'z')) col_len++;
int col_num = col_letter_to_num(r_attr, col_len);
if (col_num > this_max_col) this_max_col = col_num;
}
}
if (this_max_col < 0) continue;
if (this_max_col > max_col) max_col = this_max_col;
// 扩展all_rows
all_rows = (struct row_data*)realloc(all_rows, sizeof(struct row_data) * (all_rows_size + 1));
all_rows[all_rows_count].row_num = row_num;
all_rows[all_rows_count].max_col = this_max_col;
all_rows[all_rows_count].cells = (char**)calloc(this_max_col + 1, sizeof(char*));
// 遍历c节点
for (xmlNodePtr cnode = row_node->children; cnode; cnode = cnode->next) {
if (cnode->type == XML_ELEMENT_NODE && strcmp((const char*)cnode->name, "c") == 0) {
const char* r_attr = get_prop(cnode, "r");
if (!r_attr) continue;
int col_len = 0;
while (r_attr[col_len] && r_attr[col_len] >= 'A' && (r_attr[col_len] <= 'Z' || r_attr[col_len] <= 'z')) col_len++;
int col_num = col_letter_to_num(r_attr, col_len);
if (col_num < 0) continue;
const char* val = parse_c_value(cnode);
all_rows[all_rows_count].cells[col_num] = strdup(val ? val : "");
//if (val && (val != (const char*)xmlNodeGetContent(cnode))) xmlFree((void*)val);
}
}
all_rows_count++;
all_rows_size++;
}
}
}
}
xmlFreeDoc(doc);
// 本块解析完,立即输出
// 按行号排序
for (int i = all_rows_count - 1; i >= 0; i--) {
for (int j = 0; j < i; j++) {
if (all_rows[j].row_num > all_rows[j+1].row_num) {
struct row_data tmp = all_rows[j];
all_rows[j] = all_rows[j+1];
all_rows[j+1] = tmp;
}
}
}
// 输出新增行
for (int i = 0; i < all_rows_count; i++) {
int r = all_rows[i].row_num;
int has_data = 0;
for (int c = 0; c <= all_rows[i].max_col; c++) {
if (all_rows[i].cells[c] && strlen(all_rows[i].cells[c]) > 0) {
has_data = 1; break;
}
}
if (!has_data) continue;
printf("%d", r);
for (int c = 0; c <= all_rows[i].max_col; c++) {
if (all_rows[i].cells[c] && strlen(all_rows[i].cells[c]) > 0)
printf(",%s", all_rows[i].cells[c]);
else
printf(",");
}
printf("\n");
}
// 清空本块数据,保留内存
for (int i = 0; i < all_rows_count; i++) {
for (int c = 0; c <= all_rows[i].max_col; c++) {
if (all_rows[i].cells[c]) free(all_rows[i].cells[c]);
all_rows[i].cells[c] = NULL;
}
all_rows[i].row_num = 0;
all_rows[i].max_col = 0;
}
all_rows_count = 0;
// 下次从此处继续
cur_pos = last_row_end_in_file;
}
// 释放内存
for (int i = 0; i < all_rows_size; i++) {
free(all_rows[i].cells);
}
if (all_rows) free(all_rows);
if (chunk) free(chunk);
xmlCleanupParser();
munmap(mapped, flen);
close(fd);
return 0;
}
// 辅助函数:在指定范围内查找子串
char* strrstr(const char* haystack, const char* end, const char* needle) {
char* p = (char*)end - strlen(needle);
while (p >= haystack) {
if (strncmp(p, needle, strlen(needle)) == 0)
return p;
p--;
}
return NULL;
}
按照提示覆盖原有代码,并把辅助函数移到main函数前面,就可以编译执行了,仍然需要注释掉 xmlFree函数调用,否则报Segmentation fault (core dumped)错误退出。
查看代码,他并没有机械地按要求改,而是在复制固定文件尾的时候,复制到最后一个</row>标签后,然后用\0填充。龙猫的理解力和代码准确性都很不错。
time ./catxmlp2 sheet12.xml > s12.csv
double free or corruption (!prev)
Aborted (core dumped)
real 0m34.581s
user 0m11.822s
sys 0m3.059s
time ./tocsv sheet12.xml A1:P1000000 s13.csv
real 0m13.659s
user 0m1.733s
sys 0m1.083s
通过分块处理,输出大文件没问题了,效率也不算太差,和最快的张泽鹏先生手搓代码tocsv相比只有3倍差距,比第一版好多了。仍然有内存错误,还要进一步研究。

被折叠的 条评论
为什么被折叠?



