利用libxlm2的代码性能不高,因为通用库做了很多我们不需要要的工作,所以让他自己解析。
提示词
不依赖任何xml解析库,顺序解析所需的
<row r属性,<c r属性t属性<v>和<t>标签内容并输出csv,其余丢弃。只写出代码,不做别的
结果出来的还是分块解析思路的代码,继续提示
以上代码的chunk里只有1个
<row> ..</row>标签对,导致输出只有1行,简化思路,不要固定头尾,从第一个<row开始解析,到</row>满一行就输出一行,直到最后一个</row>或文件尾退出。给出代码,不做别的
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
// 字母列号转数字,A1 -> (0,0), B1 -> (1,0)
int col_letter_to_num(const char* col, int len) {
int num = 0;
for (int i = 0; i < len; i++) {
if (col[i] >= 'A' && col[i] <= 'Z')
num = num * 26 + (col[i] - 'A' + 1);
else if (col[i] >= 'a' && col[i] <= 'z')
num = num * 26 + (col[i] - 'a' + 1);
else
break;
}
return num - 1;
}
// 查找标签属性值
const char* get_attr(const char* start, const char* end, const char* attr) {
const char* p = start;
int attr_len = strlen(attr);
while (p < end - attr_len) {
if (strncmp(p, attr, attr_len) == 0 && p[attr_len] == '=') {
p += attr_len + 1;
if (*p == '"' || *p == '\'') {
char q = *p;
p++;
const char* val = p;
while (p < end && *p != q) p++;
char* ret = (char*)malloc(p - val + 1);
strncpy(ret, val, p - val);
ret[p - val] = 0;
return ret;
}
}
p++;
}
return NULL;
}
// 查找标签名
const char* get_tagname(const char* start, const char* end, const char* tag) {
const char* p = start;
int tag_len = strlen(tag);
while (p < end - tag_len - 1) {
if (strncmp(p, "<", 1) == 0 && strncmp(p+1, tag, tag_len) == 0 && (p[1+tag_len]==' ' || p[1+tag_len]=='>')) {
return p;
}
p++;
}
return NULL;
}
// 查找标签内容
const char* get_tagcontent(const char* start, const char* end, const char* tag) {
const char* p = start;
int tag_len = strlen(tag);
while (p < end - tag_len - 1) {
if (strncmp(p, "<", 1) == 0 && strncmp(p+1, tag, tag_len) == 0) {
const char* tag_end = strchr(p, '>');
if (!tag_end) return NULL;
const char* content = tag_end + 1;
const char* close = strstr(content, "</");
if (!close || close > end) return NULL;
const char* close_tag = close + 2;
if (strncmp(close_tag, tag, tag_len) == 0 && close_tag[tag_len] == '>') {
char* ret = (char*)malloc(close - content + 1);
strncpy(ret, content, close - content);
ret[close - content] = 0;
return ret;
}
}
p++;
}
return NULL;
}
// 查找下一个标签
const char* next_tag(const char* start, const char* end) {
const char* p = start;
while (p < end) {
if (*p == '<') return p;
p++;
}
return NULL;
}
// 查找标签结束
const char* tag_end(const char* tag) {
const char* p = tag;
while (*p && *p != '>' && *p != ' ') p++;
return p;
}
// 查找属性值,返回指向属性值的指针
const char* tag_attrval(const char* tag, const char* attr, const char** val_end) {
int attr_len = strlen(attr);
const char* p = tag;
while (*p && *p != '>') {
if (strncmp(p, attr, attr_len) == 0 && p[attr_len] == '=') {
p += attr_len + 1;
if (*p == '"' || *p == '\'') {
char q = *p;
p++;
const char* val = p;
while (*p && *p != q) p++;
*val_end = p;
return val;
}
}
p++;
}
*val_end = NULL;
return NULL;
}
// 主函数
int main(int argc, char* argv[]) {
if (argc != 2) {
fprintf(stderr, "Usage: %s <input.xml>\n", argv[0]);
exit(1);
}
const char* fname = argv[1];
int fd = open(fname, O_RDONLY);
if (fd < 0) { perror("open"); exit(1); }
struct stat sb;
if (fstat(fd, &sb) < 0) { perror("fstat"); exit(1); }
size_t flen = sb.st_size;
char* mapped = (char*)mmap(NULL, flen, PROT_READ, MAP_PRIVATE, fd, 0);
if (mapped == MAP_FAILED) { perror("mmap"); exit(1); }
// 查找第一个<row
const char* first_row = strstr(mapped, "<row");
if (!first_row) {
munmap(mapped, flen);
close(fd);
return 0;
}
// 查找最后一个</row>
const char* last_row_end = strstr(mapped, "</row>");
if (!last_row_end) {
munmap(mapped, flen);
close(fd);
return 0;
}
last_row_end = strstr(last_row_end, ">");
if (last_row_end) last_row_end += 1;
else last_row_end = mapped + flen;
const char* file_tail_start = last_row_end;
// 当前行最大列
int max_col = 0;
// 当前解析位置
const char* p = first_row;
// 64K块
const size_t BUF_SIZE = 65536;
//char* chunk = (char*)malloc(BUF_SIZE + 100);
// 行号缓存
int last_row = 0;
// 解析一行
void output_row(const char* row_start, const char* row_end, int* max_col, int last_row) {
// 提取r属性
const char* r_attr_val = get_attr(row_start, row_end, "r");
int row_num = r_attr_val ? atoi(r_attr_val) : -1;
if (r_attr_val) free((void*)r_attr_val);
if (row_num < 1) return;
// 统计最大列
int this_max_col = -1;
const char* c = row_start;
while (1) {
const char* c_tag = get_tagname(c, row_end, "c");
if (!c_tag) break;
const char* c_end = strstr(c_tag, "</c>");
if (!c_end || c_end > row_end) break;
c_end = strstr(c_end, ">");
if (c_end) c_end += 1;
else c_end = row_end;
const char* c_r_attr = get_attr(c_tag, c_end, "r");
if (c_r_attr) {
int col_len = 0;
while (c_r_attr[col_len] && c_r_attr[col_len] >= 'A' && (c_r_attr[col_len] <= 'Z' || c_r_attr[col_len] <= 'z')) col_len++;
int col_num = col_letter_to_num(c_r_attr, col_len);
if (col_num > this_max_col) this_max_col = col_num;
free((void*)c_r_attr);
}
c = c_end;
}
if (this_max_col < 0) return;
if (this_max_col > *max_col) *max_col = this_max_col;
// 补空行
while (last_row < row_num - 1) {
last_row++;
int has_data = 0;
for (int c = 0; c <= *max_col; c++) {
// 全空
}
if (!has_data) continue;
printf("%d", last_row);
for (int c = 0; c <= *max_col; c++) printf(",");
printf("\n");
}
// 解析c节点
char** cells = (char**)calloc(this_max_col + 1, sizeof(char*));
c = row_start;
while (1) {
const char* c_tag = get_tagname(c, row_end, "c");
if (!c_tag) break;
const char* c_end = strstr(c_tag, "</c>");
if (!c_end || c_end > row_end) break;
c_end = strstr(c_end, ">");
if (c_end) c_end += 1;
else c_end = row_end;
const char* c_r_attr = get_attr(c_tag, c_end, "r");
if (c_r_attr) {
int col_len = 0;
while (c_r_attr[col_len] && c_r_attr[col_len] >= 'A' && (c_r_attr[col_len] <= 'Z' || c_r_attr[col_len] <= 'z')) col_len++;
int col_num = col_letter_to_num(c_r_attr, col_len);
free((void*)c_r_attr);
// 判断t属性
const char* t_attr = get_attr(c_tag, c_end, "t");
int is_str = (t_attr && strcmp(t_attr, "inlineStr") == 0);
if (t_attr) free((void*)t_attr);
const char* val = NULL;
if (is_str) {
const char* t_tag = get_tagcontent(c_tag, c_end, "t");
val = t_tag ? t_tag : strdup("");
} else {
const char* v_tag = get_tagcontent(c_tag, c_end, "v");
val = v_tag ? v_tag : strdup("");
}
cells[col_num] = (char*)val;
}
c = c_end;
}
// 输出行
int has_data = 0;
for (int c = 0; c <= this_max_col; c++) {
if (cells[c] && strlen(cells[c]) > 0) {
has_data = 1; break;
}
}
if (has_data) {
printf("%d", row_num);
for (int c = 0; c <= this_max_col; c++) {
if (cells[c] && strlen(cells[c]) > 0)
printf(",%s", cells[c]);
else
printf(",");
}
printf("\n");
}
// 释放
for (int c = 0; c <= this_max_col; c++) {
if (cells[c]) free(cells[c]);
}
free(cells);
}
// 顺序解析
while (1){ //p < file_tail_start) {
// 找下一个<row
const char* row_start = p;//get_tagname(p, file_tail_start, "row");
if (!row_start) break;
const char* row_end = strstr(row_start, "</row>");
//if (!row_end || row_end > file_tail_start) break;
row_end = strstr(row_end, ">");
if (row_end) row_end += 1;
else row_end = file_tail_start;
// 解析并输出一行
output_row(row_start, row_end, &max_col, last_row);
last_row = last_row > 0 ? last_row : atoi(get_attr(row_start, row_end, "r"));
// 继续
p = row_end;
}
// 释放
//free(chunk);
munmap(mapped, flen);
close(fd);
return 0;
}
原始代码输出一行就退出,经过人工调试,注释了多余的条件,可以输出多行了,龙猫竟然还知道gcc允许在函数里定义其他函数,比如output_row,真博学。
gcc catxmls2.c -o catxmls2
./catxmls2 sheet13.xml
1,1,15519,785,1,17.00,24386.67,0.04,0.02,N,O,35137.0,35107.0,35146.0,DELIVER IN PERSON,TRUCK,to beans x-ray carefull
2,1,6731,732,2,36.00,58958.28,0.09,0.06,N,O,35167.0,35123.0,35175.0,TAKE BACK RETURN,MAIL, according to the final foxes. qui
Segmentation fault (core dumped)
gcc catxmls2.c -o catxmls2 -O3
time ./catxmls2 lineitem/xl/worksheets/sheet1.xml >s2.csv
Segmentation fault (core dumped)
real 0m13.215s
user 0m12.628s
sys 0m0.320s
gcc to-csv.c -o to-csv -O3
time ./to-csv lineitem/xl/worksheets/sheet1.xml A1:P1100000 s13.csv
real 0m8.349s
user 0m7.568s
sys 0m0.360s
输出100万行只比张泽鹏先生手搓的程序差50%,相当不错。还有内存错误,需要进一步研究。
后记:
内存错误原因找到了,找最后一个标签的语句const char* last_row_end = strstr(mapped, "</row>");写错了。
不能正向查找,而要反向查找。
应该是char* last_row_end = (char*)strrstr(mapped, mapped + flen, "</row>");,然后把注释掉的语句恢复,再配上如下来自分块解析代码的自定义函数就正确了。
// 辅助函数:在指定范围内查找子串
char* strrstr(const char* haystack, const char* end, const char* needle) {
char* p = (char*)end - strlen(needle);
while (p >= haystack) {
if (strncmp(p, needle, strlen(needle)) == 0)
return p;
p--;
}
return NULL;
}

被折叠的 条评论
为什么被折叠?



