上午的PK失利后,主要的性能问题出在expat把全表做了解析,其实只需要对范围内的部分做全部解析,其他部分都可以解析到行号或列号后跳过当前行,利用这个思路让龙猫改写了一版。
如下所示,虽然还不能达到张先生的和行数成正比例,但是消除了16秒的基础时间,提升还是可观的。
起初,他没处理好缺失列的情况,把道理跟他说了,就改好了。
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <expat.h>
#define MAX_CELL_CONTENT 256
typedef struct {
int start_row;
int end_row;
char start_col;
char end_col;
} ParseRange;
typedef struct {
ParseRange range;
FILE *csv;
FILE *xml_file; // 原始XML文件(用于fseek)
XML_Parser parser; // 解析器(用于获取位置)
int in_row;
int current_row;
char current_col;
int value_started;
char temp_value[MAX_CELL_CONTENT];
int value_len;
int skip_row; // 跳过当前行
long row_start_pos; // <row>标签起始位置
int first_row_processed; // 是否已处理第一行
char first_row_max_col; // 第一行实际最大列
} ParserState;
// 解析Excel范围 (如"A1:Z100")
int parse_excel_range(const char *range_str, ParseRange *range) {
if (sscanf(range_str, "%c%d:%c%d",
&range->start_col, &range->start_row,
&range->end_col, &range->end_row) != 4) {
return -1;
}
if (range->start_col > range->end_col) return -1;
if (range->start_row > range->end_row) return -1;
return 0;
}
// 开始标签回调
void XMLCALL start_element(void *user_data, const XML_Char *name, const XML_Char **attrs) {
ParserState *state = (ParserState*)user_data;
if (strcmp(name, "row") == 0) {
state->row_start_pos = XML_GetCurrentByteIndex(state->parser);
state->in_row = 1;
state->current_row = -1;
state->skip_row = 0;
// 1. 读取行号
for (int i = 0; attrs[i]; i += 2) {
if (strcmp(attrs[i], "r") == 0) {
state->current_row = atoi(attrs[i+1]);
break;
}
}
// 2. 范围检查
if (state->current_row > state->range.end_row) {
XML_StopParser(state->parser, 0); // 超过范围,终止
return;
}
if (state->current_row < state->range.start_row) {
state->skip_row = 1;
return;
}
// 在<row>标签处理中(范围检查通过后):
if (!state->first_row_processed) {
// 标记第一行开始
state->first_row_processed = 1;
state->first_row_max_col = state->range.start_col;
}
// 3. 跳过整行(性能优化)
if (state->skip_row) {
// 直接定位到</row>后
fseek(state->xml_file, state->row_start_pos, SEEK_SET);
char buffer[1024];
int depth = 1;
while (depth > 0 && fgets(buffer, sizeof(buffer), state->xml_file)) {
for (int i = 0; buffer[i]; i++) {
if (buffer[i] == '<') {
if (buffer[i+1] == '/') depth--;
else if (strncmp(buffer+i, "<row", 4) == 0) depth++;
}
}
}
// 回退到</row>后
fseek(state->xml_file, ftell(state->xml_file), SEEK_SET);
state->in_row = 0;
return;
}
// 3. 开始输出行
fprintf(state->csv, "%d", state->current_row);
}
else if (strcmp(name, "c") == 0 && state->in_row && !state->skip_row) {
for (int i = 0; attrs[i]; i += 2) {
if (strcmp(attrs[i], "r") == 0) {
state->current_col = attrs[i+1][0];
break;
}
}
}
else if ((strcmp(name, "v") == 0 || strcmp(name, "t") == 0) && state->in_row && !state->skip_row) {
if (state->current_col >= state->range.start_col &&
state->current_col <= state->range.end_col) {
state->value_started = 1;
state->value_len = 0;
state->temp_value[0] = '\0';
}
}
}
// 文本内容回调
void XMLCALL character_data(void *user_data, const XML_Char *s, int len) {
ParserState *state = (ParserState*)user_data;
if (state->value_started && state->value_len + len < MAX_CELL_CONTENT - 1) {
memcpy(state->temp_value + state->value_len, s, len);
state->value_len += len;
state->temp_value[state->value_len] = '\0';
}
}
// 结束标签回调
void XMLCALL end_element(void *user_data, const XML_Char *name) {
ParserState *state = (ParserState*)user_data;
if (strcmp(name, "row") == 0 && state->in_row && !state->skip_row) {
// 1. 输出缺失列:从上一列到实际最大列
//char last_col = state->current_col; // 最后一个有效列
for (char col = state->current_col+1; col <= state->range.end_col; col++) {
//if (col > last_col) {
fprintf(state->csv, ","); // 缺失列补逗号
//}
}
fprintf(state->csv, "\n"); // 换行
state->in_row = 0;
}
else if ((strcmp(name, "v") == 0 || strcmp(name, "t") == 0) && state->value_started) {
// 2. 输出当前单元格值
if (state->current_col >= state->range.start_col &&
state->current_col <= state->range.end_col) {
// 3. 输出缺失列:从上一列到当前列
static char last_col = 0; // 静态变量记录上一列
if (last_col == 0) last_col = state->range.start_col;
for (char col = last_col; col < state->current_col; col++) {
fprintf(state->csv, ","); // 补缺失列
}
fprintf(state->csv, ",%s", state->temp_value);
last_col = state->current_col + 1; // 下一列起始
}
state->value_started = 0;
}
}
// 主函数
int main(int argc, char *argv[]) {
if (argc != 3) {
printf("用法: %s <xml文件> <范围(A1:Z100)>\n", argv[0]);
return 1;
}
ParseRange range;
if (parse_excel_range(argv[2], &range) != 0) {
printf("错误: 无效范围格式\n");
return 1;
}
// 生成CSV
char csv_filename[256];
strncpy(csv_filename, argv[1], sizeof(csv_filename) - 1);
char *ext = strrchr(csv_filename, '.');
if (ext) strcpy(ext, ".csv");
else strncat(csv_filename, ".csv", sizeof(csv_filename) - strlen(csv_filename) - 1);
FILE *csv = fopen(csv_filename, "w");
if (!csv) {
printf("错误: 无法创建CSV\n");
return 1;
}
// 输出标题行
fprintf(csv, "Row");
for (char col = range.start_col; col <= range.end_col; col++) {
fprintf(csv, ",%c", col);
}
fprintf(csv, "\n");
// 创建解析器
XML_Parser parser = XML_ParserCreate(NULL);
ParserState state = {0};
state.range = range;
state.csv = csv;
state.parser = parser; //必须
XML_SetUserData(parser, &state);
XML_SetElementHandler(parser, start_element, end_element);
XML_SetCharacterDataHandler(parser, character_data);
// 流式解析
FILE *file = fopen(argv[1], "rb");
if (!file) {
printf("错误: 无法打开文件\n");
fclose(csv);
XML_ParserFree(parser);
return 1;
}
char buffer[8192];
int done;
do {
size_t len = fread(buffer, 1, sizeof(buffer), file);
done = (len < sizeof(buffer));
if (XML_Parse(parser, buffer, len, done) == XML_STATUS_ERROR) {
break;
}
} while (!done);
// 清理
fclose(file);
fclose(csv);
XML_ParserFree(parser);
printf("CSV已保存到 %s\n", csv_filename);
return 0;
}
性能比较,expatxml10是流式版本,expatxml3是全表扫描版本
time ./expatxml10 lineitem/xl/worksheets/sheet1.xml A600000:F700000
real 0m11.906s
user 0m11.276s
sys 0m0.176s
time ./expatxml10 lineitem/xl/worksheets/sheet1.xml A100000:F200000
real 0m3.496s
user 0m3.268s
sys 0m0.072s
time ./expatxml10 lineitem/xl/worksheets/sheet1.xml A200000:F700000
real 0m11.757s
user 0m11.596s
sys 0m0.156s
time ./expatxml3 lineitem/xl/worksheets/sheet1.xml A200000:F700000
解析范围: A200000:F700000
real 0m17.494s
user 0m17.240s
sys 0m0.252s
time ./expatxml3 lineitem/xl/worksheets/sheet1.xml A100000:F200000
解析范围: A100000:F200000
real 0m17.043s
user 0m16.808s
sys 0m0.220s
time ./expatxml3 lineitem/xl/worksheets/sheet1.xml A600000:F700000
解析范围: A600000:F700000
real 0m17.040s
user 0m16.880s
sys 0m0.152s
可见新版的时间和起始位置有关,比旧版的基础时间大幅度缩短。和张先生的程序比,还有改善的余地。
301

被折叠的 条评论
为什么被折叠?



