LZAV是Aleksey Vaneev开发的一种嵌入式压缩算法,内存中压缩速度和压缩率在LZ4和ZSTD之间,我让DeepSeek编写了一个用于文件压缩的工具,并通过在文件头处保存源文件大小解决了解压缩需要了解源文件大小的问题。
为了提供不同的压缩级别,引入了-l参数,1表示默认,非1表示深度压缩。
代码如下:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include "lzav.h"
#define MAX_FILENAME 256
void print_help() {
printf("lzavcli - LZAV Compression Tool (with size header)\n");
printf("Usage: lzavcli [options] <input_file>\n");
printf("Options:\n");
printf(" -l <level> compress level (default: 1,high: other)\n");
printf(" -o <file> Output file (default: <input>.lzav)\n");
printf(" -d Decompress mode\n");
printf(" -h Show this help\n");
}
int compress_lzav(const char* input_path, const char* output_path, int level) {
clock_t start = clock();
FILE* fin = fopen(input_path, "rb");
if (!fin) {
fprintf(stderr, "Error: Cannot open input file '%s'\n", input_path);
return -1;
}
// 获取原始文件大小
fseek(fin, 0, SEEK_END);
uint32_t src_len = (uint32_t)ftell(fin);
fseek(fin, 0, SEEK_SET);
// 读取原始数据
void* src_buf = malloc(src_len);
if (!src_buf) {
fclose(fin);
fprintf(stderr, "Error: Memory allocation failed\n");
return -1;
}
if (fread(src_buf, 1, src_len, fin) != src_len) {
fclose(fin);
free(src_buf);
fprintf(stderr, "Error: Failed to read input file\n");
return -1;
}
fclose(fin);
// 计算压缩后大小并分配缓冲区
size_t max_len = (level==1?lzav_compress_bound(src_len):lzav_compress_bound_hi(src_len))+32;
void* comp_buf = malloc(max_len);
if (!comp_buf) {
free(src_buf);
fprintf(stderr, "Error: Memory allocation failed\n");
return -1;
}
// 执行压缩
int comp_len = level==1?lzav_compress_default(src_buf, comp_buf, src_len, max_len):lzav_compress_hi(src_buf, comp_buf, src_len, max_len);
if (comp_len == 0 && src_len != 0) {
free(src_buf);
free(comp_buf);
fprintf(stderr, "Error: Compression failed\n");
return -1;
}
// 写入输出文件(包含4字节原始大小头)
FILE* fout = fopen(output_path, "wb");
if (!fout) {
free(src_buf);
free(comp_buf);
fprintf(stderr, "Error: Cannot create output file '%s'\n", output_path);
return -1;
}
// 写入原始大小(4字节小端格式)
uint8_t header[4] = {
(uint8_t)(src_len & 0xFF),
(uint8_t)((src_len >> 8) & 0xFF),
(uint8_t)((src_len >> 16) & 0xFF),
(uint8_t)((src_len >> 24) & 0xFF)
};
fwrite(header, 1, 4, fout);
// 写入压缩数据
fwrite(comp_buf, 1, comp_len, fout);
fclose(fout);
clock_t end = clock();
printf("Compressed %u bytes to %d bytes (%.2f%%)\n",
src_len, comp_len, (comp_len * 100.0) / src_len);
printf("Time: %.2f ms\n", (double)(end - start) * 1000 / CLOCKS_PER_SEC);
free(src_buf);
free(comp_buf);
return 0;
}
int decompress_lzav(const char* input_path, const char* output_path) {
clock_t start = clock();
FILE* fin = fopen(input_path, "rb");
if (!fin) {
fprintf(stderr, "Error: Cannot open input file '%s'\n", input_path);
return -1;
}
// 读取文件头中的原始大小(4字节小端格式)
uint8_t header[4];
if (fread(header, 1, 4, fin) != 4) {
fclose(fin);
fprintf(stderr, "Error: Failed to read size header\n");
return -1;
}
uint32_t src_len = (uint32_t)header[0] |
((uint32_t)header[1] << 8) |
((uint32_t)header[2] << 16) |
((uint32_t)header[3] << 24);
// 获取压缩数据大小
fseek(fin, 0, SEEK_END);
size_t comp_len = ftell(fin) - 4; // 减去4字节头
fseek(fin, 4, SEEK_SET); // 跳过头部
// 读取压缩数据
void* comp_buf = malloc(comp_len);
if (!comp_buf) {
fclose(fin);
fprintf(stderr, "Error: Memory allocation failed\n");
return -1;
}
if (fread(comp_buf, 1, comp_len, fin) != comp_len) {
fclose(fin);
free(comp_buf);
fprintf(stderr, "Error: Failed to read compressed data\n");
return -1;
}
fclose(fin);
// 分配解压缓冲区
void* decomp_buf = malloc(src_len);
if (!decomp_buf) {
free(comp_buf);
fprintf(stderr, "Error: Memory allocation failed\n");
return -1;
}
// 执行解压
int result_len = lzav_decompress(comp_buf, decomp_buf, comp_len, src_len);
if (result_len < 0) {
free(comp_buf);
free(decomp_buf);
fprintf(stderr, "Error: Decompression failed\n");
return -1;
}
// 写入解压后的文件
FILE* fout = fopen(output_path, "wb");
if (!fout) {
free(comp_buf);
free(decomp_buf);
fprintf(stderr, "Error: Cannot create output file '%s'\n", output_path);
return -1;
}
fwrite(decomp_buf, 1, src_len, fout);
fclose(fout);
clock_t end = clock();
printf("Decompressed %zu bytes to %u bytes\n", comp_len, src_len);
printf("Time: %.2f ms\n", (double)(end - start) * 1000 / CLOCKS_PER_SEC);
free(comp_buf);
free(decomp_buf);
return 0;
}
int main(int argc, char** argv) {
char input_path[MAX_FILENAME] = {0};
char output_path[MAX_FILENAME] = {0};
int decompress_mode = 0;
int compress_level=1;
// 参数解析保持不变
for (int i = 1; i < argc; i++) {
if (strcmp(argv[i], "-h") == 0) {
print_help();
return 0;
} else if (strcmp(argv[i], "-l") == 0 && i+1 < argc) {
compress_level = atoi(argv[++i]);
} else if (strcmp(argv[i], "-d") == 0) {
decompress_mode = 1;
} else if (strcmp(argv[i], "-o") == 0 && i+1 < argc) {
strncpy(output_path, argv[++i], MAX_FILENAME-1);
} else if (argv[i][0] != '-') {
strncpy(input_path, argv[i], MAX_FILENAME-1);
}
}
if (!input_path[0]) {
print_help();
return 1;
}
if (!output_path[0]) {
const char* ext = decompress_mode ? ".decomp" : ".lzav";
snprintf(output_path, MAX_FILENAME, "%s%s", input_path, ext);
}
int result;
if (decompress_mode) {
result = decompress_lzav(input_path, output_path);
} else {
result = compress_lzav(input_path, output_path, compress_level);
}
return result != 0 ? 1 : 0;
}
注意计算输出内存大小的函数lzav_compress_bound(src_len)和lzav_compress_bound_hi必须和压缩函数lzav_compress_default及lzav_compress_hi配套使用,深度压缩所需的内存更大,而且都超过原始文件大小,这与最终输出的压缩文件大小不是一个概念。比如595203472对应的lzav_compress_bound是599171513,lzav_compress_bound_hi是623546512。
编译命令行和执行结果如下,附上lz4和zstd的压缩大小和时间。
gcc lzavhd.c -o lzavhd -I . -O3
time ./lzavhd clickhouse
Compressed 595203472 bytes to 188672728 bytes (31.70%)
Time: 1218.29 ms
real 0m3.581s
user 0m0.806s
sys 0m0.419s
time ./lzavhd -l 2 clickhouse
Compressed 595203472 bytes to 165780399 bytes (27.85%)
Time: 4225.82 ms
real 0m6.489s
user 0m3.802s
sys 0m0.434s
time ./lzavhd -d clickhouse.lzav
Decompressed 165780399 bytes to 595203472 bytes
Time: 522.35 ms
real 0m2.597s
user 0m0.114s
sys 0m0.416s
time zstd clickhouse
clickhouse : 23.59% ( 568 MiB => 134 MiB, clickhouse.zst)
real 0m3.116s
user 0m1.424s
sys 0m0.241s
time lz4 -3 clickhouse clickhous.lz4
Compressed 595203472 bytes into 186838255 bytes ==> 31.39%
real 0m2.328s
user 0m3.502s
sys 0m0.235s
内部计时和外部用time命令计时差别挺大,留到以后分析。