ZLIB 1.2.8 压缩整理

最新推荐文章于 2020-07-09 10:56:05 发布

原创最新推荐文章于 2020-07-09 10:56:05 发布 · 721 阅读

1 ·

CC 4.0 BY-SA版权

系统综合同时被 3 个专栏收录

13 篇文章

订阅专栏

12 篇文章

订阅专栏

数据压缩

2 篇文章

订阅专栏

本文探讨ZLIB 1.2.8的数据压缩，包括deflate/inflate的基本功能，压缩和解压缩流程控制，以及gzip格式文件操作。重点在于使用deflateInit()初始化压缩结构体，通过调整输入输出缓冲区进行压缩和解压缩，同时对比了ZLIB与BZIP2的性能和内存占用。文中还提供了DEMO代码以展示详细流程，并提及ZLIB跨语言版本的兼容性问题。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

ZLIB 官网：http://www.zlib.net/ ，有详细的英文文档及代码。

ZLIB 数据压缩相关结构体：

typedef struct z_stream_s {
    z_const Bytef *next_in;     /* next input byte */
    uInt     avail_in;  /* number of bytes available at next_in */
    uLong    total_in;  /* total number of input bytes read so far */

    Bytef    *next_out; /* next output byte should be put there */
    uInt     avail_out; /* remaining free space at next_out */
    uLong    total_out; /* total number of bytes output so far */

    z_const char *msg;  /* last error message, NULL if no error */
    struct internal_state FAR *state; /* not visible by applications */

    alloc_func zalloc;  /* used to allocate the internal state */
    free_func  zfree;   /* used to free the internal state */
    voidpf     opaque;  /* private data object passed to zalloc and zfree */

    int     data_type;  /* best guess about the data type: binary or text */
    uLong   adler;      /* adler32 value of the uncompressed data */
    uLong   reserved;   /* reserved for future use */
} z_stream;

相关枚举值：

                        /* constants */

#define Z_NO_FLUSH      0
#define Z_PARTIAL_FLUSH 1
#define Z_SYNC_FLUSH    2
#define Z_FULL_FLUSH    3
#define Z_FINISH        4
#define Z_BLOCK         5
#define Z_TREES         6
/* Allowed flush values; see deflate() and inflate() below for details */

#define Z_OK            0
#define Z_STREAM_END    1
#define Z_NEED_DICT     2
#define Z_ERRNO        (-1)
#define Z_STREAM_ERROR (-2)
#define Z_DATA_ERROR   (-3)
#define Z_MEM_ERROR    (-4)
#define Z_BUF_ERROR    (-5)
#define Z_VERSION_ERROR (-6)
/* Return codes for the compression/decompression functions. Negative values
 * are errors, positive values are used for special but normal events.
 */

#define Z_NO_COMPRESSION         0
#define Z_BEST_SPEED             1
#define Z_BEST_COMPRESSION       9
#define Z_DEFAULT_COMPRESSION  (-1)
/* compression levels */

#define Z_FILTERED            1
#define Z_HUFFMAN_ONLY        2
#define Z_RLE                 3
#define Z_FIXED               4
#define Z_DEFAULT_STRATEGY    0
/* compression strategy; see deflateInit2() below for details */

#define Z_BINARY   0
#define Z_TEXT     1
#define Z_ASCII    Z_TEXT   /* for compatibility with 1.2.2 and earlier */
#define Z_UNKNOWN  2
/* Possible values of the data_type field (though see inflate()) */

#define Z_DEFLATED   8
/* The deflate compression method (the only one supported in this version) */

#define Z_NULL  0  /* for initializing zalloc, zfree, opaque */

ZLIB中给定了四种类型的数据压缩/解压缩函数，分别可以满足不同的要求。

1. deflate/inflate 提供了最基本的加解压缩的功能，不过需要对其流程进行控制，否则不方便处理。

2. 高级接口函数，这部分函数只是在少数程序里面会用到。

3. compress/decompress 提供了数据全部在内存缓冲区中的数据加解压缩。封装了deflate/inflate，调用非常方便，不需要处理复杂的流程控制。缺点也显而易见，需要把所有数据都放在内存中。

4. gzip格式文件相关操作功能函数。

本文主要是研究第一种类型函数的处理及其流程的控制，如何让加解压缩这个状态机正常运行，并输出正确结果。

主要是用到的函数原型如下：

ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level));	// 压缩初始化函数
ZEXTERN int ZEXPORT deflate OF((z_streamp strm, int flush));	//压缩函数
ZEXTERN int ZEXPORT deflateEnd OF((z_streamp strm));	//压缩资源回收释放函数

ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm));	//解压初始化
ZEXTERN int ZEXPORT inflate OF((z_streamp strm, int flush));	//解压函数
ZEXTERN int ZEXPORT inflateEnd OF((z_streamp strm));	//解压资源回收释放函数

压缩过程：

1. 初始化压缩结构体：defalteInit()初始化strm。

2. 配置strm.avail_in,strm.next_in,strm.avail_out,strm.next_out四个值，然后调用deflate进行压缩，需要注意的是，每次调用deflate之前，strm.avail_out需要大于0，即每次调用都需要保证有足够的输出缓冲区，最好等于输入缓冲长。如果没有更多的输入，则flush值为z_FINISH,否则为z_NO_FLUSH.调用之后，需要把输出的数据拷贝到其他的缓冲区中去，空出strm.next_out指定的缓冲区，以便输出更多的压缩后数据。

3. 如果待压缩的数据比较多，重复进行第二步操作。直到所有数据压缩完成。

解压过程，类似于压缩过程，只是在每次解压缩时，传入的flush值为Z_NO_FLUSH。

直接给出一DEMO的代码，函数实现的功能类似与compress/decompres，但是与compress/decompress的内部实现不一样，此demo给出了详细的流程控制逻辑。

#include <stdio.h>
#include <string.h>
#include "zlib.h"

#define BUFFER_SIZE 1024

#define DEBUG_INFO(tag,index,last){\
    if(1 == 0){\
    fprintf(stderr,"%s ret:[%d] s_index:[%d] s_last:[%d] d_index:[%d] d_last:[%d] avail_in:[%d] avail_out:[%d]\n",tag,ret,s_index,s_last,d_index,d_last,strm.avail_in,strm.avail_out);\
    sleep(0.5);\
    }\
}


int zlib_compress(char *s_buf,int *s_len,char *d_buf,int *d_len){
    int ret = 0;
    char s_in[BUFFER_SIZE];
    char d_out[BUFFER_SIZE];
    int s_index,d_index,s_last,d_last,flush;
    z_stream strm;
    memset(&strm,0,sizeof(z_stream));
    strm.zalloc    = Z_NULL;
    strm.zfree     = Z_NULL;
    strm.opaque    = Z_NULL;
    strm.avail_in  = 0;//BUFFER_SIZE;
    strm.avail_out = BUFFER_SIZE;
    strm.next_in   = s_in;
    strm.next_out  = d_out;
    s_index = d_index = 0;
    int level = 6;
    ret = deflateInit(&strm,level);
    if(ret != Z_OK){
	deflateEnd(&strm);
	return -1;
    }
    do{
	if(s_index < (*s_len) && strm.avail_in == 0){
	    s_last = (*s_len)-s_index;
	    s_last = (s_last > BUFFER_SIZE)?BUFFER_SIZE:s_last;
	    memset(s_in,0,BUFFER_SIZE);
	    memcpy(s_in,(s_buf+s_index),s_last);
	    s_index = s_index + s_last;
            strm.next_in = s_in;
	    strm.avail_in = s_last;
	    DEBUG_INFO("compress",s_index,s_last);
	}
        flush = (s_index < (*s_len))?Z_NO_FLUSH : Z_FINISH;
	do{
	    ret = deflate(&strm,flush);
	    d_last = BUFFER_SIZE - strm.avail_out;
	    if((d_index + d_last) > (*d_len)){
	    	ret = -1;break;
	    }
	    memcpy((d_buf+d_index),d_out,d_last);
	    d_index = d_index + d_last;
	    strm.next_out = d_out;
	    strm.avail_out = BUFFER_SIZE;
	    DEBUG_INFO("compress",s_index,s_last);
	}while(d_last == BUFFER_SIZE);	
    }while(flush != Z_FINISH);
    deflateEnd(&strm);
    if(ret != Z_STREAM_END){
	ret = -1;
    }else{
	ret = 0;
    }
    *d_len = d_index;
    return ret;
}

int zlib_decompress(char *s_buf,int *s_len,char *d_buf,int *d_len){
    int ret = 0;
    char s_in[BUFFER_SIZE];
    char d_out[BUFFER_SIZE];
    int s_index,d_index,s_last,d_last;
    z_stream strm;
    memset(&strm,0,sizeof(z_stream));
    strm.zalloc    = Z_NULL;
    strm.zfree     = Z_NULL;
    strm.opaque    = Z_NULL;
    strm.avail_in  = 0;//BUFFER_SIZE;
    strm.avail_out = BUFFER_SIZE;
    strm.next_in   = s_in;
    strm.next_out  = d_out;
    s_index = d_index = 0;
    int level = 6;
    ret = inflateInit(&strm);
    if(ret != Z_OK){
	inflateEnd(&strm);
	return -1;
    }
    do{
	if(s_index < (*s_len) && strm.avail_in == 0){
	    s_last = (*s_len)-s_index;
	    s_last = (s_last > BUFFER_SIZE)?BUFFER_SIZE:s_last;
	    memset(s_in,0,BUFFER_SIZE);
	    memcpy(s_in,(s_buf+s_index),s_last);
	    s_index = s_index + s_last;
            strm.next_in = s_in;
	    strm.avail_in = s_last;
	    DEBUG_INFO("decompress",s_index,s_last);
	}
	do{
	    ret = inflate(&strm,Z_NO_FLUSH);
	    //printf("inflate ret[%d]\n",ret);
	    switch(ret){
		case Z_NEED_DICT:
		    fprintf(stderr,"z_need_dict err [%d]\n",Z_NEED_DICT);
		    ret = Z_DATA_ERROR;
	        case Z_DATA_ERROR:
		    fprintf(stderr,"z_data_error [%d]\n",Z_DATA_ERROR);
		case Z_MEM_ERROR:
		    fprintf(stderr,"z_mem_error [%d]\n",Z_MEM_ERROR);
		    inflateEnd(&strm);
		    return ret;
            }
	    d_last = BUFFER_SIZE - strm.avail_out;
	    if((d_index + d_last) > (*d_len)){
	    	ret = -1;break;
	    }
	    memcpy((d_buf+d_index),d_out,d_last);
	    d_index = d_index + d_last;
	    strm.next_out = d_out;
	    strm.avail_out = BUFFER_SIZE;
	    DEBUG_INFO("decompress",d_index,d_last);
	}while(d_last == BUFFER_SIZE);	
    }while((s_index < (*s_len))&&(ret != Z_STREAM_END));
    inflateEnd(&strm);
    if(ret != Z_STREAM_END){
	ret = -1;
    }else{
	ret = 0;
    }
    *d_len = d_index;    
    return ret;
}

#include <unistd.h>
#define BUF_SIZE 10240
int main(int argc,char ** argv){
    int ret = 0; 
    
    char file_path[256];
    memset(file_path,0,256);
    int i,ch;
    opterr=0;
    while((ch=getopt(argc,argv,"f:"))!=-1){
	switch(ch){
	    case 'f':
		strncpy(file_path,optarg,255);
		break;
	    default:
		fprintf(stderr,"err params\n");
	}
    }
    int buf_len = BUF_SIZE;
    char *buf1=NULL;
    char *buf2=NULL;
    char *buf3=NULL;
    if(strlen(file_path) >0){
	FILE *fp = fopen(file_path,"rb");
	fseek(fp,0,SEEK_END);
	buf_len = (int)ftell(fp);
	fseek(fp,0,SEEK_SET);
	buf1 = (char *)malloc(buf_len + 1);
	buf2 = (char *)malloc(buf_len + 1);
	buf3 = (char *)malloc(buf_len + 1);
	memset(buf1,0,buf_len + 1);
	ret = fread(buf1,1,buf_len,fp);
	fclose(fp);
	fprintf(stderr,"file:[%s] len:[%d] read:[%d]\n",file_path,buf_len,ret);	
    }else{
	buf1 = (char *)malloc(buf_len+1);
	buf2 = (char *)malloc(buf_len+1);
	buf3 = (char *)malloc(buf_len+1);
	for(i = 0;i< buf_len;i++){
	    buf1[i] = i%256;
	}
    }
    //memset(buf1,0,BUF_SIZE+1);
    memset(buf2,0,buf_len+1);
    memset(buf3,0,buf_len+1);
    int j = 0;
    int len1,len2,len3;
    for(j = 0; j < 10 ;j++){
    len1 = len2 = len3 = buf_len;
    ret = zlib_compress(buf1,&len1,buf2,&len2);
    printf("compress: ret[%d] s_len[%d] d_len[%d]\n",ret,len1,len2);    
    ret = zlib_decompress(buf2,&len2,buf3,&len3);
    printf("decompress: ret[%d] s_len[%d] d_len[%d]\n",ret,len2,len3);
    for(i = 0;i< len3;i++){
	if(buf1[i] != buf3[i]){
	    printf("test error! index[%d]\n",i);
	    return 0;
	}
    } 
    printf("test success!\n");
    printf("pre compress:[%d] next compress:[%d] rate:[%f] multiple:[%f]\n",len1,len2,(double)len2/len1,(double)len1/len2);
    }
    FILE *fp = fopen("./out.zlib","wb");
    fwrite(buf2,1,len2,fp);
    fclose(fp);
    fprintf(stderr,"write to out.zlib success\n");
    sleep(30);
    return 0;
}

个人觉得ZLIB中的状态模型没有BZIP2的严格及复杂，各个状态相对来说比较简单，而BZIP2的则显得比较复杂。比较难以理解。

试着比较ZLIB与BZIP2的压缩速度。对于同一个文件，感觉ZLIB处理速度快很多，而且内存占用方面也相对少，数据压缩比相差不太大。目前尚不清除这种结果是否与压缩算法对数据本身特性有特别的效果有关。

ZLIB官方还给出了多个语言对应的ZLIB版本代码，目前还没来得及进行跨语言，跨平台测试，其相互间的兼容尚未测试过。