"数据同步算法研究"一文提出了一种改进的数据同步算法,我在实现的原型系统基础上,将文件切分、差异编码、文件同步等关键算法抽取出来封装成动态开发库libsync,方便自己的开发应用。在本人开发的deduputil,WSIO, wsync等软件中,均使用了libsync动态函数库,现已将libsync发布至google code。libsync函数库由提供三个API,原型描述如下:
1、int file_chunk(char *src_filename, char *chunk_filename, int chunk_algo)
功能:对件进行切分生成分块描述文件。
参数:src_filename为切分文件,chunk_filename为生成的块信息描述文件,chunk_algo为文件切分算法,目前支持定长FSP切分(Ffixed-size partition)、CDC切分(content-defined chunking)和滑动块(sliding block)切分三种。
2、int file_delta(char *src_filename, char *chunk_filename, char *delta_filename, int chunk_algo)
功能:使用生成的块描述信息对文件进行差异编码。
参数:src_filename为待编码文件,chunk_filename为通过函数file_chunk生成的块描述文件,chunk_algo为文件切分算法。
3、int file_sync(char *src_filename, char *delta_filename)
功能:使用差异编码文件将源文件同步至目标文件。
参数:src_filename为基本文件,delta_filename为通过函数file_delta生成的差异编码文件。
- /*
- Copyright (C) 2010, Aigui Liu
- Filename: sync.h
- Author: Aigui.Liu@gmail.com
- */
- #ifndef _SYNC_H
- #define _SYNC_H
- #include <stdint.h>
- #define BLOCK_SZ 2048
- #define BLOCK_MIN_SZ 1024
- #define BLOCK_MAX_SZ 4096
- #define BLOCK_WIN_SZ 48
- #define NAME_MAX_SZ 256
- #define BUF_MAX_SZ 32768
- #define HASHTABLE_BUCKET_SZ 1024
- #define CHUNK_CDC_D BLOCK_SZ
- #define CHUNK_CDC_R 13
- enum chunk_algo {
- CHUNK_FSP = 0, /* fixed-size partition */
- CHUNK_CDC, /* content-define chunking */
- CHUNK_SBC /* slide block chunking */
- };
- /* define chunk file header and block entry */
- typedef struct _chunk_file_header {
- uint32_t block_sz;
- uint32_t block_nr;
- } chunk_file_header;
- #define CHUNK_FILE_HEADER_SZ (sizeof(chunk_file_header))
- typedef struct _chunk_block_entry {
- uint64_t offset;
- uint32_t len;
- uint8_t md5[16 + 1];
- uint8_t csum[10 + 1];
- } chunk_block_entry;
- #define CHUNK_BLOCK_ENTRY_SZ (sizeof(chunk_block_entry))
- /* define delta file header and block entry */
- typedef struct _delta_file_header {
- uint32_t block_nr;
- uint32_t last_block_sz;
- uint64_t last_block_offset; /* offset in delta file */
- } delta_file_header;
- #define DELTA_FILE_HEADER_SZ (sizeof(delta_file_header))
- typedef struct _delta_block_entry {
- uint64_t offset;
- uint32_t len;
- uint8_t embeded; /* 1, block in delta file; 0, block in source file. */
- } delta_block_entry;
- #define DELTA_BLOCK_ENTRY_SZ (sizeof(delta_block_entry))
- int file_chunk(char *src_filename, char *chunk_filename, int chunk_algo);
- int file_delta(char *src_filename, char *chunk_filename, char *delta_filename, int chunk_algo);
- int file_sync(char *src_filename, char *delta_filename);
- #endif
数据同步有PULL和PUSH两种应用模式,PULL是将远程数据同步到本地,而PUSH是将本地数据同步到远程。对应到同步算法,主要区别在于数据分块和差异编码位置不同。PULL和PUSH同步模式步骤分别如下所述。
PULL同步模式流程:
1、本地对文件A进行数据切分,生成数据块描述文件chunk;
2、上传chunk文件至远程服务器;
3、远程服务器对文件B进行差异编码,生成差异编码文件delta;
4、下载delta文件至本地;
5、本地同步文件A至文件B,相当于下载文件B到本地文件A。
例子程序:
- /*
- Copyright (c) 2010, Aigui Liu
- All rights reserved.
- Filename: wspull.c
- Author: Aigui.Liu@gmail.com
- */
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include <fcntl.h>
- #include <unistd.h>
- #include "wsioclient.h"
- #include "wsio_api.h"
- #include "sync.h"
- /* main function */
- int main(int argc, char **argv)
- {
- char fap[32] = {0};
- char server[256] = {0};
- char *local_filename = NULL;
- char remote_filename[256] = {0};
- char local_chunkname[256] = {0};
- char remote_chunkname[256] = {0};
- char local_deltaname[256] = {0};
- char remote_deltaname[256] = {0};
- int ret = 0;
- if (argc < 3) {
- fprintf(stderr, "Usage: wspull remote_filename local_filename/n");
- exit(0);
- }
- if(argv[1]) {
- if(parseURL(argv[1], fap, server, remote_filename) == -1) {
- fprintf(stderr, "wsio address is illegal/n");
- exit(1);
- }
- }
- wsio_init(fap, server);
- local_filename = argv[2];
- if (0 == access(local_filename, F_OK)) {
- sprintf(local_chunkname, "%s_%d", tmpnam(NULL), getpid());
- sprintf(remote_chunkname, "%s_%d", tmpnam(NULL), getpid());
- sprintf(local_deltaname, "%s_%d", tmpnam(NULL), getpid());
- sprintf(remote_deltaname, "%s_%d", tmpnam(NULL), getpid());
- ret = file_chunk(local_filename, local_chunkname, CHUNK_CDC);
- if (ret == -1) {
- fprintf(stderr, "local file chunking failed/n");
- return -1;
- }
- ret = wsio_putfile(local_chunkname, remote_chunkname);
- if (ret == -1) {
- fprintf(stderr, "upload chunking file failed/n");
- return -1;
- }
- ret = wsio_deltafile(remote_filename, remote_chunkname, remote_deltaname, CHUNK_CDC);
- if (ret == -1) {
- fprintf(stderr, "remote file delta coding failed/n");
- return -1;
- }
- ret = wsio_getfile(remote_deltaname, local_deltaname);
- if (ret == -1) {
- fprintf(stderr, "download delta file failed/n");
- }
- ret = file_sync(local_filename, local_deltaname);
- if (ret == -1) {
- fprintf(stderr, "local file sync failed/n");
- return -1;
- }
- } else {
- ret = wsio_getfile(remote_filename, local_filename);
- if (ret == -1) {
- fprintf(stderr, "download file failed/n");
- return -1;
- }
- }
- wsio_unlink(remote_chunkname);
- wsio_unlink(remote_deltaname);
- unlink(local_chunkname);
- unlink(local_deltaname);
- wsio_destroy();
- fprintf(stderr, "pull %s to %s %s/n", argv[1], local_filename, (ret == 0)? "sucessfully" : "failed");
- return 0;
- }
PUSH同步模式流程:
1、远程服务器对文件B进行数据切分,生成数据块描述文件chunk;
2、下载chunk文件至本地;
3、本地对文件A进行差异编码,生成差异编码文件delta;
4、上传delta文件至远程服务器;
5、远程同步文件B到A,相当于上传文件A到远程文件B。
例子程序:
- /*
- Copyright (c) 2010, Aigui Liu
- All rights reserved.
- Filename: wspush.c
- Author: Aigui.Liu@gmail.com
- */
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include <fcntl.h>
- #include <unistd.h>
- #include "wsioclient.h"
- #include "wsio_api.h"
- #include "sync.h"
- /* main function */
- int main(int argc, char **argv)
- {
- char fap[32] = {0};
- char server[256] = {0};
- char remote_filename[256] = {0};
- char *local_filename = NULL;
- char local_chunkname[256] = {0};
- char remote_chunkname[256] = {0};
- char local_deltaname[256] = {0};
- char remote_deltaname[256] = {0};
- int ret = 0;
- if (argc < 3) {
- fprintf(stderr, "Usage: wspush local_filename remote_filename/n");
- exit(0);
- }
- if(argv[2]) {
- if(parseURL(argv[2], fap, server, remote_filename) == -1) {
- fprintf(stderr, "wsio address is illegal/n");
- exit(1);
- }
- }
- wsio_init(fap, server);
- local_filename = argv[1];
- if (0 == wsio_access(remote_filename, F_OK)) {
- sprintf(local_chunkname, "%s_%d", tmpnam(NULL), getpid());
- sprintf(remote_chunkname, "%s_%d", tmpnam(NULL), getpid());
- sprintf(local_deltaname, "%s_%d", tmpnam(NULL), getpid());
- sprintf(remote_deltaname, "%s_%d", tmpnam(NULL), getpid());
- ret = wsio_chunkfile(remote_filename, remote_chunkname, CHUNK_CDC);
- if (ret == -1) {
- fprintf(stderr, "remote file chunking failed/n");
- return -1;
- }
- ret = wsio_getfile(remote_chunkname, local_chunkname);
- if (ret == -1) {
- fprintf(stderr, "download chunking file failed/n");
- return -1;
- }
- ret = file_delta(local_filename, local_chunkname, local_deltaname, CHUNK_CDC);
- if (ret == -1) {
- fprintf(stderr, "local file delta coding failed/n");
- return -1;
- }
- ret = wsio_putfile(local_deltaname, remote_deltaname);
- if (ret == -1) {
- fprintf(stderr, "upload delta file failed/n");
- }
- ret = wsio_syncfile(remote_filename, remote_deltaname);
- if (ret == -1) {
- fprintf(stderr, "remote file sync failed/n");
- return -1;
- }
- } else {
- ret = wsio_putfile(local_filename, remote_filename);
- if (ret == -1) {
- fprintf(stderr, "upload file failed/n");
- return -1;
- }
- }
- wsio_unlink(remote_chunkname);
- wsio_unlink(remote_deltaname);
- unlink(local_chunkname);
- unlink(local_deltaname);
- wsio_destroy();
- fprintf(stderr, "push %s to %s %s/n", local_filename, remote_filename, (ret == 0)? "sucessfully" : "failed");
- return 0;
- }