BeansDB源码剖析——bitcask.c

Beansdb 是一款高可用的分布式键值存储系统,支持数据的高效读写操作。本文详细介绍了 Beansdb 的核心组件 Bitcask 的实现原理,包括数据文件管理、树结构维护、数据读写流程等关键技术点。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

/*
*  Beansdb - A high available distributed key-value storage system:
*
*      http://beansdb.googlecode.com
*
*  Copyright 2010 Douban Inc.  All rights reserved.
*
*  Use and distribution licensed under the BSD license.  See
*  the LICENSE file for full text.
*
*  Authors:
*      Davies Liu <davies.liu@gmail.com>
*
*/

#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <math.h>
#include <time.h>

#include "bitcask.h"
#include "htree.h"
#include "record.h"

#define MAX_BUCKET_COUNT 256

const uint32_t MAX_RECORD_SIZE = 50 * 1024 * 1024; // 50M
const uint32_t MAX_BUCKET_SIZE = (uint32_t)1024 * 1024 * 1024 * 2; // 2G
const uint32_t WRITE_BUFFER_SIZE = 1024 * 1024 * 4; // 4M

const char DATA_FILE[] = "%s/%03d.data";
const char HINT_FILE[] = "%s/%03d.hint.qlz";
const char NEW_DATA_FILE[] = "%s/%03d.data.new";
const char NEW_HINT_FILE[] = "%s/%03d.hint.new.qlz";

struct bitcask_t {
	char*  path;
	int    depth;
	HTree* tree; //这个tree记录了所有的data数据信息(也就是curr个tree的信息),比cur_tree要大得多
	int    curr; //当前的桶的序号,这之前的桶都已经写入datafile了
	HTree* curr_tree; //只有一个curr_tree,就是当前active的datafile的bucket的数据
	//write_buffer相当于active file的一个缓冲区。当write_buffer满了以后就flush
	char   *write_buffer; //write_buffer
	int    wbuf_size; //write_buffer的大小
	int    wbuf_start_pos; //write_buffer的大小小于文件的大小,所以start_pos是记录的write_buffer在文件中的位移
	//也就是文件的末尾
	int    wbuf_curr_pos; //有效的数据的大小
	/*
	结合item的pos,可以得到操作:
	如果有item的pos,那么pos = item->pos & 0xffffff00是这个record相对于文件的位移
	而start_pos是write_buffer相对于文件的位移,
	bc->write_buffer + pos - bc->wbuf_start_pos就得到了这个record在write_buffer
	(如果有的话,即这是最后一个bucket)的位置
	*/
	pthread_mutex_t flush_lock;
	pthread_mutex_t buffer_lock;
	pthread_mutex_t write_lock;
};

//一个bc里最多有MAX_BUCKET_COUNT个文件,每个文件叫做这个bc的bucket
//打开一个bitcask
//1.申请内存并初始化。
//2.遍历目录下的所有files——根据hintfile——如果没有就是用datafile——来建立一个整体的bc->tree
//3.更新bc的curr域,表示当前有多少个data文件
//before - 遍历的时间限制,只遍历before以后的hintfile,或者datafile中tsstamp在before之后的record
Bitcask* bc_open(const char *path, int depth, time_t before)
{
	if (path == NULL || depth > 4) return NULL;
	if (0 != access(path, F_OK) && 0 != mkdir(path, 0750)){
		fprintf(stderr, "mkdir %s failed\n", path);
		return NULL;
	}
	Bitcask* bc = (Bitcask*)malloc(sizeof(Bitcask));
	memset(bc, 0, sizeof(Bitcask));    
	bc->path = strdup(path);
	bc->depth = depth;
	bc->tree = ht_new(depth);
	bc->curr_tree = ht_new(depth);
	bc->wbuf_size = 1024 * 4;
	bc->write_buffer = malloc(bc->wbuf_size);
	pthread_mutex_init(&bc->buffer_lock, NULL);
	pthread_mutex_init(&bc->write_lock, NULL);
	pthread_mutex_init(&bc->flush_lock, NULL);

	char datapath[255], hintpath[255];
	int i=0;
	for (i=0; i<MAX_BUCKET_COUNT; i++) {
		//看看第i个桶是不是空的
		sprintf(datapath, DATA_FILE, path, i);
		FILE* f = fopen(datapath, "rb");
		if (NULL == f) break;
		fclose(f);

		sprintf(hintpath, HINT_FILE, path, i);
		struct stat st;
		if (before == 0){
			//如果有对应的hintfile,则更新这个hintfile对应的树节点
			//这是启动时,利用hintfile进行树创建的步骤
			if (0 == lstat(hintpath, &st)){
				scanHintFile(bc->tree, i, hintpath, NULL);
			}else{
				//否则创建新的hintfile
				scanDataFile(bc->tree, i, datapath, hintpath);                
			}
		}else{
			if (0 == lstat(hintpath, &st) && 
				(st.st_mtime < before || 0 == lstat(datapath, &st) && st.st_mtime < before)){
					scanHintFile(bc->tree, i, hintpath, NULL); 
			}else{
				scanDataFileBefore(bc->tree, i, datapath, before);
			}
		}
	}
	bc->curr = i;
	//    ht_optimize(bc->tree);

	return bc;
}

/*
* bc_close() is not thread safe, should stop other threads before call it.
* */
//1.flush,将write_buffer写入到datafile中,
//2.bc->curr_tree生成对应的hintfile
//3.销毁bc->tree
//4.销毁其它变量
void bc_close(Bitcask *bc)
{
	int i=0;
	pthread_mutex_lock(&bc->write_lock);
	
	//1
	bc_flush(bc, 0);

	//2
	if (NULL != bc->curr_tree) {
		//构建当前bucket的hint文件
		char buf[255];
		sprintf(buf, HINT_FILE, bc->path, bc->curr);
		build_hint(bc->curr_tree, buf);
		bc->curr_tree = NULL;
	}
	bc->curr = 0;
	//3
	ht_destroy(bc->tree);
	//4
	free(bc->path);
	free(bc->write_buffer);
	free(bc);
}

//利用it的信息(pos)更新args对应的树
void update_items(Item *it, void *args)
{
	HTree *tree = (HTree*) args;
	Item *p = ht_get(tree, it->name);
	if (!p) {
		fprintf(stderr, "Bug, item missed after optimized\n");
		return;
	}

	//如果(it->pos & 0xff) != (p->pos & 0xff)
	//那么说明至少有两个datafile中有这个key对应的data,这时要以bc->tree中的bucket为基准
	//也就是说,我们只更新bucket正确的DataRecord对应的Item
	if (it->pos != p->pos && (it->pos & 0xff) == (p->pos & 0xff) ) {
		ht_add(tree, p->name, it->pos, p->hash, p->ver);
	}
	free(p);
}

//在经过一段时间的运行后,新的bc->tree会新增或者删除一些节点,原来的datafile中的记录有可能就
//就应该被删除了。为了节省文件空间,需要将那些空的比较多的datafile中的有效的DataRecord保留下来,而
//而将该删的DataRecord删掉。
//1.依次遍历这个bc的每个bucket,也就是每个datafile
//2.调用record.c中的optimizeDataFile,这个函数会比较hintfile中的tree跟bc->tree的不同
//	并记录下来删除的record的数目,以决定是否值得optimize
//3.如果需要optimize,那么从datafile中读取DataRecord,并在bc->tree中查找看是否有必要保留
//4.经过optimize,datafile中DataRecord的位置可能发生了变化,这些变化被存储在相应的hashtree中
//	也就是本函数的cur_tree中,我们需要遍历cur_tree,反过来更新bc->tree
//5.然后根据cur_tree生成对应的hintfile
void bc_optimize(Bitcask *bc, int limit)
{
	int i;
	
	//1
	for (i=0; i < bc->curr; i++) {
		char data[255], hint[255];
		sprintf(data, DATA_FILE, bc->path, i);
		sprintf(hint, HINT_FILE, bc->path, i);

		//2,3
		HTree *cur_tree = optimizeDataFile(bc->tree, i, data, hint, limit);
		if (NULL == cur_tree) continue;

		pthread_mutex_lock(&bc->write_lock);
		//4
		ht_visit(cur_tree, update_items, bc->tree);
		pthread_mutex_unlock(&bc->write_lock);

		//5
		build_hint(cur_tree, hint);
	}
}

//从bc中对应的datafile中查找key对应的DataRecord
//注意bc中能存放一个value的结构是:
//a.已经被持久化的datafile 
//b.active的datafile(被flush了)
//c.bc的write_buffer(还没有被flush)
//所以得到bc_get的步骤为:
//1.从bc->tree中查找这个key对应的Item,
//2.得到dr所在的datafile编号及位置
//3.判断dr在a,b,c哪个里面
//	3.1.在c里面则直接从write_buffer中取,注意dr位置的计算
//	3.2.在a和b中的处理方法一样,都是直接从文件中读取record
//4.根据是否得到dr,来反向更新bc->tree
DataRecord* bc_get(Bitcask *bc, const char* key)
{
	//1
	Item *item = ht_get(bc->tree, key);
	if (NULL == item) return NULL;
	//ver小于0,说明该item是无效的
	if (item->ver < 0){
		free(item);
		return NULL;
	}

	//2
	//后8位是文件编号
	int bucket = item->pos & 0xff;
	//前24位是在文件中的位置
	uint32_t pos = item->pos & 0xffffff00;
	if (bucket > bc->curr) {
		fprintf(stderr, "BUG: invalid bucket %d > %d\n", bucket, bc->curr);
		ht_remove(bc->tree, key);
		free(item);
		return NULL;
	}

	DataRecord* r = NULL;
	//如果r在当前bucket中
	//这个bucket还没有写入文件中
	if (bucket == bc->curr) {
		pthread_mutex_lock(&bc->buffer_lock);
		//3.1
		if (bucket == bc->curr && pos >= bc->wbuf_start_pos){
			//从write_buffer中找
			//dr在write_buffer中的起始位置为p
			int p = pos - bc->wbuf_start_pos;
			r = decode_record(bc->write_buffer + p, bc->wbuf_curr_pos - p);
		}
		pthread_mutex_unlock(&bc->buffer_lock);

		if (r != NULL){//从write_buffer中找到了
			free(item);
			return r;
		}
	}

	//3.2
	//如果r不在最后一个bucket中,或者在最后一个bucket中但是被flush了。
	//打开存储这个bucket的文件
	char data[255];
	sprintf(data, DATA_FILE, bc->path, bucket);
	FILE *f = fopen(data, "rb");
	if (NULL == f){
		goto GET_END;
	}

	if (0 != fseek(f, pos, SEEK_SET)){
		fprintf(stderr, "IOError: seek file %d to %d failed\n", bucket, pos);
		goto GET_END;
	}

	r = read_record(f, true);
	if (NULL == r){
		fprintf(stderr, "Bug: get %s failed in %s %d %d\n", key, bc->path, bucket, pos);        
	}else{
		// check key
		if (strcmp(key, r->key) != 0){
			fprintf(stderr, "Bug: record %s is not expected %s\n", r->key, key);
			free_record(r);
			r = NULL;
		} 
	}
GET_END:
	//4
	if (NULL == r)
		ht_remove(bc->tree, key);
	if (f != NULL) fclose(f);
	free(item);
	return r;
}

struct build_thread_args {
	HTree *tree;
	char *path;
};

//创建hint文件的线程入口函数
void* build_thread(void *param)
{
	struct build_thread_args *args = (struct build_thread_args*) param;
	build_hint(args->tree, args->path);
	free(args->path);
	free(param);
	return NULL;
}

//清空write_buffer,将其内容写入active datafile中。
//因为datafile的大小是有限制的,所以有可能会持久化当前的datafile而新建一个active
//1.打开当前的active datafile,并检测文件大小跟当前的cur_pos是否相同
//2.向文件中写入
//3.如果write_buffer没有全部写入,则将后面的内容前移
//4.更新write_buffer的pos,如果有必要,扩充write_buffer
//5.如果当前datafile已经足够大,那么持久化本datafile,新建一个datafile及对应的htree
//	5.1.首先要把write_buffer中的内容全部写入
//	5.2.在新线程中持久化本datafile,建立对应的hintfile
//	5.3.新建一个datafile(curr+1),对应地,新建一个htree
void bc_flush(Bitcask *bc, int limit)
{
	if (bc->curr >= MAX_BUCKET_COUNT) {
		fprintf(stderr, "reach max bucket count\n");
		exit(1);
	}

	pthread_mutex_lock(&bc->flush_lock);
	//写入本bucket的datafile中
	//符合条件
	if (bc->wbuf_curr_pos > limit * 1024) {
		//1
		char buf[255];
		sprintf(buf, DATA_FILE, bc->path, bc->curr);
		FILE *f = fopen(buf, "ab");
		if (f == NULL) {
			fprintf(stderr, "open file %s for flushing failed.\n", buf);
			exit(1);
		}
		// check file size
		int last_pos = ftell(f);
		if (last_pos != bc->wbuf_start_pos) {
			fprintf(stderr, "last pos not match: %d != %d\n", last_pos, bc->wbuf_start_pos);
			exit(1);
		}

		//2
		int n = fwrite(bc->write_buffer, 1, bc->wbuf_curr_pos, f);

		pthread_mutex_lock(&bc->buffer_lock);
		//3
		if (n < bc->wbuf_curr_pos) {//没有写完
			memmove(bc->write_buffer, bc->write_buffer + n, bc->wbuf_curr_pos - n);
		}

		//4
		//更新两个pos的值
		bc->wbuf_start_pos += n;
		bc->wbuf_curr_pos -= n;
		if (bc->wbuf_curr_pos == 0 && bc->wbuf_size < WRITE_BUFFER_SIZE) {
			//如果有必要,扩充write_buffer
			bc->wbuf_size *= 2;
			free(bc->write_buffer);
			bc->write_buffer = malloc(bc->wbuf_size);
		}

		//5
		//如果write_buffer可以用来存储数据的空间大于一个bucket的size,新建一个bucket1
		//这个新建的bucket1是用一个新线程来跑的
		if (bc->wbuf_start_pos + bc->wbuf_size > MAX_BUCKET_SIZE) {
			//5.1
			if (bc->wbuf_curr_pos > 0) {
				if (fwrite(bc->write_buffer, 1, bc->wbuf_curr_pos, f) < bc->wbuf_curr_pos){
					fprintf(stderr, "write to %s failed\n", buf);
					exit(1);
				}
			}
			//5.2
			char datapath[255];
			sprintf(datapath, HINT_FILE, bc->path, bc->curr);
			struct build_thread_args *args = (struct build_thread_args*)malloc(
				sizeof(struct build_thread_args));
			//将当前bucekt的数据写入到一个hintfile中
			args->tree = bc->curr_tree;
			args->path = strdup(datapath);
			pthread_t build_ptid;
			pthread_create(&build_ptid, NULL, build_thread, args);
			//5.3
			// next bucket
			bc->curr ++;
			bc->curr_tree = ht_new(bc->depth);
			bc->wbuf_start_pos = 0;
			bc->wbuf_curr_pos = 0;
		}
		pthread_mutex_unlock(&bc->buffer_lock);

		fclose(f);
	}
	pthread_mutex_unlock(&bc->flush_lock);
}

//set是beansdb的核心操作,也是实现sync的方式。
//set有四种类型:替换,插入,删除,同步。
//version的更新应该遵循这样的规则:
//	a.每次更新时,需要将version+1
//	b.每次删除时,如果此前version为正,则version为version+1的绝对值
//这样做是为了得到sync的方法:
//比如节点1跟节点2同时add了一个key,然后又都delete了它,这时key的version为-2
//此后节点1失效,节点2更新了这个key,key的version变为3,当节点1与节点2sync时,
//节点1给出的version为-2,节点2给出的为3,节点1得知自己落后,从而进行追赶。
//1.得到本bc(节点)中该key对应的ver,设为oldv
//2.根据version和oldv的大小比较来判断到底是哪种类型,给ver赋值。
//3.更新两个htree和datafile文件
//	3.1.value相同,那么只需更新htree中的version
//	3.2.否则无论是删除,插入还是更新,都要新建一个DataRecord,加入当前的datafile中。
//			如果是更新或者删除的话,原来datafile中的数据会在Optimize的时候被删除。
bool bc_set(Bitcask *bc, const char* key, char* value, int vlen, int flag, int version)
{
	if (version < 0 && vlen > 0 || vlen > MAX_RECORD_SIZE){
		fprintf(stderr, "invalid set cmd \n");
		return false;
	}

	bool suc = false; //是否成功的标识
	pthread_mutex_lock(&bc->write_lock);

	int oldv = 0, ver = version;
	Item *it = ht_get(bc->tree, key);
	if (it != NULL) {
		oldv = it->ver;
	}

	//2
	if (version == 0 && oldv > 0){ // replace
		//更新,版本号+1
		ver = oldv + 1;
	} else if (version == 0 && oldv <= 0){ // add
		//从被删除状态转为存在状态,ver应该为-oldv+1
		//这个ver=1应该是不对的。
		ver = 1;
	} else if (version < 0 && oldv <= 0) { // delete, not exist
		goto SET_FAIL; //如果存在,不应该返回FAIL呀
	} else if (version == -1) { // delete
		ver = - abs(oldv) - 1;
	} else if (abs(version) <= abs(oldv)) { // sync
		//例如: version		oldver		op
		//		     5			8			  这个不是最新的
		//		    -5			8			  这已经不是它想要删除的那个item了
		goto SET_FAIL;
	} else { // sync
		//例如: version		oldver		op
		//		     8			 5			 更新
		//		     8			 -5			 插入
		//		    -8			 5			 删除
		ver = version;
	}

	uint16_t hash = gen_hash(value, vlen);
	//这个item要被删除了
	if (ver < 0) hash = 0;

	//tree中存在这个it,那么更新
	if (NULL != it && hash == it->hash) {
		DataRecord *r = bc_get(bc, key);
		//
		if (r != NULL && r->flag == flag && vlen  == r->vsz
			&& memcmp(value, r->value, vlen) == 0) {
				//
				if (version != 0){
					ht_add(bc->tree, key, it->pos, it->hash, ver);
					if (it->pos & 0xff == bc->curr){
						if (bc->curr_tree == NULL) {
							fprintf(stderr, "BUG: curr_tree should not be NULL\n");
						}else{
							ht_add(bc->curr_tree, key, it->pos, it->hash, ver);
						}
					}
				}
				suc = true;
				free_record(r);
				goto SET_FAIL;
		}
	}

	//tree中不存在这个it,或者it的value跟set的value不同。
	//即使是删除了,也要加入到datafile中
	int klen = strlen(key);
	DataRecord *r = malloc(sizeof(DataRecord) + klen);
	r->ksz = klen;
	memcpy(r->key, key, klen);
	r->vsz = vlen;
	r->value = value;
	r->free_value = false;
	r->flag = flag;
	r->version = ver;
	r->tstamp = time(NULL);

	int rlen;
	char *rbuf = encode_record(r, &rlen);
	if (rbuf == NULL || (rlen & 0xff) != 0){
		fprintf(stderr, "encode_record() failed with %d\n", rlen);
		if (rbuf != NULL) free(rbuf);
		goto SET_FAIL; 
	}

	pthread_mutex_lock(&bc->buffer_lock);
	//如果这个write_buffer已经装不下这个record了,清空
	if (bc->wbuf_curr_pos + rlen > bc->wbuf_size) {
		pthread_mutex_unlock(&bc->buffer_lock);
		bc_flush(bc, 0);
		pthread_mutex_lock(&bc->buffer_lock);
	}
	// record maybe larger than buffer
	//如果是更新的话,那么这个DataRecord的bucket就可能改变了。
	while (bc->wbuf_curr_pos + rlen > bc->wbuf_size) {
		bc->wbuf_size *= 2;
		bc->write_buffer = realloc(bc->write_buffer, bc->wbuf_size);
	}
	memcpy(bc->write_buffer + bc->wbuf_curr_pos, rbuf, rlen);

	int pos = (bc->wbuf_start_pos + bc->wbuf_curr_pos) | bc->curr;
	bc->wbuf_curr_pos += rlen;
	pthread_mutex_unlock(&bc->buffer_lock);

	//更新tree
	ht_add(bc->tree, key, pos, hash, ver);
	ht_add(bc->curr_tree, key, pos, hash, ver);
	suc = true;
	free(rbuf);
	free_record(r);

SET_FAIL:
	pthread_mutex_unlock(&bc->write_lock);
	if (it != NULL) free(it);
	return suc;
}

bool bc_delete(Bitcask *bc, const char* key)
{
	return bc_set(bc, key, "", 0, 0, -1);
}

uint16_t bc_get_hash(Bitcask *bc, const char * pos, int *count)
{
	return ht_get_hash(bc->tree, pos, count);
}

char* bc_list(Bitcask *bc, const char* pos, const char* prefix)
{
	return ht_list(bc->tree, pos, prefix);
}

uint32_t   bc_count(Bitcask *bc, uint32_t* curr)
{
	uint32_t total = 0;
	ht_get_hash(bc->tree, "@", &total);
	if (NULL != curr && NULL != bc->curr_tree) {
		ht_get_hash(bc->curr_tree, "@", curr);
	}
	return total;
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值