TFS文件系统策略分析
首先我们来看一下TFS分布式文件系统的特点介绍:
1.完全扁平化的数据组织结构,抛弃了传统文件系统的目录结构。
2.在块设备基础上建立自有的文件系统,减少EXT3等文件系统数据碎片带来的性能损耗
3.单进程管理单块磁盘的方式,摒除RAID5机制
4.带有HA机制的中央控制节点,在安全稳定和性能复杂度之间取得平衡。
5.尽量缩减元数据大小,将元数据全部加载入内存,提升访问速度。
6.跨机架和IDC的负载均衡和冗余安全策略。
7.完全平滑扩容
“源码之前,了无秘密”,了解TFS的最好方法是阅读源代码,这一组TFS文件系统的代码解析文章一共五篇,摘取的代码进行过筛选,基本上是在能体现TFS技术含量的或者是阅读代码比较有帮助的.
这里选择两个场景,能够反映TFS在这两个场景里面分别采用的策略.
(1). 处理来自数据服务器发送的心跳信息, 每次都需要判断是否要给该数据服务器创建新的逻辑块:
/*处理来自数据服务器发送的心跳信息*/ int HeartManagement::join_ds(Message* msg) { SetDataserverMessage* message = dynamic_cast<SetDataserverMessage*> (msg); //获得数据服务器状态 DataServerStatInfo *ds_stat_info = message->get_ds(); //server_id是数据服务器地址以及端口转换而来的 uint64_t server_id = ds_stat_info->id_; RespHeartMessage *result_msg = new RespHeartMessage(); //如果数据服务器死亡 if (ds_stat_info->status_ == DATASERVER_STATUS_DEAD) { //释放该数据服务器 meta_mgr_.leave_ds(server_id); result_msg->set_status(HEART_MESSAGE_OK); message->reply_message(result_msg); message->free(); return TFS_SUCCESS; } MetaManager::EXPIRE_BLOCK_LIST expire_list; bool isnew = false; //MetaManager& meta_mgr_ 加入数据服务器的处理方法 meta_mgr_.join_ds(*ds_stat_info, isnew); //如果isnew==TRUE 表示新加入的数据服务器 if (isnew) { TBSYS_LOG(INFO, "dataserver(%s) join: use capacity(%" PRI64_PREFIX "u),total capacity(%" PRI64_PREFIX "u), has_block(%s)", tbsys::CNetUtil::addrToString(server_id).c_str(), ds_stat_info->use_capacity_, ds_stat_info->total_capacity_, message->get_has_block() == HAS_BLOCK_FLAG_YES ? "Yes" : "No"); if (meta_mgr_.get_fs_name_system()->get_ns_global_info()->owner_role_ == NS_ROLE_MASTER) { replicate_lancher_.inc_stop_balance_count(); } } //如果有逻辑块的汇报信息 if (message->get_has_block() == HAS_BLOCK_FLAG_YES) { //MetaManager& meta_mgr_ 处理发送来的报告该数据服务器上的数据块信息 meta_mgr_.report_blocks(server_id, *message->get_blocks(), expire_list); //如果命名服务器的角色是主命名服务器 if (meta_mgr_.get_fs_name_system()->get_ns_global_info()->owner_role_ == NS_ROLE_MASTER) { uint32_t expire_blocks_size = 0; uint32_t i = 0; MetaManager::EXPIRE_BLOCK_LIST::iterator iter = expire_list.begin(); for (; iter != expire_list.end(); ++iter) { if (iter->first == server_id) { vector < uint32_t > &expire_blocks = iter->second; expire_blocks_size = expire_blocks.size(); for (i = 0; i < expire_blocks_size; ++i) { if (!replicate_lancher_.get_executor().is_replicating_block(expire_blocks[i])) result_msg->add_expire_id(expire_blocks[i]); } } else { //对失效的数据块,除了自己本身,在删除其他服务器上的块 NameServer::rm_block_from_ds(iter->first, iter->second); } } } result_msg->set_status(HEART_EXP_BLOCK_ID); TBSYS_LOG(INFO, "dataserver(%s) join: use capacity(%" PRI64_PREFIX "u),total capacity(%" PRI64_PREFIX "u), block count(%u)", tbsys::CNetUtil::addrToString(server_id).c_str(), ds_stat_info->use_capacity_, ds_stat_info->total_capacity_, message->get_blocks()->size()); } else { //如果该数据服务器汇报来的心跳信息不包含有逻辑块信息 ServerCollect* servre_collect = meta_mgr_.get_block_ds_mgr().get_ds_collect(server_id); int32_t block_count = -1; if (servre_collect != NULL){ block_count = servre_collect->get_block_list().size(); } //如果是新加的数据服务器,并且逻辑块列表数位,那么返回告诉需要汇报逻辑块信息 if (isnew || servre_collect->get_block_list().size() == 0) { TBSYS_LOG(INFO, "reply dataserver(%s) heart msg need send block, isnew(%s),current block count(%u)", tbsys::CNetUtil::addrToString(server_id).c_str(), isnew ? "true" : "false", block_count); result_msg->set_status(HEART_NEED_SEND_BLOCK_INFO); } else { result_msg->set_status(HEART_MESSAGE_OK); } if ((meta_mgr_.get_fs_name_system()->get_ns_global_info()->switch_time_ != 0) && (block_count != ds_stat_info->block_count_)) { TBSYS_LOG(DEBUG, "new ds block count(%d): old ds block count(%d)", ds_stat_info->block_count_, block_count);
if (time(NULL) < (meta_mgr_.get_fs_name_system()->get_ns_global_info()->switch_time_ + SYSPARAM_NAMESERVER.ds_dead_time_)) result_msg->set_status(HEART_NEED_SEND_BLOCK_INFO); else meta_mgr_.get_fs_name_system()->get_ns_global_info()->switch_time_ = 0; }
} message->reply_message(result_msg); if (message->get_has_block() == HAS_BLOCK_FLAG_YES){ //SYSPARAM_NAMESERVER.add_primary_block_count_=5:一次添加可写块的数量 //每次都需要判断是否要给该数据服务器创建新的逻辑块 meta_mgr_.check_primary_writable_block(server_id, SYSPARAM_NAMESERVER.add_primary_block_count_); } message->free(); message = NULL; return TFS_SUCCESS; } |
meta_mgr_.check_primary_writable_block()方法:
//该数据服务器需要添加的主逻辑块,如果小于设定的能做主数据服务器的逻辑块数,发送创建新的逻辑块的消息 //add_block_count:一次添加可写块的数量=add_primary_block_count = 5 一次添加可写块的数量 int MetaManager::check_primary_writable_block(const uint64_t ds_id, const int32_t add_block_count, bool promote) { ServerCollect* server_collect = meta_mgr_.get_ds_collect(ds_id); int32_t need_add_block_count = 0; if (server_collect != NULL) { //该数据服务器磁盘空间满,直接返回 if (server_collect->is_disk_full()){ return 0; } //得到当前可以该数据服务器可以提供的作为主逻辑块的列表的个数 int32_t current = static_cast<int32_t> (server_collect->get_primary_writable_block_list()->size()); //参数:max_write_filecount = 5 dataserver在一段时间拥有可写块的最大数 if (current >= SYSPARAM_NAMESERVER.max_write_file_count_) { TBSYS_LOG(INFO, "check primary writableblock in dataserver(%s), current_primary_block_count(%u) >= max_write_file_count(%d), no need to add new block", tbsys::CNetUtil::addrToString(ds_id).c_str(), current, SYSPARAM_NAMESERVER.max_write_file_count_); return 0; } //获得需要添加的逻辑块数 need_add_block_count = std::min(add_block_count, (SYSPARAM_NAMESERVER.max_write_file_count_ - current)); TBSYS_LOG(INFO, "check primary writableblock in dataserver(%s), current primary block count(%u), need add block count(%d)", tbsys::CNetUtil::addrToString(ds_id).c_str(), current, need_add_block_count); } if (need_add_block_count > 0) { if (promote){ promote_primary_write_block(server_collect, need_add_block_count); } //如果命名服务器不是主控服务器,那么直接返回 if (fs_name_system_->get_ns_global_info()->owner_role_ != NS_ROLE_MASTER){ return 0; } //获得下一个逻辑块号,最大逻辑块号加,初始max_block_id_=0 uint32_t next_block_id = meta_mgr_.get_max_block_id() + 0x01; //获得活动的数据服务器个数 int32_t alive_ds_size = meta_mgr_.get_alive_ds_size(); //获得该数据服务器的最大逻辑块号 uint32_t max_block_id = server_collect->get_max_block_id(); uint32_t diff = __gnu_cxx::abs(next_block_id - max_block_id); if ((diff < static_cast<uint32_t> (add_block_count)) && (alive_ds_size > 0x01)) { TBSYS_LOG(INFO, "next_block_id(%u) - max_block_id(%d) <= add_block_count(%d)," "can't add new block in this dataserver", next_block_id, max_block_id, add_block_count); return 0; } uint32_t new_block_id = 0; //循环处理,发创建新的逻辑块给数据服务器 for (int32_t i = 0; i < need_add_block_count; ++i, new_block_id = 0) { add_new_block(new_block_id, ds_id); } } return need_add_block_count; } |
BlockCollect* MetaManager::add_new_block()方法:
/添加新的逻辑块,block_id默认参数是 BlockCollect* MetaManager::add_new_block(uint32_t& block_id, const uint64_t ds_id) { vector < uint64_t > elect_ds_list; if (ds_id != 0) { ServerCollect * server_collect = meta_mgr_.get_ds_collect(ds_id); if ((server_collect != NULL) && (meta_mgr_.server_writable(server_collect))) { //首先将该数据服务器id号插入到选中的数据服务器列表elect_ds_list elect_ds_list.push_back(ds_id); } else { TBSYS_LOG(DEBUG, "add_new_block : server %s not writable", tbsys::CNetUtil::addrToString(ds_id).c_str()); return NULL; } } //首先确定需要的数据服务器个数,ds_id参数默认是的话,就是最小的备份数 uint32_t need_ds_size = SYSPARAM_NAMESERVER.min_replication_ - elect_ds_list.size(); TBSYS_LOG(DEBUG, "add new block , need_ds_size(%u)", elect_ds_list.size()); if (need_ds_size > 0) { ScopedRWLock scoped_lock(meta_mgr_.get_server_mutex(), WRITE_LOCKER); //选择写入的数据服务器,比如说一些比较权重的方法 elect_write_ds(meta_mgr_, need_ds_size, elect_ds_list); //如果逻辑块所在的数据服务器列表小于最小复制个数 if (static_cast<int32_t>(elect_ds_list.size()) < SYSPARAM_NAMESERVER.min_replication_) { TBSYS_LOG(ERROR, "there's no any dataserver can be writable."); return NULL; } } //生成新的逻辑块id if (0 == block_id){ block_id = meta_mgr_.get_avail_block_id(); } BlockChunkPtr ptr = meta_mgr_.get_block_chunk(block_id); ptr->mutex_.wrlock(); //这个逻辑块号是新生成的 BlockCollect* block_collect = ptr->create(block_id); const BlockInfo *block_info = block_collect->get_block_info(); block_collect->set_creating_flag(BlockCollect::BLOCK_CREATE_FLAG_YES); //设置块已经创建好 uint32_t new_block_id = block_info->block_id_; ptr->mutex_.unlock(); //写入创建逻辑块的日志(并且还需要逻辑块日志发送到辅助命名服务器) if (oplog_sync_mgr_.log(block_info, OPLOG_INSERT, elect_ds_list) != TFS_SUCCESS) { TBSYS_LOG(WARN, "LogBlockInfo Fail, block_id: %u", block_id); } VUINT64 add_success_ds_list; //发送需要创建新的逻辑块的消息到数据服务器(也会包括该ds_id自己本身数据服务器) for (uint32_t i = 0; i < elect_ds_list.size(); ++i) { TBSYS_LOG(DEBUG, "dataserver(%s)", tbsys::CNetUtil::addrToString(elect_ds_list[i]).c_str()); NewBlockMessage nbmsg; nbmsg.add_new_id(new_block_id); if (send_message_to_server(elect_ds_list[i], &nbmsg, NULL) == TFS_SUCCESS) { add_success_ds_list.push_back(elect_ds_list[i]); TBSYS_LOG(INFO, "add block:%u on server:%s succeed", new_block_id, tbsys::CNetUtil::addrToString( elect_ds_list[i]).c_str()); } else { TBSYS_LOG(INFO, "add block:%u on server:%s failed", new_block_id, tbsys::CNetUtil::addrToString( elect_ds_list[i]).c_str()); } } // 如果失败了,需要进行回滚 if (add_success_ds_list.size() == 0) { oplog_sync_mgr_.log(block_info, OPLOG_REMOVE, add_success_ds_list); ptr->mutex_.wrlock(); ptr->remove(new_block_id); ptr->mutex_.unlock(); TBSYS_LOG(ERROR, "add block(%u) failed, rollback", new_block_id); return NULL; } for (uint32_t i = 0; i < add_success_ds_list.size(); ++i) { //添加新的逻辑块成功后,维护该逻辑块与数据服务器的对应的关系 meta_mgr_.build_ds_block_relation(new_block_id, add_success_ds_list[i], false); } ptr->mutex_.wrlock(); block_collect->set_creating_flag(); ptr->mutex_.unlock(); return block_collect; } |
需要注意的是elect_write_ds(meta_mgr_, need_ds_size, elect_ds_list)方法, 选择写入的数据服务器,比如说一些比较权重的方法.
(2).第二个场景, 命名服务器做逻辑块负载均衡会涉及到逻辑块的迁移复制:
//逻辑块负载均衡 int ReplicateLauncher::balance() { if (destroy_flag_ == NS_DESTROY_FLAGS_YES) return TFS_SUCCESS; if ((pause_flag_ & PAUSE_FLAG_PAUSE_BALANCE)) { TBSYS_LOG(INFO, "pause balance."); return TFS_SUCCESS; } //得到负载均衡类 LayoutManager& block_ds_map = meta_mgr_.get_block_ds_mgr(); int64_t current_stop_balance_count = stop_balance_count_; int32_t ds_size = block_ds_map.get_alive_ds_size(); if (ds_size <= 1) { TBSYS_LOG(ERROR, "ds_size(%d) <= 1, must be stop balance", ds_size); return TFS_SUCCESS; } if (executor_.get_replicating_map().size() >= static_cast<uint32_t> (ds_size / 2)) { TBSYS_LOG(ERROR, "replicating size(%u) > ds_size(%d)", executor_.get_replicating_map().size(), ds_size / 2); return TFS_SUCCESS; } int64_t total_block_count = 0; int64_t total_capacity = 0; const SERVER_MAP* ds_maps = block_ds_map.get_ds_map(); SERVER_MAP::const_iterator iter = ds_maps->begin(); DataServerStatInfo* ds_stat_info = NULL; //对数据服务器列表做循环处理 for (; iter != ds_maps->end(); ++iter) { if (!iter->second->is_alive()) continue; ds_stat_info = iter->second->get_ds(); //得到总的逻辑块数,总的可用空间 total_block_count += ds_stat_info->block_count_; total_capacity += (ds_stat_info->total_capacity_ * SYSPARAM_NAMESERVER.max_use_capacity_ratio_) / 100; total_capacity -= ds_stat_info->use_capacity_; } //得到逻辑块汇报上来的总的已经使用的字节数,逻辑块数 int64_t total_bytes = block_ds_map.cacl_all_block_bytes(); int64_t block_count = block_ds_map.cacl_all_block_count(); int64_t block_size = 0; //得到逻辑块平均使用的尺寸大小 if (total_bytes > 0 && block_count > 0) { block_size = total_bytes / block_count; } else { block_size = SYSPARAM_NAMESERVER.max_block_size_; } //总的容量=总的容量+总块数*块平均使用的尺寸 total_capacity += total_block_count * block_size; if (total_capacity == 0) { TBSYS_LOG(ERROR, "total_capacity(%"PRI64_PREFIX"d)", total_capacity); return TFS_SUCCESS; } TBSYS_LOG(INFO, "build move plan"); INT64_INT_MAP ds_src_maps; INT64_INT_MAP ds_dest_maps; vector<ServerCollect*> dest_ds_list_desc; int64_t max_src_count = 0; int64_t max_dest_count = 0; int64_t current_block_count = 0; int64_t current_average_block_size = 0; int64_t should_block_count = 0; for (iter = ds_maps->begin(); iter != ds_maps->end(); ++iter) { if (!iter->second->is_alive()) continue; ds_stat_info = iter->second->get_ds(); //得到当前数据服务器逻辑块数 current_block_count = ds_stat_info->block_count_; //得到当前数据服务器逻辑块总的平均尺寸 //current_average_block_size的值加上了current_block_count*block_size,而total_capacity的值加上了total_block_count*block_size current_average_block_size = ((ds_stat_info->total_capacity_ * SYSPARAM_NAMESERVER.max_use_capacity_ratio_) / 100 - ds_stat_info->use_capacity_ + current_block_count * block_size); //得到该数据服务器应该有的逻辑块数 should_block_count = current_average_block_size * total_block_count / total_capacity; TBSYS_LOG(INFO, "dataserver(%s), should block count(%"PRI64_PREFIX"d), current block count(%"PRI64_PREFIX"d)", tbsys::CNetUtil::addrToString(iter->first).c_str(),should_block_count, current_block_count); if (should_block_count + 10 < current_block_count) { //当前的逻辑块数大于应该有的逻辑块数,那么需要从该数据服务器上迁移走 ds_src_maps.insert(INT64_INT_MAP::value_type(iter->first, current_block_count - should_block_count)); max_src_count += (current_block_count - should_block_count); } else if (should_block_count > current_block_count) { //否则就需要从别的数据服务器上迁移过来 ds_dest_maps.insert(INT64_INT_MAP::value_type(iter->first, should_block_count - current_block_count)); max_dest_count += (should_block_count - current_block_count); //填充到目标服务器ServerCollect*列表中 dest_ds_list_desc.push_back(iter->second); } } if ((ds_dest_maps.size() == 0) || (ds_src_maps.size() == 0)) { TBSYS_LOG(INFO, "block without moving data"); return TFS_SUCCESS; } else { TBSYS_LOG(INFO, "src(%u),dest(%u)", ds_src_maps.size(), ds_dest_maps.size()); } //源,目标需要匹配一致 if (max_src_count > max_dest_count) { max_src_count = max_dest_count; } if (max_src_count > static_cast<int64_t> (ds_dest_maps.size() * SYSPARAM_NAMESERVER.replicate_max_count_per_server_)) { max_src_count = ds_dest_maps.size() * SYSPARAM_NAMESERVER.replicate_max_count_per_server_; }
ReplicateExecutor::REPL_BLOCK_MAP need_move_block_map; ReplicateStrategy::counter_type ds_dest_counter; INT64_INT_MAP_ITER ds_src_iter = ds_src_maps.begin(); INT64_INT_MAP_ITER ds_dest_iter; vector<ServerCollect*>::iterator server_collect_iter; set<uint32_t>::iterator block_list_iter; ServerCollect* server_collect = NULL; BlockCollect* block_collect = NULL; VUINT64 ds_list; uint32_t block_id = 0; int32_t move_count = 0; int32_t elect_count = 0; uint64_t server_id = 0; for (; ds_src_iter != ds_src_maps.end(); ++ds_src_iter) { server_collect = block_ds_map.get_ds_collect(ds_src_iter->first); if (server_collect == NULL) continue; const set<uint32_t>& blks = server_collect->get_block_list(); //得到要迁移走的逻辑块数 move_count = ds_src_iter ->second; elect_count = 0; server_id = 0; block_list_iter = blks.begin(); for (; block_list_iter != blks.end(); ++block_list_iter) { block_id = (*block_list_iter); block_collect = block_ds_map.get_block_collect(block_id); if (block_collect == NULL) continue; //如果逻辑块满了,跳过该块 if (!block_collect->is_full()) continue; //如果已经出现在迁移逻辑块map里面,跳过该块 if (need_move_block_map.find(block_id) != need_move_block_map.end()) continue; ds_list = *(block_collect->get_ds()); //如果该逻辑块备份数跟最小复制块数不等,跳过该块 if (ds_list.size() != static_cast<uint32_t> (SYSPARAM_NAMESERVER.min_replication_)) continue; //如果该逻辑块不在该数据服务器上,跳过该块 VUINT64::iterator where = find(ds_list.begin(), ds_list.end(), ds_src_iter->first); if (where == ds_list.end()) continue; //首先删除该数据服务器,该逻辑块将会从该数据服务器上迁移走 ds_list.erase(where); //然后从迁移的目标数据服务器选择一个 //参数:dest_ds_list_desc 迁移目标数据服务器列表,ds_dest_counter:ReplicateDestStrategy::counter_type& dest_counter //ds_src_iter->first:该数据服务器server_id,从它上面迁移走 //server_id:目标数据服务器 bool bret = elect_move_dest_ds(dest_ds_list_desc, ds_dest_counter, ds_list, ds_src_iter->first, server_id); if (!bret) { TBSYS_LOG(ERROR, "cannot elect move dest server block:%u, source:%s,dest:%s", block_id, tbsys::CNetUtil::addrToString(ds_src_iter->first).c_str(), tbsys::CNetUtil::addrToString(server_id).c_str()); continue; } else { //根据server_id在ds_dest_counter中进行查找,目标数据服务器个数加 ReplicateStrategy::inc_ds_count(ds_dest_counter, server_id); } //插入到复制逻辑块need_move_block_map ReplBlock *replicate_block = new ReplBlock(); replicate_block->block_id_ = block_id; replicate_block->source_id_ = ds_src_iter->first; replicate_block->destination_id_ = server_id; need_move_block_map.insert(ReplicateExecutor::REPL_BLOCK_MAP::value_type(replicate_block->block_id_,replicate_block)); //该数据服务器需要迁移走的块数减,总的要迁移走的块数减 --move_count; --max_src_count; ++elect_count; //通过选中的server_id,找到目标数据服务器 ds_dest_iter = ds_dest_maps.find(server_id); if (ds_dest_iter != ds_dest_maps.end()) { //目标服务器需要迁移进来的逻辑块数减 ds_dest_iter->second--; if (ds_dest_iter->second <= 0) { //如果发现目标数据服务器不需要在迁移逻辑块,那么直接在目标数据服务器ds_dest_maps中删除 ds_dest_maps.erase(ds_dest_iter); //同样删除迁移目标服务器ServerCollect*列表中的该数据服务器 server_collect_iter = dest_ds_list_desc.begin(); for (; server_collect_iter != dest_ds_list_desc.end(); ++server_collect_iter) { server_collect = (*server_collect_iter); if (server_collect->get_ds()->id_ == server_id) { dest_ds_list_desc.erase(server_collect_iter); break; } } } } if (dest_ds_list_desc.size() == 0) break; if (move_count <= 0 || max_src_count <= 0) break; if (elect_count >= SYSPARAM_NAMESERVER.replicate_max_count_per_server_) break; } if (max_src_count <= 0) break; } TBSYS_LOG(INFO, "need to move the data block size: %u", need_move_block_map.size()); REPL_BLOCK_LIST need_move_block_list; ReplBlock* replicate_block = NULL; ReplicateExecutor::REPL_BLOCK_MAP_ITER rep_iter = need_move_block_map.begin(); //插入到复制逻辑块列表中 for (; rep_iter != need_move_block_map.end(); ++rep_iter) { replicate_block = rep_iter->second; need_move_block_list.push_back(replicate_block); TBSYS_LOG(DEBUG, "move plan: block:%u, from %s to %s", replicate_block->block_id_, tbsys::CNetUtil::addrToString(replicate_block->source_id_).c_str(), tbsys::CNetUtil::addrToString( replicate_block->destination_id_).c_str()); } sort(need_move_block_list); int32_t retry = 0; uint32_t oldone = 0; uint32_t need_move_block_list_size = need_move_block_list.size(); uint32_t uncomplete_move_block_num = need_move_block_list_size; while (uncomplete_move_block_num && retry < 10) { oldone = uncomplete_move_block_num; uncomplete_move_block_num = 0; for (uint32_t i = 0; i < need_move_block_list_size; ++i) { replicate_block = need_move_block_list.at(i); if (replicate_block->block_id_ == 0) continue; //因为发送的时候是在循环发送,所以需要先加上判断,比如说如果在复制发送map里面能 //找到block_id_,那么就不需要在次发送等 if (executor_.is_replicating_block(replicate_block, SYSPARAM_NAMESERVER.replicate_max_count_per_server_)) { TBSYS_LOG(INFO, "move plans to give up: block:%u, from %s to %s", replicate_block->block_id_, tbsys::CNetUtil::addrToString(replicate_block->source_id_).c_str(), tbsys::CNetUtil::addrToString( replicate_block->destination_id_).c_str()); uncomplete_move_block_num++; continue; } if (NS_DESTROY_FLAGS_YES == destroy_flag_) break; block_collect = block_ds_map.get_block_collect(replicate_block->block_id_); if (block_collect == NULL) { replicate_block->block_id_ = 0; } //发送的时候会填写到复制发送map里面 executor_.send_replicate_cmd(replicate_block->source_id_, replicate_block->destination_id_, replicate_block->block_id_, REPLICATE_BLOCK_MOVE_FLAG_YES); replicate_block->block_id_ = 0; if (ds_size != block_ds_map.get_alive_ds_size()) break; if (stop_balance_count_ > current_stop_balance_count) break; } TBSYS_LOG(INFO, "total moved count: %u, failed to move the block number: %u", need_move_block_list_size,uncomplete_move_block_num); if (NS_DESTROY_FLAGS_YES == destroy_flag_) break; if (ds_size != block_ds_map.get_alive_ds_size()) break; if (stop_balance_count_ > current_stop_balance_count) break; Func::sleep(10, reinterpret_cast<int32_t*> (&destroy_flag_)); if (NS_DESTROY_FLAGS_YES == destroy_flag_) break; if (ds_size != block_ds_map.get_alive_ds_size()) break; if (stop_balance_count_ > current_stop_balance_count) break; if ((pause_flag_ & PAUSE_FLAG_PAUSE_BALANCE)) break; if (oldone == uncomplete_move_block_num) { retry++; } else { retry = 0; } } TBSYS_LOG(INFO, "balance plan exit, remain uncomplete move(%u)", uncomplete_move_block_num); rep_iter = need_move_block_map.begin(); for (; rep_iter != need_move_block_map.end(); ++rep_iter) { tbsys::gDelete(rep_iter->second); } return TFS_SUCCESS; }
|
需要注意的是elect_move_dest_ds(dest_ds_list_desc, ds_dest_counter, ds_list, ds_src_iter->first, server_id) 方法,也是根据策略选择复制目的数据服务器.
(3).这里具体的解析一下策略类的详细实现方法:
template<typename Strategy> class StoreWeight { public: StoreWeight(Strategy& strategy, DS_WEIGHT& weight) :strategy_(strategy), weights_(weight) { } virtual ~StoreWeight() { } void operator()(const ServerCollect* server_collect) const { int64_t weight = strategy_.calc(server_collect); TBSYS_LOG(DEBUG, "weight(%"PRI64_PREFIX"d)", weight); //计算每个数据服务器选择的序号,当然初始值是 if (weight > 0){ weights_.insert(make_pair(weight, const_cast<ServerCollect*> (server_collect))); } } const DS_WEIGHT& get_weight() const { return weights_; } private: Strategy& strategy_; DS_WEIGHT& weights_; };
|
struct ExcludeGroupElectOperation { //操作符解引用 int32_t operator()(const DS_WEIGHT& weights, const int32_t elect_count, int64_t& elect_seq, common::VUINT64& elect_ds_list) const { return elect_ds_exclude_group(weights, elect_count, elect_seq, elect_ds_list); } };
|
//选择数据服务器的模板函数 template<typename Strategy, typename ElectType> int32_t elect_ds(Strategy& strategy, ElectType op, const LayoutManager& meta, const int32_t elect_count, int64_t& elect_seq, common::VUINT64& elect_ds_list) { //定义处: typedef std::multimap<int32_t, ServerCollect*> DS_WEIGHT; //多个map值 DS_WEIGHT weights; StoreWeight<Strategy> store(strategy, weights); //weights作为构造函数的传递参数 const common::SERVER_MAP* ds_map = meta.get_ds_map(); common::SERVER_MAP::const_iterator iter = ds_map->begin(); for (; iter != ds_map->end(); ++iter) { //迭代处理每个数据服务器ServerCollect*类的指针,得到DS_WEIGHT的多个值的map列表 //每个每个的计算,得到每个数据服务器的选择的序号 //对于WriteStrategy类来说是调用WriteStrategy::calc()的方法 //这样就筛选出一批满足当前负载,当前使用空间比率的条件的数据服务器 store(iter->second); } //类的解引用操作,elect_seq的初始值是 int32_t result = op(weights, elect_count, elect_seq, elect_ds_list); if (elect_seq <= 0) { elect_seq = 1; iter = ds_map->begin(); for (; iter != ds_map->end(); ++iter) { iter->second->elect(1); } } return result; } |
//检查数据服务器的负载:当前挂载,总挂载,当前数据服务器使用量,整个容量,真个活动服务个数 bool check_average(int32_t current_load, int32_t total_load, int64_t use_capacity, int64_t total_use_capacity,int64_t alive_ds_count) { if (alive_ds_count == 0) { TBSYS_LOG(DEBUG, "alive dataserver not found alive_ds_count(%"PRI64_PREFIX"d)", alive_ds_count); return 0; } //得到平均的挂载 int64_t average_load = total_load / alive_ds_count; int64_t average_use = total_use_capacity / alive_ds_count; //得到数据服务器的平均能使用字节数 //当前挂载不超过平均挂载的两倍并且当前使用的空间不超过平均能使用空间的两倍,返回True return (((current_load < average_load * 2) || (total_load == 0)) && ((use_capacity <= average_use * 2)|| (total_use_capacity == 0))); }
|
int BaseStrategy::check(const ServerCollect* server_collect) const { //判断这个数据服务器集合的活动状态 if (!server_collect->is_alive()) { TBSYS_LOG(DEBUG, "dataserver(%s) is dead, can't join ",CNetUtil::addrToString(server_collect->get_ds()->id_).c_str()); return 0; } const DataServerStatInfo* ds_stat_info = server_collect->get_ds(); //参数:当前数据服务器挂载,总挂载,当前数据服务器使用空间,总的空间,总的数据服务器个数 return check_average(ds_stat_info->current_load_, global_info_.total_load_, ds_stat_info->use_capacity_,global_info_.total_capacity_, global_info_.alive_server_count_); } |
int64_t WriteStrategy::calc(const ServerCollect* server_collect) const { //判断是否磁盘满 if (server_collect->is_disk_full()) { TBSYS_LOG(DEBUG, "dataserver(%s) is full , can't join elect list", CNetUtil::addrToString(server_collect->get_ds()->id_).c_str()); return 0; } //在去调用基类的check方法 if (BaseStrategy::check(server_collect) == 0) { TBSYS_LOG(DEBUG, "BaseStrategy::check == 0 , can't join elect list"); return 0; } //当前挂载不超过平均挂载的两倍并且当前使用的空间不超过平均能使用空间的两倍,返回True //在server_collect类的构造函数里面默认值是 return server_collect->get_elect_seq(); } |
int ReplicateStrategy::check(const ServerCollect* server_collect) const { //首先获得该数据服务器累计要获得的复制块个数 int64_t count = get_ds_count(counter_, server_collect->get_ds()->id_); //得到它的复制比率 copy_count_ = percent(count, SYSPARAM_NAMESERVER.replicate_max_count_per_server_); if (count >= SYSPARAM_NAMESERVER.replicate_max_count_per_server_) return 0x00; return 0x01; }
|
int64_t ReplicateSourceStrategy::calc(const ServerCollect* server_collect) const { //先判断该迁移目标数据服务器的复制个数是否超过最大的复制个数,如果是返回 if (!ReplicateStrategy::check(server_collect)) return 0; BaseStrategy::normalize(server_collect); //复制比率占%,负载比率占% return copy_count_ * 70 + load_ * 30; } |
//选择数据服务器实际的处理方法,均衡服务器初始elect_seq=1 int32_t elect_ds_exclude_group(const DS_WEIGHT& weights, const int32_t elect_count, int64_t& elect_seq, VUINT64& elect_ds_list) { if (elect_count == 0) { TBSYS_LOG(DEBUG, "current elect count(%d) <= 0, must be return", elect_count); return 0; } std::set < uint32_t > existlan; for (uint32_t i = 0; i < elect_ds_list.size(); ++i) { uint32_t lan = Func::get_lan(elect_ds_list[i], SYSPARAM_NAMESERVER.group_mask_); existlan.insert(lan); } //dump_weigths(weights); //weights:是根据负载均衡值calu算出的可选的数据服务器map列表,对应于每个数据服务器上的选择的序号 //该列表已经按elect_seq升序排列好的 DS_WEIGHT::const_iterator iter = weights.begin(); int32_t need_elect_count = elect_count; TBSYS_LOG(DEBUG, "weights.size(%u), need_elect_count(%d)", weights.size(), need_elect_count); DataServerStatInfo* ds_stat_info = NULL; while (iter != weights.end() && need_elect_count > 0) { ds_stat_info = iter->second->get_ds(); uint32_t dlan = Func::get_lan(ds_stat_info->id_, SYSPARAM_NAMESERVER.group_mask_); //排除已经选择了得数据服务器 if (existlan.find(dlan) == existlan.end()) { existlan.insert(dlan); elect_ds_list.push_back(ds_stat_info->id_); if (elect_seq > 0) { //重新设置elect_seq值,每个数据服务器的初始是,然后重新赋值 //(1).假定有个满足权重的数据服务器,只取个,那么这个的elect_seq_的值初始都为,取前面的三个 // ,那么选中之后elect_seq_的值分别变为,3,4,做完后负载均衡的currnet_elect_seq_变为; //(2).在做同样的操作,没有选中的个在multimap里面会有先被选中的机会,于是被选中个elect_seq_的分别变为 // 5,6,7,做完后负载均衡的currnet_elect_seq_变为,而没有被选中的剩下的个的elect_seq_依旧是 //(3).如此循环,因为multimap按升序排序,这里是按elect_seq排序,所以没有被选中的几率就会在下次被选中的几率要高 iter->second->elect(++elect_seq); } --need_elect_count; } ++iter; } TBSYS_LOG(DEBUG, "current elect_count(%d)", elect_count - need_elect_count); return elect_count - need_elect_count; } |
//选择写入的数据服务器 int32_t elect_write_ds(const LayoutManager& meta, const int32_t elect_count, VUINT64& elect_ds_list) { WriteStrategy strategy(meta.get_elect_seq(), *meta.get_ns_global_info()); int64_t elect_seq = meta.get_elect_seq(); //均衡服务器的选择序号初始默认值是 int32_t ret = elect_ds(strategy, ExcludeGroupElectOperation(), meta, elect_count, elect_seq, elect_ds_list); meta.set_elect_seq(elect_seq); return ret; } |
//选择迁移的目标数据服务器 //typedef std::map<uint64_t, uint32_t> counter_type; //参数:ds_list: 目标数据服务器列表; dest_counter: 目标数据服务器复制块map // elect_ds_list: 已经存在该逻辑块的数据服务器列表; src_ds:源数据服务器; dest_ds:目标数据服务器 bool elect_move_dest_ds(const vector<ServerCollect*>& ds_list, const ReplicateDestStrategy::counter_type& dest_counter, const VUINT64& elect_ds_list, const uint64_t src_ds, uint64_t & dest_ds) { //首先从目的数据服务器中选择最大负载的 vector<ServerCollect*>::const_iterator maxit = std::max_element(ds_list.begin(), ds_list.end(), CompareLoad()); int32_t max_load = 1; if (maxit != ds_list.end()) max_load = (*maxit)->get_ds()->current_load_; NsGlobalInfo ginfo; ginfo.max_load_ = max_load; // only max_load & alive_server_count could be useful, calc. ginfo.alive_server_count_ = ds_list.size(); ReplicateSourceStrategy strategy(1, ginfo, dest_counter); DS_WEIGHT weights; StoreWeight < ReplicateSourceStrategy > store(strategy, weights); //逐个做目标数据服务器做复制权重计算:复制比率占%,负载占% std::for_each(ds_list.begin(), ds_list.end(), store); std::set < uint32_t > existlan; uint32_t elect_ds_list_size = elect_ds_list.size(); //先将已经存在该逻辑块的数据服务器插入到existlan集合中 for (uint32_t i = 0; i < elect_ds_list_size; ++i) { uint32_t lan = Func::get_lan(elect_ds_list[i], SYSPARAM_NAMESERVER.group_mask_); existlan.insert(lan); } dest_ds = 0; uint64_t first_elect_ds = 0; uint32_t dlan = 0; DataServerStatInfo* ds_stat_info = NULL; //经过for_each权重计算后,weights已经按照升序排序好 DS_WEIGHT::const_iterator iter = weights.begin(); while (iter != weights.end()) { ds_stat_info = iter->second->get_ds(); dlan = Func::get_lan(ds_stat_info->id_, SYSPARAM_NAMESERVER.group_mask_); if ((first_elect_ds == 0) && (existlan.find(dlan) == existlan.end())) { //初次循环操作,会给first_elect_ds取第一个元素的值 first_elect_ds = ds_stat_info->id_; } if ((dest_ds == 0) && (existlan.find(dlan) == existlan.end()) && (ReplicateStrategy::get_ds_ip(src_ds) == ReplicateStrategy::get_ds_ip(ds_stat_info->id_))) { //如果在一台机器上同时起几个DataServer数据服务器,那么优先选择同一ip地址的 dest_ds = ds_stat_info->id_; } if ((first_elect_ds != 0) && (dest_ds != 0)) { break; } ++iter; } //如果出现这样的状态,直接用第一个选取的数据服务器server_id赋值 if (dest_ds == 0) { dest_ds = first_elect_ds; } return (dest_ds != 0); } |