看了TFS文件系统的代码,描述一下TFS的写文件时数据块复制的处理过程:
客户端接口函数:
//写文件操作组包的方法:写文件时的协议的方法是会发送多个独立的消息包过去 int TfsFile::tfs_write(char *data, const int32_t len) { if (is_open_flag_ == TFS_FILE_OPEN_FLAG_NO) { snprintf(error_message_, ERR_MSG_SIZE, "tfs file(%s) don't open file", file_name_); return -2; } if (!(mode_ & WRITE_MODE)) { snprintf(error_message_, ERR_MSG_SIZE, "open file mode isn't write."); return -1; } //发起写操作 WriteDataMessage dsmessage; dsmessage.set_file_number(file_number_); dsmessage.set_block_id(block_id_); dsmessage.set_file_id(file_id_); dsmessage.set_offset(offset_); dsmessage.set_length(len); //要写入的字节长度 dsmessage.set_ds_list(ds_list_); //数据服务器列表 dsmessage.set_data(data); //写的buf区 Message *message = client_->call(&dsmessage); //客户端发送数据包 } |
从这个接口函数可以看出,写文件时,会分成多个数据包发送.
接下来看看一下具体的实现过程:
(1).在client_->call(&dsmessage)方法里面,会把数据包放入到发送队列里面:
//发送packet到发送队列 bool Connection::postPacket(Packet *packet, IPacketHandler *packetHandler, void *args, bool noblocking) { if (!isConnectState()) { if (_iocomponent == NULL || _iocomponent->isAutoReconn() == false) { return false; } else if (_outputQueue.size()>10) { return false; } else { TCPComponent *ioc = dynamic_cast<TCPComponent*>(_iocomponent); bool ret = false; if (ioc != NULL) { _outputCond.lock(); ret = ioc->init(false); //在做一次连接,写事件 _outputCond.unlock(); } if (!ret) return false; } } // 如果是client, 并且有queue长度的限制 _outputCond.lock(); _queueTotalSize = _outputQueue.size() + _channelPool.getUseListCount() + _myQueue.size(); //如果操作队列限制数,直接返回 if (!_isServer && _queueLimit > 0 && noblocking && _queueTotalSize >= _queueLimit) { _outputCond.unlock(); return false; } _outputCond.unlock(); Channel *channel = NULL; packet->setExpireTime(_queueTimeout); // 设置超时 if (_streamer->existPacketHeader()) { // 存在包头 uint32_t chid = packet->getChannelId(); // 从packet中取 if (_isServer) { assert(chid != 0); // 不能为空 } else { channel = _channelPool.allocChannel(); // channel没找到了 if (channel == NULL) { TBSYS_LOG(WARN, "分配channel出错, id: %u", chid); return false; } channel->setHandler(packetHandler); channel->setArgs(args); packet->setChannel(channel); //设置渠道 } } _outputCond.lock(); // 写入到outputqueue中 _outputQueue.push(packet); if (_iocomponent != NULL && _outputQueue.size() == 1U) { _iocomponent->enableWrite(true); } _outputCond.unlock(); //如果是客户端,并且有队列数限制的话 if (!_isServer && _queueLimit > 0) { _outputCond.lock(); _queueTotalSize = _outputQueue.size() + _channelPool.getUseListCount() + _myQueue.size(); if ( _queueTotalSize > _queueLimit && noblocking == false) { bool *stop = NULL; if (_iocomponent && _iocomponent->getOwner()) { stop = _iocomponent->getOwner()->getStop(); } while (_queueTotalSize > _queueLimit && stop && *stop == false) { if (_outputCond.wait(1000) == false) { if (!isConnectState()) { break; } _queueTotalSize = _outputQueue.size() + _channelPool.getUseListCount() + _myQueue.size(); } } } _outputCond.unlock(); }
if (_isServer && _iocomponent) { _iocomponent->subRef(); }
return true; } |
将数据包发送到发送队列后,有一个线程在扫描socket事件:
//socket event 的检测, 被run函数调用,在线程里面被调用/ void Transport::eventLoop(SocketEvent *socketEvent) { IOEvent events[MAX_SOCKET_EVENTS]; while (!_stop) { // 检查是否有事件发生 int cnt = socketEvent->getEvents(1000, events, MAX_SOCKET_EVENTS); if (cnt < 0) { TBSYS_LOG(INFO, "得到events出错了: %s(%d)/n", strerror(errno), errno); } for (int i = 0; i < cnt; i++) { IOComponent *ioc = events[i]._ioc; if (ioc == NULL) { continue; } if (events[i]._errorOccurred) { // 错误发生了 removeComponent(ioc); continue; } ioc->addRef(); // 读写 bool rc = true; if (events[i]._readOccurred) { rc = ioc->handleReadEvent(); } if (rc && events[i]._writeOccurred) { rc = ioc->handleWriteEvent(); } ioc->subRef(); if (!rc) { removeComponent(ioc); } } } } |
接着触发ioc->handleWriteEvent()过程调用:
//当有数据可写到时被Transport调用 bool TCPComponent::handleWriteEvent() { _lastUseTime = tbsys::CTimeUtil::getTime(); bool rc = true; if (_state == TBNET_CONNECTED) { //如果是已经连接,那么写数据 rc = _connection->writeData(); } else if (_state == TBNET_CONNECTING) { int error = _socket->getSoError(); if (error == 0) { enableWrite(true); _connection->clearOutputBuffer(); _state = TBNET_CONNECTED; } else { TBSYS_LOG(ERROR, "连接到%s 失败: %s(%d)", _socket->getAddr().c_str(), strerror(error), error); if (_socketEvent) { _socketEvent->removeEvent(_socket); } _socket->close(); _state = TBNET_CLOSED; } } return rc; } |
实际上是调用_connection->writeData()过程:
bool TCPConnection::writeData() { // 把_outputQueue copy到_myQueue中 // 因为是线程操作用到了线程同步 _outputCond.lock(); _outputQueue.moveTo(&_myQueue); if (_myQueue.size() == 0 && _output.getDataLen() == 0) { // 返回 _iocomponent->enableWrite(false); _outputCond.unlock(); return true; } _outputCond.unlock(); Packet *packet; int ret; int writeCnt = 0; int myQueueSize = _myQueue.size(); do { // READ_WRITE_SIZE=8K while (_output.getDataLen() < READ_WRITE_SIZE) { // 队列空了就退出 if (myQueueSize == 0) break; packet = _myQueue.pop(); //弹开头部元素 myQueueSize --; //TfsPacketStreamer进行编码,成批写入缓冲的 _streamer->encode(packet, &_output); _channelPool.setExpireTime(packet->getChannel(), packet->getExpireTime()); packet->free(); TBNET_COUNT_PACKET_WRITE(1); } if (_output.getDataLen() == 0) { break; } // write data ret = _socket->write(_output.getData(), _output.getDataLen()); if (ret > 0) { _output.drainData(ret); //写入后在调整数据指针 } writeCnt ++; } while (ret > 0 && _output.getDataLen() == 0 && myQueueSize>0 && writeCnt < 10);
// 紧缩 _output.shrink();
_outputCond.lock(); int queueSize = _outputQueue.size() + _myQueue.size() + (_output.getDataLen() > 0 ? 1 : 0); if ((queueSize == 0 || _writeFinishClose) && _iocomponent != NULL) { _iocomponent->enableWrite(false); } _outputCond.unlock(); if (_writeFinishClose) { TBSYS_LOG(ERROR, "主动断开."); return false; }
// 如果是client, 并且有queue长度的限制 if (!_isServer && _queueLimit > 0 && _queueTotalSize > _queueLimit) { _outputCond.lock(); _queueTotalSize = queueSize + _channelPool.getUseListCount(); if (_queueTotalSize <= _queueLimit) { _outputCond.broadcast(); //广播通知 } _outputCond.unlock(); } return true; }
|
将发送队列移到_myQueue队列,然后从_myQueue队列取数据包,先对包进行编码,在发送.
(2).TFS在写文件时,传输数据包,需要向几个DataServer数据服务器写入, dsmessage.set_ds_list(ds_list_); //数据服务器列表,参数ds_list_就是要写入的数据服务器列表,其中包括一个主的数据服务器,发送写消息的数据包时,也是向这个主数据服务器发起请求,在由这个主数据服务器推送到其它数据服务器.
来看一下数据服务器受到请求后的处理:
int DataService::write_data(WriteDataMessage* message) { WriteDataInfo write_info = message->get_write_info(); int32_t lease_id = message->get_lease_id(); int32_t version = message->get_block_version(); char* msg_data = message->get_data(); /*省略*/ if (Master_Server_Role == write_info.is_server_) { message->set_server(Slave_Server_Role); message->set_lease_id(lease_id); message->set_block_version(version); ret = post_message_to_server(message, message->get_ds_list()); if (ret >= 0) { if (0 == ret) { //no slave message->reply_message(new StatusMessage(STATUS_MESSAGE_OK)); } return TFS_SUCCESS; } else { ds_requester_.req_block_write_complete(write_info.block_id_, lease_id, EXIT_SENDMSG_ERROR); return MessageFactory::send_error_message(message, TBSYS_LOG_LEVEL(ERROR), data_server_info_.id_, "write data fail to other dataserver (send): blockid: %u, fileid: %" PRI64_PREFIX "u, datalen: %d", write_info.block_id_, write_info.file_id_, write_info.length_); } }
//master should not execute this statement, while salve will. message->reply_message(new StatusMessage(STATUS_MESSAGE_OK)); } |
这时就会看到实际干活是调用post_message_to_server()方法:
int DataService::post_message_to_server(Message* message, const VUINT64& ds_list) { VUINT64 erase_self; for (uint32_t i = 0; i < ds_list.size(); ++i) { if (ds_list[i] == data_server_info_.id_) { continue; } erase_self.push_back(ds_list[i]); } if (erase_self.size() == 0) { return 0; } //参数this: DataService*将自己本身传递出去,回调参数指针 if (async_post_message_to_servers(message, erase_self, this) == TFS_SUCCESS) { return 1; } else { return -1; } } |
方法比较简单,先将自己过滤掉,剩下其它数据服务器的ip列表,在调用async_post_message_to_servers()方法:
//同步推送到数据服务器 int async_post_message_to_servers(const Message* message, VUINT64& ds_list, AsyncCallback* cb) { if (!ds_list.size()) { return EXIT_GENERAL_ERROR; } AsyncClient* client = CLIENT_POOL.get_async_client(ds_list, cb); if (client->connect() != TFS_SUCCESS) { CLIENT_POOL.release_client(client); return EXIT_CONNECT_ERROR; } uint32_t send = client->post(const_cast<Message*> (message)); if (send < ds_list.size()) { if (send == 0) { CLIENT_POOL.release_client(client); } return EXIT_GENERAL_ERROR; } // if post all message to server, dont release client object, wait for async repsonse message. return TFS_SUCCESS; } |
这个时候就会发现新生成了AsyncClient* client,然后调用client->post()方法,在这里,
自己感觉代码似乎有点问题,因为数据块是同步写到其它数据服务器,而这里代码好像少了
一段,因为发送数据包是发送到数据包队列里面,真正的发送过程是在线程里面发送数据包的,
所以这样就达不到同步等待的效果,所以可能需要这样来改写一下:
cond_.lock(); uint32_t send = client->post(const_cast<Message*> (message)); cond_.wait(CLIENT_POOL.max_timeout_); cond_.unlock(); |
加入条件变量,进行等待,等到其它的数据服务器写完数据包有相应之后cond_.signal();
(3). AsyncClient::post()方法:
//实际从主数据服务器到其它的数据服务器发送消息包 int32_t AsyncClient::post(const Message* msg) { save_message(msg); int32_t i = 0; int32_t list_size = ds_list_.size(); for (i = 0; i < list_size; ++i) { uint64_t dsip = Func::addr_inc_port(ds_list_[i], 1); Message* send_msg = ClientManager::gClientManager.factory_.clone_message(const_cast<Message*> (msg), 2, false); if (send_msg == NULL) { TBSYS_LOG(ERROR, "clone message failure, pcode:%d", const_cast<Message*>(msg)->getPCode()); break; } send_msg->set_auto_free(true); mutex_.lock(); if (!ClientManager::gClientManager.connmgr_->sendPacket(dsip, send_msg, NULL, (void*) ((long) call_id_))) { TBSYS_LOG(ERROR, "client(%d) send message cid(%d) to server(%s) failure, pcode:%d", call_id_, send_msg->getChannelId(), tbsys::CNetUtil::addrToString(dsip).c_str(), send_msg->getPCode()); delete send_msg; call_over_ = true; mutex_.unlock(); break; } ++send_success_count_; TBSYS_LOG(DEBUG, "client(%d) post packet(%d) success, count:%d, dssize:%d, over:%d", call_id_, const_cast<Message*>(msg)->getChannelId(), send_success_count_, list_size, call_over_); if (send_success_count_ >= ds_list_.size()) { call_over_ = true; i = ds_list_.size(); mutex_.unlock(); break; } mutex_.unlock(); } return i; } |
从这个处理过程来看,对其它的数据服务器做循环处理发送数据包,分别发到各自的发送队列里面,在触发socket写事件操作,在去调用真正socket发送数据包的过程;
接下来,数据服务器受到数据包,调用写数据包的操作,然后在进行返回,主数据服务器触发socket读操作:
bool TCPComponent::handleReadEvent() { _lastUseTime = tbsys::CTimeUtil::getTime(); bool rc = false; if (_state == TBNET_CONNECTED) { rc = _connection->readData(); } return rc; } |
而_connection->readData()过程:
bool TCPConnection::readData() { _input.ensureFree(READ_WRITE_SIZE); //先初始化接收缓冲区,设置buffer为K int ret = _socket->read(_input.getFree(), _input.getFreeLen()); int readCnt = 0; int freeLen = 0; bool broken = false; while (ret > 0) { _input.pourData(ret); //修改缓冲区的空闲指针 freeLen = _input.getFreeLen(); while (1) { if (!_gotHeader) { _gotHeader = _streamer->getPacketInfo(&_input, &_packetHeader, &broken); if (broken) break; } // 如果有足够的数据, decode, 并且调用handlepacket if (_gotHeader && _input.getDataLen() >= _packetHeader._dataLen) { handlePacket(&_input, &_packetHeader); _gotHeader = false; _packetHeader._dataLen = 0; TBNET_COUNT_PACKET_READ(1); //可见是可以去读多个包的 } else { break; } } if (broken || freeLen > 0 || readCnt >= 10) { break; } if (_packetHeader._dataLen - _input.getDataLen() > READ_WRITE_SIZE) { _input.ensureFree(_packetHeader._dataLen - _input.getDataLen()); } else { _input.ensureFree(READ_WRITE_SIZE); } ret = _socket->read(_input.getFree(), _input.getFreeLen()); readCnt++; }
// 是否为批量回调 if (_isServer && _serverAdapter->_batchPushPacket && _inputQueue.size() > 0) { _serverAdapter->handleBatchPacket(this, _inputQueue); _inputQueue.clear(); }
_input.shrink(); if (!broken) { if (ret == 0) { broken = true; } else if (ret < 0) { int error = Socket::getLastError(); broken = (error != EAGAIN); } } else { _gotHeader = false; } return !broken; } |
会去调用handlePacket(&_input, &_packetHeader)方法:
bool Connection::handlePacket(DataBuffer *input, PacketHeader *header) { Packet *packet; IPacketHandler::HPRetCode rc; void *args = NULL; Channel *channel = NULL; IPacketHandler *packetHandler = NULL;
if (_streamer->existPacketHeader() && !_isServer) { // 存在包头 uint32_t chid = header->_chid; // 从header中取 chid = (chid & 0xFFFFFFF); channel = _channelPool.offerChannel(chid);
// channel没找到 if (channel == NULL) { input->drainData(header->_dataLen); TBSYS_LOG(WARN, "没找到channel, id: %u, %s", chid, tbsys::CNetUtil::addrToString(getServerId()).c_str()); return false; } packetHandler = channel->getHandler(); args = channel->getArgs(); }
// 解码 packet = _streamer->decode(input, header); if (packet == NULL) { packet = &ControlPacket::BadPacket; } else { packet->setPacketHeader(header); // 是批量调用, 直接放入queue, 返回 if (_isServer && _serverAdapter->_batchPushPacket) { if (_iocomponent) _iocomponent->addRef(); _inputQueue.push(packet); if (_inputQueue.size() >= 15) { // 大于个packet就调用一次 _serverAdapter->handleBatchPacket(this, _inputQueue); _inputQueue.clear(); } return true; } }
// 调用handler if (_isServer) { //如果是服务端的话 if (_iocomponent) _iocomponent->addRef(); rc = _serverAdapter->handlePacket(this, packet); //参数是连接对象的指针,包的指针 } else { if (packetHandler == NULL) { // 用默认的 packetHandler = _defaultPacketHandler; } assert(packetHandler != NULL); // rc = packetHandler->handlePacket(packet, args); channel->setArgs(NULL); // 接收回来释放掉 if (channel) { _channelPool.appendChannel(channel); } } return true; } |
可见实际的处理过程是调用packetHandler->handlePacket(packet, args)方法:
//实际处理接收的消息包回调过程 tbnet::IPacketHandler::HPRetCode ClientManager::handlePacket(tbnet::Packet *packet, void *args) { int id = static_cast<int> ((reinterpret_cast<long>(args))); int complete = Callee::CALL_UNFINISH; mutex_.lock(); CLIENTS_MAP::iterator it = clients_.find(id); if (it != clients_.end()) { if (it->second) { //最后通过这里来存放比较返回的响应包 //比如主数据服务器向其它的数据服务器发写数据包的请求时 complete = it->second->handlePacket(packet, args); } if (complete == Callee::CALL_FINISHED) { delete it->second; it->second = NULL; clients_.erase(it); } } else if (packet->isRegularPacket()) { delete packet; } mutex_.unlock(); return tbnet::IPacketHandler::FREE_CHANNEL; } |
这个时候,绕了一圈,实际调用的是it->second->handlePacket(packet, args)方法,就是下面的:
int AsyncClient::handlePacket(tbnet::Packet* packet, void*) { bool response_result = false; bool error_occured = true; int response_status = CALL_UNFINISH; std::string err_msg; ++handle_response_count_; Message* message = NULL; StatusMessage* s_msg = NULL; if (!packet->isRegularPacket()) { // failed tbnet::ControlPacket* ctrl = static_cast<tbnet::ControlPacket*> (packet); TBSYS_LOG(ERROR, "client(%d) handle error control packet:%d/n", call_id_, ctrl->getCommand()); err_msg.assign("client handle controlpacket, maybe timeout or disconnect."); goto out; }
message = static_cast<Message*> (packet); if (message->get_message_type() == STATUS_MESSAGE) { s_msg = (StatusMessage*) message; if (s_msg->get_status() != STATUS_MESSAGE_OK) { TBSYS_LOG(ERROR, "client(%d) handle response error %d,%s/n", call_id_, s_msg->get_status(), s_msg->get_error()); err_msg.assign(s_msg->get_error()); goto out; } else { // ok, check all server return.. handle_success_count_++; error_occured = false; goto out; } } else { // get wired packet... TBSYS_LOG(ERROR, "get wired packet %s,%d/n", message->get_name(), message->getChannelId()); goto out; }
out: // all message posted & all response handled or handle a error response mutex_.lock(); if (call_over_ && ((handle_response_count_ >= send_success_count_) || error_occured)) response_status = CALL_FINISHED; // all message posted & all response handled successfully if (call_over_ && (handle_success_count_ >= ds_list_.size())) { response_result = true; } TBSYS_LOG(DEBUG, "client(%d) handle packet response_status:%d,response_result:%d, " "error_occured:%d, _callOver:%d, _handleResponseCount:%d,_handleSucessCount:%d, _sendSucessCount:%d/n", call_id_, response_status, response_result, error_occured, call_over_, handle_response_count_, handle_success_count_, send_success_count_); mutex_.unlock();
if ((response_status == CALL_FINISHED) && all_message_posted()) { callback_->command_done(send_message_, response_result, err_msg); } // only allMessagePosted need callback... if (packet->isRegularPacket()) { delete packet; } return response_status; } |
在这里比较接收到的其它数据服务器的响应的条数跟数据服务器的个数进行对比,判断是否
收到其它数据服务器的所有响应请求,如果相等,就说明推送到其它数据服务器完成.
这里如果代码好像少了一点,因为要达到同步推送的效果,估计要加入条件变量去cond_.signal(),给主数据服务器,说明推送完成.
总结:
上面的(1),(2),(3)其实已经描述了TFS同步数据推送的过程,实际上跟数据块流水线复制还是有很大区别的,不过如果使用流水线复制技术实现起来感觉更好.