HDFS源码解析---checksum数据校验

本文详细介绍了HDFS客户端如何获取文件的checksum,包括获取文件的各个block及其checksum,计算MD5FileChecksum的过程。同时解析了DataXceiver计算Block Checksum的步骤,以及在读文件时的校验流程。最后,阐述了Block、Chunk和Packet的关系,描述了它们在数据校验中的作用。

客户端获取文件的checksum

整体流程如下:

1、获取文件的所有block

2、遍历每个block

3、获取block的所有副本以及所在的datanode,去datanode上获取这个block的checksum

4、获取每个block的checksum的byte-per-checksum,crc-per-block(block的crc)以及MD5

5、根据所有block的MD5计算文件的MD5

6、文件的MD5 + bytesPerCRC + crcPerBlock 计算MD5FileChecksum

public MD5MD5CRC32FileChecksum getFileChecksum(String src, long length)
      throws IOException {
    checkOpen();
    Preconditions.checkArgument(length >= 0);
    //get block locations for the file range
    LocatedBlocks blockLocations = callGetBlockLocations(namenode, src, 0,
        length, dfsClientConf.preferedPool);
    if (null == blockLocations) {
      throw new FileNotFoundException("File does not exist: " + src);
    }
    List<LocatedBlock> locatedblocks = blockLocations.getLocatedBlocks();
    final DataOutputBuffer md5out = new DataOutputBuffer();
    int bytesPerCRC = -1;
    DataChecksum.Type crcType = DataChecksum.Type.DEFAULT;
    long crcPerBlock = 0;
    boolean refetchBlocks = false;
    int lastRetriedIndex = -1;

    // get block checksum for each block
    long remaining = length;
    if (src.contains(HdfsConstants.SEPARATOR_DOT_SNAPSHOT_DIR_SEPARATOR)) {
      remaining = Math.min(length, blockLocations.getFileLength());
    }
    for(int i = 0; i < locatedblocks.size() && remaining > 0; i++) {
      if (refetchBlocks) {  // refetch to get fresh tokens
        blockLocations = callGetBlockLocations(namenode, src, 0, length, dfsClientConf.preferedPool);
        if (null == blockLocations) {
          throw new FileNotFoundException("File does not exist: " + src);
        }
        locatedblocks = blockLocations.getLocatedBlocks();
        refetchBlocks = false;
      }
      LocatedBlock lb = locatedblocks.get(i);
      final ExtendedBlock block = lb.getBlock();
      if (remaining < block.getNumBytes()) {
        block.setNumBytes(remaining);
      }
      remaining -= block.getNumBytes();
      final DatanodeInfo[] datanodes = lb.getLocations();
      
      //try each datanode location of the block
      final int timeout = 3000 * datanodes.length + dfsClientConf.socketTimeout;
      boolean done = false;
      for(int j = 0; !done && j < datanodes.length; j++) {
        DataOutputStream out = null;
        DataInputStream in = null;
        
        try {
          //connect to a datanode
          IOStreamPair pair = connectToDN(datanodes[j], timeout, lb);
          out = new DataOutputStream(new BufferedOutputStream(pair.out,
              HdfsConstants.SMALL_BUFFER_SIZE));
          in = new DataInputStream(pair.in);

          if (LOG.isDebugEnabled()) {
            LOG.debug("write to " + datanodes[j] + ": "
                + Op.BLOCK_CHECKSUM + ", block=" + block);
          }
          // get block MD5
          new Sender(out).blockChecksum(block, lb.getBlockToken());

          final BlockOpResponseProto reply =
            BlockOpResponseProto.parseFrom(PBHelper.vintPrefixed(in));

          if (reply.getStatus() != Status.SUCCESS) {
            if (reply.getStatus() == Status.ERROR_ACCESS_TOKEN) {
              throw new InvalidBlockTokenException();
            } else {
              throw new IOException("Bad response " + reply + " for block "
                  + block + " from datanode " + datanodes[j]);
            }
          }
          
          OpBlockChecksumResponseProto checksumData =
            reply.getChecksumResponse();

          //read byte-per-checksum
          final int bpc = checksumData.getBytesPerCrc();
          if (i == 0) { //first block
            bytesPerCRC = bpc;
          }
          else if (bpc != bytesPerCRC) {
            throw new IOException("Byte-per-checksum not matched: bpc=" + bpc
                + " but bytesPerCRC=" + bytesPerCRC);
          }
          
          //read crc-per-block
          final long cpb = checksumData.getCrcPerBlock();
          if (locatedblocks.size() > 1 && i == 0) {
            crcPerBlock = cpb;
          }

          //read md5
          final MD5Hash md5 = new MD5Hash(
              checksumData.getMd5().toByteArray());
          md5.write(md5out);
          
          // read crc-type
          final DataChecksum.Type ct;
          if (checksumData.hasCrcType()) {
            ct = PBHelper.convert(checksumData
                .getCrcType());
          } else {
            LOG.debug("Retrieving checksum from an earlier-version DataNode: " +
                      "inferring checksum by reading first byte");
            ct = inferChecksumTypeByReading(lb, datanodes[j]);
          }

          if (i == 0) { // first block
            crcType = ct;
          } else if (crcType != DataChecksum.Type.MIXED
              && crcType != ct) {
            // if crc types are mixed in a file
            crcType = DataChecksum.Type.MIXED;
          }

          done = true;

          if (LOG.isDebugEnabled()) {
            if (i == 0) {
              LOG.debug("set bytesPerCRC=" + bytesPerCRC
                  + ", crcPerBlock=" + crcPerBlock);
            }
            LOG.debug("got reply from " + datanodes[j] + ": md5=" + md5);
          }
        } catch (InvalidBlockTokenException ibte) {
          if (i > lastRetriedIndex) {
            if (LOG.isDebugEnabled()) {
              LOG.debug("Got access token error in response to OP_BLOCK_CHECKSUM "
                  + "for file " + src + " for block " + block
                  + " from datanode " + datanodes[j]
                  + ". Will retry the block once.");
            }
            lastRetriedIndex = i;
            done = true; // actually it's not done; but we'll retry
            i--; // repeat at i-th block
            refetchBlocks = true;
            break;
          }
        } catch (IOException ie) {
          LOG.warn("src=" + src + ", datanodes["+j+"]=" + datanodes[j], ie);
        } finally {
          IOUtils.closeStream(in);
          IOUtils.closeStream(out);
        }
      }

      if (!done) {
        throw new IOException("Fail to get block MD5 for " + block);
      }
    }

    //compute file MD5
    final MD5Hash fileMD5 = MD5Hash.digest(md5out.getData()); 
    switch (crcType) {
      case CRC32:
        return new MD5MD5CRC32GzipFileChecksum(bytesPerCRC,
            crcPerBlock, fileMD5);
      case CRC32C:
        return new MD5MD5CRC32CastagnoliFileChecksum(bytesPerCRC,
            crcPerBlock, fileMD5);
      default:
        // If there is no block allocated for the file,
        // return one with the magic entry that matches what previous
        // hdfs versions return.
        if (locatedblocks.size() == 0) {
          return new MD5MD5CRC32GzipFileChecksum(0, 0, fileMD5);
        }

        // we should never get here since the validity was checked
        // when getCrcType() was called above.
        return null;
    }
  }

DataXceiver 计算Block Checksum

读取Block对应的meta文件内容,并计算其MD5值

  • 如果是整个Block校验,直接使用全部meta文件内容计算其MD5值
  • 如果是部分Block校验
  @Override
  public void blockChecksum(final ExtendedBlock block,
      final Token<BlockTokenIdentifier> blockToken) throws IOException {
    updateCurrentThreadName("Getting checksum for block " + block);
    final DataOutputStream out = new DataOutputStream(
        getOutputStream());
    checkAccess(out, true, block, blockToken,
        Op.BLOCK_CHECKSUM, BlockTokenSecretManager.AccessMode.READ);
    // client side now can specify a range of the block for checksum
    long requestLength = block.getNumBytes();
    Preconditions.checkArgument(requestLength >= 0);
    long visibleLength = datanode.data.getReplicaVisibleLength(block);
    boolean partialBlk = requestLength < visibleLength;

    final LengthInputStream metadataIn = datanode.data
        .getMetaDataInputStream(block);
    
    final DataInputStream checksumIn = new DataInputStream(
        new BufferedInputStream(metadataIn, HdfsConstants.IO_FILE_BUFFER_SIZE));
    try {
      //read metadata file
      final BlockMetadataHeader header = BlockMetadataHeader
          .readHeader(checksumIn);
      final DataChecksum checksum = header.getChecksum();
      final int csize = checksum.getChecksumSize();
      final int bytesPerCRC = checksum.getBytesPerChecksum();
      final long crcPerBlock = csize <= 0 ? 0 : 
        (metadataIn.getLength() - BlockMetadataHeader.getHeaderSize()) / csize;

      final MD5Hash md5 = partialBlk && crcPerBlock > 0 ? 
          calcPartialBlockChecksum(block, requestLength, checksum, checksumIn)
            : MD5Hash.digest(checksumIn);
      if (LOG.isDebugEnabled()) {
        LOG.debug("block=" + block + ", bytesPerCRC=" + bytesPerCRC
            + ", crcPerBlock=" + crcPerBlock + ", md5=" + md5);
      }

      //write reply
      BlockOpResponseProto.newBuilder()
        .setStatus(SUCCESS)
        .setChecksumResponse(OpBlockChecksumResponseProto.newBuilder()             
          .setBytesPerCrc(bytesPerCRC)
          .setCrcPerBlock(crcPerBlock)
          .setMd5(ByteString.copyFrom(md5.getDigest()))
          .setCrcType(PBHelper.convert(checksum.getChecksumType())))
        .build()
        .writeDelimitedTo(out);
      out.flush();
    } catch (IOException ioe) {
      LOG.info("blockChecksum " + block + " received exception " + ioe);
      incrDatanodeNetworkErrors();
      throw ioe;
    } finally {
      IOUtils.closeStream(out);
      IOUtils.closeStream(checksumIn);
      IOUtils.closeStream(metadataIn);
    }

    //update metrics
    datanode.metrics.addBlockChecksumOp(elapsed());
  }

读文件校验checksum

  /**
   * Open a DataInputStream to a DataNode so that it can be read from.
   * We get block ID and the IDs of the destinations at startup, from the namenode.
   */
  private synchronized DNAddrPair blockSeekTo(long target) throws IOException {
    if (target >= getFileLength()) {
      throw new IOException("Attempted to read past end of file");
    }

    // Will be getting a new BlockReader.
    closeCurrentBlockReader();

    //
    // Connect to best DataNode for desired Block, with potential offset
    //
    DatanodeInfo chosenNode = null;
    int refetchToken = 1; // only need to get a new access token once
    int refetchEncryptionKey = 1; // only need to get a new encryption key once
    
    boolean connectFailedOnce = false;

    while (true) {
      //
      // Compute desired block
      //
      LocatedBlock targetBlock = getBlockAt(target, true);
      assert (target==pos) : "Wrong postion " + pos + " expect " + target;
      long offsetIntoBlock = target - targetBlock.getStartOffset();

      DNAddrPair retval = chooseDataNode(targetBlock);
      chosenNode = retval.info;
      InetSocketAddress targetAddr = retval.addr;
      StorageType storageType = retval.storageType;

      long startCreateBlockReader = Time.monotonicNow(), setupConnectionSpan = 0;
      try {
        if (targetBlock.getLocations().length == 1) {
          dfsClient.getConf().effectSocketTimeout.set(dfsClient.getConf().socketTimeoutForEC);
        } else {
          dfsClient.getConf().effectSocketTimeout.set(dfsClient.getConf().socketTimeout);
        }
        ExtendedBlock blk = targetBlock.getBlock();
        Token<BlockTokenIdentifier> accessToken = targetBlock.getBlockToken();
        CachingStrategy curCachingStrategy;
        boolean shortCircuitForbidden;
        synchronized(infoLock) {
          curCachingStrategy = cachingStrategy;
          shortCircuitForbidden = shortCircuitForbidden();
        }
        blockReader = new BlockReaderFactory(dfsClient.getConf()).
            setInetSocketAddress(targetAddr).
            setRemotePeerFactory(dfsClient).
            setDatanodeInfo(chosenNode).
            setStorageType(storageType).
            setFileName(src).
            setBlock(blk).
            setBlockToken(accessToken).
            setStartOffset(offsetIntoBlock).
            setVerifyChecksum(verifyChecksum).
            setClientName(dfsClient.clientName).
            setLength(blk.getNumBytes() - offsetIntoBlock).
            setCachingStrategy(curCachingStrategy).
            setAllowShortCircuitLocalReads(!shortCircuitForbidden).
            setClientCacheContext(dfsClient.getClientContext()).
            setUserGroupInformation(dfsClient.ugi).
            setConfiguration(dfsClient.getConfiguration()).
            setTracer(dfsClient.getTracer()).
            setZoneLease(multiplexZoneLease).
            setZoneOppositeIdc(zoneOppositeIdc).
            setFailoverRemoteRead(retval.failoverRemoteRead).
            build();
        DFSClientFaultInjector.get().readFromDatanodeDelay();
        setupConnectionSpan = Time.monotonicNow() - startCreateBlockReader;
        addBlockReaderInfoAndReset(chosenNode, setupConnectionSpan);
        if(connectFailedOnce) {
          DFSClient.LOG.info("Successfully connected to " + targetAddr +
                             " for " + blk + ", scope:" + chosenNode.getScope());
        }
        return retval;
      } catch (IOException ex) {
        if (ex instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
          DFSClient.LOG.info("Will fetch a new encryption key and retry, " 
              + "encryption key was invalid when connecting to " + targetAddr
              + " : " + ex);
          // The encryption key used is invalid.
          refetchEncryptionKey--;
          dfsClient.clearDataEncryptionKey();
        } else if (refetchToken > 0 && tokenRefetchNeeded(ex, targetAddr)) {
          refetchToken--;
          fetchBlockAt(target);
        } else {
          connectFailedOnce = true;
          DFSClient.LOG.warn("Failed to connect to " + targetAddr + " for block"
            + ", add to deadNodes and continue. " + ex, ex);
          // Put chosen node into dead list, continue
          ioExceptionHandler(targetBlock, chosenNode, null, ex);
        }
      } finally {
        if (setupConnectionSpan == 0)
          setupConnectionSpan = Time.monotonicNow() - startCreateBlockReader;
        dfsClient.stats.incrementReadSetupConnectionSpan(setupConnectionSpan);
      }
    }
  }

调用链如下:

1、readWithStrategy

        a、blockSeekTo(这一步会完成blockSender对象的创建)

blockReader = new BlockReaderFactory(dfsClient.getConf()).
    setInetSocketAddress(targetAddr).
    setRemotePeerFactory(dfsClient).
    setDatanodeInfo(chosenNode).
    setStorageType(storageType).
    setFileName(src).
    setBlock(blk).
    setBlockToken(accessToken).
    setStartOffset(offsetIntoBlock).
    setVerifyChecksum(verifyChecksum).
    setClientName(dfsClient.clientName).
    setLength(blk.getNumBytes() - offsetIntoBlock).
    setCachingStrategy(curCachingStrategy).
    setAllowShortCircuitLocalReads(!shortCircuitForbidden).
    setClientCacheContext(dfsClient.getClientContext()).
    setUserGroupInformation(dfsClient.ugi).
    setConfiguration(dfsClient.getConfiguration()).
    setTracer(dfsClient.getTracer()).
    setZoneLease(multiplexZoneLease).
    setZoneOppositeIdc(zoneOppositeIdc).
    setFailoverRemoteRead(retval.failoverRemoteRead).
build();

调用链如下:

build() ---> getRemoteBlockReaderFromDomain() ---> getRemoteBlockReader ---> RemoteBlockReader2.newBlockReader

        b、readBuffer(委托BlockReader对象读取数据)

在创建BlockReader对象后,client会调用BlockReader 对象的doRead()方法读取数据块。

Block、Chunk、Packet

1、block由一个个的chunk组成

2、每个chunk = checksum(4 Byte) + data(512 Byte),数据与检验值的比值为128:1,所以对于一个128M的block会有一个1M的校验文件与之对应。

3、多个chunk组成一个packet,packet = header + 所有校验块的校验和 + chunk

readWithStrategy --> readBuffer ---> reader.doRead --> blockReader.read(buf, off, len);--->

RemoteBlockReader2.read ---> readNextPacket

private void readNextPacket() throws IOException {
    //Read packet headers.
    packetReceiver.receiveNextPacket(in);

    PacketHeader curHeader = packetReceiver.getHeader();
    // 获取每个packet的data信息
    curDataSlice = packetReceiver.getDataSlice();
    assert curDataSlice.capacity() == curHeader.getDataLen();
    
    if (LOG.isTraceEnabled()) {
      LOG.trace("DFSClient readNextPacket got header " + curHeader);
    }

    // Sanity check the lengths
    if (!curHeader.sanityCheck(lastSeqNo)) {
         throw new IOException("BlockReader: error in packet header " +
                               curHeader);
    }
    
    if (curHeader.getDataLen() > 0) {
      int chunks = 1 + (curHeader.getDataLen() - 1) / bytesPerChecksum;
      int checksumsLen = chunks * checksumSize;

      assert packetReceiver.getChecksumSlice().capacity() == checksumsLen :
        "checksum slice capacity=" + packetReceiver.getChecksumSlice().capacity() + 
          " checksumsLen=" + checksumsLen;
      
      lastSeqNo = curHeader.getSeqno();
      if (verifyChecksum && curDataSlice.remaining() > 0) {
        // N.B.: the checksum error offset reported here is actually
        // relative to the start of the block, not the start of the file.
        // This is slightly misleading, but preserves the behavior from
        // the older BlockReader.
        checksum.verifyChunkedSums(curDataSlice,
            packetReceiver.getChecksumSlice(),
            filename, curHeader.getOffsetInBlock());
      }
      bytesNeededToFinish -= curHeader.getDataLen();
    }    
    
    // First packet will include some data prior to the first byte
    // the user requested. Skip it.
    if (curHeader.getOffsetInBlock() < startOffset) {
      int newPos = (int) (startOffset - curHeader.getOffsetInBlock());
      curDataSlice.position(newPos);
    }

    // If we've now satisfied the whole client read, read one last packet
    // header, which should be empty
    if (bytesNeededToFinish <= 0) {
      readTrailingEmptyPacket();
      DFSClientFaultInjector.get().sleepForDatanodeReadTimeout();
      if (verifyChecksum) {
        sendReadResult(Status.CHECKSUM_OK);
      } else {
        sendReadResult(Status.SUCCESS);
      }
    }
  }

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值