客户端获取文件的checksum
整体流程如下:
1、获取文件的所有block
2、遍历每个block
3、获取block的所有副本以及所在的datanode,去datanode上获取这个block的checksum
4、获取每个block的checksum的byte-per-checksum,crc-per-block(block的crc)以及MD5
5、根据所有block的MD5计算文件的MD5
6、文件的MD5 + bytesPerCRC + crcPerBlock 计算MD5FileChecksum
public MD5MD5CRC32FileChecksum getFileChecksum(String src, long length)
throws IOException {
checkOpen();
Preconditions.checkArgument(length >= 0);
//get block locations for the file range
LocatedBlocks blockLocations = callGetBlockLocations(namenode, src, 0,
length, dfsClientConf.preferedPool);
if (null == blockLocations) {
throw new FileNotFoundException("File does not exist: " + src);
}
List<LocatedBlock> locatedblocks = blockLocations.getLocatedBlocks();
final DataOutputBuffer md5out = new DataOutputBuffer();
int bytesPerCRC = -1;
DataChecksum.Type crcType = DataChecksum.Type.DEFAULT;
long crcPerBlock = 0;
boolean refetchBlocks = false;
int lastRetriedIndex = -1;
// get block checksum for each block
long remaining = length;
if (src.contains(HdfsConstants.SEPARATOR_DOT_SNAPSHOT_DIR_SEPARATOR)) {
remaining = Math.min(length, blockLocations.getFileLength());
}
for(int i = 0; i < locatedblocks.size() && remaining > 0; i++) {
if (refetchBlocks) { // refetch to get fresh tokens
blockLocations = callGetBlockLocations(namenode, src, 0, length, dfsClientConf.preferedPool);
if (null == blockLocations) {
throw new FileNotFoundException("File does not exist: " + src);
}
locatedblocks = blockLocations.getLocatedBlocks();
refetchBlocks = false;
}
LocatedBlock lb = locatedblocks.get(i);
final ExtendedBlock block = lb.getBlock();
if (remaining < block.getNumBytes()) {
block.setNumBytes(remaining);
}
remaining -= block.getNumBytes();
final DatanodeInfo[] datanodes = lb.getLocations();
//try each datanode location of the block
final int timeout = 3000 * datanodes.length + dfsClientConf.socketTimeout;
boolean done = false;
for(int j = 0; !done && j < datanodes.length; j++) {
DataOutputStream out = null;
DataInputStream in = null;
try {
//connect to a datanode
IOStreamPair pair = connectToDN(datanodes[j], timeout, lb);
out = new DataOutputStream(new BufferedOutputStream(pair.out,
HdfsConstants.SMALL_BUFFER_SIZE));
in = new DataInputStream(pair.in);
if (LOG.isDebugEnabled()) {
LOG.debug("write to " + datanodes[j] + ": "
+ Op.BLOCK_CHECKSUM + ", block=" + block);
}
// get block MD5
new Sender(out).blockChecksum(block, lb.getBlockToken());
final BlockOpResponseProto reply =
BlockOpResponseProto.parseFrom(PBHelper.vintPrefixed(in));
if (reply.getStatus() != Status.SUCCESS) {
if (reply.getStatus() == Status.ERROR_ACCESS_TOKEN) {
throw new InvalidBlockTokenException();
} else {
throw new IOException("Bad response " + reply + " for block "
+ block + " from datanode " + datanodes[j]);
}
}
OpBlockChecksumResponseProto checksumData =
reply.getChecksumResponse();
//read byte-per-checksum
final int bpc = checksumData.getBytesPerCrc();
if (i == 0) { //first block
bytesPerCRC = bpc;
}
else if (bpc != bytesPerCRC) {
throw new IOException("Byte-per-checksum not matched: bpc=" + bpc
+ " but bytesPerCRC=" + bytesPerCRC);
}
//read crc-per-block
final long cpb = checksumData.getCrcPerBlock();
if (locatedblocks.size() > 1 && i == 0) {
crcPerBlock = cpb;
}
//read md5
final MD5Hash md5 = new MD5Hash(
checksumData.getMd5().toByteArray());
md5.write(md5out);
// read crc-type
final DataChecksum.Type ct;
if (checksumData.hasCrcType()) {
ct = PBHelper.convert(checksumData
.getCrcType());
} else {
LOG.debug("Retrieving checksum from an earlier-version DataNode: " +
"inferring checksum by reading first byte");
ct = inferChecksumTypeByReading(lb, datanodes[j]);
}
if (i == 0) { // first block
crcType = ct;
} else if (crcType != DataChecksum.Type.MIXED
&& crcType != ct) {
// if crc types are mixed in a file
crcType = DataChecksum.Type.MIXED;
}
done = true;
if (LOG.isDebugEnabled()) {
if (i == 0) {
LOG.debug("set bytesPerCRC=" + bytesPerCRC
+ ", crcPerBlock=" + crcPerBlock);
}
LOG.debug("got reply from " + datanodes[j] + ": md5=" + md5);
}
} catch (InvalidBlockTokenException ibte) {
if (i > lastRetriedIndex) {
if (LOG.isDebugEnabled()) {
LOG.debug("Got access token error in response to OP_BLOCK_CHECKSUM "
+ "for file " + src + " for block " + block
+ " from datanode " + datanodes[j]
+ ". Will retry the block once.");
}
lastRetriedIndex = i;
done = true; // actually it's not done; but we'll retry
i--; // repeat at i-th block
refetchBlocks = true;
break;
}
} catch (IOException ie) {
LOG.warn("src=" + src + ", datanodes["+j+"]=" + datanodes[j], ie);
} finally {
IOUtils.closeStream(in);
IOUtils.closeStream(out);
}
}
if (!done) {
throw new IOException("Fail to get block MD5 for " + block);
}
}
//compute file MD5
final MD5Hash fileMD5 = MD5Hash.digest(md5out.getData());
switch (crcType) {
case CRC32:
return new MD5MD5CRC32GzipFileChecksum(bytesPerCRC,
crcPerBlock, fileMD5);
case CRC32C:
return new MD5MD5CRC32CastagnoliFileChecksum(bytesPerCRC,
crcPerBlock, fileMD5);
default:
// If there is no block allocated for the file,
// return one with the magic entry that matches what previous
// hdfs versions return.
if (locatedblocks.size() == 0) {
return new MD5MD5CRC32GzipFileChecksum(0, 0, fileMD5);
}
// we should never get here since the validity was checked
// when getCrcType() was called above.
return null;
}
}
DataXceiver 计算Block Checksum
读取Block对应的meta文件内容,并计算其MD5值
- 如果是整个Block校验,直接使用全部meta文件内容计算其MD5值
- 如果是部分Block校验
@Override
public void blockChecksum(final ExtendedBlock block,
final Token<BlockTokenIdentifier> blockToken) throws IOException {
updateCurrentThreadName("Getting checksum for block " + block);
final DataOutputStream out = new DataOutputStream(
getOutputStream());
checkAccess(out, true, block, blockToken,
Op.BLOCK_CHECKSUM, BlockTokenSecretManager.AccessMode.READ);
// client side now can specify a range of the block for checksum
long requestLength = block.getNumBytes();
Preconditions.checkArgument(requestLength >= 0);
long visibleLength = datanode.data.getReplicaVisibleLength(block);
boolean partialBlk = requestLength < visibleLength;
final LengthInputStream metadataIn = datanode.data
.getMetaDataInputStream(block);
final DataInputStream checksumIn = new DataInputStream(
new BufferedInputStream(metadataIn, HdfsConstants.IO_FILE_BUFFER_SIZE));
try {
//read metadata file
final BlockMetadataHeader header = BlockMetadataHeader
.readHeader(checksumIn);
final DataChecksum checksum = header.getChecksum();
final int csize = checksum.getChecksumSize();
final int bytesPerCRC = checksum.getBytesPerChecksum();
final long crcPerBlock = csize <= 0 ? 0 :
(metadataIn.getLength() - BlockMetadataHeader.getHeaderSize()) / csize;
final MD5Hash md5 = partialBlk && crcPerBlock > 0 ?
calcPartialBlockChecksum(block, requestLength, checksum, checksumIn)
: MD5Hash.digest(checksumIn);
if (LOG.isDebugEnabled()) {
LOG.debug("block=" + block + ", bytesPerCRC=" + bytesPerCRC
+ ", crcPerBlock=" + crcPerBlock + ", md5=" + md5);
}
//write reply
BlockOpResponseProto.newBuilder()
.setStatus(SUCCESS)
.setChecksumResponse(OpBlockChecksumResponseProto.newBuilder()
.setBytesPerCrc(bytesPerCRC)
.setCrcPerBlock(crcPerBlock)
.setMd5(ByteString.copyFrom(md5.getDigest()))
.setCrcType(PBHelper.convert(checksum.getChecksumType())))
.build()
.writeDelimitedTo(out);
out.flush();
} catch (IOException ioe) {
LOG.info("blockChecksum " + block + " received exception " + ioe);
incrDatanodeNetworkErrors();
throw ioe;
} finally {
IOUtils.closeStream(out);
IOUtils.closeStream(checksumIn);
IOUtils.closeStream(metadataIn);
}
//update metrics
datanode.metrics.addBlockChecksumOp(elapsed());
}
读文件校验checksum
/**
* Open a DataInputStream to a DataNode so that it can be read from.
* We get block ID and the IDs of the destinations at startup, from the namenode.
*/
private synchronized DNAddrPair blockSeekTo(long target) throws IOException {
if (target >= getFileLength()) {
throw new IOException("Attempted to read past end of file");
}
// Will be getting a new BlockReader.
closeCurrentBlockReader();
//
// Connect to best DataNode for desired Block, with potential offset
//
DatanodeInfo chosenNode = null;
int refetchToken = 1; // only need to get a new access token once
int refetchEncryptionKey = 1; // only need to get a new encryption key once
boolean connectFailedOnce = false;
while (true) {
//
// Compute desired block
//
LocatedBlock targetBlock = getBlockAt(target, true);
assert (target==pos) : "Wrong postion " + pos + " expect " + target;
long offsetIntoBlock = target - targetBlock.getStartOffset();
DNAddrPair retval = chooseDataNode(targetBlock);
chosenNode = retval.info;
InetSocketAddress targetAddr = retval.addr;
StorageType storageType = retval.storageType;
long startCreateBlockReader = Time.monotonicNow(), setupConnectionSpan = 0;
try {
if (targetBlock.getLocations().length == 1) {
dfsClient.getConf().effectSocketTimeout.set(dfsClient.getConf().socketTimeoutForEC);
} else {
dfsClient.getConf().effectSocketTimeout.set(dfsClient.getConf().socketTimeout);
}
ExtendedBlock blk = targetBlock.getBlock();
Token<BlockTokenIdentifier> accessToken = targetBlock.getBlockToken();
CachingStrategy curCachingStrategy;
boolean shortCircuitForbidden;
synchronized(infoLock) {
curCachingStrategy = cachingStrategy;
shortCircuitForbidden = shortCircuitForbidden();
}
blockReader = new BlockReaderFactory(dfsClient.getConf()).
setInetSocketAddress(targetAddr).
setRemotePeerFactory(dfsClient).
setDatanodeInfo(chosenNode).
setStorageType(storageType).
setFileName(src).
setBlock(blk).
setBlockToken(accessToken).
setStartOffset(offsetIntoBlock).
setVerifyChecksum(verifyChecksum).
setClientName(dfsClient.clientName).
setLength(blk.getNumBytes() - offsetIntoBlock).
setCachingStrategy(curCachingStrategy).
setAllowShortCircuitLocalReads(!shortCircuitForbidden).
setClientCacheContext(dfsClient.getClientContext()).
setUserGroupInformation(dfsClient.ugi).
setConfiguration(dfsClient.getConfiguration()).
setTracer(dfsClient.getTracer()).
setZoneLease(multiplexZoneLease).
setZoneOppositeIdc(zoneOppositeIdc).
setFailoverRemoteRead(retval.failoverRemoteRead).
build();
DFSClientFaultInjector.get().readFromDatanodeDelay();
setupConnectionSpan = Time.monotonicNow() - startCreateBlockReader;
addBlockReaderInfoAndReset(chosenNode, setupConnectionSpan);
if(connectFailedOnce) {
DFSClient.LOG.info("Successfully connected to " + targetAddr +
" for " + blk + ", scope:" + chosenNode.getScope());
}
return retval;
} catch (IOException ex) {
if (ex instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
DFSClient.LOG.info("Will fetch a new encryption key and retry, "
+ "encryption key was invalid when connecting to " + targetAddr
+ " : " + ex);
// The encryption key used is invalid.
refetchEncryptionKey--;
dfsClient.clearDataEncryptionKey();
} else if (refetchToken > 0 && tokenRefetchNeeded(ex, targetAddr)) {
refetchToken--;
fetchBlockAt(target);
} else {
connectFailedOnce = true;
DFSClient.LOG.warn("Failed to connect to " + targetAddr + " for block"
+ ", add to deadNodes and continue. " + ex, ex);
// Put chosen node into dead list, continue
ioExceptionHandler(targetBlock, chosenNode, null, ex);
}
} finally {
if (setupConnectionSpan == 0)
setupConnectionSpan = Time.monotonicNow() - startCreateBlockReader;
dfsClient.stats.incrementReadSetupConnectionSpan(setupConnectionSpan);
}
}
}
调用链如下:
1、readWithStrategy
a、blockSeekTo(这一步会完成blockSender对象的创建)
blockReader = new BlockReaderFactory(dfsClient.getConf()).
setInetSocketAddress(targetAddr).
setRemotePeerFactory(dfsClient).
setDatanodeInfo(chosenNode).
setStorageType(storageType).
setFileName(src).
setBlock(blk).
setBlockToken(accessToken).
setStartOffset(offsetIntoBlock).
setVerifyChecksum(verifyChecksum).
setClientName(dfsClient.clientName).
setLength(blk.getNumBytes() - offsetIntoBlock).
setCachingStrategy(curCachingStrategy).
setAllowShortCircuitLocalReads(!shortCircuitForbidden).
setClientCacheContext(dfsClient.getClientContext()).
setUserGroupInformation(dfsClient.ugi).
setConfiguration(dfsClient.getConfiguration()).
setTracer(dfsClient.getTracer()).
setZoneLease(multiplexZoneLease).
setZoneOppositeIdc(zoneOppositeIdc).
setFailoverRemoteRead(retval.failoverRemoteRead).
build();
调用链如下:
build() ---> getRemoteBlockReaderFromDomain() ---> getRemoteBlockReader ---> RemoteBlockReader2.newBlockReader
b、readBuffer(委托BlockReader对象读取数据)
在创建BlockReader对象后,client会调用BlockReader 对象的doRead()方法读取数据块。
Block、Chunk、Packet
1、block由一个个的chunk组成
2、每个chunk = checksum(4 Byte) + data(512 Byte),数据与检验值的比值为128:1,所以对于一个128M的block会有一个1M的校验文件与之对应。
3、多个chunk组成一个packet,packet = header + 所有校验块的校验和 + chunk

readWithStrategy --> readBuffer ---> reader.doRead --> blockReader.read(buf, off, len);--->
RemoteBlockReader2.read ---> readNextPacket
private void readNextPacket() throws IOException {
//Read packet headers.
packetReceiver.receiveNextPacket(in);
PacketHeader curHeader = packetReceiver.getHeader();
// 获取每个packet的data信息
curDataSlice = packetReceiver.getDataSlice();
assert curDataSlice.capacity() == curHeader.getDataLen();
if (LOG.isTraceEnabled()) {
LOG.trace("DFSClient readNextPacket got header " + curHeader);
}
// Sanity check the lengths
if (!curHeader.sanityCheck(lastSeqNo)) {
throw new IOException("BlockReader: error in packet header " +
curHeader);
}
if (curHeader.getDataLen() > 0) {
int chunks = 1 + (curHeader.getDataLen() - 1) / bytesPerChecksum;
int checksumsLen = chunks * checksumSize;
assert packetReceiver.getChecksumSlice().capacity() == checksumsLen :
"checksum slice capacity=" + packetReceiver.getChecksumSlice().capacity() +
" checksumsLen=" + checksumsLen;
lastSeqNo = curHeader.getSeqno();
if (verifyChecksum && curDataSlice.remaining() > 0) {
// N.B.: the checksum error offset reported here is actually
// relative to the start of the block, not the start of the file.
// This is slightly misleading, but preserves the behavior from
// the older BlockReader.
checksum.verifyChunkedSums(curDataSlice,
packetReceiver.getChecksumSlice(),
filename, curHeader.getOffsetInBlock());
}
bytesNeededToFinish -= curHeader.getDataLen();
}
// First packet will include some data prior to the first byte
// the user requested. Skip it.
if (curHeader.getOffsetInBlock() < startOffset) {
int newPos = (int) (startOffset - curHeader.getOffsetInBlock());
curDataSlice.position(newPos);
}
// If we've now satisfied the whole client read, read one last packet
// header, which should be empty
if (bytesNeededToFinish <= 0) {
readTrailingEmptyPacket();
DFSClientFaultInjector.get().sleepForDatanodeReadTimeout();
if (verifyChecksum) {
sendReadResult(Status.CHECKSUM_OK);
} else {
sendReadResult(Status.SUCCESS);
}
}
}

本文详细介绍了HDFS客户端如何获取文件的checksum,包括获取文件的各个block及其checksum,计算MD5FileChecksum的过程。同时解析了DataXceiver计算Block Checksum的步骤,以及在读文件时的校验流程。最后,阐述了Block、Chunk和Packet的关系,描述了它们在数据校验中的作用。

被折叠的 条评论
为什么被折叠?



