HBase Flush 流程梳理
接上一篇继续梳理HBase Flush。
HRegion
public FlushResult flushcache(boolean forceFlushAllStores, boolean writeFlushRequestWalMarker)
throws IOException {
if (this.closing.get()) {
String msg = "Skipping flush on " + this + " because closing";
LOG.debug(msg);
return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
}
MonitoredTask status = TaskMonitor.get().createStatus("Flushing " + this);
status.enableStatusJournal(false);
status.setStatus("Acquiring readlock on region");
lock.readLock().lock();
try {
if (this.closed.get()) {
String msg = "Skipping flush on " + this + " because closed";
LOG.debug(msg);
status.abort(msg);
return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
}
if (coprocessorHost != null) {
status.setStatus("Running coprocessor pre-flush hooks");
coprocessorHost.preFlush();
}
if (numMutationsWithoutWAL.get() > 0) {
numMutationsWithoutWAL.set(0);
dataInMemoryWithoutWAL.set(0);
}
synchronized (writestate) {
if (!writestate.flushing && writestate.writesEnabled) {
this.writestate.flushing = true;
} else {
if (LOG.isDebugEnabled()) {
LOG.debug("NOT flushing memstore for region " + this
+ ", flushing=" + writestate.flushing + ", writesEnabled="
+ writestate.writesEnabled);
}
String msg = "Not flushing since "
+ (writestate.flushing ? "already flushing"
: "writes not enabled");
status.abort(msg);
return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
}
}
try {
Collection<Store> specificStoresToFlush =
forceFlushAllStores ? stores.values() : flushPolicy.selectStoresToFlush();
FlushResult fs = internalFlushcache(specificStoresToFlush,
status, writeFlushRequestWalMarker);
if (coprocessorHost != null) {
status.setStatus("Running post-flush coprocessor hooks");
coprocessorHost.postFlush();
}
if(fs.isFlushSucceeded()) {
flushesQueued.set(0L);
}
status.markComplete("Flush successful");
return fs;
} finally {
synchronized (writestate) {
writestate.flushing = false;
this.writestate.flushRequested = false;
writestate.notifyAll();
}
}
} finally {
lock.readLock().unlock();
if (LOG.isDebugEnabled()) {
LOG.debug("Flush status journal:\n" + status.prettyPrintJournal());
}
status.cleanup();
}
}
Flush线程会调用HRegion中的flush()方法,该方法直接调用flushcache方法。flushcache首先检查region是否正在关闭或者已经关闭,执行coprocessor 的preFlush方法,再检查这个region是否正在flushing或者不可写入,之后调用internalFlushcache方法开始flush,最后执行coprocessor的postFlush方法。读锁保证不会出现flush没有完成但region被close的情况。
internalFlushcache() 方法
protected FlushResult internalFlushcache(final WAL wal, final long myseqid,
final Collection<Store> storesToFlush, MonitoredTask status, boolean writeFlushWalMarker)
throws IOException {
PrepareFlushResult result
= internalPrepareFlushCache(wal, myseqid, storesToFlush, status, writeFlushWalMarker);
if (result.result == null) {
return internalFlushCacheAndCommit(wal, status, result, storesToFlush);
} else {
return result.result;
}
}
从这个方法可以看出实际flush分三个阶段,prepare,flush 和 commit。
internalPrepareFlushCache() 方法
检查阶段
if (this.rsServices != null && this.rsServices.isAborted()) {
throw new IOException("Aborting flush because server is aborted...");
}
final long startTime = EnvironmentEdgeManager.currentTime();
if (this.memstoreSize.get() <= 0) {
MultiVersionConcurrencyControl.WriteEntry writeEntry = null;
this.updatesLock.writeLock().lock();
try {
if (this.memstoreSize.get() <= 0) {
if (wal != null) {
writeEntry = mvcc.begin();
long flushOpSeqId = writeEntry.getWriteNumber();
FlushResult flushResult = new FlushResultImpl(
FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY,
flushOpSeqId,
"Nothing to flush",
writeFlushRequestMarkerToWAL(wal, writeFlushWalMarker));
mvcc.completeAndWait(writeEntry);
writeEntry = null;
return new PrepareFlushResult(flushResult, myseqid);
} else {
return new PrepareFlushResult(
new FlushResultImpl(
FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY,
"Nothing to flush",
false),
myseqid);
}
}
} finally {
this.updatesLock.writeLock().unlock();
if (writeEntry != null) {
mvcc.complete(writeEntry);
}
}
}
首先检查regionserver是否在aborting,如果是则抛出异常,之后检查该region的memstore大小是否大于0,如果不大于0,则flush nothing,返回一个FlushResult的对象,标记memstore 为空。updatelock 上写锁保证mvcc进行记录的时候,其他线程无法更新memstore的内容。
写入日志信息
if (LOG.isInfoEnabled()) {
StringBuilder perCfExtras = null;
if (!isAllFamilies(storesToFlush)) {
perCfExtras = new StringBuilder();
for (Store store: storesToFlush) {
perCfExtras.append("; ").append(store.getColumnFamilyName());
perCfExtras.append("=").append(StringUtils.byteDesc(store.getMemStoreSize()));
}
}
LOG.info("Flushing " + storesToFlush.size() + "/" + stores.size() +
" column families, memstore=" + StringUtils.byteDesc(this.memstoreSize.get()) +
((perCfExtras != null && perCfExtras.length() > 0)? perCfExtras.toString(): "") +
((wal != null) ? "" : "; WAL is null, using passed sequenceid=" + myseqid));
}
保存MemStore快照
status.setStatus("Obtaining lock to block concurrent updates");
// block waiting for the lock for internal flush
this.updatesLock.writeLock().lock();
status.setStatus("Preparing to flush by snapshotting stores in " +
getRegionInfo().getEncodedName());
long totalFlushableSizeOfFlushableStores = 0;
Set<byte[]> flushedFamilyNames = new HashSet<byte[]>();
for (Store store: storesToFlush) {
flushedFamilyNames.add(store.getFamily().getName());
}
TreeMap<byte[], StoreFlushContext> storeFlushCtxs
= new TreeMap<byte[], StoreFlushContext>(Bytes.BYTES_COMPARATOR);
TreeMap<byte[], List<Path>> committedFiles = new TreeMap<byte[], List<Path>>(
Bytes.BYTES_COMPARATOR);
TreeMap<byte[], Long> storeFlushableSize
= new TreeMap<byte[], Long>(Bytes.BYTES_COMPARATOR);
long flushOpSeqId = HConstants.NO_SEQNUM;
long flushedSeqId = HConstants.NO_SEQNUM;
byte[] encodedRegionName = getRegionInfo().getEncodedNameAsBytes();
long trxId = 0;
MultiVersionConcurrencyControl.WriteEntry writeEntry = mvcc.begin();
mvcc.completeAndWait(writeEntry);
writeEntry = null;
try {
try {
if (wal != null) {
Long earliestUnflushedSequenceIdForTheRegion =
wal.startCacheFlush(encodedRegionName, flushedFamilyNames);
if (earliestUnflushedSequenceIdForTheRegion == null) {
String msg = this.getRegionInfo().getEncodedName() + " flush aborted; WAL closing.";
status.setStatus(msg);
return new PrepareFlushResult(
new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false),
myseqid);
}
flushOpSeqId = getNextSequenceId(wal);
flushedSeqId =
earliestUnflushedSequenceIdForTheRegion.longValue() == HConstants.NO_SEQNUM?
flushOpSeqId: earliestUnflushedSequenceIdForTheRegion.longValue() - 1;
} else {
flushedSeqId = flushOpSeqId = myseqid;
}
for (Store s : storesToFlush) {
totalFlushableSizeOfFlushableStores += s.getFlushableSize();
storeFlushCtxs.put(s.getFamily().getName(), s.createFlushContext(flushOpSeqId));
committedFiles.put(s.getFamily().getName(), null);
storeFlushableSize.put(s.getFamily().getName(), s.getFlushableSize());
}
if (wal != null && !writestate.readOnly) {
FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.START_FLUSH,
getRegionInfo(), flushOpSeqId, committedFiles);
trxId = WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(),
desc, false, mvcc);
}
for (StoreFlushContext flush : storeFlushCtxs.values()) {
flush.prepare();
}
} catch (IOException ex) {
if (wal != null) {
if (trxId > 0) {
try {
FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH,
getRegionInfo(), flushOpSeqId, committedFiles);
WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(),
desc, false, mvcc);
} catch (Throwable t) {
LOG.warn("Received unexpected exception trying to write ABORT_FLUSH marker to WAL:" +
StringUtils.stringifyException(t));
}
}
wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
throw ex;
}
} finally {
this.updatesLock.writeLock().unlock();
}
String s = "Finished memstore snapshotting " + this +
", syncing WAL and waiting on mvcc, flushsize=" + totalFlushableSizeOfFlushableStores;
status.setStatus(s);
if (LOG.isTraceEnabled()) LOG.trace(s);
if (wal != null) {
try {
wal.sync();
} catch (IOException ioe) {
wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
throw ioe;
}
}
} finally {
if (writeEntry != null) {
mvcc.complete(writeEntry);
}
}
return new PrepareFlushResult(storeFlushCtxs, committedFiles, storeFlushableSize, startTime,
flushOpSeqId, flushedSeqId, totalFlushableSizeOfFlushableStores);
先更新MVCC版本,写入WAL,flush.prepare() 方法实质上是保存一个store中memstore的快照。
internalFlushCacheAndCommit() 方法
通过上一步internalPrepareFlushCache() 方法返回的结果准备上下文:
TreeMap<byte[], StoreFlushContext> storeFlushCtxs = prepareResult.storeFlushCtxs;
TreeMap<byte[], List<Path>> committedFiles = prepareResult.committedFiles;
long startTime = prepareResult.startTime;
long flushOpSeqId = prepareResult.flushOpSeqId;
long flushedSeqId = prepareResult.flushedSeqId;
long totalFlushableSizeOfFlushableStores = prepareResult.totalFlushableSize;
String s = "Flushing stores of " + this;
status.setStatus(s);
if (LOG.isTraceEnabled()) LOG.trace(s);
boolean compactionRequested = false;
long flushedOutputFileSize = 0;
之后先将所有要flush的memstore快照写入新的file(保存在tmp目录),最后通过commit将临时文件移动到对应familyName的目录。