Flush过程,对应MemStoreFlusher
1.是否需要做global flush,有则取当前rs最大的region进行flush
if (isAboveLowWaterMark()) {
....
//获取memstore最大的region进行flush
if (!flushOneForGlobalPressure()) {
// Wasn't able to flush any region, but we're above low water mark
// This is unlikely to happen, but might happen when closing the
// entire server - another thread is flushing regions. We'll just
// sleep a little bit to avoid spinning, and then pretend that
// we flushed one, so anyone blocked will check again
lock.lock();
try {
Thread.sleep(1000);
flushOccurred.signalAll();
} finally {
lock.unlock();
}
}
//完了,再requeue一个任务,再次check是否超过内存限制
// Enqueue another one of these tokens so we'll wake up again
wakeupFlushThread();
2.region flush开始
3.检查region下的store file是否超过限制,默认7个,超过则重试(requeue)
if (!fqe.region.getRegionInfo().isMetaRegion() &&
isTooManyStoreFiles(region)) {
......
// Put back on the queue. Have it come back out of the queue
// after a delay of this.blockingWaitTime / 100 ms.
this.flushQueue.add(fqe.requeue(this.blockingWaitTime / 100));
// Tell a lie, it's not flushed but it's ok
return true;
}
}
4.拿Region的读锁,阻塞写
5.MVCC里增加一个事务,代表flush操作
w = mvcc.beginMemstoreInsert();
mvcc.advanceMemstore(w);
6.拿Log sequence id
7.take snapshot,kvList引用切换
void snapshot() {
this.lock.writeLock().lock();
try {
// If snapshot currently has entries, then flusher failed or didn't call
// cleanup. Log a warning.
if (!this.snapshot.isEmpty()) {
LOG.warn("Snapshot called again without clearing previous. " +
"Doing nothing. Another ongoing flush or did we fail last attempt?");
} else {
if (!this.kvset.isEmpty()) {
//引用切换
this.snapshot = this.kvset;
this.kvset = new KeyValueSkipListSet(this.comparator);
this.snapshotTimeRangeTracker = this.timeRangeTracker;
this.timeRangeTracker = new TimeRangeTracker();
// Reset heap to not include any keys
this.size.set(DEEP_OVERHEAD);
// Reset allocator so we get a fresh buffer for the new memstore
if (allocator != null) {
this.allocator = new MemStoreLAB(conf);
}
}
}
} finally {
this.lock.writeLock().unlock();
}
}
8.MVCC等待之前的事务完成
mvcc.waitForRead(w);
9.将内存中的kv数据flush到hfile,流程如下
10.生成一个临时目录,使用UUID生成一个文件名
11.使用StoreScanner遍历内存中的kv数据,循环append入write cache
12.writer最终flush到hfile
private Path internalFlushCache(final SortedSet<KeyValue> set,
final long logCacheFlushId,
TimeRangeTracker snapshotTimeRangeTracker,
AtomicLong flushedSize,
MonitoredTask status)
throws IOException {
StoreFile.Writer writer;
// Find the smallest read point across all the Scanners.
long smallestReadPoint = region.getSmallestReadPoint();
long flushed = 0;
Path pathName;
// Don't flush if there are no entries.
if (set.size() == 0) {
return null;
}
//scan方式扫描KVlist数据,注意内存中的kv数据是有序的,先rowkey排序,再按family和qualifier,再按Timestamp
Scan scan = new Scan();
scan.setMaxVersions(scanInfo.getMaxVersions());
// Use a store scanner to find which rows to flush.
// Note that we need to retain deletes, hence
// treat this as a minor compaction.
InternalScanner scanner = new StoreScanner(this, scan, Collections
.singletonList(new CollectionBackedScanner(set, this.comparator)),
ScanType.MINOR_COMPACT, this.region.getSmallestReadPoint(),
HConstants.OLDEST_TIMESTAMP);
try {
// TODO: We can fail in the below block before we complete adding this
// flush to list of store files. Add cleanup of anything put on filesystem
// if we fail.
synchronized (flushLock) {
status.setStatus("Flushing " + this + ": creating writer");
// A. Write the map out to the disk
writer = createWriterInTmp(set.size());
writer.setTimeRangeTracker(snapshotTimeRangeTracker);
//临时目录
pathName = writer.getPath();
try {
List<KeyValue> kvs = new ArrayList<KeyValue>();
boolean hasMore;
do {
hasMore = scanner.next(kvs);
if (!kvs.isEmpty()) {
for (KeyValue kv : kvs) {
// If we know that this KV is going to be included always, then let us
// set its memstoreTS to 0. This will help us save space when writing to disk.
if (kv.getMemstoreTS() <= smallestReadPoint) {
// let us not change the original KV. It could be in the memstore
// changing its memstoreTS could affect other threads/scanners.
kv = kv.shallowCopy();
kv.setMemstoreTS(0);
}
//往cache中写数据
writer.append(kv);
flushed += this.memstore.heapSizeChange(kv, true);
}
kvs.clear();
}
} while (hasMore);
} finally {
// Write out the log sequence number that corresponds to this output
// hfile. The hfile is current up to and including logCacheFlushId.
status.setStatus("Flushing " + this + ": appending metadata");
writer.appendMetadata(logCacheFlushId, false);
status.setStatus("Flushing " + this + ": closing flushed file");
//flush到HDFS
writer.close();
}
}
}
......
}
13.将store file从tmp下move到正式目录,并添加到Store file列表
14.清理snapshot
private boolean updateStorefiles(final StoreFile sf,
final SortedSet<KeyValue> set)
throws IOException {
this.lock.writeLock().lock();
try {
//添加到storeFile中
ArrayList<StoreFile> newList = new ArrayList<StoreFile>(storefiles);
newList.add(sf);
storefiles = sortAndClone(newList);
//释放内存
this.memstore.clearSnapshot(set);
} finally {
// We need the lock, as long as we are updating the storefiles
// or changing the memstore. Let us release it before calling
// notifyChangeReadersObservers. See HBASE-4485 for a possible
// deadlock scenario that could have happened if continue to hold
// the lock.
this.lock.writeLock().unlock();
}
// Tell listeners of the change in readers.
notifyChangedReadersObservers();
return needsCompaction();
}
15.修改global和memstore的内存大小
public long addAndGetGlobalMemstoreSize(long memStoreSize) {
if (this.rsAccounting != null) {
rsAccounting.addAndGetGlobalMemstoreSize(memStoreSize);
}
return this.memstoreSize.getAndAdd(memStoreSize);
}
16.flush成功后,HLog增加一条flush信息,小于该flush txid的事务已经失效了
public void completeCacheFlush(final byte [] encodedRegionName,
final byte [] tableName, final long logSeqId, final boolean isMetaRegion)
throws IOException {
long start = System.currentTimeMillis();
try {
if (this.closed) {
return;
}
long txid = 0;
synchronized (updateLock) {
//flush的事务数据
WALEdit edit = completeCacheFlushLogEdit();
HLogKey key = makeKey(encodedRegionName, tableName, logSeqId,
System.currentTimeMillis(), HConstants.DEFAULT_CLUSTER_ID);
logSyncerThread.append(new Entry(key, edit));
txid = this.unflushedEntries.incrementAndGet();
this.numEntries.incrementAndGet();
}
// sync txn to file system
//写入HDFS
this.sync(txid);
} finally {
// updateLock not needed for removing snapshot's entry
// Cleaning up of lastSeqWritten is in the finally clause because we
// don't want to confuse getOldestOutstandingSeqNum()
this.lastSeqWritten.remove(getSnapshotName(encodedRegionName));
this.cacheFlushLock.unlock();
}
long took = System.currentTimeMillis() - start;
doWALTime.inc(took);
}
17.唤醒等待的业务线程