转载自:http://iwinit.iteye.com/blog/1827404
Flush过程,对应MemStoreFlusher
1.是否需要做global flush,有则取当前rs最大的region进行flush
- if (isAboveLowWaterMark()) {
- ....
- //获取memstore最大的region进行flush
- if (!flushOneForGlobalPressure()) {
- // Wasn't able to flush any region, but we're above low water mark
- // This is unlikely to happen, but might happen when closing the
- // entire server - another thread is flushing regions. We'll just
- // sleep a little bit to avoid spinning, and then pretend that
- // we flushed one, so anyone blocked will check again
- lock.lock();
- try {
- Thread.sleep(1000);
- flushOccurred.signalAll();
- } finally {
- lock.unlock();
- }
- }
- //完了,再requeue一个任务,再次check是否超过内存限制
- // Enqueue another one of these tokens so we'll wake up again
- wakeupFlushThread();
2.region flush开始
3.检查region下的store file是否超过限制,默认7个,超过则重试(requeue)
- if (!fqe.region.getRegionInfo().isMetaRegion() &&
- isTooManyStoreFiles(region)) {
- ......
- // Put back on the queue. Have it come back out of the queue
- // after a delay of this.blockingWaitTime / 100 ms.
- this.flushQueue.add(fqe.requeue(this.blockingWaitTime / 100));
- // Tell a lie, it's not flushed but it's ok
- return true;
- }
- }
4.拿Region的读锁,阻塞写
5.MVCC里增加一个事务,代表flush操作
- w = mvcc.beginMemstoreInsert();
- mvcc.advanceMemstore(w);
6.拿Log sequence id
7.take snapshot,kvList引用切换
- void snapshot() {
- this.lock.writeLock().lock();
- try {
- // If snapshot currently has entries, then flusher failed or didn't call
- // cleanup. Log a warning.
- if (!this.snapshot.isEmpty()) {
- LOG.warn("Snapshot called again without clearing previous. " +
- "Doing nothing. Another ongoing flush or did we fail last attempt?");
- } else {
- if (!this.kvset.isEmpty()) {
- //引用切换
- this.snapshot = this.kvset;
- this.kvset = new KeyValueSkipListSet(this.comparator);
- this.snapshotTimeRangeTracker = this.timeRangeTracker;
- this.timeRangeTracker = new TimeRangeTracker();
- // Reset heap to not include any keys
- this.size.set(DEEP_OVERHEAD);
- // Reset allocator so we get a fresh buffer for the new memstore
- if (allocator != null) {
- this.allocator = new MemStoreLAB(conf);
- }
- }
- }
- } finally {
- this.lock.writeLock().unlock();
- }
- }
8.MVCC等待之前的事务完成
- mvcc.waitForRead(w);
9.将内存中的kv数据flush到hfile,流程如下
10.生成一个临时目录,使用UUID生成一个文件名
11.使用StoreScanner遍历内存中的kv数据,循环append入write cache
12.writer最终flush到hfile
- private Path internalFlushCache(final SortedSet<KeyValue> set,
- final long logCacheFlushId,
- TimeRangeTracker snapshotTimeRangeTracker,
- AtomicLong flushedSize,
- MonitoredTask status)
- throws IOException {
- StoreFile.Writer writer;
- // Find the smallest read point across all the Scanners.
- long smallestReadPoint = region.getSmallestReadPoint();
- long flushed = 0;
- Path pathName;
- // Don't flush if there are no entries.
- if (set.size() == 0) {
- return null;
- }
- //scan方式扫描KVlist数据,注意内存中的kv数据是有序的,先rowkey排序,再按family和qualifier,再按Timestamp
- Scan scan = new Scan();
- scan.setMaxVersions(scanInfo.getMaxVersions());
- // Use a store scanner to find which rows to flush.
- // Note that we need to retain deletes, hence
- // treat this as a minor compaction.
- InternalScanner scanner = new StoreScanner(this, scan, Collections
- .singletonList(new CollectionBackedScanner(set, this.comparator)),
- ScanType.MINOR_COMPACT, this.region.getSmallestReadPoint(),
- HConstants.OLDEST_TIMESTAMP);
- try {
- // TODO: We can fail in the below block before we complete adding this
- // flush to list of store files. Add cleanup of anything put on filesystem
- // if we fail.
- synchronized (flushLock) {
- status.setStatus("Flushing " + this + ": creating writer");
- // A. Write the map out to the disk
- writer = createWriterInTmp(set.size());
- writer.setTimeRangeTracker(snapshotTimeRangeTracker);
- //临时目录
- pathName = writer.getPath();
- try {
- List<KeyValue> kvs = new ArrayList<KeyValue>();
- boolean hasMore;
- do {
- hasMore = scanner.next(kvs);
- if (!kvs.isEmpty()) {
- for (KeyValue kv : kvs) {
- // If we know that this KV is going to be included always, then let us
- // set its memstoreTS to 0. This will help us save space when writing to disk.
- if (kv.getMemstoreTS() <= smallestReadPoint) {
- // let us not change the original KV. It could be in the memstore
- // changing its memstoreTS could affect other threads/scanners.
- kv = kv.shallowCopy();
- kv.setMemstoreTS(0);
- }
- //往cache中写数据
- writer.append(kv);
- flushed += this.memstore.heapSizeChange(kv, true);
- }
- kvs.clear();
- }
- } while (hasMore);
- } finally {
- // Write out the log sequence number that corresponds to this output
- // hfile. The hfile is current up to and including logCacheFlushId.
- status.setStatus("Flushing " + this + ": appending metadata");
- writer.appendMetadata(logCacheFlushId, false);
- status.setStatus("Flushing " + this + ": closing flushed file");
- //flush到HDFS
- writer.close();
- }
- }
- }
- ......
- }
13.将store file从tmp下move到正式目录,并添加到Store file列表
14.清理snapshot
- private boolean updateStorefiles(final StoreFile sf,
- final SortedSet<KeyValue> set)
- throws IOException {
- this.lock.writeLock().lock();
- try {
- //添加到storeFile中
- ArrayList<StoreFile> newList = new ArrayList<StoreFile>(storefiles);
- newList.add(sf);
- storefiles = sortAndClone(newList);
- //释放内存
- this.memstore.clearSnapshot(set);
- } finally {
- // We need the lock, as long as we are updating the storefiles
- // or changing the memstore. Let us release it before calling
- // notifyChangeReadersObservers. See HBASE-4485 for a possible
- // deadlock scenario that could have happened if continue to hold
- // the lock.
- this.lock.writeLock().unlock();
- }
- // Tell listeners of the change in readers.
- notifyChangedReadersObservers();
- return needsCompaction();
- }
15.修改global和memstore的内存大小
- public long addAndGetGlobalMemstoreSize(long memStoreSize) {
- if (this.rsAccounting != null) {
- rsAccounting.addAndGetGlobalMemstoreSize(memStoreSize);
- }
- return this.memstoreSize.getAndAdd(memStoreSize);
- }
16.flush成功后,HLog增加一条flush信息,小于该flush txid的事务已经失效了
- public void completeCacheFlush(final byte [] encodedRegionName,
- final byte [] tableName, final long logSeqId, final boolean isMetaRegion)
- throws IOException {
- long start = System.currentTimeMillis();
- try {
- if (this.closed) {
- return;
- }
- long txid = 0;
- synchronized (updateLock) {
- //flush的事务数据
- WALEdit edit = completeCacheFlushLogEdit();
- HLogKey key = makeKey(encodedRegionName, tableName, logSeqId,
- System.currentTimeMillis(), HConstants.DEFAULT_CLUSTER_ID);
- logSyncerThread.append(new Entry(key, edit));
- txid = this.unflushedEntries.incrementAndGet();
- this.numEntries.incrementAndGet();
- }
- // sync txn to file system
- //写入HDFS
- this.sync(txid);
- } finally {
- // updateLock not needed for removing snapshot's entry
- // Cleaning up of lastSeqWritten is in the finally clause because we
- // don't want to confuse getOldestOutstandingSeqNum()
- this.lastSeqWritten.remove(getSnapshotName(encodedRegionName));
- this.cacheFlushLock.unlock();
- }
- long took = System.currentTimeMillis() - start;
- doWALTime.inc(took);
- }
17.唤醒等待的业务线程