leveldb 读写流程
API
// A DB is a persistent ordered map from keys to values.
// A DB is safe for concurrent access from multiple threads without
// any external synchronization.
class LEVELDB_EXPORT DB {
public:
// Open the database with the specified "name".
// Stores a pointer to a heap-allocated database in *dbptr and returns
// OK on success.
// Stores nullptr in *dbptr and returns a non-OK status on error.
// Caller should delete *dbptr when it is no longer needed.
static Status Open(const Options& options, const std::string& name,
DB** dbptr);
...
// Set the database entry for "key" to "value". Returns OK on success,
// and a non-OK status on error.
// Note: consider setting options.sync = true.
virtual Status Put(const WriteOptions& options, const Slice& key,
const Slice& value) = 0;
// Remove the database entry (if any) for "key". Returns OK on
// success, and a non-OK status on error. It is not an error if "key"
// did not exist in the database.
// Note: consider setting options.sync = true.
virtual Status Delete(const WriteOptions& options, const Slice& key) = 0;
// Apply the specified updates to the database.
// Returns OK on success, non-OK on failure.
// Note: consider setting options.sync = true.
virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
// If the database contains an entry for "key" store the
// corresponding value in *value and return OK.
//
// If there is no entry for "key" leave *value unchanged and return
// a status for which Status::IsNotFound() returns true.
//
// May return some other Status on an error.
virtual Status Get(const ReadOptions& options, const Slice& key,
std::string* value) = 0;
// Return a heap-allocated iterator over the contents of the database.
// The result of NewIterator() is initially invalid (caller must
// call one of the Seek methods on the iterator before using it).
//
// Caller should delete the iterator when it is no longer needed.
// The returned iterator should be deleted before this db is deleted.
virtual Iterator* NewIterator(const ReadOptions& options) = 0;
// Return a handle to the current DB state. Iterators created with
// this handle will all observe a stable snapshot of the current DB
// state. The caller must call ReleaseSnapshot(result) when the
// snapshot is no longer needed.
virtual const Snapshot* GetSnapshot() = 0;
// Release a previously acquired snapshot. The caller must not
// use "snapshot" after this call.
virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
...
};
class DBImpl : public DB {
public:
...
// Implementations of the DB interface
Status Put(const WriteOptions&, const Slice& key,
const Slice& value) override;
Status Delete(const WriteOptions&, const Slice& key) override;
Status Write(const WriteOptions& options, WriteBatch* updates) override;
Status Get(const ReadOptions& options, const Slice& key,
std::string* value) override;
Iterator* NewIterator(const ReadOptions&) override;
const Snapshot* GetSnapshot() override;
void ReleaseSnapshot(const Snapshot* snapshot) override;
...
}
Put & Delete
本质上最后都是用的 Write
put是写入kTypeValue
标记的KV,delete是写入kTypeDeletion
标记KV
Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) {
return DB::Put(o, key, val);
}
// Default implementations of convenience methods that subclasses of DB
// can call if they wish
Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) {
WriteBatch batch;
batch.Put(key, value);
return Write(opt, &batch);
}
Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
return DB::Delete(options, key);
}
Status DB::Delete(const WriteOptions& opt, const Slice& key) {
WriteBatch batch;
batch.Delete(key);
return Write(opt, &batch);
}
WriteBatch
WriteBatch 封装一个或者多个要写的数据
class LEVELDB_EXPORT WriteBatch {
public:
...
WriteBatch();
// Intentionally copyable.
WriteBatch(const WriteBatch&) = default;
WriteBatch& operator=(const WriteBatch&) = default;
~WriteBatch() {
Clear(); }
// Store the mapping "key->value" in the database.
void Put(const Slice& key, const Slice& value);
// If the database contains a mapping for "key", erase it. Else do nothing.
void Delete(const Slice& key);
// WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
static const size_t kHeader = 12;
// Clear all updates buffered in this batch.
void Clear() ;
...
private:
friend class WriteBatchInternal;
std::string rep_; // See comment in write_batch.cc for the format of rep_
};
WriteBatch 的 std::string rep_存储了所有通过 Put/Delete 接口传入的数据。
按照一定格式记录了:sequence (8 byte,该WriteBatch中第一条KV的seq), count (4 byte,该WriteBatch总共包含多少条KV), [操作类型(Put or Delete),key/value的长度及key/value本身]…
Write
外部调写的接口是可以并发的,但leveldb内部保证了写是单线程无并发的
WriterBatch封装了数据,DBImpl::Writer则封装了WriteBatch 和 mutex cond 同步原语,以及是否sync。
// Information kept for every waiting writer
struct DBImpl::Writer {
Status status;
WriteBatch* batch;
bool sync;
bool done;
port::CondVar cv;
explicit Writer(port::Mutex* mu) : cv(mu) {
}
};
先初始化 Writer
Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
Writer w(&mutex_);
w.batch = updates;
w.sync = options.sync;
w.done = false;
把 w 放进 writers_ 队列,只要自己还没有被写入(一定还在队列里),且队列前面还有其他 Writer,这个线程就等着。
MutexLock l(&mutex_);
writers_.push_back(&w);
while (!w.done && &w != writers_.front()) {
w.cv.Wait();
}
if (w.done) {
// 自己已经被写入(一定已经被弹出队列了),可以返回结果了
return w.status;
}
走到这里,这时候这个线程的 Writer w 已经是队列的第一个了,且拿到了mutex_
// May temporarily unlock and wait.
Status status = MakeRoomForWrite(updates == nullptr);
BuildBatchGroup
的作用是聚合队列中多个线程的 WriteBatch 写请求到一个 WriteBatch,当然聚合后的 size 是有上限的,不能过大。
- last_writer = &w
- 向后遍历 writers_ 队列,逐渐递增 batch
- 更新 last_writer, 相当于记录哪些 writer 中的 batch 已经被聚合了,后面不用再写这些 writer。
uint64_t last_sequence = versions_->LastSequence(); // 获取 last seq
Writer* last_writer = &w;
if (status.ok() && updates != nullptr) {
// nullptr batch is for compactions
WriteBatch* write_batch = BuildBatchGroup(&last_writer);
设置这个聚合过后的write_batch的seq,并且last_sequence += write_batch的entry数量
WriteBatchInternal::SetSequence(write_batch, last_sequence + 1);
last_sequence += WriteBatchInternal::Count(write_batch);
先将write_batch的rep_这一整个string(WriteBatchInternal::Contents(write_batch))作为一条record写入log
再将这个write_batch的每一条entry一条一条的写入mem table,且每一条entry的seq递增(WriteBatchInternal::InsertInto(write_batch, mem_))
// Add to log and apply to memtable. We can release the lock
// during this phase since &w is currently responsible for logging
// and protects against concurrent loggers and concurrent writes
// into mem_.
{
mutex_.Unlock();
status = log_->AddRecord(WriteBatchInternal::Contents(write_batch));
bool sync_error = false;
if (status.ok() && options.sync) {
// sync
status = logfile_->Sync();
if (!status.ok()) {
sync_error = true;
}
}
if (status.ok()) {
status = WriteBatchInternal::InsertInto(write_batch, mem_);
}
mutex_.Lock();
if (sync_error) {
// The state of the log file is indeterminate: the log record we
// just added may or may not show up when the DB is re-opened.
// So we force the DB into a mode where all future writes fail.
RecordBackgroundError(status);
}
}
if (write_batch == tmp_batch_) tmp_batch_->Clear();
更新VersionSet::last_sequence_
versions_->SetLastSequence(last_sequence);
}
因为last_writer和在它之前的writer都已经被BuildBatchGroup聚合到一起写入了,所以要现将这些writer弹出,将它们所属的线程唤醒,最后唤醒剩余队列中的第一个writer
while (true) {
Writer* ready = writers_.front();
writers_.pop_front();
if (ready != &w) {
// 不用唤醒 w,因为现在正在执行的就是 w 线程
ready->status = status;
ready->done = true;
ready->cv